text-generation-inference/backends/neuron/tests/server/test_prefill.py

from helpers import create_request
from text_generation_server.generator import NeuronGenerator
from text_generation_server.pb.generate_pb2 import Batch


def test_prefill(neuron_model_config):
    """Verify that a prefill for a single request generates the expected output."""
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    max_batch_size = 4
    assert generator.model.batch_size >= max_batch_size
    for num_requests in [1, max_batch_size]:
        for do_sample in [True, False]:
            mode = "sample" if do_sample else "greedy"
            print(f"[{mode}]: {num_requests} requests")
            _test_prefill(config_name, generator, num_requests, do_sample)
            generator.clear()


def _test_prefill(config_name, generator, batch_size, do_sample):
    requests = []
    max_new_tokens = 20
    input_text = (
        "It was a bright cold day in April, and the clocks were striking thirteen."
    )
    for i in range(batch_size):
        requests.append(
            create_request(
                id=i,
                inputs=input_text,
                do_sample=do_sample,
                max_new_tokens=max_new_tokens,
            )
        )
    # Let's be pessimistic when estimating max_tokens
    max_length = generator.model.max_length
    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == batch_size
    # Whatever was passed as max_tokens, the server will correct it
    # because of static batching
    assert next_batch.max_tokens == batch_size * max_length
    assert len(generations) == batch_size
    if do_sample:
        expectations = {
            "gpt2": [383, " The"],
            "llama": [10058, " George"],
            "mistral": [450, " The"],
            "qwen2": [362, " A"],
            "granite": [308, " ("],
        }[config_name]
    else:
        expectations = {
            "gpt2": [198, "\n"],
            "llama": [10058, " George"],
            "mistral": [13, "\n"],
            "qwen2": [358, " I"],
            "granite": [203, "\n"],
        }[config_name]
    for g in generations:
        tokens = g.tokens
        assert tokens.ids[0] == expectations[0]
        assert tokens.texts[0] == expectations[1]


def test_prefill_truncate(neuron_model_config):
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    batch_size = generator.model.batch_size
    # We apply truncation to all requests but the first one
    truncate = [
        None,
    ] + [i * 3 for i in range(1, batch_size)]
    input_text = (
        "Two gin-scented tears trickled down the sides of his nose."
        " But it was all right, everything was all right, the struggle was finished."
        " He had won the victory over himself. He loved Big Brother."
    )
    requests = []
    for i in range(batch_size):
        requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
    max_length = generator.model.max_length
    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, _ = generator.prefill(batch)
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
    expectations = {
        "gpt2": [" He", " He", "\n", " He"],
        "llama": [" —", " The", " He", " He"],
        "mistral": [" He", "\n", " He", " He"],
        "qwen2": [" He", " The", " He", " He"],
        "granite": ["\n", "\n", " I", " He"],
    }[config_name]
    for i, g in enumerate(generations):
        tokens = g.tokens
        assert tokens.texts[0] == expectations[i]
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`from helpers import create_request`
			`from text_generation_server.generator import NeuronGenerator`
			`from text_generation_server.pb.generate_pb2 import Batch`


			`def test_prefill(neuron_model_config):`
			`"""Verify that a prefill for a single request generates the expected output."""`
			`config_name = neuron_model_config["name"]`
			`neuron_model_path = neuron_model_config["neuron_model_path"]`
			`generator = NeuronGenerator.from_pretrained(neuron_model_path)`
			`max_batch_size = 4`
			`assert generator.model.batch_size >= max_batch_size`
			`for num_requests in [1, max_batch_size]:`
			`for do_sample in [True, False]:`
			`mode = "sample" if do_sample else "greedy"`
			`print(f"[{mode}]: {num_requests} requests")`
			`_test_prefill(config_name, generator, num_requests, do_sample)`
			`generator.clear()`


			`def _test_prefill(config_name, generator, batch_size, do_sample):`
			`requests = []`
			`max_new_tokens = 20`
fix: run linters and fix formatting (#3057) 2025-02-25 21:11:34 +00:00			`input_text = (`
			`"It was a bright cold day in April, and the clocks were striking thirteen."`
			`)`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`for i in range(batch_size):`
fix: run linters and fix formatting (#3057) 2025-02-25 21:11:34 +00:00			`requests.append(`
			`create_request(`
			`id=i,`
			`inputs=input_text,`
			`do_sample=do_sample,`
			`max_new_tokens=max_new_tokens,`
			`)`
			`)`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`# Let's be pessimistic when estimating max_tokens`
			`max_length = generator.model.max_length`
fix: run linters and fix formatting (#3057) 2025-02-25 21:11:34 +00:00			`batch = Batch(`
			`id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length`
			`)`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`generations, next_batch = generator.prefill(batch)`
			`assert next_batch.size == batch_size`
			`# Whatever was passed as max_tokens, the server will correct it`
			`# because of static batching`
			`assert next_batch.max_tokens == batch_size * max_length`
			`assert len(generations) == batch_size`
			`if do_sample:`
			`expectations = {`
			`"gpt2": [383, " The"],`
			`"llama": [10058, " George"],`
			`"mistral": [450, " The"],`
			`"qwen2": [362, " A"],`
			`"granite": [308, " ("],`
			`}[config_name]`
			`else:`
			`expectations = {`
			`"gpt2": [198, "\n"],`
			`"llama": [10058, " George"],`
			`"mistral": [13, "\n"],`
			`"qwen2": [358, " I"],`
			`"granite": [203, "\n"],`
			`}[config_name]`
			`for g in generations:`
			`tokens = g.tokens`
			`assert tokens.ids[0] == expectations[0]`
			`assert tokens.texts[0] == expectations[1]`


			`def test_prefill_truncate(neuron_model_config):`
			`config_name = neuron_model_config["name"]`
			`neuron_model_path = neuron_model_config["neuron_model_path"]`
			`generator = NeuronGenerator.from_pretrained(neuron_model_path)`
			`batch_size = generator.model.batch_size`
			`# We apply truncation to all requests but the first one`
			`truncate = [`
			`None,`
			`] + [i * 3 for i in range(1, batch_size)]`
			`input_text = (`
			`"Two gin-scented tears trickled down the sides of his nose."`
			`" But it was all right, everything was all right, the struggle was finished."`
			`" He had won the victory over himself. He loved Big Brother."`
			`)`
			`requests = []`
			`for i in range(batch_size):`
			`requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))`
			`max_length = generator.model.max_length`
fix: run linters and fix formatting (#3057) 2025-02-25 21:11:34 +00:00			`batch = Batch(`
			`id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length`
			`)`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`generations, _ = generator.prefill(batch)`
			`# Even if the input text is identical for all requests, the first generated token might`
			`# be different because of the truncation`
			`expectations = {`
			`"gpt2": [" He", " He", "\n", " He"],`
			`"llama": [" —", " The", " He", " He"],`
			`"mistral": [" He", "\n", " He", " He"],`
			`"qwen2": [" He", " The", " He", " He"],`
			`"granite": ["\n", "\n", " I", " He"],`
			`}[config_name]`
			`for i, g in enumerate(generations):`
			`tokens = g.tokens`
			`assert tokens.texts[0] == expectations[i]`