From 856d7682cf582d6351d668f4b75b730d079a52ba Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 12 Feb 2025 09:10:47 +0000
Subject: [PATCH] feat(neuron): add server and integration tests

---
 backends/neuron/Makefile                      |  10 +-
 backends/neuron/tests/conftest.py             |   1 +
 backends/neuron/tests/fixtures/model.py       | 129 ++++++++++
 backends/neuron/tests/fixtures/service.py     | 240 ++++++++++++++++++
 .../neuron/tests/integration/test_generate.py |  96 +++++++
 .../tests/integration/test_implicit_env.py    |  76 ++++++
 backends/neuron/tests/pytest.ini              |   2 +
 backends/neuron/tests/requirements.txt        |  19 ++
 backends/neuron/tests/server/helpers.py       | 149 +++++++++++
 .../tests/server/test_continuous_batching.py  |  74 ++++++
 backends/neuron/tests/server/test_decode.py   |  55 ++++
 .../tests/server/test_generator_slot.py       |  61 +++++
 backends/neuron/tests/server/test_info.py     |  10 +
 backends/neuron/tests/server/test_prefill.py  |  89 +++++++
 14 files changed, 1010 insertions(+), 1 deletion(-)
 create mode 100644 backends/neuron/tests/conftest.py
 create mode 100644 backends/neuron/tests/fixtures/model.py
 create mode 100644 backends/neuron/tests/fixtures/service.py
 create mode 100644 backends/neuron/tests/integration/test_generate.py
 create mode 100644 backends/neuron/tests/integration/test_implicit_env.py
 create mode 100644 backends/neuron/tests/pytest.ini
 create mode 100644 backends/neuron/tests/requirements.txt
 create mode 100644 backends/neuron/tests/server/helpers.py
 create mode 100644 backends/neuron/tests/server/test_continuous_batching.py
 create mode 100644 backends/neuron/tests/server/test_decode.py
 create mode 100644 backends/neuron/tests/server/test_generator_slot.py
 create mode 100644 backends/neuron/tests/server/test_info.py
 create mode 100644 backends/neuron/tests/server/test_prefill.py

diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile
index 6da77f79..4806ac7d 100644
--- a/backends/neuron/Makefile
+++ b/backends/neuron/Makefile
@@ -16,7 +16,7 @@ mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 root_dir := "${mkfile_dir}/../.."
 
-.PHONY:	image install_server
+.PHONY:	image install_server test_server test_integration
 
 VERSION := $(shell gawk 'match($$0, /^version = "(.*)"/, a) {print a[1]}' ${root_dir}/Cargo.toml)
 
@@ -28,3 +28,11 @@ image:
 
 install_server:
 	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
+
+test_server: install_server
+	python -m pip install -r ${mkfile_dir}/tests/requirements.txt
+	python -m pytest -sv ${mkfile_dir}/tests/server
+
+test_integration: image
+	python -m pip install -r ${mkfile_dir}/tests/requirements.txt
+	python -m pytest -sv ${mkfile_dir}/tests/integration
diff --git a/backends/neuron/tests/conftest.py b/backends/neuron/tests/conftest.py
new file mode 100644
index 00000000..f0fc72ab
--- /dev/null
+++ b/backends/neuron/tests/conftest.py
@@ -0,0 +1 @@
+pytest_plugins = ["fixtures.service", "fixtures.model"]
diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py
new file mode 100644
index 00000000..6fa63ce8
--- /dev/null
+++ b/backends/neuron/tests/fixtures/model.py
@@ -0,0 +1,129 @@
+import copy
+import logging
+import subprocess
+import sys
+from tempfile import TemporaryDirectory
+
+import huggingface_hub
+import pytest
+from transformers import AutoTokenizer
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils import synchronize_hub_cache
+from optimum.neuron.version import __sdk_version__ as sdk_version
+from optimum.neuron.version import __version__ as version
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
+
+# All model configurations below will be added to the neuron_model_config fixture
+MODEL_CONFIGURATIONS = {
+    "gpt2": {
+        "model_id": "gpt2",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "llama": {
+        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "mistral": {
+        "model_id": "optimum/mistral-1.1b-testing",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+    "qwen2": {
+        "model_id": "Qwen/Qwen2.5-0.5B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "granite": {
+        "model_id": "ibm-granite/granite-3.1-2b-instruct",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+}
+
+
+def get_hub_neuron_model_id(config_name: str):
+    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+
+
+def export_model(model_id, export_kwargs, neuron_model_path):
+    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    for kwarg, value in export_kwargs.items():
+        export_command.append(f"--{kwarg}")
+        export_command.append(str(value))
+    export_command.append(neuron_model_path)
+    logger.info(f"Exporting {model_id} with {export_kwargs}")
+    try:
+        subprocess.run(export_command, check=True)
+    except subprocess.CalledProcessError as e:
+        raise ValueError(f"Failed to export model: {e}")
+
+
+@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
+def neuron_model_config(request):
+    """Expose a pre-trained neuron model
+
+    The fixture first makes sure the following model artifacts are present on the hub:
+    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
+    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
+    If not, it will export the model and push it to the hub.
+
+    It then fetches the model locally and return a dictionary containing:
+    - a configuration name,
+    - the original model id,
+    - the export parameters,
+    - the neuron model id,
+    - the neuron model local path.
+
+    For each exposed model, the local directory is maintained for the duration of the
+    test session and cleaned up afterwards.
+    The hub model artifacts are never cleaned up and persist accross sessions.
+    They must be cleaned up manually when the optimum-neuron version changes.
+
+    """
+    config_name = request.param
+    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
+    model_id = model_config["model_id"]
+    export_kwargs = model_config["export_kwargs"]
+    neuron_model_id = get_hub_neuron_model_id(config_name)
+    with TemporaryDirectory() as neuron_model_path:
+        hub = huggingface_hub.HfApi()
+        if hub.repo_exists(neuron_model_id):
+            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
+            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        else:
+            export_model(model_id, export_kwargs, neuron_model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            tokenizer.save_pretrained(neuron_model_path)
+            del tokenizer
+            # Create the test model on the hub
+            hub.create_repo(neuron_model_id, private=True)
+            hub.upload_folder(
+                folder_path=neuron_model_path,
+                repo_id=neuron_model_id,
+                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
+            )
+            # Make sure it is cached
+            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        # Add dynamic parameters to the model configuration
+        model_config["neuron_model_path"] = neuron_model_path
+        model_config["neuron_model_id"] = neuron_model_id
+        # Also add model configuration name to allow tests to adapt their expectations
+        model_config["name"] = config_name
+        # Yield instead of returning to keep a reference to the temporary directory.
+        # It will go out of scope and be released only once all tests needing the fixture
+        # have been completed.
+        logger.info(f"{config_name} ready for testing ...")
+        yield model_config
+        logger.info(f"Done with {config_name}")
+
+
+@pytest.fixture(scope="module")
+def neuron_model_path(neuron_model_config):
+    yield neuron_model_config["neuron_model_path"]
diff --git a/backends/neuron/tests/fixtures/service.py b/backends/neuron/tests/fixtures/service.py
new file mode 100644
index 00000000..85b0adc5
--- /dev/null
+++ b/backends/neuron/tests/fixtures/service.py
@@ -0,0 +1,240 @@
+import asyncio
+import contextlib
+import logging
+import os
+import random
+import shutil
+import sys
+import tempfile
+import time
+from typing import List
+
+import docker
+import huggingface_hub
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
+
+
+OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "text-generation-inference:latest-neuron")
+HF_TOKEN = huggingface_hub.get_token()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+
+class TestClient(AsyncInferenceClient):
+    def __init__(self, service_name: str, base_url: str):
+        super().__init__(model=base_url)
+        self.service_name = service_name
+
+
+class LauncherHandle:
+    def __init__(self, service_name: str, port: int):
+        self.client = TestClient(service_name, f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for i in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError(f"Service crashed after {i} seconds.")
+
+            try:
+                await self.client.text_generation("test", max_new_tokens=1)
+                logger.info(f"Service started after {i} seconds")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+                time.sleep(1)
+            except Exception:
+                raise RuntimeError("Basic generation failed with: {e}")
+        raise RuntimeError(f"Service failed to start after {i} seconds.")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, service_name, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(service_name, port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+        self._log_since = time.time()
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        container_output = container.logs(since=self._log_since).decode("utf-8")
+        self._log_since = time.time()
+        if container_output != "":
+            print(container_output, end="")
+        return container.status in ["running", "created"]
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def launcher(event_loop):
+    """Utility fixture to expose a TGI service.
+
+    The fixture uses a single event loop for each module, but it can create multiple
+    docker services with different parameters using the parametrized inner context.
+
+    Args:
+        service_name (`str`):
+            Used to identify test configurations and adjust test expectations,
+        model_name_or_path (`str`):
+            The model to use (can be a hub model or a path)
+        trust_remote_code (`bool`):
+            Must be set to True for gated models.
+
+    Returns:
+        A `ContainerLauncherHandle` containing both a TGI server and client.
+    """
+
+    @contextlib.contextmanager
+    def docker_launcher(
+        service_name: str,
+        model_name_or_path: str,
+        trust_remote_code: bool = False,
+    ):
+        port = random.randint(8000, 10_000)
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{service_name}-{port}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
+
+        if HF_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
+            env["HF_TOKEN"] = HF_TOKEN
+
+        for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
+            if var in os.environ:
+                env[var] = os.environ[var]
+
+        if os.path.isdir(model_name_or_path):
+            # Create a sub-image containing the model to workaround docker dind issues preventing
+            # to share a volume from the container running tests
+
+            docker_tag = f"{container_name}-img"
+            logger.info(
+                "Building image on the flight derivated from %s, tagged with %s",
+                DOCKER_IMAGE,
+                docker_tag,
+            )
+            with tempfile.TemporaryDirectory() as context_dir:
+                # Copy model directory to build context
+                model_path = os.path.join(context_dir, "model")
+                shutil.copytree(model_name_or_path, model_path)
+                # Create Dockerfile
+                container_model_id = f"/data/{model_name_or_path}"
+                docker_content = f"""
+                FROM {DOCKER_IMAGE}
+                COPY model {container_model_id}
+                """
+                with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
+                    f.write(docker_content.encode("utf-8"))
+                    f.flush()
+                image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=docker_tag)
+            logger.info("Successfully built image %s", image.id)
+            logger.debug("Build logs %s", logs)
+        else:
+            docker_tag = DOCKER_IMAGE
+            image = None
+            container_model_id = model_name_or_path
+
+        args = ["--model-id", container_model_id, "--env"]
+
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
+        container = client.containers.run(
+            docker_tag,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=False,
+            detach=True,
+            devices=["/dev/neuron0"],
+            ports={"80/tcp": port},
+            shm_size="1G",
+        )
+
+        logger.info(f"Starting {container_name} container")
+        yield ContainerLauncherHandle(service_name, client, container.name, port)
+
+        try:
+            container.stop(timeout=60)
+            container.wait(timeout=60)
+        except Exception as e:
+            logger.exception(f"Ignoring exception while stopping container: {e}.")
+            pass
+        finally:
+            logger.info("Removing container %s", container_name)
+            try:
+                container.remove(force=True)
+            except Exception as e:
+                logger.error("Error while removing container %s, skipping", container_name)
+                logger.exception(e)
+
+            # Cleanup the build image
+            if image:
+                logger.info("Cleaning image %s", image.id)
+                try:
+                    image.remove(force=True)
+                except NotFound:
+                    pass
+                except Exception as e:
+                    logger.error("Error while removing image %s, skipping", image.id)
+                    logger.exception(e)
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    """A utility fixture to launch multiple asynchronous TGI requests in parallel
+
+    Args:
+        client (`AsyncClient`):
+            An async client
+        prompt (`str`):
+            The prompt to use (identical for all requests)
+        max_new_tokens (`int`):
+            The number of tokens to generate for each request.
+        n (`int`):
+            The number of requests
+
+    Returns:
+        A list of `huggingface_hub.TextGenerationOutput`.
+    """
+
+    async def generate_load_inner(
+        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[TextGenerationOutput]:
+        futures = [
+            client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
+            for _ in range(n)
+        ]
+
+        return await asyncio.gather(*futures)
+
+    return generate_load_inner
diff --git a/backends/neuron/tests/integration/test_generate.py b/backends/neuron/tests/integration/test_generate.py
new file mode 100644
index 00000000..db716be5
--- /dev/null
+++ b/backends/neuron/tests/integration/test_generate.py
@@ -0,0 +1,96 @@
+import Levenshtein
+import pytest
+
+
+@pytest.fixture
+async def tgi_service(launcher, neuron_model_config):
+    model_name_or_path = neuron_model_config["neuron_model_path"]
+    service_name = neuron_model_config["name"]
+    with launcher(service_name, model_name_or_path) as tgi_service:
+        await tgi_service.health(600)
+        yield tgi_service
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_service):
+    service_name = tgi_service.client.service_name
+    prompt = "What is Deep Learning?"
+    # Greedy bounded without input
+    response = await tgi_service.client.text_generation(
+        prompt, max_new_tokens=17, details=True, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 17
+    greedy_expectations = {
+        "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
+        "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
+        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
+        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
+    }
+    assert response.generated_text == greedy_expectations[service_name]
+
+    # Greedy bounded with input
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == prompt + greedy_expectations[service_name]
+
+    # Sampling
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+    )
+    sample_expectations = {
+        "gpt2": "Deep Learning",
+        "llama": "Deep Learning",
+        "mistral": "Deep learning",
+        "qwen2": "Deep Learning",
+        "granite": "Deep learning",
+    }
+    assert sample_expectations[service_name] in response
+
+    # Sampling with stop sequence
+    stop_sequence = sample_expectations[service_name][-5:]
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+        stop_sequences=[stop_sequence],
+    )
+    assert response.endswith(stop_sequence)
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_service, generate_load):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_service.client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expectations = {
+        "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
+        "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
+        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
+        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
+    }
+    expected = expectations[tgi_service.client.service_name]
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3
diff --git a/backends/neuron/tests/integration/test_implicit_env.py b/backends/neuron/tests/integration/test_implicit_env.py
new file mode 100644
index 00000000..fa88ab67
--- /dev/null
+++ b/backends/neuron/tests/integration/test_implicit_env.py
@@ -0,0 +1,76 @@
+import os
+
+import pytest
+from huggingface_hub.errors import ValidationError
+
+
+@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
+async def tgi_service(request, launcher, neuron_model_config):
+    """Expose a TGI service corresponding to a model configuration
+
+    For each model configuration, the service will be started using the following
+    deployment options:
+    - from the hub original model (export parameters chosen after hub lookup),
+    - from the hub pre-exported neuron model,
+    - from a local path to the neuron model.
+    """
+    # the tgi_env.py script will take care of setting these
+    for var in [
+        "MAX_BATCH_SIZE",
+        "MAX_INPUT_TOKENS",
+        "MAX_TOTAL_TOKENS",
+        "HF_NUM_CORES",
+        "HF_AUTO_CAST_TYPE",
+    ]:
+        if var in os.environ:
+            del os.environ[var]
+    if request.param == "hub":
+        model_name_or_path = neuron_model_config["model_id"]
+    elif request.param == "hub-neuron":
+        model_name_or_path = neuron_model_config["neuron_model_id"]
+    else:
+        model_name_or_path = neuron_model_config["neuron_model_path"]
+    service_name = neuron_model_config["name"]
+    with launcher(service_name, model_name_or_path) as tgi_service:
+        await tgi_service.health(600)
+        yield tgi_service
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_service):
+    # Just verify that the generation works, and nothing is raised, with several set of params
+
+    # No params
+    await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+    )
+
+    response = await tgi_service.client.text_generation(
+        "How to cook beans ?",
+        max_new_tokens=17,
+        details=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+
+    # check error
+    try:
+        await tgi_service.client.text_generation("What is Deep Learning?", max_new_tokens=170000)
+    except ValidationError:
+        pass
+    else:
+        raise AssertionError(
+            "The previous text generation request should have failed, "
+            "because too many tokens were requested, it succeeded"
+        )
+
+    # Sampling
+    await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+    )
diff --git a/backends/neuron/tests/pytest.ini b/backends/neuron/tests/pytest.ini
new file mode 100644
index 00000000..2f4c80e3
--- /dev/null
+++ b/backends/neuron/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/backends/neuron/tests/requirements.txt b/backends/neuron/tests/requirements.txt
new file mode 100644
index 00000000..ef3c8543
--- /dev/null
+++ b/backends/neuron/tests/requirements.txt
@@ -0,0 +1,19 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+text-generation >= 0.6.0
+pytest >= 7.4.0
+pytest-asyncio >= 0.21.1
+requests < 2.32.0
+docker >= 6.1.3
+Levenshtein
diff --git a/backends/neuron/tests/server/helpers.py b/backends/neuron/tests/server/helpers.py
new file mode 100644
index 00000000..81547cb6
--- /dev/null
+++ b/backends/neuron/tests/server/helpers.py
@@ -0,0 +1,149 @@
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import (
+    Batch,
+    NextTokenChooserParameters,
+    Request,
+    StoppingCriteriaParameters,
+)
+
+
+def create_request(
+    id: int,
+    inputs: str,
+    truncate: int = 0,
+    max_new_tokens: int = 20,
+    do_sample: bool = False,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    temperature: float = 1.0,
+    seed: int = 42,
+    repetition_penalty: float = 1.0,
+):
+    parameters = NextTokenChooserParameters(
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        seed=seed,
+        repetition_penalty=repetition_penalty,
+    )
+    stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
+    return Request(
+        id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
+    )
+
+
+def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
+    """Verify that a prefill for a single request generates the expected output."""
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size >= batch_size
+    requests = []
+    max_new_tokens = 20
+    for i in range(batch_size):
+        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+    # Let's be pessimistic when estimating max_tokens
+    batch_size * (len(input_text) + max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == batch_size
+    # Whatever was passed as max_tokens, the server will correct it
+    # because of static batching
+    assert next_batch.max_tokens == batch_size * max_length
+    assert len(generations) == batch_size
+    for g in generations:
+        tokens = g.tokens
+        assert tokens.ids == [expected_token_id]
+        assert tokens.texts == [expected_token_text]
+
+
+def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+    """Verify that a decoding for a single request generates the expected output."""
+    generator = NeuronGenerator.from_pretrained(model_path)
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    # We already generated one token: call decode max_new_tokens - 1 times
+    for _ in range(max_new_tokens - 1):
+        assert next_batch.size == 1
+        assert next_batch.max_tokens == max_length
+        assert len(generations) == 1
+        assert len(generations[0].tokens.ids) == 1
+        generations, next_batch = generator.decode([next_batch])
+    assert next_batch is None
+    assert len(generations) == 1
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert output.finish_reason == 0
+    assert output.text == generated_text
+
+
+def check_decode_multiple(model_path):
+    """Verify that two requests added to the batch at different generation steps
+    generate the same outputs (continuous batching).
+    """
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size > 1
+    input_text = "Once upon a time"
+    max_new_tokens = 20
+    # Prefill a single request, remembering the generated token
+    tokens = {0: [], 1: []}
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == 1
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == 1
+    # Decode a few tokens
+    gen_tokens = 4
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert next_batch.size == 1
+    # Add a second request
+    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch_1 = generator.prefill(batch)
+    assert next_batch_1.size == 1
+    # We should have generated only a single token
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert len(tokens[1]) == 1
+    # Decode more tokens until we reach the maximum for the first request
+    batches = [next_batch, next_batch_1]
+    for _ in range(max_new_tokens - gen_tokens):
+        generations, next_batch = generator.decode(batches)
+        for g in generations:
+            tokens[g.request_id].append(g.tokens.ids[0])
+        batches = [next_batch]
+    # Verify we now only have one pending request
+    assert next_batch.size == 1
+    assert len(tokens[0]) == max_new_tokens
+    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
+    # Verify we have the output for the first request
+    for g in generations:
+        if g.request_id == 0:
+            output = g.generated_text
+            assert output.text != ""
+            assert output.generated_tokens == max_new_tokens
+            generated_text = output.text
+    # Continue decoding until the end of the second request
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert next_batch is None
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert tokens[0] == tokens[1]
+    assert output.text == generated_text
diff --git a/backends/neuron/tests/server/test_continuous_batching.py b/backends/neuron/tests/server/test_continuous_batching.py
new file mode 100644
index 00000000..48bb70cc
--- /dev/null
+++ b/backends/neuron/tests/server/test_continuous_batching.py
@@ -0,0 +1,74 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_continuous_batching_two_requests(neuron_model_config):
+    """Verify that two requests added to the batch at different generation steps
+    generate the same outputs (continuous batching).
+    """
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    assert generator.model.batch_size > 1
+    input_text = "Once upon a time"
+    max_new_tokens = 20
+    # Prefill a single request, remembering the generated token
+    tokens = {0: [], 1: []}
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == 1
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == 1
+    # Decode a few tokens
+    gen_tokens = 4
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert next_batch.size == 1
+    # Add a second request
+    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch_1 = generator.prefill(batch)
+    assert next_batch_1.size == 1
+    # We should have generated only a single token
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert len(tokens[1]) == 1
+    # Decode more tokens until we reach the maximum for the first request
+    batches = [next_batch, next_batch_1]
+    for _ in range(max_new_tokens - gen_tokens):
+        generations, next_batch = generator.decode(batches)
+        for g in generations:
+            tokens[g.request_id].append(g.tokens.ids[0])
+        batches = [next_batch]
+    # Verify we now only have one pending request
+    assert next_batch.size == 1
+    assert len(tokens[0]) == max_new_tokens
+    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
+    # Verify we have the output for the first request
+    for g in generations:
+        if g.request_id == 0:
+            output = g.generated_text
+            assert output.text != ""
+            assert output.generated_tokens == max_new_tokens
+            generated_text = output.text
+    # Continue decoding until the end of the second request
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert next_batch is None
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert tokens[0] == tokens[1]
+    assert output.text == generated_text
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
new file mode 100644
index 00000000..2ab4c2da
--- /dev/null
+++ b/backends/neuron/tests/server/test_decode.py
@@ -0,0 +1,55 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_decode(neuron_model_config):
+    """Verify that a decoding for a single request generates the expected output."""
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    for do_sample in [True, False]:
+        mode = "sample" if do_sample else "greedy"
+        print(f"{config_name}[{mode}]")
+        _test_decode(config_name, generator, do_sample)
+        generator.clear()
+
+
+def _test_decode(config_name, generator, do_sample):
+    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    max_new_tokens = 20
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    # We already generated one token: call decode max_new_tokens - 1 times
+    for _ in range(max_new_tokens - 1):
+        assert next_batch.size == 1
+        assert next_batch.max_tokens == max_length
+        assert len(generations) == 1
+        assert len(generations[0].tokens.ids) == 1
+        generations, next_batch = generator.decode([next_batch])
+    assert next_batch is None
+    assert len(generations) == 1
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert output.finish_reason == 0
+    if do_sample:
+        expected_text = {
+            "gpt2": " The sun was set",
+            "llama": "George Orwell, 1984",
+            "mistral": "The sky was",
+            "qwen2": " A young woman with",
+            "granite": "1984, George Orwell",
+        }[config_name]
+        assert expected_text in output.text
+    else:
+        print(output.text)
+        expected_text = {
+            "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
+            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
+            "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
+            "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
+            "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
+        }[config_name]
+        assert output.text == expected_text
diff --git a/backends/neuron/tests/server/test_generator_slot.py b/backends/neuron/tests/server/test_generator_slot.py
new file mode 100644
index 00000000..459ee3e5
--- /dev/null
+++ b/backends/neuron/tests/server/test_generator_slot.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from text_generation_server.generator import Slot
+from text_generation_server.pb.generate_pb2 import Request
+from transformers import AutoTokenizer, GenerationConfig
+
+
+TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]
+
+
+@pytest.fixture(params=TOKENIZERS)
+def tokenizer(request):
+    t = AutoTokenizer.from_pretrained(request.param)
+    t.padding_side = "left"
+    t.pad_token_id = t.eos_token_id
+    return t
+
+
+@pytest.mark.parametrize(
+    "input_text, generated_text",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
+            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
+            " to prevent a swirl of gritty dust from entering along with him.",
+        ],
+        ["This sentence is written in chinese:", "我很感谢你的热情"],
+        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
+    ],
+    ids=["spaces", "chinese-utf8", "emojis"],
+)
+def test_decode_streaming(tokenizer, input_text, generated_text):
+    slot = Slot(0, tokenizer)
+    request = Request(id=0, inputs=input_text)
+    slot.assign(0, request, GenerationConfig())
+    assert slot.cached_text == input_text
+
+    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    input_ids = inputs["input_ids"][0]
+    attention_mask = inputs["attention_mask"][0]
+    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
+
+    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
+    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
+    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
+    regenerated_text = full_text[len(input_text) :]
+
+    # Initialize the slot with the inputs
+    slot.reset(input_ids, attention_mask, selector=None)
+
+    assert slot.generated_tokens == 0
+
+    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
+    decoded_text = ""
+    for i in range(len(generated_tokens)):
+        text = slot.append(generated_tokens[i])
+        assert slot.generated_tokens == i + 1
+        decoded_text += text
+
+    assert decoded_text == regenerated_text
diff --git a/backends/neuron/tests/server/test_info.py b/backends/neuron/tests/server/test_info.py
new file mode 100644
index 00000000..5913acec
--- /dev/null
+++ b/backends/neuron/tests/server/test_info.py
@@ -0,0 +1,10 @@
+from text_generation_server.generator import NeuronGenerator
+
+
+def test_info(neuron_model_path):
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    info = generator.info
+    assert info.requires_padding is True
+    assert info.device_type == "xla"
+    assert info.window_size == 0
+    assert info.speculate == 0
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
new file mode 100644
index 00000000..2120e5c5
--- /dev/null
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -0,0 +1,89 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_prefill(neuron_model_config):
+    """Verify that a prefill for a single request generates the expected output."""
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    max_batch_size = 4
+    assert generator.model.batch_size >= max_batch_size
+    for num_requests in [1, max_batch_size]:
+        for do_sample in [True, False]:
+            mode = "sample" if do_sample else "greedy"
+            print(f"[{mode}]: {num_requests} requests")
+            _test_prefill(config_name, generator, num_requests, do_sample)
+            generator.clear()
+
+
+def _test_prefill(config_name, generator, batch_size, do_sample):
+    requests = []
+    max_new_tokens = 20
+    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    for i in range(batch_size):
+        requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+    # Let's be pessimistic when estimating max_tokens
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == batch_size
+    # Whatever was passed as max_tokens, the server will correct it
+    # because of static batching
+    assert next_batch.max_tokens == batch_size * max_length
+    assert len(generations) == batch_size
+    if do_sample:
+        expectations = {
+            "gpt2": [383, " The"],
+            "llama": [10058, " George"],
+            "mistral": [450, " The"],
+            "qwen2": [362, " A"],
+            "granite": [308, " ("],
+        }[config_name]
+    else:
+        expectations = {
+            "gpt2": [198, "\n"],
+            "llama": [10058, " George"],
+            "mistral": [13, "\n"],
+            "qwen2": [358, " I"],
+            "granite": [203, "\n"],
+        }[config_name]
+    for g in generations:
+        tokens = g.tokens
+        assert tokens.ids[0] == expectations[0]
+        assert tokens.texts[0] == expectations[1]
+
+
+def test_prefill_truncate(neuron_model_config):
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    batch_size = generator.model.batch_size
+    # We apply truncation to all requests but the first one
+    truncate = [
+        None,
+    ] + [i * 3 for i in range(1, batch_size)]
+    input_text = (
+        "Two gin-scented tears trickled down the sides of his nose."
+        " But it was all right, everything was all right, the struggle was finished."
+        " He had won the victory over himself. He loved Big Brother."
+    )
+    requests = []
+    for i in range(batch_size):
+        requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, _ = generator.prefill(batch)
+    # Even if the input text is identical for all requests, the first generated token might
+    # be different because of the truncation
+    expectations = {
+        "gpt2": [" He", " He", "\n", " He"],
+        "llama": [" —", " The", " He", " He"],
+        "mistral": [" He", "\n", " He", " He"],
+        "qwen2": [" He", " The", " He", " He"],
+        "granite": ["\n", "\n", " I", " He"],
+    }[config_name]
+    for i, g in enumerate(generations):
+        tokens = g.tokens
+        assert tokens.texts[0] == expectations[i]