feat: add neuron backend

2025-07-08 00:40:16 +00:00 · 2025-02-11 09:53:16 +00:00 · 2025-02-11 09:53:16 +00:00 · 9c25afb832
commit 9c25afb832
parent 76bcb4948d
18 changed files with 1788 additions and 0 deletions
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -0,0 +1,173 @@
+# Fetch and extract the TGI sources
+FROM alpine AS tgi
+RUN mkdir -p /tgi
+
+# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
+FROM alpine AS optimum-neuron
+RUN mkdir -p /optimum-neuron
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
+RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
+
+# Build cargo components (adapted from TGI original Dockerfile)
+# Note: we cannot use the cargo-chef base image as it uses python 3.11
+FROM ubuntu:22.04 AS chef
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN cargo install cargo-chef --locked
+
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    unzip python3-dev libssl-dev pkg-config \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+# Remove this line once TGI has fixed the conflict
+RUN cargo update ureq --precise 2.9.7
+RUN cargo build --release
+
+# Python base image
+FROM ubuntu:22.04 AS base
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+RUN pip3 --no-cache-dir install --upgrade pip
+
+# Python server build image
+FROM base AS pyserver
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    make \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN install -d /pyserver
+WORKDIR /pyserver
+COPY backends/neuron/server server
+COPY proto proto
+RUN pip3 install -r server/build-requirements.txt
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package
+
+# Neuron base image (used for deployment)
+FROM base AS neuron
+
+# Install system prerequisites
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    gnupg2 \
+    wget \
+    python3-dev \
+    libexpat1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+# Install neuronx packages
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    aws-neuronx-dkms=2.18.20.0 \
+    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
+    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
+    aws-neuronx-tools=2.19.0.0 \
+    libxml2 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
+
+RUN pip3 install \
+    neuronx-cc==2.15.143.0 \
+    torch-neuronx==2.1.2.2.3.2 \
+    transformers-neuronx==0.12.313 \
+    neuronx-distributed==0.9.0 \
+    libneuronxla==2.0.5347.0 \
+    --extra-index-url=https://pip.repos.neuron.amazonaws.com
+
+# Install HuggingFace packages
+RUN pip3 install \
+    hf_transfer huggingface_hub
+
+# Install optimum-neuron
+COPY --from=optimum-neuron /optimum-neuron optimum-neuron
+RUN pip3 install ./optimum-neuron
+
+# TGI base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Disable color logs as they are not supported by CloudWatch
+ENV LOGURU_COLORIZE=NO
+ENV LOG_COLORIZE=0
+
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+# Install python server
+COPY --from=pyserver /pyserver/build/dist dist
+RUN pip install dist/text_generation_server*.tar.gz
+
+# AWS Sagemaker compatible image
+FROM neuron AS sagemaker
+
+COPY backends/neuron/sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM neuron
+
+COPY backends/neuron/tgi_env.py /tgi_env.py
+COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
--- a/backends/neuron/Cargo.toml
+++ b/backends/neuron/Cargo.toml
@ -0,0 +1,47 @@
+[workspace]
+members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+default-members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+resolver = "2"
+
+[workspace.package]
+version = "3.0.0"
+edition = "2021"
+authors = ["Olivier Dehaene"]
+homepage = "https://github.com/huggingface/text-generation-inference"
+
+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.20.0", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+minijinja = { version = "2.2.0", features = ["json"] }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
+
+[profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
+debug = 1
+incremental = true
+panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
--- a/backends/neuron/Makefile
+++ b/backends/neuron/Makefile
@ -0,0 +1,28 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+root_dir := "${mkfile_dir}/../.."
+
+.PHONY:	image
+
+VERSION := $(shell gawk 'match($$0, /^version = "(.*)"/, a) {print a[1]}' ${root_dir}/Cargo.toml)
+
+image:
+	docker build --rm -f ${root_dir}/Dockerfile.neuron \
+	             --build-arg VERSION=$(VERSION) \
+				 --ulimit nofile=100000:100000 \
+				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
+	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
--- a/backends/neuron/README.md
+++ b/backends/neuron/README.md
@ -0,0 +1,25 @@
+# Text-generation-inference - Neuron backend for AWS Trainium and inferentia2
+
+## Description
+
+This is the TGI backend for AWS Neuron Trainium and Inferentia family of chips.
+
+This backend is composed of:
+- the AWS Neuron SDK,
+- the legacy v2 TGI launcher and router,
+- a neuron specific inference server for text-generation.
+
+## Usage
+
+Please refer to the official [documentation](https://huggingface.co/docs/text-generation-inference/backends/neuron).
+
+## Build your own image
+
+The simplest way to build TGI with the neuron backend is to use the provided `Makefile`:
+
+```shell
+$ make -C backends/neuron image
+```
+
+Alternatively, you can build the image directly from the top directory using a command similar to the one defined
+in the `Makefile` under the `image` target.
--- a/backends/neuron/sagemaker-entrypoint.sh
+++ b/backends/neuron/sagemaker-entrypoint.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [[ -z "${HF_MODEL_ID}" ]]; then
+  echo "HF_MODEL_ID must be set"
+  exit 1
+fi
+export MODEL_ID="${HF_MODEL_ID}"
+
+if [[ -n "${HF_MODEL_REVISION}" ]]; then
+  export REVISION="${HF_MODEL_REVISION}"
+fi
+
+if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
+  export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
+fi
+
+if [[ -z "${MAX_BATCH_SIZE}" ]]; then
+  echo "MAX_BATCH_SIZE must be set to the model static batch size"
+  exit 1
+fi
+
+text-generation-launcher --port 8080
--- a/backends/neuron/server/.gitignore
+++ b/backends/neuron/server/.gitignore
@ -0,0 +1 @@
+build
--- a/backends/neuron/server/Makefile
+++ b/backends/neuron/server/Makefile
@ -0,0 +1,57 @@
+# Initialize base variables
+pkg_name := text_generation_server
+BUILDDIR ?= $(CURDIR)/build
+VERSION ?= 0.0.1
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+pkg_dir := $(BUILDDIR)/$(pkg_name)
+pkg_dist := ${BUILDDIR}/dist/${pkg_name}-${VERSION}.tar.gz
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+# List static sources to be deployed in the package
+src_dir := $(mkfile_dir)/$(pkg_name)
+sources := $(wildcard $(src_dir)/*.py)
+deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
+
+# Static files are just copied
+
+define COPY
+	cp -f $< $@
+endef
+
+$(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml
+	mkdir -p $(BUILDDIR)
+	$(COPY)
+	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
+
+$(pkg_dir)/%.py: $(src_dir)/%.py
+	mkdir -p $(pkg_dir)
+	$(COPY)
+
+# Generated files are produced by grpcio tools
+
+# If not provided, get local proto files
+ifndef PROTODIR
+PROTODIR := $(mkfile_dir)/../../../proto
+endif
+
+# Three python files are generated for each protobuf
+protobufs := $(PROTODIR)/generate.proto
+pkg_pb_dir := $(pkg_dir)/pb
+generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
+generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
+
+$(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
+	mkdir -p $(pkg_pb_dir)
+	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
+		--grpc_python_out=$(pkg_pb_dir) --mypy_out=$(pkg_pb_dir) $^
+	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
+
+${pkg_dist}: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
+	python -m build $(BUILDDIR)
+
+package: ${pkg_dist}
--- a/backends/neuron/server/build-requirements.txt
+++ b/backends/neuron/server/build-requirements.txt
@ -0,0 +1,3 @@
+build
+grpcio-tools==1.53.0
+mypy-protobuf
--- a/backends/neuron/server/pyproject.toml
+++ b/backends/neuron/server/pyproject.toml
@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "text-generation-server"
+version = "VERSION"
+authors = [{name="David Corvoysier", email="david@huggingface.co" }]
+description = "TGI compatible inference server for AWS Neuronx platforms"
+dependencies = [
+    'protobuf > 3.20.1, < 4',
+    'grpcio == 1.57.0',
+    'grpcio-status == 1.48.2',
+    'grpcio-reflection == 1.48.2',
+    'grpc-interceptor == 0.15.2',
+    'typer == 0.6.1',
+    'safetensors',
+    'loguru == 0.6.0'
+]
+
+[tool.setuptools]
+packages = ["text_generation_server", "text_generation_server.pb"]
+
+[project.scripts]
+text-generation-server = 'text_generation_server.cli:app'
--- a/backends/neuron/server/text_generation_server/cli.py
+++ b/backends/neuron/server/text_generation_server/cli.py
@ -0,0 +1,111 @@
+import sys
+from typing import Optional
+
+import typer
+from loguru import logger
+
+
+app = typer.Typer()
+
+
+@app.command()
+def serve(
+    model_id: str,
+    revision: Optional[str] = None,
+    sharded: bool = False,
+    trust_remote_code: bool = None,
+    uds_path: str = "/tmp/text-generation-server",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
+):
+    """This is the main entry-point for the server CLI.
+
+    Args:
+        model_id (`str`):
+            The *model_id* of a model on the HuggingFace hub or the path to a local model.
+        revision (`Optional[str]`, defaults to `None`):
+            The revision of the model on the HuggingFace hub.
+        sharded (`bool`):
+            Whether the model must be sharded or not. Kept for compatibility with the
+            text-generation-launcher, but must be set to False.
+        trust-remote-code (`bool`):
+            Kept for compatibility with text-generation-launcher. Ignored.
+        uds_path (`Union[Path, str]`):
+            The local path on which the server will expose its google RPC services.
+        logger_level (`str`):
+            The server logger level. Defaults to *INFO*.
+        json_output (`bool`):
+            Use JSON format for log serialization.
+        otlp_endpoint (`Optional[str]`, defaults to `None`):
+            The Open Telemetry endpoint to use.
+        otlp_service_name (`Optional[str]`, defaults to `None`):
+            The name to use when pushing data to the Open Telemetry endpoint.
+        max_input_tokens (`Optional[int]`, defaults to `None`):
+            The maximum number of input tokens each request should contain.
+    """
+    if sharded:
+        raise ValueError("Sharding is not supported.")
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    if trust_remote_code is not None:
+        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+
+    # Import here after the logger is added to log potential import exceptions
+    from .server import serve
+
+    serve(model_id, revision, uds_path)
+
+
+@app.command()
+def download_weights(
+    model_id: str,
+    revision: Optional[str] = None,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    auto_convert: Optional[bool] = None,
+    extension: Optional[str] = None,
+    trust_remote_code: Optional[bool] = None,
+    merge_lora: Optional[bool] = None,
+):
+    """Download the model weights.
+
+    This command will be called by text-generation-launcher before serving the model.
+    """
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    if extension is not None:
+        logger.warning("'extension' argument is not supported and will be ignored.")
+    if trust_remote_code is not None:
+        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+    if auto_convert is not None:
+        logger.warning("'auto_convert' argument is not supported and will be ignored.")
+    if merge_lora is not None:
+        logger.warning("'merge_lora' argument is not supported and will be ignored.")
+
+    # Import here after the logger is added to log potential import exceptions
+    from .model import fetch_model
+
+    fetch_model(model_id, revision)
--- a/backends/neuron/server/text_generation_server/generator.py
+++ b/backends/neuron/server/text_generation_server/generator.py
@ -0,0 +1,636 @@
+import copy
+import logging
+import time
+from abc import ABC
+from enum import Enum
+from typing import List, Optional, Tuple
+
+import torch
+from loguru import logger
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
+from transformers.generation import GenerationConfig
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.generation import TokenSelector
+
+from .model import get_export_kwargs_from_env
+from .pb.generate_pb2 import (
+    Batch,
+    CachedBatch,
+    FinishReason,
+    GeneratedText,
+    Generation,
+    InfoResponse,
+    Request,
+    Tokens,
+)
+
+
+# Disable optimum-neuron warnings as it seems to block the server after a while
+optimum_logger = logging.getLogger("optimum.neuron")
+optimum_logger.setLevel("CRITICAL")
+
+
+class Generator(ABC):
+    """An abstract class to represent the workhorse behind TextGenerationService.
+
+    Ideally, it should not rely on protobuf constructs, but in a first step it does.
+    Implementations would typically need a model and a tokenizer to implement the Generator methods.
+    """
+
+    @property
+    def info(self) -> InfoResponse:
+        """This should simply return the expected InfoResponse"""
+        raise NotImplementedError
+
+    def warmup(self, batch: Batch) -> int:
+        """Verify if the hardware can support the target load.
+
+        Args:
+            batch (`Batch`):
+                A batch corresponding to the maximum number of concurrent requests.
+
+        Return:
+            The maximum number of tokens the model supports.
+        """
+        raise NotImplementedError
+
+    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
+        """Prefill is called whenever new requests need to be added.
+
+        When this method returns successfully, a decode method will follow
+        with both the current and newly prefilled batch(es).
+
+        Args:
+            batch (`Batch`):
+                A batch containing the new requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        raise NotImplementedError
+
+    def decode(self, batches: List[Batch]) -> Tuple[List[Generation], CachedBatch]:
+        """Decode after a prefill or another decode."""
+        raise NotImplementedError
+
+    def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch:
+        """Remove requests that are not listed from the specified batch"""
+        raise NotImplementedError
+
+    def clear(self):
+        """Remove all requests from the generator"""
+        raise NotImplementedError
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, revision: Optional[str]):
+        """Factory method "a la transformers" """
+        raise NotImplementedError
+
+
+class Slot:
+    """Represents a slot in a static batch"""
+
+    class State(Enum):
+        EMPTY = 0
+        PAUSE = 1
+        READY = 2
+
+    def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase):
+        self._id = id
+        self._tokenizer = tokenizer
+        self.clear()
+
+    def clear(self):
+        """Clear the slot and mark it as available."""
+        self._state = Slot.State.EMPTY
+        self._batch_id = None
+        self._request_id = None
+        self._inputs = ""
+        self._truncate = 0
+        self._generation_config = None
+        self._tokens = []
+        self._mask = torch.tensor([])
+        self._selector = None
+        self._generated_tokens = 0
+        self._next_text_token_start = 0
+        self._next_text_token_end = 0
+        self._generated_text = ""
+        self._next_text = ""
+
+    @property
+    def id(self) -> int:
+        return self._id
+
+    @property
+    def state(self) -> "Slot.State":
+        return self._state
+
+    @property
+    def batch_id(self) -> int:
+        return self._batch_id
+
+    @property
+    def request_id(self) -> int:
+        return self._request_id
+
+    @property
+    def cached_text(self) -> str:
+        return self._inputs + self._generated_text
+
+    @property
+    def generation_config(self) -> GenerationConfig:
+        return self._generation_config
+
+    @property
+    def generated_tokens(self) -> int:
+        return self._generated_tokens
+
+    def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
+        """Assign a request to a slot.
+
+        Args:
+            request (`Request`):
+                The request to be assigned. Contains the inputs and tokens selection parameters.
+            generation_config (`transformers.GenerationConfig`):
+                The base generation config (might be modified by the request generation parameters).
+        """
+        self._state = Slot.State.READY
+        self._batch_id = batch_id
+        self._request_id = request.id
+        self._inputs = request.inputs
+        if request.truncate:
+            self._truncate = request.truncate
+        self._generation_config = copy.deepcopy(generation_config)
+        # Update generation config with request parameters
+        self._generation_config.do_sample = request.parameters.do_sample
+        if self._generation_config.do_sample:
+            if request.parameters.temperature != 0:
+                self._generation_config.temperature = request.parameters.temperature
+            if request.parameters.top_k != 0:
+                self._generation_config.top_k = request.parameters.top_k
+            if request.parameters.top_p != 0:
+                self._generation_config.top_p = request.parameters.top_p
+            if request.parameters.typical_p != 0:
+                self._generation_config.typical_p = request.parameters.typical_p
+        if request.parameters.repetition_penalty != 0:
+            self._generation_config.repetition_penalty = request.parameters.repetition_penalty
+        self.seed = request.parameters.seed
+        self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
+        self._max_new_tokens = self._generation_config.max_new_tokens
+        stop_strings = request.stopping_parameters.stop_sequences
+        if stop_strings:
+            self._generation_config.stop_strings = stop_strings
+
+    def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
+        """Reset the slot for the next generation.
+
+        Args:
+            input_ids: (`torch.LongTensor`):
+                The new input_ids to use to generate the next token.
+            attention_mask: (`torch.LongTensor`):
+                The new attention_mask to use to generate the next token.
+            selector: (`optimum.neuron.generation.TokenSelector`):
+                An object implementing the updated token selection logic.
+        """
+        self._tokens = input_ids.clone()
+        self._next_text_token_start = 0
+        self._next_text_token_end = torch.numel(self._tokens)
+        self._next_text = ""
+        self._mask = attention_mask.clone()
+        self._selector = selector
+
+    def pause(self, reset_on_pause: bool):
+        """Mark the current slot as paused for generation.
+
+        Note that the KV cache for this slot will still be filled.
+        """
+        if reset_on_pause:
+            # Drop the last token as it will be added back when resuming the slot
+            self._generated_tokens -= 1
+            # Since generated tokens are now part of the prefill, we need to reevaluate
+            # max_new_tokens for the next generation
+            self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
+        self._state = Slot.State.PAUSE
+
+    def resume(self):
+        """Mark the slot as ready for generation."""
+        self._state = Slot.State.READY
+
+    def _decode_next_tokens(
+        self,
+    ) -> str:
+        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
+        # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
+        if new_text.endswith("<EFBFBD>"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            return ""
+
+        # Compare the generated text with the one using only the tokens producing the last one
+        last_text = self._tokenizer.decode(
+            self._tokens[self._next_text_token_start : self._next_text_token_end],
+            skip_special_tokens=False,
+        )
+        if len(new_text) == len(last_text):
+            # Nothing new was actually generated
+            return ""
+        # Return the decoded text and store its token offsets
+        self._next_text_token_start = self._next_text_token_end
+        self._next_text_token_end = torch.numel(self._tokens)
+        return new_text[len(last_text) :]
+
+    def append(self, next_token: int) -> str:
+        """Append a new generated token to this slot
+
+        The new token is added to the list of generated tokens, which impacts
+        directly the generated_text and stopped property.
+
+        The new token is however not added immediately to the slot inputs: it will
+        be added later on when it has effectively been used to produce the next token.
+
+        Args:
+            next_token (`int`):
+                The newly generated token.
+
+        Return:
+            The corresponding decoded text (if any).
+        """
+        self._tokens = torch.cat([self._tokens, torch.LongTensor([next_token])])
+        self._mask = torch.cat([self._mask, torch.LongTensor([1])])
+        self._generated_tokens += 1
+        next_text = self._decode_next_tokens()
+        # Now that a new token has been generated, we can append the previous one to the generated text
+        self._generated_text += self._next_text
+        self._next_text = next_text
+        return next_text
+
+    def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
+        """Select the next token from the candidate logits.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation (not used in all generation modes).
+            logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                The logits corresponding to the generated tokens.
+
+        Return:
+            `torch.LongTensor`: A scalar torch.LongTensor` containing the selected token.
+        """
+        return self._selector.select(input_ids, logits)[0]
+
+    @property
+    def stopped(self) -> bool:
+        # Transformers stopping criteria expects a batch of input ids
+        input_ids = torch.unsqueeze(self._tokens, dim=0)
+        return self._selector.stopping_criteria(input_ids, None)
+
+    @property
+    def generated_text(self) -> str:
+        return self._generated_text + self._next_text
+
+    @property
+    def next_token(self) -> int:
+        return None if len(self._tokens) == 0 else self._tokens[-1]
+
+    @property
+    def attention_mask(self) -> torch.LongTensor:
+        return self._mask
+
+    @property
+    def max_token(self) -> int:
+        return self._generation_config.max_length
+
+    @property
+    def max_new_tokens(self) -> int:
+        # The current value of max_new_tokens: might be different of the target max_new_tokens
+        # if the slot has been paused and resumed.
+        return self._generation_config.max_new_tokens
+
+    @property
+    def truncate(self) -> int:
+        return self._truncate
+
+
+class NeuronGenerator(Generator):
+    """A Generator for Neuron models."""
+
+    def __init__(
+        self,
+        model: NeuronModelForCausalLM,
+        tokenizer: PreTrainedTokenizerBase,
+    ):
+        self.model = model
+        self.rebuild_cache_on_prefill = not self.model.continuous_batching
+        # Specify padding and truncation options for decoder-only architecture
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        tokenizer.truncation_side = "left"
+        self.tokenizer = tokenizer
+        self.special_tokens = self.tokenizer.all_special_ids
+        self.slots = [Slot(i, tokenizer) for i in range(self.model.batch_size)]
+        self.batch_id = 0
+
+    @property
+    def info(self) -> InfoResponse:
+        """Returns the expected InfoResponse."""
+        dtype = getattr(self.model.config, "torch_dtype", "float32")
+        return InfoResponse(
+            requires_padding=True,
+            dtype=str(dtype),
+            device_type="xla",
+        )
+
+    def warmup(self, batch: Batch) -> int:
+        """Verify if the hardware can support the target load.
+
+        Args:
+            batch (`Batch`):
+                A batch corresponding to the maximum number of concurrent requests.
+
+        Return:
+            The maximum number of tokens the model supports.
+        """
+        # Just check that the warmup request parameters match the model capacity
+        batch_size = self.model.batch_size
+        if len(batch.requests) > batch_size:
+            raise ValueError(
+                f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI.  The compiled model batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process.  The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
+            )
+        self.prefill(batch)
+        self.clear()
+        return self.model.batch_size * self.model.max_length
+
+    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
+        """Prefill new requests.
+
+        Args:
+            batch (`Batch`):
+                A batch containing the new requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        slots = {state: [] for state in Slot.State}
+        for slot in self.slots:
+            slots[slot.state].append(slot)
+        active_slots = slots[Slot.State.READY]
+        empty_slots = slots[Slot.State.EMPTY]
+        if len(empty_slots) < len(batch.requests):
+            raise ValueError(
+                f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
+                f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
+            )
+        # Assign each request to an empty slot
+        logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
+        new_slots = []
+        for request in batch.requests:
+            slot = empty_slots.pop()
+            slot.assign(self.batch_id, request, self.model.generation_config)
+            new_slots.append(slot)
+            logger.debug(
+                f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
+            )
+        if self.rebuild_cache_on_prefill:
+            # We will clear pending slots and prefill all slots
+            prefill_slots = self.slots
+            seq_ids = None
+        else:
+            # We only need to pass inputs for the new requests
+            prefill_slots = new_slots
+            seq_ids = torch.tensor([slot.id for slot in prefill_slots])
+        # Reconstruct the full inputs (without padding) as seen by the model.
+        # This comprises:
+        # - the inputs for new requests,
+        # - only when rebuilding the cache, the inputs and the generated text that has already
+        # been cached (i.e. excluding the last generated token) for unfinished requests.
+        inputs = []
+        max_length = 0
+        for slot in prefill_slots:
+            inputs.append(slot.cached_text)
+            # Apply truncation, making sure we fit into static dimensions
+            if slot.truncate == 0:
+                max_length = self.model.max_length
+            elif slot.truncate > max_length and slot.truncate < self.model.max_length:
+                max_length = slot.truncate
+        # Tokenize with padding and truncation
+        padded_inputs = self.tokenizer(
+            inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+        )
+        input_ids = padded_inputs.input_ids
+        attention_mask = padded_inputs.attention_mask
+        # Pause previously active slots during generation
+        next_tokens = []
+        for slot in active_slots:
+            slot.pause(reset_on_pause=self.rebuild_cache_on_prefill)
+            if self.rebuild_cache_on_prefill:
+                # The slot will be reset, so we need to store its next token
+                next_tokens.append(slot.next_token)
+        # Each slot must be reset with the padded inputs and masks
+        for i, slot in enumerate(prefill_slots):
+            if slot.state != slot.state.EMPTY:
+                if slot.truncate > 0 and slot.truncate < input_ids.shape[-1]:
+                    # Apply per-request truncation
+                    input_ids[i, : -slot.truncate] = self.tokenizer.pad_token_id
+                    attention_mask[i, : -slot.truncate] = 0
+                slot_input_ids = input_ids[i : i + 1, :]
+                # Padded input ids are also required to set logits processors and stopping criterias
+                selector = TokenSelector.create(
+                    slot_input_ids,
+                    slot.generation_config,
+                    self.model,
+                    self.model.max_length,
+                    tokenizer=self.tokenizer,
+                    seed=slot.seed,
+                )
+                slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
+                slot_attention_mask = attention_mask[i]
+                slot.reset(slot_input_ids, slot_attention_mask, selector)
+        # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
+        # as they have already been generated and sent back in the last decode.
+        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
+        logits = self.model(**model_inputs)[0]
+        generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
+        self.batch_id += 1
+        # Reactivate previously active slots for the next decode
+        for i, slot in enumerate(active_slots):
+            slot.resume()
+            if self.rebuild_cache_on_prefill:
+                # Append back the next token
+                slot.append(next_tokens[i])
+        logger.debug("Model ready for decoding")
+        if next_batch is not None:
+            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
+        return generation, next_batch
+
+    def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
+        """Decode the specified prefilled requests.
+
+        Args:
+            batches (`List[CachedBatch]`):
+                A list of previous batches containing the prefilled requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        # batches contains a list composed of:
+        # - the batch id returned by the last decode,
+        # - the batch id(s) returned by the last prefill(s)
+        # Batches are always concatenated during prefill, so we can
+        # just carry on with decoding. We adopt the id of the first
+        # batch in the list as our next batch id.
+        next_batch_id = batches[0].id
+        request_ids = []
+        for batch in batches:
+            request_ids += batch.request_ids
+        cleared_request_ids = []
+        for slot in self.slots:
+            if slot.state == slot.State.READY and slot.request_id not in request_ids:
+                cleared_request_ids.append(slot.request_id)
+                slot.clear()
+        if len(cleared_request_ids) > 0:
+            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
+        active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
+        if len(active_slots) < len(request_ids):
+            raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
+        if self.model.continuous_batching:
+            decode_slots = active_slots
+            seq_ids = torch.tensor([slot.id for slot in decode_slots])
+        else:
+            decode_slots = self.slots
+            seq_ids = None
+        # Reconstruct input_ids and attention_mask from decode slots
+        n_slots = len(decode_slots)
+        input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
+        max_length = 0
+        for slot in decode_slots:
+            max_length = max(max_length, slot.attention_mask.size(-1))
+        attention_mask = torch.zeros([n_slots, max_length], dtype=torch.int64)
+        for i, slot in enumerate(decode_slots):
+            if slot.state != Slot.State.EMPTY:
+                # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
+                input_ids[i, 0] = slot.next_token
+                attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
+        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
+        logits = self.model(**model_inputs)[0]
+        return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
+
+    def _generate_token(
+        self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
+    ) -> Tuple[List[Generation], CachedBatch]:
+        generations = []
+        active_slots = False
+        for i, slot in enumerate(slots):
+            if slot.state != Slot.State.READY:
+                continue
+            request_id = slot.request_id
+            next_token_logits = logits[i : i + 1, -1, :]
+            slot_input_ids = input_ids[i : i + 1, :]
+            next_token = slot.select(slot_input_ids, next_token_logits)
+            next_token_text = slot.append(next_token)
+            generated_text = None
+            finish_reason = None
+            if next_token == self.tokenizer.eos_token_id:
+                finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
+            elif slot.stopped:
+                if slot.generated_tokens == slot.max_new_tokens:
+                    finish_reason = FinishReason.FINISH_REASON_LENGTH
+                else:
+                    finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
+            if finish_reason is not None:
+                # We must include the generated text for each finished sequence in the response
+                generated_text = GeneratedText(
+                    text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
+                )
+                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
+                # mark the slot as available
+                slot.clear()
+            else:
+                active_slots = True
+            generations.append(
+                Generation(
+                    request_id=request_id,
+                    prefill_tokens=None,
+                    tokens=Tokens(
+                        ids=[next_token],
+                        logprobs=[0],
+                        texts=[next_token_text],
+                        is_special=[next_token in self.special_tokens],
+                    ),
+                    generated_text=generated_text,
+                )
+            )
+        batch = None
+        if active_slots:
+            # Whatever initial batch these requests came from, we always return all pending requests in a single batch
+            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
+            batch = self._cached_batch(next_batch_id, request_ids)
+        else:
+            logger.debug("No more pending requests")
+        return generations, batch
+
+    def _cached_batch(self, batch_id: int, request_ids: List):
+        size = len(request_ids)
+        max_tokens = size * self.model.max_length
+        return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
+
+    def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
+        """Remove requests that are not listed from the specified batch
+
+        Args:
+            batch_id (`int`):
+                The id of a cached batch.
+            keep_ids(`List[int]`):
+                The list of requests that must be kept.
+
+        Return:
+            A `CachedBatch` containing the pending requests.
+        """
+        keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
+        self._clear(keep_slot_ids)
+        return self._cached_batch(batch_id, keep_request_ids)
+
+    def clear(self, batch_id: Optional[int] = None):
+        """Remove a subset or all requests from the generator"""
+        keep_ids = []
+        if batch_id is not None:
+            keep_ids = [slot.id for slot in self.slots if slot.batch_id != batch_id]
+        return self._clear(keep_ids)
+
+    def _clear(self, keep_slot_ids: List):
+        for slot in self.slots:
+            if slot.state != Slot.State.EMPTY and slot.id not in keep_slot_ids:
+                logger.debug(f"Removing slot {slot.id} with request {slot.request_id}")
+                slot.clear()
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, revision: str = None):
+        """Instantiate a NeuronGenerator.
+
+        Args:
+            model_id (`str`):
+                A hub model id or the path to a local model. This path must also contain a Tokenizer.
+            revision (`Optional[str]`, defaults to `None`):
+                The revision of the model on the HuggingFace hub.
+
+        Returns:
+            A NeuronGenerator.
+        """
+        config = AutoConfig.from_pretrained(model_id)
+        neuron_config = getattr(config, "neuron", None)
+        start = time.time()
+        if neuron_config is None:
+            export_kwargs = get_export_kwargs_from_env()
+            logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
+            model = NeuronModelForCausalLM.from_pretrained(
+                model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
+            )
+        else:
+            logger.info("Loading model on neuron devices (this can take a few minutes).")
+            model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
+        end = time.time()
+        logger.info(f"Model successfully loaded in {end - start:.2f} s.")
+        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        return cls(model, tokenizer)
--- a/backends/neuron/server/text_generation_server/interceptor.py
+++ b/backends/neuron/server/text_generation_server/interceptor.py
@ -0,0 +1,27 @@
+from typing import Any, Callable
+
+import grpc
+from google.rpc import code_pb2, status_pb2
+from grpc_interceptor.server import AsyncServerInterceptor
+from grpc_status import rpc_status
+from loguru import logger
+
+
+class ExceptionInterceptor(AsyncServerInterceptor):
+    async def intercept(
+        self,
+        method: Callable,
+        request_or_iterator: Any,
+        context: grpc.ServicerContext,
+        method_name: str,
+    ) -> Any:
+        try:
+            response = method(request_or_iterator, context)
+            return await response
+        except Exception as err:
+            method_name = method_name.split("/")[-1]
+            logger.exception(f"Method {method_name} encountered an error.")
+
+            await context.abort_with_status(
+                rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
+            )
--- a/backends/neuron/server/text_generation_server/model.py
+++ b/backends/neuron/server/text_generation_server/model.py
@ -0,0 +1,118 @@
+import os
+import shutil
+import time
+from typing import Optional
+
+from huggingface_hub import snapshot_download
+from huggingface_hub.constants import HF_HUB_CACHE
+from loguru import logger
+from transformers import AutoConfig
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils import get_hub_cached_entries
+
+
+def get_export_kwargs_from_env():
+    batch_size = os.environ.get("MAX_BATCH_SIZE", None)
+    if batch_size is not None:
+        batch_size = int(batch_size)
+    sequence_length = os.environ.get("MAX_TOTAL_TOKENS", None)
+    if sequence_length is not None:
+        sequence_length = int(sequence_length)
+    num_cores = os.environ.get("HF_NUM_CORES", None)
+    if num_cores is not None:
+        num_cores = int(num_cores)
+    auto_cast_type = os.environ.get("HF_AUTO_CAST_TYPE", None)
+    return {
+        "task": "text-generation",
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_cores": num_cores,
+        "auto_cast_type": auto_cast_type,
+    }
+
+
+def is_cached(model_id, neuron_config):
+    # Look for cached entries for the specified model
+    in_cache = False
+    entries = get_hub_cached_entries(model_id, "inference")
+    # Look for compatible entries
+    for entry in entries:
+        compatible = True
+        for key, value in neuron_config.items():
+            # Only weights can be different
+            if key in ["checkpoint_id", "checkpoint_revision"]:
+                continue
+            if entry[key] != value:
+                compatible = False
+        if compatible:
+            in_cache = True
+            break
+    return in_cache
+
+
+def log_cache_size():
+    path = HF_HUB_CACHE
+    if os.path.exists(path):
+        usage = shutil.disk_usage(path)
+        gb = 2**30
+        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
+    else:
+        raise ValueError(f"The cache directory ({path}) does not exist.")
+
+
+def fetch_model(
+    model_id: str,
+    revision: Optional[str] = None,
+) -> str:
+    """Fetch a neuron model.
+
+    Args:
+        model_id (`str`):
+            The *model_id* of a model on the HuggingFace hub or the path to a local model.
+        revision (`Optional[str]`, defaults to `None`):
+            The revision of the model on the HuggingFace hub.
+
+    Returns:
+        A string corresponding to the model_id or path.
+    """
+    if not os.path.isdir("/sys/class/neuron_device/"):
+        raise SystemError("No neuron cores detected on the host.")
+    if os.path.isdir(model_id) and revision is not None:
+        logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
+        revision = None
+    # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
+    # Note that the model may already be present in the cache.
+    config = AutoConfig.from_pretrained(model_id, revision=revision)
+    neuron_config = getattr(config, "neuron", None)
+    if neuron_config is not None:
+        if os.path.isdir(model_id):
+            return model_id
+        # Prefetch the neuron model from the Hub
+        logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
+        log_cache_size()
+        return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
+    # Model needs to be exported: look for compatible cached entries on the hub
+    export_kwargs = get_export_kwargs_from_env()
+    export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
+    neuron_config = export_config.neuron
+    if not is_cached(model_id, neuron_config):
+        hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
+        neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
+        error_msg = (
+            f"No cached version found for {model_id} with {neuron_config}."
+            f"You can start a discussion to request it on {hub_cache_url}"
+            f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
+        )
+        raise ValueError(error_msg)
+    logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
+    if os.path.isdir(model_id):
+        return model_id
+    # Prefetch weights, tokenizer and generation config so that they are in cache
+    log_cache_size()
+    start = time.time()
+    snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
+    end = time.time()
+    logger.info(f"Model weights fetched in {end - start:.2f} s.")
+    log_cache_size()
+    return model_id
--- a/backends/neuron/server/text_generation_server/server.py
+++ b/backends/neuron/server/text_generation_server/server.py
@ -0,0 +1,89 @@
+import asyncio
+from pathlib import Path
+from typing import List
+
+from grpc import aio
+from grpc_reflection.v1alpha import reflection
+from loguru import logger
+
+from .generator import Generator, NeuronGenerator
+from .interceptor import ExceptionInterceptor
+from .pb import generate_pb2, generate_pb2_grpc
+
+
+class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
+    def __init__(self, generator: Generator, server_urls: List[str]):
+        self.generator = generator
+        self.server_urls = server_urls
+
+    async def Info(self, request, context):
+        return self.generator.info
+
+    async def Health(self, request, context):
+        return generate_pb2.HealthResponse()
+
+    async def ServiceDiscovery(self, request, context):
+        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
+
+    async def ClearCache(self, request, context):
+        if request.HasField("id"):
+            self.generator.clear(request.id)
+        else:
+            self.generator.clear()
+        return generate_pb2.ClearCacheResponse()
+
+    async def FilterBatch(self, request, context):
+        filtered_batch = self.generator.filter(request.batch_id, request.request_ids)
+        return generate_pb2.FilterBatchResponse(batch=filtered_batch)
+
+    async def Warmup(self, request, context):
+        max_tokens = self.generator.warmup(request.batch)
+        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)
+
+    async def Prefill(self, request, context):
+        generations, batch = self.generator.prefill(request.batch)
+        return generate_pb2.PrefillResponse(generations=generations, batch=batch)
+
+    async def Decode(self, request, context):
+        generations, batch = self.generator.decode(request.batches)
+        return generate_pb2.DecodeResponse(generations=generations, batch=batch)
+
+
+def serve(
+    model_id: str,
+    revision: str,
+    uds_path: Path,
+):
+    async def serve_inner(model_id: str, revision: str):
+        unix_socket_template = "unix://{}-{}"
+        local_url = unix_socket_template.format(uds_path, 0)
+        server_urls = [local_url]
+
+        try:
+            generator = NeuronGenerator.from_pretrained(model_id, revision)
+        except Exception:
+            logger.exception("Error when initializing model")
+            raise
+
+        server = aio.server(interceptors=[ExceptionInterceptor()])
+        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
+            TextGenerationService(generator, server_urls), server
+        )
+        SERVICE_NAMES = (
+            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(local_url)
+
+        await server.start()
+
+        logger.info("Server started at {}".format(local_url))
+
+        try:
+            await server.wait_for_termination()
+        except KeyboardInterrupt:
+            logger.info("Signal received. Shutting down")
+            await server.stop(0)
+
+    asyncio.run(serve_inner(model_id, revision))
--- a/backends/neuron/tgi-entrypoint.sh
+++ b/backends/neuron/tgi-entrypoint.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+set -e -o pipefail -u
+
+export ENV_FILEPATH=$(mktemp)
+
+trap "rm -f ${ENV_FILEPATH}" EXIT
+
+touch $ENV_FILEPATH
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+${SCRIPT_DIR}/tgi_env.py $@
+
+source $ENV_FILEPATH
+
+exec text-generation-launcher $@
--- a/backends/neuron/tgi_env.py
+++ b/backends/neuron/tgi_env.py
@ -0,0 +1,226 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+from huggingface_hub import constants
+from transformers import AutoConfig
+
+from optimum.neuron.modeling_decoder import get_available_cores
+from optimum.neuron.utils import get_hub_cached_entries
+from optimum.neuron.utils.version_utils import get_neuronxcc_version
+
+
+logger = logging.getLogger(__name__)
+
+tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
+tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
+
+env_config_peering = [
+    ("MAX_BATCH_SIZE", "batch_size"),
+    ("MAX_TOTAL_TOKENS", "sequence_length"),
+    ("HF_AUTO_CAST_TYPE", "auto_cast_type"),
+    ("HF_NUM_CORES", "num_cores"),
+]
+
+# By the end of this script all env var should be specified properly
+env_vars = tgi_server_env_vars + tgi_router_env_vars
+
+available_cores = get_available_cores()
+neuronxcc_version = get_neuronxcc_version()
+
+
+def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    if not argv:
+        argv = sys.argv
+    # All these are params passed to tgi and intercepted here
+    parser.add_argument(
+        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
+    parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
+    parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
+    parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
+    parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
+    parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
+
+    args = parser.parse_known_args(argv)[0]
+
+    if not args.model_id:
+        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
+
+    # Override env with cmdline params
+    os.environ["MODEL_ID"] = args.model_id
+
+    # Set all tgi router and tgi server values to consistent values as early as possible
+    # from the order of the parser defaults, the tgi router value can override the tgi server ones
+    if args.max_total_tokens > 0:
+        os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
+
+    if args.max_input_tokens > 0:
+        os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
+
+    if args.max_batch_size > 0:
+        os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
+
+    if args.max_batch_prefill_tokens > 0:
+        os.environ["MAX_BATCH_PREFILL_TOKENS"] = str(args.max_batch_prefill_tokens)
+
+    if args.revision:
+        os.environ["REVISION"] = str(args.revision)
+
+    return args
+
+
+def neuron_config_to_env(neuron_config):
+    with open(os.environ["ENV_FILEPATH"], "w") as f:
+        for env_var, config_key in env_config_peering:
+            f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
+        max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
+        if not max_input_tokens:
+            max_input_tokens = int(neuron_config["sequence_length"]) // 2
+            if max_input_tokens == 0:
+                raise Exception("Model sequence length should be greater than 1")
+        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
+        max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
+        if not max_batch_prefill_tokens:
+            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
+        f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
+
+
+def sort_neuron_configs(dictionary):
+    return -dictionary["num_cores"], -dictionary["batch_size"]
+
+
+def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
+    # Reuse the same mechanic as the one in use to configure the tgi server part
+    # The only difference here is that we stay as flexible as possible on the compatibility part
+    entries = get_hub_cached_entries(model_id, "inference")
+
+    logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
+
+    all_compatible = []
+    for entry in entries:
+        if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
+            all_compatible.append(entry)
+
+    if not all_compatible:
+        logger.debug(
+            "No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
+            model_id,
+            get_env_dict(),
+            available_cores,
+            neuronxcc_version,
+        )
+        return None
+
+    logger.info("%d compatible neuron cached models found", len(all_compatible))
+
+    all_compatible = sorted(all_compatible, key=sort_neuron_configs)
+
+    entry = all_compatible[0]
+
+    return entry
+
+
+def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
+    logger.debug(
+        "Checking the provided neuron config %s is compatible with the local setup and provided environment",
+        neuron_config,
+    )
+
+    # Local setup compat checks
+    if neuron_config["num_cores"] > available_cores:
+        logger.debug("Not enough neuron cores available to run the provided neuron config")
+        return False
+
+    if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
+        logger.debug(
+            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
+            neuronxcc_version,
+            neuron_config["compiler_version"],
+        )
+        return False
+
+    for env_var, config_key in env_config_peering:
+        neuron_config_value = str(neuron_config[config_key])
+        env_value = os.getenv(env_var, str(neuron_config_value))
+        if env_value != neuron_config_value:
+            logger.debug(
+                "The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
+                env_var,
+                config_key,
+                env_value,
+                neuron_config_value,
+            )
+            return False
+
+    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    if max_input_tokens > 0:
+        sequence_length = neuron_config["sequence_length"]
+        if max_input_tokens >= sequence_length:
+            logger.debug(
+                "Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
+                max_input_tokens,
+                sequence_length,
+            )
+            return False
+
+    return True
+
+
+def get_env_dict() -> Dict[str, str]:
+    d = {}
+    for k in env_vars:
+        d[k] = os.getenv(k)
+    return d
+
+
+def main():
+    """
+    This script determines proper default TGI env variables for the neuron precompiled models to
+    work properly
+    :return:
+    """
+    args = parse_cmdline_and_set_env()
+
+    for env_var in env_vars:
+        if not os.getenv(env_var):
+            break
+    else:
+        logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
+        sys.exit(0)
+
+    cache_dir = constants.HF_HUB_CACHE
+
+    logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
+
+    config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
+    neuron_config = getattr(config, "neuron", None)
+    if neuron_config is not None:
+        compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
+        if not compatible:
+            env_dict = get_env_dict()
+            msg = (
+                "Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
+            ).format(neuron_config, env_dict, available_cores, neuronxcc_version)
+            logger.error(msg)
+            raise Exception(msg)
+    else:
+        neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
+
+    if not neuron_config:
+        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
+            get_env_dict(), available_cores, neuronxcc_version
+        )
+        logger.error(msg)
+        raise Exception(msg)
+
+    neuron_config_to_env(neuron_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -50,6 +50,8 @@
    title: Train Medusa
  title: Tutorials
 - sections:
+  - local: backends/neuron
+    title: Neuron
  - local: backends/trtllm
    title: TensorRT-LLM
  title: Backends
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@ -0,0 +1,182 @@
+# Neuron backend for AWS Trainium and Inferentia
+
+The Neuron backend allows the deployment of TGI on AWS Trainium and Inferentia family of chips.
+
+The following hardware targets are supported:
+- Trainium 1,
+- Inferentia 2.
+
+## Features
+
+The basic TGI features are supported:
+
+- continuous batching,
+- token streaming,
+- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation).
+
+
+## Deploy the service from the Hugging Face hub
+
+The simplest way to deploy the NeuronX TGI service for a specific model is to follow the
+deployment instructions in the model card:
+
+- click on the "Deploy" button on the right,
+- select your deployment service ("Inference Endpoints" and "SageMaker" are supported),
+- select "AWS Trainum & Inferentia",
+- follow the instructions.
+
+
+## Deploy the service on a dedicated host
+
+The service is launched simply by running the text-generation-inference container with two sets of parameters:
+
+```
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:latest-neuron <service_parameters>
+```
+
+- system parameters are used to map ports, volumes and devices between the host and the service,
+- service parameters are forwarded to the `text-generation-launcher`.
+
+When deploying a service, you will need a pre-compiled Neuron model. The Neuron TGI backend supports two main modes of operation:
+
+- you can either deploy the service on a model that has already been exported to Neuron,
+- or alternatively you can take advantage of the Neuron Model Cache to export your own model.
+
+### Common system parameters
+
+Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where
+the models will be cached to speed up further instantiations of the service.
+
+Note also that enough neuron devices should be visible by the container.The simplest way to achieve that is to launch the service in `privileged` mode to get access to all neuron devices.
+Alternatively, each device can be explicitly exposed using the `--device` option.
+
+Finally, you might want to export the `HF_TOKEN` if you want to access gated repositories.
+
+Here is an example of a service instantiation:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --privileged \
+       -e HF_TOKEN=${HF_TOKEN} \
+       ghcr.io/huggingface/text-generation-inference:latest-neuron \
+       <service_parameters>
+```
+
+If you only want to map the first device, the launch command becomes:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --device=/dev/neuron0 \
+       -e HF_TOKEN=${HF_TOKEN} \
+       ghcr.io/huggingface/text-generation-inference:latest-neuron \
+       <service_parameters>
+```
+
+### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended)
+
+We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache).
+
+If you just want to try the service quickly using a model without exporting it to Neuron first, it is thus still possible, pending some conditions:
+- you must specify the export parameters when launching the service (or use default parameters),
+- the model configuration must be cached.
+
+The snippet below shows how you can deploy a service from a hub standard model:
+
+```
+export HF_TOKEN=<YOUR_TOKEN>
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --privileged \
+       -e HF_TOKEN=${HF_TOKEN} \
+       -e HF_AUTO_CAST_TYPE="fp16" \
+       -e HF_NUM_CORES=2 \
+       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       --model-id meta-llama/Meta-Llama-3-8B \
+       --max-batch-size 1 \
+       --max-input-length 3164 \
+       --max-total-tokens 4096
+```
+
+### Using a model exported to a local path
+
+Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference:latest-neuron) locally.
+
+You can then deploy the service inside the shared volume:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --privileged \
+       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       --model-id /data/<neuron_model_path>
+```
+
+Note: You don't need to specify any service parameters, as they will all be deduced from the model export configuration.
+
+### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/)
+
+The easiest way to share a neuron model inside your organization is to push it on the Hugging Face hub, so that it can be deployed directly without requiring an export.
+
+The snippet below shows how you can deploy a service from a hub neuron model:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --privileged \
+       -e HF_TOKEN=${HF_TOKEN} \
+       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       --model-id <organization>/<neuron-model>
+```
+
+### Choosing service parameters
+
+Use the following command to list the available service parameters:
+
+```
+docker run ghcr.io/huggingface/text-generation-inference:latest-neuron --help
+```
+
+The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.
+
+The neuron models have static input dimensions `[batch_size, max_length]`.
+
+This adds several restrictions to the following parameters:
+
+- `--max-batch-size` must be set to `batch size`,
+- `--max-input-length` must be lower than `max_length`,
+- `--max-total-tokens` must be set to `max_length` (it is per-request).
+
+Although not strictly necessary, but important for efficient prefilling:
+
+- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`.
+
+### Choosing the correct batch size
+
+As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput.
+
+Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints.
+
+Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available
+on your instance (16GB per neuron core, with 2 cores per device).
+
+## Query the service
+
+You can query the model using either the `/generate` or `/generate_stream` routes:
+
+```
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+```
+curl 127.0.0.1:8080/generate_stream \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+Note: replace 127.0.0.1:8080 with your actual IP address and port.