diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 824a5a28..b7cc7955 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -230,7 +230,7 @@ jobs:
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
           echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
           echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
-  precompile_static_models:
+  precompile_neuron_models:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
@@ -252,21 +252,18 @@ jobs:
       - name: Install
         run: |
           make install-integration-tests
-      - name: Run tests
+      - name: Export neuron models
         run: |
-          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
           export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
-          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
-          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
           echo $DOCKER_IMAGE
           docker pull $DOCKER_IMAGE
-          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
+          python integration-tests/fixtures/neuron/export_models.py
   integration_tests:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: [precompile_static_models, build-and-push]
+    needs: [precompile_neuron_models, build-and-push]
     if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
     runs-on:
       group: ${{ needs.build-and-push.outputs.runs_on }}
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 17d25691..c7c4af68 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked
 
 WORKDIR /usr/src
 
-ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
-
 FROM chef AS planner
 COPY backends/neuron/Cargo.toml Cargo.toml
 COPY Cargo.lock Cargo.lock
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 39f0ef4b..37b57d6f 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -12,7 +12,7 @@
   - local: installation_gaudi
     title: Using TGI with Intel Gaudi
   - local: installation_inferentia
-    title: Using TGI with AWS Inferentia
+    title: Using TGI with AWS Trainium and Inferentia
   - local: installation_tpu
     title: Using TGI with Google TPUs
   - local: installation_intel
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
index d3a6fa92..b475bb6d 100644
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin
 - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
 - A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
 - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
-- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
+- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
 - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
 
 Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md
index 0394e6de..bfd0f657 100644
--- a/docs/source/installation_inferentia.md
+++ b/docs/source/installation_inferentia.md
@@ -1,3 +1,3 @@
 # Using TGI with Inferentia
 
-Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
+You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
index 03d6d30b..997503a4 100644
--- a/docs/source/multi_backend_support.md
+++ b/docs/source/multi_backend_support.md
@@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam
   However, it requires a model-specific compilation step for each GPU architecture.
 * **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
   (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
+* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index e0451052..0ffcd162 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,4 +1,4 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
+pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 import requests
diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/export_models.py
similarity index 89%
rename from integration-tests/fixtures/neuron/model.py
rename to integration-tests/fixtures/neuron/export_models.py
index 3345e2ea..836402ec 100644
--- a/integration-tests/fixtures/neuron/model.py
+++ b/integration-tests/fixtures/neuron/export_models.py
@@ -118,10 +118,11 @@ def get_tgi_docker_image():
     return docker_image
 
 
-def export_model(config_name, model_config, neuron_model_name):
-    """Export a neuron model.
+def maybe_export_model(config_name, model_config):
+    """Export a neuron model for the specified test configuration.
 
-    The model is exported by a custom image built on the fly from the base TGI image.
+    If the neuron model has not already been compiled and pushed to the hub, it is
+    exported by a custom image built on the fly from the base TGI image.
     This makes sure the exported model and image are aligned and avoids introducing
     neuron specific imports in the test suite.
 
@@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name):
             Used to identify test configurations
         model_config (`str`):
             The model configuration for export (includes the original model id)
-        neuron_model_name (`str`):
-            The name of the exported model on the hub
     """
+    neuron_model_name = get_neuron_model_name(config_name)
+    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
+    hub = huggingface_hub.HfApi()
+    if hub.repo_exists(neuron_model_id):
+        logger.info(
+            f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
+        )
+        return neuron_model_id
 
     client = docker.from_env()
 
@@ -183,7 +190,7 @@ def export_model(config_name, model_config, neuron_model_name):
         logger.debug("Build logs %s", logs)
 
     try:
-        container = client.containers.run(
+        client.containers.run(
             export_image,
             environment=env,
             auto_remove=True,
@@ -192,7 +199,6 @@ def export_model(config_name, model_config, neuron_model_name):
             shm_size="1G",
         )
         logger.info(f"Successfully exported model for config {config_name}")
-        container.logs()
     except Exception as e:
         logger.exception(f"An exception occurred while running container: {e}.")
         pass
@@ -206,6 +212,12 @@ def export_model(config_name, model_config, neuron_model_name):
         except Exception as e:
             logger.error("Error while removing image %s, skipping", image.id)
             logger.exception(e)
+    return neuron_model_id
+
+
+def maybe_export_models():
+    for config_name, model_config in MODEL_CONFIGURATIONS.items():
+        maybe_export_model(config_name, model_config)
 
 
 @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
@@ -232,15 +244,14 @@ def neuron_model_config(request):
     """
     config_name = request.param
     model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
-    neuron_model_name = get_neuron_model_name(config_name)
-    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
+    # Export the model first (only if needed)
+    neuron_model_id = maybe_export_model(config_name, model_config)
     with TemporaryDirectory() as neuron_model_path:
-        hub = huggingface_hub.HfApi()
-        if not hub.repo_exists(neuron_model_id):
-            # Export the model first
-            export_model(config_name, model_config, neuron_model_name)
         logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
-        hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        hub = huggingface_hub.HfApi()
+        hub.snapshot_download(
+            neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
+        )
         # Add dynamic parameters to the model configuration
         model_config["neuron_model_path"] = neuron_model_path
         model_config["neuron_model_id"] = neuron_model_id
@@ -257,3 +268,7 @@ def neuron_model_config(request):
 @pytest.fixture(scope="module")
 def neuron_model_path(neuron_model_config):
     yield neuron_model_config["neuron_model_path"]
+
+
+if __name__ == "__main__":
+    maybe_export_models()
diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/test_generate.py
similarity index 100%
rename from integration-tests/neuron/integration/test_generate.py
rename to integration-tests/neuron/test_generate.py
diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/test_implicit_env.py
similarity index 100%
rename from integration-tests/neuron/integration/test_implicit_env.py
rename to integration-tests/neuron/test_implicit_env.py