diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c43d8eb9..2e3fe7ef 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -31,15 +31,28 @@ jobs:
       group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
     runs-on:
-      group: aws-highmemory-32-plus-priv
+      group: aws-highmemory-64-plus-priv
     permissions:
       contents: write
       packages: write
+      id-token: write
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Extract TensorRT-LLM version
+        run: |
+          echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
+          echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
+      - name: "Configure AWS Credentials"
+        id: aws-creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
+          role-duration-seconds: 7200
+          output-credentials: true
       - name: Construct harware variables
         shell: bash
         run: |
@@ -52,6 +65,7 @@ jobs:
                 export runs_on="aws-g6-12xl-plus-priv-cache"
                 export platform=""
                 export extra_pytest=""
+                export target="nil" 
                 ;;
             cuda-trtllm)
                 export dockerfile="Dockerfile_trtllm"
@@ -61,6 +75,10 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
+                export target="ci-runtime"
+                export sccache_s3_key_prefix="trtllm"
+                export sccache_region="us-east-1"
+                export build_type="dev"
                 ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
@@ -71,6 +89,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest="-k test_flash_gemma_gptq_load"
+                export target="nil"
                 ;;
             intel-xpu)
                 export dockerfile="Dockerfile_intel"
@@ -80,6 +99,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform="xpu"
                 export extra_pytest=""
+                export target="nil"
                 ;;
             intel-cpu)
                 export dockerfile="Dockerfile_intel"
@@ -90,6 +110,7 @@ jobs:
                 export runs_on="aws-highmemory-32-plus-priv"
                 export platform="cpu"
                 export extra_pytest="-k test_flash_gemma_simple"
+                export target="nil"
                 ;;
           esac
           echo $dockerfile
@@ -106,6 +127,10 @@ jobs:
           echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
           echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
           echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
+          echo "TARGET=${target}" >> $GITHUB_ENV
+          echo "SCCACHE_S3_KEY_PREFIX=${sccache_s3_key_prefix}" >> $GITHUB_ENV
+          echo "SCCACHE_REGION=${sccache_region}" >> $GITHUB_ENV
+          echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
       - name: Initialize Docker Buildx
         uses: docker/setup-buildx-action@v3
         with:
@@ -170,6 +195,14 @@ jobs:
             GIT_SHA=${{ env.GITHUB_SHA }}
             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
             PLATFORM=${{ env.PLATFORM }}
+            build_type=${{ env.BUILD_TYPE }}
+            is_gha_build=true
+            aws_access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }}
+            aws_secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }}
+            aws_session_token=${{ steps.aws-creds.outputs.aws-session-token }}
+            sccache_bucket=${{ secrets.AWS_S3_BUCKET_GITHUB_TGI_TEST }}
+            sccache_s3_key_prefix=${{ env.SCCACHE_S3_KEY_PREFIX }}
+            sccache_region=${{ env.SCCACHE_REGION }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
           cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
@@ -215,3 +248,22 @@ jobs:
           echo $DOCKER_IMAGE
           docker pull $DOCKER_IMAGE
           pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+
+  backend_trtllm_cxx_tests:
+    needs: build-and-push
+    if: needs.build-and-push.outputs.label == '-trtllm'
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-g6-12xl-plus-priv-cache
+    container:
+      image: ${{ needs.build-and-push.outputs.docker_image }}
+      credentials:
+        username: ${{ secrets.REGISTRY_USERNAME }}
+        password: ${{ secrets.REGISTRY_PASSWORD }}
+      options: --gpus all --shm-size=8g
+
+    steps:
+      - name: Run C++/CUDA tests
+        run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index 0d87cb29..d8746b65 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -42,6 +42,7 @@ jobs:
     permissions:
       contents: write
       packages: write
+      id-token: write
     with:
       hardware: ${{ matrix.hardware }}
       # https://github.com/actions/runner/issues/2206
diff --git a/Cargo.toml b/Cargo.toml
index d8155153..9f49c9ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,21 +1,21 @@
 [workspace]
 members = [
-  "benchmark",
-  "backends/v2",
-  "backends/v3",
-  "backends/grpc-metadata",
-  "backends/trtllm",
-  "launcher",
-  "router"
+    "benchmark",
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    "backends/trtllm",
+    "launcher",
+    "router"
 ]
 default-members = [
-  "benchmark",
-  "backends/v2",
-  "backends/v3",
-  "backends/grpc-metadata",
-  # "backends/trtllm",
-  "launcher",
-  "router"
+    "benchmark",
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    # "backends/trtllm",
+    "launcher",
+    "router"
 ]
 resolver = "2"
 
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index ecefc048..40972764 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,19 +1,7 @@
-ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
-ARG OMPI_VERSION="4.1.7rc1"
-
-# Build dependencies resolver stage
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
-WORKDIR /usr/src/text-generation-inference/backends/trtllm
-
-FROM chef AS planner
-COPY Cargo.lock Cargo.lock
-COPY Cargo.toml Cargo.toml
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY router router
-COPY benchmark/ benchmark/
-COPY backends/ backends/
-COPY launcher/ launcher/
-RUN cargo chef prepare --recipe-path recipe.json
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
+ARG ompi_version="4.1.7rc1"
+ARG build_type=release
+ARG is_gha_build=false
 
 # CUDA dependent dependencies resolver stage
 FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
@@ -26,8 +14,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     g++-14 \
     git \
     git-lfs \
+    lld \
     libssl-dev \
     libucx-dev \
+    libasan8 \
+    libubsan1 \
     ninja-build \
     pkg-config \
     pipx \
@@ -43,9 +34,9 @@ ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
 
 # Install OpenMPI
 FROM cuda-builder AS mpi-builder
-ARG OMPI_VERSION
+ARG ompi_version
 
-ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2"
 RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
     mkdir /usr/src/mpi && \
     tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
@@ -65,34 +56,56 @@ RUN chmod +x /opt/install_tensorrt.sh && \
 FROM cuda-builder AS tgi-builder
 WORKDIR /usr/src/text-generation-inference
 
+# Scoped global args reuse
+ARG is_gha_build
+ARG build_type
+
 # Install Rust
+ENV PATH="/root/.cargo/bin:$PATH"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
     chmod -R a+w /root/.rustup && \
-    chmod -R a+w /root/.cargo
+    chmod -R a+w /root/.cargo && \
+    cargo install sccache --locked
 
-ENV PATH="/root/.cargo/bin:$PATH"
-RUN cargo install cargo-chef
+# SCCACHE Specifics args - before finding a better, more generic, way...
+ARG aws_access_key_id
+ARG aws_secret_access_key
+ARG aws_session_token
+ARG sccache_bucket
+ARG sccache_s3_key_prefix
+ARG sccache_region
 
-# Cache dependencies
-COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
-RUN cargo chef cook --release --recipe-path recipe.json
+ENV AWS_ACCESS_KEY_ID=$aws_access_key_id
+ENV AWS_SECRET_ACCESS_KEY=$aws_secret_access_key
+ENV AWS_SESSION_TOKEN=$aws_session_token
+ENV SCCACHE_BUCKET=$sccache_bucket
+ENV SCCACHE_S3_KEY_PREFIX=$sccache_s3_key_prefix
+ENV SCCACHE_REGION=$sccache_region
 
-# Build actual TGI
-ARG CUDA_ARCH_LIST
-ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
 ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+
+ENV USE_LLD_LINKER=ON
+ENV CUDA_ARCH_LIST=${cuda_arch_list}
+ENV IS_GHA_BUILD=${is_gha_build}
 
 COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY router router
-COPY backends/trtllm backends/trtllm
+COPY backends backends
+COPY benchmark benchmark
+COPY launcher launcher
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+
 RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
-    cd backends/trtllm && \
-    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
+    python3 backends/trtllm/scripts/setup_sccache.py --is-gha-build ${is_gha_build} && \
+    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX \
+    RUSTC_WRAPPER=sccache \
+    cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
+    sccache --show-stats
 
 FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
 RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
@@ -116,6 +129,28 @@ FROM runtime
 
 LABEL co.huggingface.vendor="Hugging Face Inc."
 LABEL org.opencontainers.image.authors="hardware@hf.co"
+LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
 
 ENTRYPOINT ["./text-generation-launcher"]
 CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
+
+# This is used only for the CI/CD
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
+RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
+
+WORKDIR /usr/local/tgi/bin
+
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+
+# Basically we copy from target/debug instead of target/release
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
\ No newline at end of file
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 9c1f3436..2cbd3a07 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -1,11 +1,18 @@
 cmake_minimum_required(VERSION 3.20)
 
-if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
     find_program(CCACHE_EXECUTABLE "ccache")
     if (CCACHE_EXECUTABLE)
         message(STATUS "Using ccache")
-        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+        set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
+        set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
     endif ()
+else ()
+    message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
 endif ()
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
@@ -21,28 +28,31 @@ include(CheckCXXCompilerFlag)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
-set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
 find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(MPI REQUIRED)
 
 #### External dependencies ####
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(TGI_TRTLLM_BACKEND_DEBUG ON)
     add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
-endif()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
+endif ()
 
-# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
-check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
-if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
-    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
-endif()
+if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
+    message(STATUS "Using lld linker")
+    add_link_options("-fuse-ld=lld")
+endif ()
 
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
@@ -55,51 +65,68 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
-#        $<INSTALL_INTERFACE:csrc>
+        #        $<INSTALL_INTERFACE:csrc>
 )
 target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
 target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
-
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-else()
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
-endif ()
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
-install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
-install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+install(TARGETS tgi_trtllm_backend_impl)
+install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
+if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
+    install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+endif ()
+
 
 #### Unit Tests ####
-if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
     message(STATUS "Building tests")
+    option(TGI_TRTLLM_BACKEND_ENABLE_ASAN "Enable AddressSanitizer")
+    option(TGI_TRTLLM_BACKEND_ENABLE_UBSAN "Enable UndefinedSanitizer")
+
     FetchContent_Declare(
             Catch2
             URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
     )
     FetchContent_MakeAvailable(Catch2)
 
+    # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
+    check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
+    if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
+        message(STATUS "Enabling non-NVRO detection")
+        target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
+    endif ()
+
+    cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
+    message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
+
     add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
+
+    #    target_compile_options(tgi_trtllm_backend_tests PRIVATE -Werror)
+    target_link_directories(tgi_trtllm_backend_tests PRIVATE "${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
     target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
     target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
     target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
     target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
-    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-    else()
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+    if (${TGI_TRTLLM_BACKEND_ENABLE_ASAN})
+        message(STATUS "Enabled AddressSanitizer")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=address)
     endif ()
 
-    if(CMAKE_BUILD_TYPE MATCHES "Debug")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
-        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
-    endif()
+    if (${TGI_TRTLLM_BACKEND_ENABLE_UBSAN})
+        message(STATUS "Enabled UndefinedSanitizer")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined)
+    endif ()
 
-    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
-    include(CTest)
-    include(Catch)
-    catch_discover_tests(tgi_trtllm_backend_tests)
-endif ()
+    install(TARGETS tgi_trtllm_backend_tests)
+
+    #    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    #    include(CTest)
+    #    include(Catch)
+    #    catch_discover_tests(tgi_trtllm_backend_tests)
+endif ()
\ No newline at end of file
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index d9c1aa15..c18b13a9 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -3,6 +3,7 @@ use pkg_config;
 use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
+use std::sync::LazyLock;
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
@@ -12,12 +13,20 @@ const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
 const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
 
+const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
+    option_env!("IS_GHA_BUILD").map_or(false, |value| match value.to_lowercase().as_str() {
+        "on" => true,
+        "true" => true,
+        "1" => true,
+        _ => false,
+    })
+});
+
 // Dependencies
-const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
+const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
 const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
-const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
     ("dylib", "tensorrt_llm"),
-    ("static", "tensorrt_llm_executor_static"),
     ("dylib", "tensorrt_llm_nvrtc_wrapper"),
     ("dylib", "nvinfer_plugin_tensorrt_llm"),
     ("dylib", "decoder_attention"),
@@ -32,6 +41,48 @@ macro_rules! probe {
     };
 }
 
+fn get_compiler_flag(
+    switch: bool,
+    true_case: &'static str,
+    false_case: &'static str,
+) -> &'static str {
+    match switch {
+        true => true_case,
+        false => false_case,
+    }
+}
+
+fn get_library_architecture() -> &'static str {
+    let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
+    let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
+
+    match os.as_str() {
+        "linux" => {
+            if env != "gnu" {
+                panic!("unsupported linux ABI {env}, only 'gnu' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-linux-gnu",
+                "aarch64" => "aarch64-linux-gnu",
+                _ => panic!("unsupported linux architecture {arch}"),
+            }
+        }
+        "windows" => {
+            if env != "msvc" {
+                panic!("unsupported windows ABI {env}, only 'msvc' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-windows-msvc",
+                _ => panic!("unsupported windows architecture {arch}"),
+            }
+        }
+        _ => panic!("unsupported OS {os}"),
+    }
+}
+
 fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
@@ -54,10 +105,45 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
         .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
-        .define("Python3_ROOT_DIR", "../venv")
+        .define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
 
+    if is_debug || *IS_GHA_BUILD {
+        config.define("TGI_TRTLLM_BACKEND_BUILD_TESTS", "ON");
+    }
+
+    if option_env!("USE_LLD_LINKER").is_some() {
+        println!("cargo:warning=Using lld linker");
+        config.define("TGI_TRTLLM_BACKEND_BUILD_USE_LLD", "ON");
+    }
+
+    if (is_debug && option_env!("ENABLE_ASAN").is_some()) || *IS_GHA_BUILD {
+        println!("cargo:warning=Enabling Address Sanitizer");
+        config.define("TGI_TRTLLM_BACKEND_ENABLE_ASAN", "ON");
+    }
+
+    if (is_debug && option_env!("ENABLE_UBSAN").is_some()) || *IS_GHA_BUILD {
+        println!("cargo:warning=Enabling Undefined Sanitizer");
+        config.define("TGI_TRTLLM_BACKEND_ENABLE_UBSAN", "ON");
+    }
+
+    if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
+        config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
+    }
+
+    if let Some(wrapper) = option_env!("RUSTC_WRAPPER") {
+        println!("cargo:warning=Using caching tool: {wrapper}");
+
+        env::set_var("CMAKE_C_COMPILER_LAUNCHER", wrapper);
+        env::set_var("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
+        env::set_var("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
+    }
+
     // Allow to override which Python to use ...
     if let Some(python3) = option_env!("Python3_EXECUTABLE") {
         config.define("Python3_EXECUTABLE", python3);
@@ -78,23 +164,18 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     // Emit linkage information from the artifacts we just built
-    let install_lib_path = install_path.join("lib");
-
-    println!(
-        r"cargo:warning=Adding link search path: {}",
-        install_lib_path.display()
-    );
-    println!(r"cargo:rustc-link-search={}", install_lib_path.display());
-
+    for path in ["lib", "lib64"] {
+        let install_lib_path = install_path.join(path);
+        println!(
+            r"cargo:warning=Adding link search path: {}",
+            install_lib_path.display()
+        );
+        println!(r"cargo:rustc-link-search={}", install_lib_path.display());
+    }
     (PathBuf::from(install_path), deps_folder)
 }
 
 fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
-    let ndebug = match is_debug {
-        true => "1",
-        false => "0",
-    };
-
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -106,7 +187,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
         .include("/usr/local/tensorrt/include")
         .include("csrc/")
         .file("csrc/ffi.hpp")
-        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
@@ -125,6 +209,7 @@ fn main() {
     let build_profile = env::var("PROFILE").unwrap();
     let (is_debug, opt_level) = match build_profile.as_ref() {
         "debug" => (true, "0"),
+        "dev" => (true, "0"),
         _ => (false, "3"),
     };
 
@@ -161,7 +246,5 @@ fn main() {
         });
 
     // Backend
-    BACKEND_DEPS.iter().for_each(|name| {
-        println!("cargo:rustc-link-lib=static={}", name);
-    });
-}
+    println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS);
+}
\ No newline at end of file
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
index 45e6790a..e7566cd7 100644
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
 else ()
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 endif ()
 
 fetchcontent_declare(
         spdlog
-#        DOWNLOAD_EXTRACT_TIMESTAMP
-        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
+        #        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index 4217892b..d789b1eb 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 set(ENABLE_UCX OFF)
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(FAST_BUILD ON)
-    set(NVTX_DISABLE OFF)
+    set(NVTX_DISABLE ON)
+    set(INDEX_RANGE_CHECK ON)
 else ()
     set(FAST_BUILD OFF)
     set(FAST_MATH ON)
-    set(NVTX_DISABLE ON)
+    set(NVTX_DISABLE OFF)
+    set(INDEX_RANGE_CHECK OFF)
 endif ()
 
 find_package(Python3 REQUIRED Interpreter)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index b50044d8..2151466b 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -1,7 +1,6 @@
 #include <ranges>
 
 #include <nlohmann/json.hpp>
-#include <spdlog/spdlog.h>
 
 #include "backend.hpp"
 #include "hardware.hpp"
@@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm {
         if (world_size > 1) {
             SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
             mode = tle::CommunicationMode::kORCHESTRATOR;
-            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
+            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
+                                                                             true);
         } else {
             SPDLOG_INFO("Detected single engine deployment, using leader mode");
         }
@@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
-        : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
+            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
 
     size_t backend_t::num_tokens_ready() const noexcept {
         return executor_.getNumResponsesReady();
     }
 
     std::expected<request_id_t, backend_error_t>
-    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
-        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
-        return executor_.enqueueRequest(tle::Request {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
+                      const sampling_params_t s_params) noexcept {
+        SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
+        return executor_.enqueueRequest(tle::Request{
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
-                static_cast<tle::SizeType32>(generation_params.max_new_tokens),
+                static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
-                (tle::SamplingConfig) sampling_params,
-                tle::OutputConfig { /* returnLogProbs= */ true },
+                (tle::SamplingConfig) s_params,
+                tle::OutputConfig{ /* returnLogProbs= */ true},
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index d0342d4b..840614bb 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -28,20 +28,62 @@ namespace huggingface::tgi::backends::trtllm {
 
 #include "backends/trtllm/src/lib.rs.h"
 
+
 namespace huggingface::tgi::backends::trtllm {
     std::once_flag backend_initialized_flag;
 
+    constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason reason) noexcept {
+        switch (reason) {
+            case tle::FinishReason::kNOT_FINISHED:
+                return finish_reason_t::kNOT_FINISHED;
+            case tle::FinishReason::kSTOP_WORDS:
+                return finish_reason_t::kSTOP_WORDS;
+            case tle::FinishReason::kEND_ID:
+                return finish_reason_t::kEND_ID;
+            case tle::FinishReason::kLENGTH:
+                return finish_reason_t::kLENGTH;
+            default:
+                std::unreachable();
+        }
+    }
+
+    static auto as_generation_step = [](const tle::Response &r) {
+        const auto reqId = r.getRequestId();
+        if (!r.hasError()) [[likely]] {
+            const auto result = r.getResult();
+            const auto logits = result.logProbs.value()[0];
+            return generation_step_t{
+                    reqId,
+                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                    logits.back(),
+                    result.isFinal,
+                    as_finish_reason_t(result.finishReasons[0]),
+                    false,
+                    std::string()
+            };
+        } else {
+            return generation_step_t{
+                    reqId,
+                    0,
+                    0.0,
+                    true,
+                    finish_reason_t::kNOT_FINISHED,
+                    true,
+                    std::move(r.getErrorMsg())
+            };
+        }
+    };
+
+
     class tensorrt_llm_backend_t {
     private:
         backend_t inner_;
 
     public:
         tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-            : inner_(engine_folder, executor_worker_path) {}
+                : inner_(engine_folder, executor_worker_path) {}
 
-        size_t num_tokens_ready() const noexcept {
-            return inner_.num_tokens_ready();
-        }
+        size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
 
         request_id_t submit(
                 rust::Slice<const uint32_t> tokens,
@@ -59,13 +101,13 @@ namespace huggingface::tgi::backends::trtllm {
             // Submit the request to the executor and get back a potential request_id used to track request status
             const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
             const auto maybe_request_id = inner_.submit(
-                signed_tokens,
-                {max_new_tokens},
-                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
+                    signed_tokens,
+                    {max_new_tokens},
+                    {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
             );
 
             // If we do have a value, let's return the request_id
-            if(maybe_request_id.has_value()) [[likely]] {
+            if (maybe_request_id.has_value()) [[likely]] {
                 return *maybe_request_id;
             } else {
                 SPDLOG_WARN("[FFI] Failed to submit request to the executor");
@@ -74,61 +116,45 @@ namespace huggingface::tgi::backends::trtllm {
         }
 
         std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
-            if(num_tokens_ready() > 0) [[likely]] {
+            if (num_tokens_ready() > 0) [[likely]] {
                 const auto responses = inner_.pull_tokens();
 
                 SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
-                // Transform tle::Response to GenerationStep
-                auto steps = std::make_unique<std::vector<generation_step_t>>();
-                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-                    const auto reqId = r.getRequestId();
-                    if (!r.hasError()) [[likely]] {
-                        const auto result = r.getResult();
-                        return generation_step_t{
-                                reqId,
-                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                                result.logProbs.value()[0][0],
-                                result.isFinal,
-                                false,
-                                std::string()
-                        };
-                    } else {
-                        return generation_step_t{
-                                reqId,
-                                0,
-                                0.0,
-                                true,
-                                true,
-                                std::move(r.getErrorMsg())
-                        };
-                    }
-                });
-                return steps;
+
+                // Transform tle::Response to generation_step_t
+#ifdef __cpp_lib_ranges_to_container
+                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
+#else
+                auto steps = std::vector<generation_step_t>();
+                steps.reserve(responses.size());
+                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
+#endif
+                return std::make_unique<std::vector<generation_step_t>>(steps);
 
             } else {
                 return std::make_unique<std::vector<generation_step_t>>();
             }
         }
 
-        void cancel(request_id_t requestId) noexcept {
-            SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
-            inner_.cancel(requestId);
+        void cancel(request_id_t request_id) noexcept {
+            SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
+            inner_.cancel(request_id);
         }
     };
 
     void initialize_logging() {
 #ifndef TGI_TRTLLM_BACKEND_DEBUG
         if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
-        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
-        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
+            std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+            std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+                return std::tolower(c);
+            });
 
-        if (log_level == "debug")
-            spdlog::set_level(spdlog::level::debug);
-        else
-            spdlog::set_level(spdlog::level::info);
-    }
+            if (log_level == "debug")
+                spdlog::set_level(spdlog::level::debug);
+            else
+                spdlog::set_level(spdlog::level::info);
+        }
 #else
         spdlog::set_level(spdlog::level::debug);
 #endif
@@ -151,11 +177,14 @@ namespace huggingface::tgi::backends::trtllm {
         }
     }
 
-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+    std::unique_ptr<tensorrt_llm_backend_t>
+    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
         return std::make_unique<tensorrt_llm_backend_t>(
-            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
-            std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
+                std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
+                                      std::filesystem::path::format::auto_format),
+                std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
+                                      std::filesystem::path::format::auto_format)
         );
     }
 }
diff --git a/backends/trtllm/scripts/setup_sccache.py b/backends/trtllm/scripts/setup_sccache.py
new file mode 100644
index 00000000..982f8c77
--- /dev/null
+++ b/backends/trtllm/scripts/setup_sccache.py
@@ -0,0 +1,49 @@
+from argparse import ArgumentParser
+
+AWS_S3_CACHING_VARIABLES = {
+    "AWS_ACCESS_KEY_ID": "aws_access_key_id",
+    "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
+    "AWS_SESSION_TOKEN": "aws_session_token",
+    "SCCACHE_REGION": "s3_region",
+    "SCCACHE_BUCKET": "s3_bucket_name",
+}
+
+ALL_CACHING_STORAGE_VARIABLES = {
+    "AWS_S3_CACHING_VARIABLES"
+}
+
+
+def setup_sccache_locally():
+    from os import environ
+
+    print("Setting up Local Caching Layer")
+    for target in ALL_CACHING_STORAGE_VARIABLES:
+        for envvar in globals()[target].keys():
+            if envvar in environ:
+                print(f"Deleted {envvar} from environment variables")
+                del environ[envvar]
+
+
+def setup_sccache_for_s3():
+    from os import environ
+
+    print("Setting up AWS S3 Caching Layer")
+    for envvar in AWS_S3_CACHING_VARIABLES.keys():
+        if not envvar in environ or not environ[envvar] or len(environ[envvar]) == 0:
+            print(f"Missing definition for environment variable {envvar}")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("TensorRT-LLM Build Caching Setup")
+
+    parser.add_argument("--is-gha-build", type=str, default="FALSE",
+                        help="Indicate if the build is from Github Actions")
+
+    # Parse args
+    args = parser.parse_args()
+    args.is_gha_build = args.is_gha_build.lower() in {"on", "true", "1"}
+
+    if args.is_gha_build:
+        setup_sccache_for_s3()
+    else:
+        setup_sccache_locally()
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index d6acafa1..08507256 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -6,6 +6,26 @@ mod utils;
 
 #[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
+    #[cxx_name = "finish_reason_t"]
+    #[derive(Debug, Clone, Copy)]
+    pub enum FinishReason {
+        /// The request is not finished.
+        #[cxx_name = "kNOT_FINISHED"]
+        NotFinished = 0u8,
+
+        /// The request finished because the end id was generated.
+        #[cxx_name = "kEND_ID"]
+        EndTokenId = 1u8,
+
+        /// The request finished because a stop word was generated.
+        #[cxx_name = "kSTOP_WORDS"]
+        StopWords = 2u8,
+
+        /// The request finished because the maximum number of tokens was reached.
+        #[cxx_name = "kLENGTH"]
+        MaxLength = 3u8,
+    }
+
     /// Struct used as shared type between rust and C++ to represent the result
     /// of a single decoding iteration
     #[cxx_name = "generation_step_t"]
@@ -15,6 +35,7 @@ mod ffi {
         token_id: u32,
         log_prob: f32,
         is_final: bool,
+        finish_reason: FinishReason,
         has_error: bool,
         error_msg: String,
     }
@@ -66,3 +87,17 @@ mod ffi {
         fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
     }
 }
+
+use ffi::FinishReason;
+use text_generation_router::FinishReason as InferFinishReason;
+
+impl From<FinishReason> for InferFinishReason {
+    fn from(reason: FinishReason) -> Self {
+        match reason {
+            FinishReason::StopWords => InferFinishReason::StopSequence,
+            FinishReason::MaxLength => InferFinishReason::Length,
+            FinishReason::EndTokenId => InferFinishReason::EndOfSequenceToken,
+            _ => panic!("Cannot convert {reason:?} to text_generation_router::FinishReason"),
+        }
+    }
+}
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index af299f7d..5af96ade 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -11,7 +11,7 @@ use text_generation_router::server::{
     get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
 };
 use text_generation_router::usage_stats::UsageStatsLevel;
-use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
+use text_generation_router::{server, Tokenizer};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -69,7 +69,7 @@ struct Args {
 
 async fn get_tokenizer(
     tokenizer_name: &str,
-    tokenizer_config_path: Option<&str>,
+    _tokenizer_config_path: Option<&str>,
     revision: Option<&str>,
 ) -> Option<Tokenizer> {
     // Parse Huggingface hub token
diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp
index 14d92b75..f44cc03f 100644
--- a/backends/trtllm/tests/test_backend.cpp
+++ b/backends/trtllm/tests/test_backend.cpp
@@ -8,13 +8,13 @@
 
 #include "backend.hpp"
 
-
-
 using namespace huggingface::tgi::backends::trtllm;
 
 TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
 {
-    const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
+    const json config_j = {{"temperature",  0.6},
+                           {"top_p",        0.95},
+                           {"eos_token_id", {1, 2, 3}}};
     const auto generation_config = generation_config_t(config_j);
 
     REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
@@ -24,8 +24,9 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
     REQUIRE_FALSE(generation_config.stop_words.empty());
     REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
 
-    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
-    {
+    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
+                                                                                                        {2},
+                                                                                                        {3}})) {
         // Currently we do not support multi-tokens stop words
         REQUIRE(lhs.size() == 1);
         REQUIRE(rhs.size() == 1);
@@ -35,7 +36,7 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
 
 TEST_CASE("parse generation_config.json default", "[generation_config_t]")
 {
-    const json config_j = {{"eos_token_id", {1,2,3}}};
+    const json config_j = {{"eos_token_id", {1, 2, 3}}};
     const auto generation_config = generation_config_t(config_j);
 
     REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
@@ -44,8 +45,9 @@ TEST_CASE("parse generation_config.json default", "[generation_config_t]")
     REQUIRE_FALSE(generation_config.stop_words.empty());
     REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
 
-    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
-    {
+    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
+                                                                                                        {2},
+                                                                                                        {3}})) {
         // Currently we do not support multi-tokens stop words
         REQUIRE(lhs.size() == 1);
         REQUIRE(rhs.size() == 1);