diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index fe60769fe..0a0f6e6bf 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -4,7 +4,7 @@ use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
 
-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
 const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
@@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
         .std("c++23")
-        .include(deps_folder.join("fmt-src").join("include"))
         .include(deps_folder.join("spdlog-src").join("include"))
         .include(deps_folder.join("json-src").join("include"))
         .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
@@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
     println!("cargo:rerun-if-changed=CMakeLists.txt");
     println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
     println!("cargo:rerun-if-changed=cmake/json.cmake");
-    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
     println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
     println!("cargo:rerun-if-changed=csrc/backend.hpp");
     println!("cargo:rerun-if-changed=csrc/backend.cpp");
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index 362fe8783..e593f4396 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     std::expected<request_id_t, backend_error_t>
-    backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
         SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
         return executor_.enqueueRequest(tle::Request {
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
index c8f8f21cd..376513589 100644
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@@ -1,3 +1,4 @@
+#pragma once
 #include <cmath>
 #include <cstdint>
 #include <exception>
@@ -17,7 +18,7 @@
 namespace huggingface::tgi::backends::trtllm {
     namespace tle = tensorrt_llm::executor;
     using json = nlohmann::json;
-    using request_id_t = uint32_t;
+    using request_id_t = uint64_t;
     using token_id_t = tle::TokenIdType;
 
     /**
@@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm {
         float_t top_p;
         float_t repetition_penalty;
         float_t frequency_penalty;
-        float_t length_penalty;
         float_t temperature;
         uint64_t seed;
 
@@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm {
                 repetition_penalty,
                 std::nullopt,
                 frequency_penalty,
-                length_penalty
+                std::nullopt
             };
         }
     };
@@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm {
          */
         [[nodiscard("Discarded executor request_id needs to be assigned")]]
         std::expected<request_id_t, backend_error_t>
-        submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
+        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
 
         /**
          * Query the number of tokens available across all in-flight generations
@@ -216,8 +216,8 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
     auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
         return fmt::format_to(
                 ctx.out(),
-                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
-                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
+                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
+                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
         );
     }
 };
\ No newline at end of file
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index b3f20b838..dc9fdd0fb 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,9 +1,18 @@
+#ifndef TGI_BACKEND_TRTLLM_FFI
+#define TGI_BACKEND_TRTLLM_FFI
+
 #include <memory>
 #include <thread>
 
 #include <tensorrt_llm/common/tllmException.h>
 #include <tensorrt_llm/plugins/api/tllmPlugin.h>
 
+#include <spdlog/spdlog.h>
+#include <spdlog/pattern_formatter.h>
+#include <spdlog/fmt/fmt.h>
+
+#include <backend.hpp>
+
 namespace rust::behavior {
     template<typename Try, typename Fail>
     static void trycatch(Try &&func, Fail &&fail) noexcept try {
@@ -13,11 +22,11 @@ namespace rust::behavior {
     }
 }
 
-#include <spdlog/spdlog.h>
-#include <spdlog/pattern_formatter.h>
-#include <spdlog/fmt/fmt.h>
+namespace huggingface::tgi::backends::trtllm {
+    class tensorrt_llm_backend_t;
+}
 
-#include <backend.hpp>
+#include "backends/trtllm/src/lib.rs.h"
 
 namespace huggingface::tgi::backends::trtllm {
     std::once_flag backend_initialized_flag;
@@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm {
             SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
 
             // Submit the request to the executor and get back a potential request_id used to track request status
+            const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
             const auto maybe_request_id = inner_.submit(
-                {tokens.data(), tokens.size()},
+                signed_tokens,
                 {max_new_tokens},
                 {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
             );
@@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm {
             }
         }
 
+        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
+            if(num_tokens_ready() > 0) [[likely]] {
+                const auto responses = inner_.pull_tokens();
+
+                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
+                // Transform tle::Response to GenerationStep
+                auto steps = std::make_unique<std::vector<generation_step_t>>();
+                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
+                    const auto reqId = r.getRequestId();
+                    if (!r.hasError()) [[likely]] {
+                        const auto result = r.getResult();
+                        return generation_step_t{
+                                reqId,
+                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                                result.logProbs.value()[0][0],
+                                result.isFinal,
+                                false,
+                                std::string()
+                        };
+                    } else {
+                        return generation_step_t{
+                                reqId,
+                                0,
+                                0.0,
+                                true,
+                                true,
+                                std::move(r.getErrorMsg())
+                        };
+                    }
+                });
+                return steps;
+
+            } else {
+                return std::make_unique<std::vector<generation_step_t>>();
+            }
+        }
+
         void cancel(request_id_t requestId) noexcept {
             SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
             inner_.cancel(requestId);
@@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm {
         }
     }
 
-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
         return std::make_unique<tensorrt_llm_backend_t>(
             std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
@@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
         );
     }
 }
+#endif
\ No newline at end of file
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
deleted file mode 100644
index 0a92c050f..000000000
--- a/backends/trtllm/src/ffi.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//
-// Created by mfuntowicz on 6/30/24.
-//
-#pragma once
-
-#include <algorithm>
-#include <exception>
-#include <filesystem>
-#include <functional>
-#include <limits>
-#include <iterator>
-#include <ranges>
-#include <vector>
-
-#include <spdlog/spdlog.h>
-#include "backends/trtllm/include/ffi.h"
-
-
-huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
-        const std::string_view &engineFolder,
-        const std::string_view &executorWorker
-) : TensorRtLlmBackend(engineFolder, executorWorker) {}
-
-
-uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
-        rust::Slice<const uint32_t> tokens,
-        uint32_t maxNewTokens,
-        int32_t topK,
-        float_t topP,
-        float_t temperature,
-        float_t repetition_penalty,
-        float_t frequency_penalty,
-        uint64_t seed) {
-
-    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
-    return TensorRtLlmBackend::Submit(
-            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
-}
-
-std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
-huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
-    const auto responses = TensorRtLlmBackend::PullNewTokens();
-
-    auto steps = std::make_unique<std::vector<GenerationStep>>();
-    steps->reserve(responses.size());
-
-#ifndef NDEBUG
-    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
-#endif
-
-    // Transform tle::Response to GenerationStep
-    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-        const auto reqId = r.getRequestId();
-        if (!r.hasError()) {
-            const auto result = r.getResult();
-            return GenerationStep{
-                    reqId,
-                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                    result.logProbs.value()[0][0],
-                    result.isFinal,
-                    false,
-                    std::string()
-            };
-        } else {
-            return GenerationStep{
-                    reqId,
-                    0,
-                    0.0,
-                    true,
-                    true,
-                    std::move(r.getErrorMsg())
-            };
-        }
-    });
-
-    return steps;
-}
-
-std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
-huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
-    SPDLOG_INFO("Creating TensorRT-LLM Backend");
-    // Unconditionally call this to initialize and discover TRTLLM plugins
-    InitializeBackend();
-
-    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
-    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
-}
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index edd8caff1..7b60593df 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -4,10 +4,11 @@ pub mod errors;
 mod looper;
 mod utils;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
     /// Struct used as shared type between rust and C++ to represent the result
     /// of a single decoding iteration
+    #[cxx_name = "generation_step_t"]
     #[derive(Debug, Clone)]
     pub struct GenerationStep {
         request_id: u64,
@@ -19,9 +20,10 @@ mod ffi {
     }
 
     unsafe extern "C++" {
-        include!("backends/trtllm/src/ffi.cpp");
+        include!("backends/trtllm/csrc/ffi.hpp");
 
         /// Represent an instance of the underlying TensorRT-LLM backend
+        #[cxx_name = "tensorrt_llm_backend_t"]
         type TensorRtLlmBackendImpl;
 
         /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@@ -38,21 +40,18 @@ mod ffi {
         /// ```
         ///
         /// ```
-        #[rust_name = "create_tensorrt_llm_backend"]
-        fn CreateTensorRtLlmBackend(
+        fn create_backend_from_engine_folder(
             engine_folder: &str,
             executor_worker: &str,
         ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
 
-        #[rust_name = "num_responses_ready"]
-        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+        fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
 
-        #[rust_name = "submit"]
-        fn Submit(
+        fn submit(
             self: Pin<&mut TensorRtLlmBackendImpl>,
             tokens: &[u32],
             max_new_tokens: u32,
-            top_k: i32,
+            top_k: u32,
             top_p: f32,
             temperature: f32,
             repetition_penalty: f32,
@@ -60,8 +59,7 @@ mod ffi {
             seed: u64,
         ) -> Result<u64>;
 
-        #[rust_name = "pull_tokens"]
-        fn PullTokens(
+        fn pull_tokens(
             self: Pin<&mut TensorRtLlmBackendImpl>,
         ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
     }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index e26155c16..8e9ff49d5 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
 use text_generation_router::{FinishReason, Token};
 
 use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
 use crate::utils::first_line;
 
 type InferResult<T> = Result<T, InferError>;
@@ -93,7 +93,7 @@ fn executor_status_looper(
                 match backend.pin_mut().submit(
                     &input_ids.unwrap(), // This is checked beforehand in validate()
                     stopping_params.max_new_tokens,
-                    generation_params.top_k as i32,
+                    generation_params.top_k,
                     generation_params.top_p,
                     generation_params.temperature,
                     generation_params.repetition_penalty,
@@ -120,7 +120,7 @@ fn executor_status_looper(
             }
         }
 
-        if backend.num_responses_ready() > 0 {
+        if backend.num_tokens_ready() > 0 {
             match backend.pin_mut().pull_tokens() {
                 Ok(responses) => {
                     // Iterate through all the decoded token
@@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 {
         let (post_processor_sender, post_processor_receiver) = unbounded_channel();
 
         // Create the FFI backend
-        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
             .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval