diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
index b19b5d7e..d84bc253 100644
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
 namespace huggingface::tgi::backends {
     using RequestId = tle::IdType;
     using TokenId = tle::TokenIdType;
-    using TokenStreamingCallback = void(tle::TokenIdType);
 
     /**
      * Initialize all the components required by TRTLLM.
@@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
      */
     tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
 
+    /**
+     * Get the sampling configuration from the parameters provided by TGI
+     * @param topK
+     * @param topP
+     * @param temperature
+     * @param seed
+     * @param beamWidth
+     * @return
+     */
+    tle::SamplingConfig GetSamplingConfig(
+            uint32_t topK,
+            float_t topP,
+            float_t temperature,
+            uint64_t seed,
+            std::optional<int32_t> beamWidth
+    );
+
     /**
      *
      */
@@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
                 const std::filesystem::path &executorWorker
         );
 
-        /***
+        /**
          * Indicate if the backend is ready to accept incoming request
          * @return true if ready, false otherwise
          */
         [[nodiscard]] bool IsReady() const;
 
-        /***
+        /**
          * Query the executor for the number of token available for pulling
          * @return
          */
         [[nodiscard]] size_t NumResponsesReady() const;
 
-        /***
+        /**
          * Submit a new generation task to the executor
          * @param tokens
          * @param maxNewTokens
@@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
                 uint64_t seed
         );
 
-        /***
+        /**
          *
          * @param requestId The request id to poll the generation results
          * @return
          */
         std::vector<tle::Response> Poll(RequestId requestId);
 
-        /***
+        /**
          * Stop the underlying executor
          */
         void Shutdown();
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index aca718c4..161dea5a 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
 
     // Get the compute capabilities of the current hardware
     nvmlDevice_t device;
-    int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
+    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
     if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
         SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
-        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
-            NVML_SUCCESS) {
-            SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
-                        cudaComputeCapabilitiesMinor);
+        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
+            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
         }
     }
 
@@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
 
     // Define some configuration variables
     execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
+    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
     return execConfig;
 }
 
+tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
+        uint32_t topK,
+        float_t topP,
+        float_t temperature,
+        uint64_t seed,
+        std::optional<int32_t> beamWidth = std::nullopt) {
+    return tle::SamplingConfig(
+            beamWidth.value_or(1),
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            std::nullopt,
+            temperature,
+            std::nullopt
+    );
+}
+
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
         const std::filesystem::path &enginesFolder,
         const std::filesystem::path &executorWorker
@@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
         const float_t temperature,
         const uint64_t seed
 ) {
-#ifndef NDEBUG
-    SPDLOG_INFO(
+#ifdef NDEBUG
+    SPDLOG_DEBUG(
             FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
             tokens.size(),
             executor.getLatestIterationStats().back().numActiveRequests
     );
 #else
-    SPDLOG_INFO(
+    SPDLOG_DEBUG(
             FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
             fmt::join(tokens, ", "),
-            executor.getLatestIterationStats().back().numActiveRequests
+            executor.getLatestIterationStats().front().numActiveRequests
     );
 #endif
 
-    const auto sampling = tle::SamplingConfig{
-            1,
-            topK,
-            topP,
-            std::nullopt,
-            std::nullopt,
-            std::nullopt,
-            seed,
-            std::nullopt,
-            temperature,
-            std::nullopt,
-    };
-    const auto output = tle::OutputConfig{false, false, false};
+    const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
+    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
+
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
+    const auto output = tle::OutputConfig(false, false, false, true, false);
     return executor.enqueueRequest(
-            tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
+            tle::Request{tokens, maxNewTokens, true, sampling, output});
 }
 
 [[nodiscard("Generated tokens result must be used")]]
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
-    SPDLOG_INFO("Polling status for request {}", requestId);
+    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
     return executor.awaitResponses(requestId);
 }
 
diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs
index b59e2006..d3f56ad9 100644
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
                     );
 
                     info!("Releasing lock for submit");
-                    return request_id;
+                    request_id
                 })
                 .await;
 
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
index 2920eda0..43d6c9f2 100644
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@@ -4,7 +4,9 @@
 #pragma once
 
 #include <cmath>
+#include <exception>
 #include <filesystem>
+#include <iterator>
 #include <vector>
 
 #include <spdlog/spdlog.h>
@@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
         rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
 
     // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.size());
-    tokens_.assign(tokens.begin(), tokens.end());
-
+    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
     return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
 }
 
-size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
-                                             rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
-                                             rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
+size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
+        const uint64_t requestId,
+        rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
+        rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
 
-    SPDLOG_INFO("Entering StreamTokens");
+    size_t numTokens = 0;
     for (const auto &item: Poll(requestId)) {
         if (!item.hasError()) {
-            SPDLOG_INFO("\tStreamTokens -> Decoding token...");
+            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
             const auto decoded = item.getResult();
-            SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());
 
             const auto token = decoded.outputTokenIds[0][0];
             const auto isFinal = decoded.isFinal;
-//            const auto logProb = decoded.logProbs.value()[0][0];
-            const auto logProb = 0.0;
+            const auto logProb = decoded.logProbs.value()[0][0];
 
-            SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+            ++numTokens;
+
+            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
             callback(std::move(ctx), token, logProb, isFinal);
-            SPDLOG_INFO("\tStreamTokens -> Post callback");
+            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
         } else {
             // TODO : Return rest::Result with error
             SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
@@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
         }
     }
 
-    SPDLOG_INFO("Exiting StreamTokens");
-    return 0;
+    return numTokens;
 }
 
 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>