diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index cbfaacf1..abba906e 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -20,6 +20,9 @@ using json = nlohmann::json; namespace tle = tensorrt_llm::executor; + +#define CAST_SIZETYPE(x) static_cast(x) + namespace huggingface::tgi::backends { using RequestId = tle::IdType; using TokenId = tle::TokenIdType; diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index a9d37bc1..ee8171bc 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -164,10 +164,9 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( #endif const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed); - const auto maxNewTokensChecked_ = static_cast(maxNewTokensChecked); // Build the request - auto request = tle::Request{tokens, maxNewTokensChecked_, true, sampling, OUTPUT_CONFIG}; + auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG}; request.setStopWords(stopWords); // Submit to the executor for batching