feat(backend): delete previous backend impl

2025-09-11 20:34:54 +00:00 · 2024-12-01 23:49:25 +01:00 · 2024-12-01 23:49:25 +01:00 · df99164dc1
commit df99164dc1
parent 25c6bbe142
6 changed files with 16 additions and 492 deletions
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -34,12 +34,16 @@ include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
-# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
+    add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
 if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
 endif()
 # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
 #check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
 #if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
 #    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
 #endif()
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -90,15 +90,16 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    CFG.include_prefix = "backends/trtllm";
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
        .std("c++23")
        .include(deps_folder.join("fmt-src").join("include"))
        .include(deps_folder.join("spdlog-src").join("include"))
        .include(deps_folder.join("json-src").join("include"))
        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
        .include("/usr/local/cuda/include")
        .include("/usr/local/tensorrt/include")
-        .file("src/ffi.cpp")
+        .include("csrc/")
-        .std("c++20")
+        .file("csrc/ffi.hpp")
-        .define("NDEBUG", ndebug)
+        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
        .compile("tgi_trtllm_backend");
    println!("cargo:rerun-if-changed=CMakeLists.txt");
@ -106,10 +107,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    println!("cargo:rerun-if-changed=cmake/json.cmake");
    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
-    println!("cargo:rerun-if-changed=include/backend.h");
+    println!("cargo:rerun-if-changed=csrc/backend.hpp");
-    println!("cargo:rerun-if-changed=lib/backend.cpp");
+    println!("cargo:rerun-if-changed=csrc/backend.cpp");
-    println!("cargo:rerun-if-changed=include/ffi.h");
+    println!("cargo:rerun-if-changed=csrc/hardware.hpp");
-    println!("cargo:rerun-if-changed=src/ffi.cpp");
+    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
 }
 fn main() {
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -1,144 +0,0 @@
 //
 // Created by Morgan Funtowicz on 6/30/24.
 //
 #ifndef TGI_TRTLLM_BACKEND_H
 #define TGI_TRTLLM_BACKEND_H
 #include <array>
 #include <cmath>
 #include <filesystem>
 #include <span>
 #include <vector>
 #include <nlohmann/json.hpp>
 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
 #include <tensorrt_llm/plugins/api/tllmPlugin.h>
 using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;
 #define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
 namespace huggingface::tgi::backends {
    using RequestId = tle::IdType;
    using TokenId = tle::TokenIdType;
    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
            "Submitting inference [{}] to the executor ({:d} already in-flight)");
    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
            "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
    /**
     * Initialize all the components required by TRTLLM.
     * It is required to call this function before attempting to load any engine
     */
    void InitializeBackend();
    /**
     * Initialize logging mechanism
     */
    void InitializeLogging();
    /**
     *
     * @param config TensorRT-LLM configuration object
     * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
     * @return
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    /**
     *
     * @param worldSize
     * @param workerPath
     * @return
     */
    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
    /**
     * Get the sampling configuration from the parameters provided by TGI
     * @param topK
     * @param topP
     * @param temperature
     * @param repetition_penalty
     * @param frequency_penalty
     * @param seed
     * @return
     */
    tle::SamplingConfig GetSamplingConfig(
            uint32_t topK,
            float_t topP,
            float_t temperature,
            float_t repetition_penalty,
            float_t frequency_penalty,
            uint64_t seed
    ) noexcept;
    /**
     * Attempt to retrieve the
     * @param generationConfigPath
     * @return
     */
    std::optional<std::list<std::vector<TokenId>>>
    GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
    /**
     *
     */
    class TensorRtLlmBackend {
    private:
        const json config;
        tle::Executor executor;
        /** Frequently accessed variables cached here **/
        uint32_t maxNumTokens;
        std::list<std::vector<TokenId>> stopWords;
    public:
        explicit TensorRtLlmBackend(
                const std::filesystem::path &engineFolder,
                const std::filesystem::path &executorWorker
        );
        /**
         * Query the executor for the number of token available for pulling
         * @return
         */
        [[nodiscard]] size_t NumResponsesReady() const;
        /**
         * Submit a new generation task to the executor
         * @param tokens
         * @param topK
         * @param topP
         * @param temperature
         * @param repetitionPenalty
         * @param frequencyPenalty
         * @param seed
         * @return Request id related to this generation for reference
         */
        [[nodiscard]] RequestId Submit(
                const std::vector<TokenId> &tokens,
                uint32_t maxNewTokens,
                int32_t topK,
                float_t topP,
                float_t temperature,
                float_t repetitionPenalty,
                float_t frequencyPenalty,
                uint64_t seed
        );
        [[nodiscard]] std::vector<tle::Response> PullNewTokens();
    };
 }
 #endif //TGI_TRTLLM_BACKEND_H
--- a/backends/trtllm/include/ffi.h
+++ b/backends/trtllm/include/ffi.h
@ -1,75 +0,0 @@
 //
 // Created by mfuntowicz on 7/11/24.
 //
 #ifndef TGI_TRTLLM_BACKEND_FFI_H
 #define TGI_TRTLLM_BACKEND_FFI_H
 #include <cmath>
 #include <cstddef>
 #include <memory>
 #include "backend.h"
 namespace huggingface::tgi::backends {
    class TensorRtLlmBackendImpl;
 }
 // Template to support returning error from TllmException back to Rust in a Result<>
 #include <tensorrt_llm/common/tllmException.h>
 namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
        func();
    } catch (tensorrt_llm::common::TllmException &e) {
        fail(e.what());
    }
 }
 #include "backends/trtllm/src/lib.rs.h"
 namespace huggingface::tgi::backends {
    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
    public:
        /***
         *
         * @param engineFolder
         * @param executorWorker
         */
        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
        /***
         *
         * @param tokens
         * @param maxNewTokens
         * @param topK
         * @param topP
         * @param temperature
         * @param repetition_penalty
         * @param frequency_penalty
         * @param seed
         * @return
         */
        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
        uint64_t
        Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
               int32_t topK, float_t topP, float_t temperature,
               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
        /***
         *
         * @return
         */
        std::unique_ptr<std::vector<GenerationStep>> PullTokens();
    };
    /***
    *
    * @param engineFolder
    * @return
    */
    std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
 }
 #endif //TGI_TRTLLM_BACKEND_FFI_H
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@ -1,59 +0,0 @@
 //
 // Created by mfuntowicz on 7/23/24.
 //
 #ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
 #define TGI_TRTLLM_BACKEND_HARDWARE_H
 #include <cstdint>
 #include <limits>
 #include <fmt/base.h>
 #include <spdlog/spdlog.h>
 #include <nvml.h>
 namespace huggingface::hardware::cuda {
 #define AMPERE_SM_MAJOR 8
 #define HOPPER_SM_MAJOR 9
    /**
     * Store information about the version of the CUDA Compute Capabilities detected on the device
     */
    struct CudaComputeCapabilities {
        int32_t major;
        int32_t minor;
        [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
        [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
    };
    CudaComputeCapabilities GetCudaComputeCapabilities() {
        // Get the compute capabilities of the current hardware
        nvmlDevice_t device;
        CudaComputeCapabilities capabilities{0, 0};
        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
            }
        }
        return capabilities;
    }
    /**
     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
     * @return
     */
    std::optional<size_t> GetNumDevices() {
        uint32_t numGpus = 0;
        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
            return std::optional(numGpus);
        } else {
            return std::nullopt;
        }
    }
 }
 #endif //TGI_TRTLLM_BACKEND_HARDWARE_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,203 +0,0 @@
 #include <cstdlib>
 #include <fstream>
 #include <fmt/ranges.h>
 #include <spdlog/spdlog.h>
 #include <nvml.h>
 #include "backend.h"
 #include "hardware.h"
 void huggingface::tgi::backends::InitializeLogging() {
 #ifdef NDEBUG
    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
            return std::tolower(c);
        });
        if (log_level == "debug")
            spdlog::set_level(spdlog::level::debug);
        else
            spdlog::set_level(spdlog::level::info);
    }
 #else
    spdlog::set_level(spdlog::level::debug);
 #endif
 }
 void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();
    InitializeLogging();
    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
    if (numGpus.has_value()) {
        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
    } else {
        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
    }
 }
 [[nodiscard]]
 tle::ParallelConfig
 huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
    auto mode = tle::CommunicationMode::kLEADER;
    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
    if (worldSize > 1) {
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        mode = tle::CommunicationMode::kORCHESTRATOR;
        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
    } else {
        SPDLOG_INFO("Detected single engine deployment, using leader mode");
    }
    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
 }
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
    // Retrieve the compute capabilities to enable some options at runtime
    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
    execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
    execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
    execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
    return execConfig;
 }
 tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
        const uint32_t topK,
        const float_t topP,
        const float_t temperature,
        const float_t repetition_penalty,
        const float_t frequency_penalty,
        const uint64_t seed) noexcept {
    return tle::SamplingConfig(
            1,  // TGI only use a single beam
            topK,
            topP,
            std::nullopt,
            std::nullopt,
            std::nullopt,
            seed,
            temperature,
            temperature,
            std::nullopt,
            repetition_penalty,
            std::nullopt,
            frequency_penalty
    );
 }
 std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
 huggingface::tgi::backends::GetStopWordsFromConfig(
        const std::filesystem::path &generationConfigPath) noexcept {
    if (exists(generationConfigPath)) {
        const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
        if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
            SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
            std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
            const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
                return {tokenIdObj.template get<tle::TokenIdType>()};
            };
            std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
            return stopWords;
        } else {
            SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
        }
    } else {
        SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
    }
    return std::nullopt;
 }
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
 ) :
        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
        executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
                 GetExecutorConfig(config, executorWorker.string())) {
    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
    // Ensure we have enough GPUs on the system
    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
    if (numGpus < worldSize) {
        SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
        // todo : raise exception to catch on rust side
    }
    // Cache variables
    maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
    // Attempt to discover stopWords from the generation_config.json
    const auto generationConfigPath = enginesFolder / "generation_config.json";
    stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
 }
 [[nodiscard("Returned number of requests needs to be consumed")]]
 size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
 #ifdef NDEBUG
    return executor.getNumResponsesReady();
 #else
    const auto numResponses = executor.getNumResponsesReady();
    if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
    return numResponses;
 #endif
 }
 [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const std::vector<tle::TokenIdType> &tokens,
        const uint32_t maxNewTokens,
        const int32_t topK,
        const float_t topP,
        const float_t temperature,
        const float_t repetitionPenalty,
        const float_t frequencyPenalty,
        const uint64_t seed
 ) {
    const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
 #ifndef NDEBUG
    {
        const auto &iterations = executor.getLatestIterationStats();
        const auto &lastIteration = iterations.front();
        SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
        SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
        SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
    }
 #endif
    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
    // Build the request
    auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
    request.setStopWords(stopWords);
    // Submit to the executor for batching
    return executor.enqueueRequest(request);
 }
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
    return executor.awaitResponses();
 }