diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 552537f0..32c215e3 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -34,12 +34,16 @@ include(cmake/json.cmake) include(cmake/spdlog.cmake) include(cmake/trtllm.cmake) -# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function -check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO) -if(${COMPILER_SUPPORT_WARNING_ON_NVRO}) - set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro") +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1) endif() +# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function +#check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO) +#if(${COMPILER_SUPPORT_WARNING_ON_NVRO}) +# set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro") +#endif() + # Let's build TRTLLM as part of CMake add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..") diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 98501926..9970d84f 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -90,15 +90,16 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { CFG.include_prefix = "backends/trtllm"; cxx_build::bridge("src/lib.rs") .static_flag(true) + .std("c++23") .include(deps_folder.join("fmt-src").join("include")) .include(deps_folder.join("spdlog-src").join("include")) .include(deps_folder.join("json-src").join("include")) .include(deps_folder.join("trtllm-src").join("cpp").join("include")) .include("/usr/local/cuda/include") .include("/usr/local/tensorrt/include") - .file("src/ffi.cpp") - .std("c++20") - .define("NDEBUG", ndebug) + .include("csrc/") + .file("csrc/ffi.hpp") + .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug) .compile("tgi_trtllm_backend"); println!("cargo:rerun-if-changed=CMakeLists.txt"); @@ -106,10 +107,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { println!("cargo:rerun-if-changed=cmake/json.cmake"); println!("cargo:rerun-if-changed=cmake/fmt.cmake"); println!("cargo:rerun-if-changed=cmake/spdlog.cmake"); - println!("cargo:rerun-if-changed=include/backend.h"); - println!("cargo:rerun-if-changed=lib/backend.cpp"); - println!("cargo:rerun-if-changed=include/ffi.h"); - println!("cargo:rerun-if-changed=src/ffi.cpp"); + println!("cargo:rerun-if-changed=csrc/backend.hpp"); + println!("cargo:rerun-if-changed=csrc/backend.cpp"); + println!("cargo:rerun-if-changed=csrc/hardware.hpp"); + println!("cargo:rerun-if-changed=csrc/ffi.hpp"); } fn main() { diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h deleted file mode 100644 index d23f6288..00000000 --- a/backends/trtllm/include/backend.h +++ /dev/null @@ -1,144 +0,0 @@ -// -// Created by Morgan Funtowicz on 6/30/24. -// - -#ifndef TGI_TRTLLM_BACKEND_H -#define TGI_TRTLLM_BACKEND_H - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -using json = nlohmann::json; -namespace tle = tensorrt_llm::executor; - - -#define CAST_SIZETYPE(x) static_cast(x) - -namespace huggingface::tgi::backends { - using RequestId = tle::IdType; - using TokenId = tle::TokenIdType; - - const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); - constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING( - "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})"); - constexpr auto FMT_EXECUTOR_STATS = FMT_STRING( - "Submitting inference [{}] to the executor ({:d} already in-flight)"); - constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING( - "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}"); - - /** - * Initialize all the components required by TRTLLM. - * It is required to call this function before attempting to load any engine - */ - void InitializeBackend(); - - /** - * Initialize logging mechanism - */ - void InitializeLogging(); - - - /** - * - * @param config TensorRT-LLM configuration object - * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode - * @return - */ - tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); - - /** - * - * @param worldSize - * @param workerPath - * @return - */ - tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept; - - /** - * Get the sampling configuration from the parameters provided by TGI - * @param topK - * @param topP - * @param temperature - * @param repetition_penalty - * @param frequency_penalty - * @param seed - * @return - */ - tle::SamplingConfig GetSamplingConfig( - uint32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed - ) noexcept; - - /** - * Attempt to retrieve the - * @param generationConfigPath - * @return - */ - std::optional>> - GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept; - - /** - * - */ - class TensorRtLlmBackend { - private: - const json config; - tle::Executor executor; - - /** Frequently accessed variables cached here **/ - uint32_t maxNumTokens; - std::list> stopWords; - - public: - explicit TensorRtLlmBackend( - const std::filesystem::path &engineFolder, - const std::filesystem::path &executorWorker - ); - - /** - * Query the executor for the number of token available for pulling - * @return - */ - [[nodiscard]] size_t NumResponsesReady() const; - - /** - * Submit a new generation task to the executor - * @param tokens - * @param topK - * @param topP - * @param temperature - * @param repetitionPenalty - * @param frequencyPenalty - * @param seed - * @return Request id related to this generation for reference - */ - [[nodiscard]] RequestId Submit( - const std::vector &tokens, - uint32_t maxNewTokens, - int32_t topK, - float_t topP, - float_t temperature, - float_t repetitionPenalty, - float_t frequencyPenalty, - uint64_t seed - ); - - [[nodiscard]] std::vector PullNewTokens(); - }; -} - - -#endif //TGI_TRTLLM_BACKEND_H diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h deleted file mode 100644 index 449bcd4d..00000000 --- a/backends/trtllm/include/ffi.h +++ /dev/null @@ -1,75 +0,0 @@ -// -// Created by mfuntowicz on 7/11/24. -// - -#ifndef TGI_TRTLLM_BACKEND_FFI_H -#define TGI_TRTLLM_BACKEND_FFI_H - -#include -#include -#include -#include "backend.h" - -namespace huggingface::tgi::backends { - class TensorRtLlmBackendImpl; -} - -// Template to support returning error from TllmException back to Rust in a Result<> -#include - -namespace rust::behavior { - template - static void trycatch(Try &&func, Fail &&fail) noexcept try { - func(); - } catch (tensorrt_llm::common::TllmException &e) { - fail(e.what()); - } -} - -#include "backends/trtllm/src/lib.rs.h" - -namespace huggingface::tgi::backends { - - class TensorRtLlmBackendImpl : public TensorRtLlmBackend { - public: - /*** - * - * @param engineFolder - * @param executorWorker - */ - TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker); - - /*** - * - * @param tokens - * @param maxNewTokens - * @param topK - * @param topP - * @param temperature - * @param repetition_penalty - * @param frequency_penalty - * @param seed - * @return - */ - [[nodiscard("returned request id should be used to refer to the request's generation result later on")]] - uint64_t - Submit(rust::Slice tokens, uint32_t maxNewTokens, - int32_t topK, float_t topP, float_t temperature, - float_t repetition_penalty, float_t frequency_penalty, uint64_t seed); - - /*** - * - * @return - */ - std::unique_ptr> PullTokens(); - }; - - /*** - * - * @param engineFolder - * @return - */ - std::unique_ptr CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker); -} - -#endif //TGI_TRTLLM_BACKEND_FFI_H diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h deleted file mode 100644 index 9633495f..00000000 --- a/backends/trtllm/include/hardware.h +++ /dev/null @@ -1,59 +0,0 @@ -// -// Created by mfuntowicz on 7/23/24. -// - -#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H -#define TGI_TRTLLM_BACKEND_HARDWARE_H - -#include -#include -#include -#include -#include - -namespace huggingface::hardware::cuda { - -#define AMPERE_SM_MAJOR 8 -#define HOPPER_SM_MAJOR 9 - - /** - * Store information about the version of the CUDA Compute Capabilities detected on the device - */ - struct CudaComputeCapabilities { - int32_t major; - int32_t minor; - - [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; } - - [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; } - }; - - CudaComputeCapabilities GetCudaComputeCapabilities() { - // Get the compute capabilities of the current hardware - nvmlDevice_t device; - CudaComputeCapabilities capabilities{0, 0}; - if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) { - SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0"); - if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) { - SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor); - } - } - - return capabilities; - } - - /** - * Return the number of GPU detected. If no GPU is detected, return size_t::max() - * @return - */ - std::optional GetNumDevices() { - uint32_t numGpus = 0; - if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) { - return std::optional(numGpus); - } else { - return std::nullopt; - } - } -} - -#endif //TGI_TRTLLM_BACKEND_HARDWARE_H diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp deleted file mode 100644 index 4dd41de0..00000000 --- a/backends/trtllm/lib/backend.cpp +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include - -#include -#include -#include - -#include "backend.h" -#include "hardware.h" - - -void huggingface::tgi::backends::InitializeLogging() { -#ifdef NDEBUG - if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { - std::string log_level(TRTLLM_LOG_LEVEL_CSTR); - std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { - return std::tolower(c); - }); - - if (log_level == "debug") - spdlog::set_level(spdlog::level::debug); - else - spdlog::set_level(spdlog::level::info); - } -#else - spdlog::set_level(spdlog::level::debug); -#endif -} - -void huggingface::tgi::backends::InitializeBackend() { - SPDLOG_INFO("Initializing Backend..."); - nvmlInit_v2(); - initTrtLlmPlugins(); - - InitializeLogging(); - - SPDLOG_INFO("Backend Executor Version: {}", tle::version()); - const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); - if (numGpus.has_value()) { - SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value()); - } else { - SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system"); - } -} - -[[nodiscard]] -tle::ParallelConfig -huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept { - auto mode = tle::CommunicationMode::kLEADER; - std::optional orchestratorConfig = std::nullopt; - - if (worldSize > 1) { - SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); - mode = tle::CommunicationMode::kORCHESTRATOR; - orchestratorConfig = std::make_optional(true, workerPath, nullptr, true); - } else { - SPDLOG_INFO("Detected single engine deployment, using leader mode"); - } - - return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig); -} - -[[nodiscard]] -tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { - tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1); - - // Retrieve the compute capabilities to enable some options at runtime - const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities(); - - // Single engine (TP = PP = 1) -> using leader mode (no MPI involved) - const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get(); - execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath)); - - // Define some configuration variables - execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); - execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere()); - execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)); - return execConfig; -} - -tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( - const uint32_t topK, - const float_t topP, - const float_t temperature, - const float_t repetition_penalty, - const float_t frequency_penalty, - const uint64_t seed) noexcept { - - return tle::SamplingConfig( - 1, // TGI only use a single beam - topK, - topP, - std::nullopt, - std::nullopt, - std::nullopt, - seed, - temperature, - temperature, - std::nullopt, - repetition_penalty, - std::nullopt, - frequency_penalty - ); -} - -std::optional>> -huggingface::tgi::backends::GetStopWordsFromConfig( - const std::filesystem::path &generationConfigPath) noexcept { - if (exists(generationConfigPath)) { - const auto generationConfig = json::parse(std::ifstream(generationConfigPath)); - if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) { - SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size()); - std::list> stopWords(eosTokenIds.size()); - - const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type { - return {tokenIdObj.template get()}; - }; - - std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token); - return stopWords; - } else { - SPDLOG_INFO("Invalid EOS tokens entry found (not an array)"); - } - } else { - SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist"); - } - - return std::nullopt; -} - -huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( - const std::filesystem::path &enginesFolder, - const std::filesystem::path &executorWorker -) : - config(json::parse(std::ifstream(enginesFolder / "config.json"))), - executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, - GetExecutorConfig(config, executorWorker.string())) { - - SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get()); - - // Ensure we have enough GPUs on the system - const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get(); - const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0); - if (numGpus < worldSize) { - SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize); - // todo : raise exception to catch on rust side - } - - // Cache variables - maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); - - // Attempt to discover stopWords from the generation_config.json - const auto generationConfigPath = enginesFolder / "generation_config.json"; - stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list>()); -} - -[[nodiscard("Returned number of requests needs to be consumed")]] -size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { -#ifdef NDEBUG - return executor.getNumResponsesReady(); -#else - const auto numResponses = executor.getNumResponsesReady(); - if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); - return numResponses; -#endif -} - -[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] -tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( - const std::vector &tokens, - const uint32_t maxNewTokens, - const int32_t topK, - const float_t topP, - const float_t temperature, - const float_t repetitionPenalty, - const float_t frequencyPenalty, - const uint64_t seed -) { - const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size())); -#ifndef NDEBUG - { - const auto &iterations = executor.getLatestIterationStats(); - const auto &lastIteration = iterations.front(); - - SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); - SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed); - SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); - } -#endif - - const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed); - - // Build the request - auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG}; - request.setStopWords(stopWords); - - // Submit to the executor for batching - return executor.enqueueRequest(request); -} - -std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { - return executor.awaitResponses(); -}