First version loading engines and making it ready for inference

This commit is contained in:
Morgan Funtowicz 2024-07-03 21:12:24 +00:00
parent f8a1463915
commit f57f2a4521
8 changed files with 114 additions and 63 deletions

View File

@ -7,11 +7,27 @@ include(FetchContent)
include(ExternalProject) include(ExternalProject)
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support") set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located") set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located") set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located") set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
#### External dependencies ####
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)
# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
#### Unit Tests #### #### Unit Tests ####
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
message(STATUS "Building tests") message(STATUS "Building tests")
@ -23,22 +39,10 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
FetchContent_MakeAvailable(Catch2) FetchContent_MakeAvailable(Catch2)
add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp) add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain) target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog)
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
include(CTest) include(CTest)
include(Catch) include(Catch)
catch_discover_tests(tgi_trtllm_backend_tests) catch_discover_tests(tgi_trtllm_backend_tests)
endif () endif ()
#### External dependencies ####
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)
# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)

View File

@ -0,0 +1,5 @@
fetchcontent_declare(
json
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
)
fetchcontent_makeavailable(json)

View File

@ -9,18 +9,6 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST}) set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
# message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
#else ()
# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
# message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
# endif ()
#
# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
# message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
# endif ()
#endif ()
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")

View File

@ -8,18 +8,30 @@
#include <filesystem> #include <filesystem>
#include <span> #include <span>
#include <nlohmann/json.hpp>
#include <fmt/format.h>
#include <tensorrt_llm/runtime/common.h> #include <tensorrt_llm/runtime/common.h>
#include <tensorrt_llm/executor/executor.h> #include <tensorrt_llm/executor/executor.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
using json = nlohmann::json;
namespace tle = tensorrt_llm::executor; namespace tle = tensorrt_llm::executor;
namespace huggingface::tgi::backends { namespace huggingface::tgi::backends {
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
class TensorRtLlmBackend { class TensorRtLlmBackend {
private: private:
const json config;
tle::Executor executor; tle::Executor executor;
public: public:
explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder); explicit TensorRtLlmBackend(
const std::filesystem::path &engineFolder,
const std::filesystem::path &executorWorker
);
/*** /***
* Indicate if the backend is ready to accept incoming request * Indicate if the backend is ready to accept incoming request
@ -58,4 +70,5 @@ namespace huggingface::tgi::backends {
}; };
} }
#endif //TGI_TRTLLM_BACKEND_H #endif //TGI_TRTLLM_BACKEND_H

View File

@ -3,9 +3,31 @@
#include "backend.h" #include "backend.h"
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
: executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) { tle::ExecutorConfig execConfig(
SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder); config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
);
execConfig.setParallelConfig(tle::ParallelConfig(
tle::CommunicationType::kMPI,
tle::CommunicationMode::kORCHESTRATOR,
std::nullopt,
std::nullopt,
tle::OrchestratorConfig(true, workerPath)
));
return execConfig;
}
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
const std::filesystem::path &engineFolder,
const std::filesystem::path &executorWorker
):
config(json::parse(std::ifstream(engineFolder / "config.json"))),
executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
{
initTrtLlmPlugins();
SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
} }
tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
@ -20,32 +42,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
const std::optional<uint32_t> seed, const std::optional<uint32_t> seed,
const std::optional<uint32_t> nTopTokens const std::optional<uint32_t> nTopTokens
) { ) {
if (IsReady()) { // if (IsReady()) {
spdlog::debug( // spdlog::debug(
"Submitting inference over {:d} tokens to the executor {:d}", // "Submitting inference over {:d} tokens to the executor {:d}",
tokens.size(), // tokens.size(),
executor.getLatestIterationStats().back().numActiveRequests // executor.getLatestIterationStats().back().numActiveRequests
); // );
//
const auto sampling = tle::SamplingConfig{ // const auto sampling = tle::SamplingConfig{
1, // 1,
topK, // topK,
topP, // topP,
std::nullopt, // std::nullopt,
std::nullopt, // std::nullopt,
std::nullopt, // std::nullopt,
seed, // seed,
temperature, // temperature,
minLength, // minLength,
std::nullopt, // std::nullopt,
repetitionPenalty.value_or(0.0), // repetitionPenalty.value_or(0.0),
std::nullopt, // std::nullopt,
frequencePenalty.value_or(1.0), // frequencePenalty.value_or(1.0),
}; // };
const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; // const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; // const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
//
return executor.enqueueRequest(request); // return executor.enqueueRequest(request);
} // }
return 0; return 0;
} }

View File

@ -12,9 +12,9 @@ namespace huggingface::tgi::backends {
* @param engineFolder * @param engineFolder
* @return * @return
*/ */
std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder) { std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end()); const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
return std::make_unique<TensorRtLlmBackend>(std::move(enginePath)); const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
return std::make_unique<TensorRtLlmBackend>(std::move(enginePath), std::move(executorPath));
} }
} }

View File

@ -10,7 +10,21 @@ mod ffi {
type TensorRtLlmBackend; type TensorRtLlmBackend;
fn create_trtllm_backend(engine_folder: &str) -> UniquePtr<TensorRtLlmBackend>; /// Create an instance backed behind an std::unique_ptr to manage the lifespan of the backend
///
/// # Arguments
///
/// * `engine_folder`: Path to the folder containing all the TRTLLM engines
/// * `executor_worker`: Path to the TRTLLM executor worker
///
/// returns: <unknown>
///
/// # Examples
///
/// ```
///
/// ```
fn create_trtllm_backend(engine_folder: &str, executor_worker: &str) -> UniquePtr<TensorRtLlmBackend>;
#[rust_name = "is_ready"] #[rust_name = "is_ready"]
fn IsReady(&self) -> bool; fn IsReady(&self) -> bool;

View File

@ -2,8 +2,13 @@
// Created by mfuntowicz on 7/2/24. // Created by mfuntowicz on 7/2/24.
// //
#include <catch2/catch_all.hpp> #include <catch2/catch_all.hpp>
#include <spdlog/spdlog.h>
#include "../include/backend.h" #include "../include/backend.h"
TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") { TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine"); const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
spdlog::info("Loading config from: {}", absolute(engines).string());
huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
} }