From 52d57dca798f7eb0ba92b91733e33579921fa03a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 4 Oct 2024 10:42:31 +0200 Subject: [PATCH] feat(llamacpp): initial end2end build --- backends/llamacpp/CMakeLists.txt | 18 ++- backends/llamacpp/Cargo.toml | 17 +++ backends/llamacpp/build.rs | 94 +++++++++++++ backends/llamacpp/cmake/spdlog.cmake | 7 +- backends/llamacpp/csrc/backend.cpp | 11 +- backends/llamacpp/csrc/backend.hpp | 7 +- backends/llamacpp/offline/main.cpp | 22 +++ backends/llamacpp/src/backend.rs | 18 +++ backends/llamacpp/src/lib.rs | 11 ++ backends/llamacpp/src/main.rs | 203 ++++++++++++++++++++++++++- backends/trtllm/CMakeLists.txt | 2 + 11 files changed, 398 insertions(+), 12 deletions(-) create mode 100644 backends/llamacpp/build.rs create mode 100644 backends/llamacpp/offline/main.cpp create mode 100644 backends/llamacpp/src/backend.rs create mode 100644 backends/llamacpp/src/lib.rs diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 2f9026f1..4671314f 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -6,12 +6,18 @@ set(CMAKE_CXX_STANDARD 20) include(FetchContent) set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against") - +option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") +option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") # Add dependencies include(cmake/fmt.cmake) include(cmake/spdlog.cmake) +if(${LLAMA_CPP_BUILD_CUDA}) + message(STATUS "Enabling llama.cpp CUDA support") + set(GGML_CUDA ON) +endif() + # Download llama.cpp repo at the specific version fetchcontent_declare( llama @@ -25,4 +31,12 @@ fetchcontent_makeavailable(llama) add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) -target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common) +target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) + +if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) + message(STATUS "Building llama.cpp offline runner") + add_executable(tgi_llama_cpp_offline_runner offline/main.cpp) + target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl) +endif() + + diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 2e8ed7dd..fdd980c3 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -6,3 +6,20 @@ authors.workspace = true homepage.workspace = true [dependencies] +clap = { version = "4.5.19", features = ["derive"] } +cxx = "1.0" +hf-hub = { workspace = true } +image = { version = "0.25.1", features = ["default-formats"] } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } +serde_json = "1.0.128" +text-generation-router = { path = "../../router" } +thiserror = "1.0.64" +tokio = "1.40.0" +tokio-stream = "0.1.16" +tokenizers = { workspace = true } + +[build-dependencies] +cmake = "0.1" +cxx-build = { version = "1.0", features = ["parallel"] } +pkg-config = "0.3" \ No newline at end of file diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs new file mode 100644 index 00000000..4e8859ab --- /dev/null +++ b/backends/llamacpp/build.rs @@ -0,0 +1,94 @@ +use cxx_build::CFG; +use std::env; +use std::path::PathBuf; + +const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl"; +const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +const MPI_REQUIRED_VERSION: &str = "4.1"; + +macro_rules! probe { + ($name: expr, $version: expr) => { + if let Err(_) = pkg_config::probe_library($name) { + pkg_config::probe_library(&format!("{}-{}", $name, $version)) + .expect(&format!("Failed to locate {}", $name)); + } + }; +} + +fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf { + let install_path = env::var("CMAKE_INSTALL_PREFIX") + .map(|val| PathBuf::from(val)) + .unwrap_or(out_dir.join("dist")); + + let _ = cmake::Config::new(".") + .uses_cxx11() + .generator("Ninja") + .profile(match is_debug { + true => "Debug", + false => "Release", + }) + .env("OPT_LEVEL", opt_level) + .define("CMAKE_INSTALL_PREFIX", &install_path) + // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") + // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list) + // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path) + .build(); + + // Additional transitive CMake dependencies + let deps_folder = out_dir.join("build").join("_deps"); + for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES { + let dep_name = match is_debug { + true => format!("{}d", dependency), + false => String::from(dependency), + }; + let dep_path = deps_folder.join(format!("{}-build", dependency)); + println!("cargo:rustc-link-search={}", dep_path.display()); + println!("cargo:rustc-link-lib=static={}", dep_name); + } + + let deps_folder = out_dir.join("build").join("_deps"); + deps_folder +} + +fn build_ffi_layer(deps_folder: &PathBuf) { + println!("cargo:warning={}", &deps_folder.display()); + CFG.include_prefix = "backends/llamacpp"; + cxx_build::bridge("src/lib.rs") + .static_flag(true) + .include(deps_folder.join("fmt-src").join("include")) + .include(deps_folder.join("spdlog-src").join("include")) + .include(deps_folder.join("llama-src").join("common")) + .include(deps_folder.join("llama-src").join("ggml").join("include")) + .include(deps_folder.join("llama-src").join("include")) + .file("csrc/backend.cpp") + .std("c++20") + .compile(CMAKE_LLAMA_CPP_TARGET); + + println!("cargo:rerun-if-changed=CMakeLists.txt"); + println!("cargo:rerun-if-changed=csrc/backend.hpp"); + println!("cargo:rerun-if-changed=csrc/backend.cpp"); +} + +fn main() { + // Misc variables + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let build_profile = env::var("PROFILE").unwrap(); + let (is_debug, opt_level) = match build_profile.as_ref() { + "debug" => (true, "0"), + _ => (false, "3"), + }; + + // Build the backend + let deps_folder = build_backend(is_debug, opt_level, &out_dir); + + // Build the FFI layer calling the backend above + build_ffi_layer(&deps_folder); + + // Emit linkage search path + probe!("ompi", MPI_REQUIRED_VERSION); + + // Backend + // BACKEND_DEPS.iter().for_each(|name| { + // println!("cargo:rustc-link-lib=static={}", name); + // }); +} diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index 9cd210dd..68658ba5 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -4,9 +4,10 @@ set(SPDLOG_FMT_EXTERNAL ON) # Define the level at which SPDLOG_ compilation level is defined if (CMAKE_BUILD_TYPE STREQUAL "Debug") - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) -else () - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) + message(STATUS "Verbose logging is enabled in debug build") + add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG) +else() + add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO) endif () fetchcontent_declare( diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 9ce1dbc9..875fdb68 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -46,8 +46,11 @@ namespace huggingface::tgi::backends::llama { } TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx), batch() { - + : model(model), ctx(ctx), batch() + { + char modelName[128]; + llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); + SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); } TgiLlamaCppBackend::~TgiLlamaCppBackend() { @@ -63,4 +66,8 @@ namespace huggingface::tgi::backends::llama { llama_free(ctx); } } + + void TgiLlamaCppBackend::schedule() { + std::vector tokens; + } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index a643454e..7e3c9020 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -1,7 +1,6 @@ // // Created by Morgan Funtowicz on 9/28/2024. // - #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP @@ -9,7 +8,7 @@ #include namespace huggingface::tgi::backends::llama { - const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; +// const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; class TgiLlamaCppBackend { @@ -18,8 +17,10 @@ namespace huggingface::tgi::backends::llama { llama_context* ctx; llama_batch batch; public: - TgiLlamaCppBackend(llama_model* const model, llama_context* const); + TgiLlamaCppBackend(llama_model *model, llama_context *ctx); ~TgiLlamaCppBackend(); + + void schedule(); }; std::unique_ptr CreateLlamaCppBackend(std::string_view root); diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp new file mode 100644 index 00000000..4009588d --- /dev/null +++ b/backends/llamacpp/offline/main.cpp @@ -0,0 +1,22 @@ +// +// Created by mfuntowicz on 10/3/24. +// + +#include +#include +#include +#include +#include "../csrc/backend.hpp" + +int main(int argc, char** argv) { + if(argc < 2) { + fmt::print("No model folder provider"); + return 1; + } + + spdlog::set_level(spdlog::level::debug); + + const std::string_view model_root = argv[1]; + auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root); + fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root); +} \ No newline at end of file diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs new file mode 100644 index 00000000..8af1067b --- /dev/null +++ b/backends/llamacpp/src/backend.rs @@ -0,0 +1,18 @@ +use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::validation::ValidGenerateRequest; +use tokio_stream::wrappers::UnboundedReceiverStream; + +pub struct TgiLlamaCppBakend {} + +impl Backend for TgiLlamaCppBakend { + fn schedule( + &self, + request: ValidGenerateRequest, + ) -> Result>, InferError> { + Err(InferError::GenerationError("Not implemented yet".into())) + } + + async fn health(&self, current_health: bool) -> bool { + todo!() + } +} diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs new file mode 100644 index 00000000..d4c3caf9 --- /dev/null +++ b/backends/llamacpp/src/lib.rs @@ -0,0 +1,11 @@ +pub mod backend; + +#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")] +mod ffi { + unsafe extern "C++" { + include!("backends/llamacpp/csrc/backend.cpp"); + + /// Represent an instance of the llama.cpp backend instance on C++ side + type LlamaCppBackendImpl; + } +} diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index e7a11a96..7226473c 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,3 +1,202 @@ -fn main() { - println!("Hello, world!"); +use clap::{Parser, Subcommand}; +use text_generation_router::{server, usage_stats}; +use thiserror::Error; +use text_generation_router::server::ApiDoc; + +/// App Configuration +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + #[command(subcommand)] + command: Option, + + #[clap(default_value = "128", long, env)] + max_concurrent_requests: usize, + #[clap(default_value = "2", long, env)] + max_best_of: usize, + #[clap(default_value = "4", long, env)] + max_stop_sequences: usize, + #[clap(default_value = "5", long, env)] + max_top_n_tokens: u32, + #[clap(default_value = "1024", long, env)] + max_input_tokens: usize, + #[clap(default_value = "2048", long, env)] + max_total_tokens: usize, + #[clap(default_value = "1.2", long, env)] + waiting_served_ratio: f32, + #[clap(default_value = "4096", long, env)] + max_batch_prefill_tokens: u32, + #[clap(long, env)] + max_batch_total_tokens: Option, + #[clap(default_value = "20", long, env)] + max_waiting_tokens: usize, + #[clap(long, env)] + max_batch_size: Option, + #[clap(default_value = "0.0.0.0", long, env)] + hostname: String, + #[clap(default_value = "3000", long, short, env)] + port: u16, + #[clap(default_value = "/tmp/text-generation-server-0", long, env)] + master_shard_uds_path: String, + #[clap(default_value = "bigscience/bloom", long, env)] + tokenizer_name: String, + #[clap(long, env)] + tokenizer_config_path: Option, + #[clap(long, env)] + revision: Option, + #[clap(default_value = "2", long, env)] + validation_workers: usize, + #[clap(long, env)] + api_key: Option, + #[clap(long, env)] + json_output: bool, + #[clap(long, env)] + otlp_endpoint: Option, + #[clap(default_value = "text-generation-inference.router", long, env)] + otlp_service_name: String, + #[clap(long, env)] + cors_allow_origin: Option>, + #[clap(long, env)] + ngrok: bool, + #[clap(long, env)] + ngrok_authtoken: Option, + #[clap(long, env)] + ngrok_edge: Option, + #[clap(long, env, default_value_t = false)] + messages_api_enabled: bool, + #[clap(long, env, default_value_t = false)] + disable_grammar_support: bool, + #[clap(default_value = "4", long, env)] + max_client_batch_size: usize, + #[clap(default_value = "on", long, env)] + usage_stats: usage_stats::UsageStatsLevel, } + +#[derive(Debug, Subcommand)] +enum Commands { + PrintSchema, +} + +#[tokio::main] +async fn main() -> Result<(), RouterError> { + // Get args + let args = Args::parse(); + // Pattern match configuration + let Args { + command, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + hostname, + port, + master_shard_uds_path, + tokenizer_name, + tokenizer_config_path, + revision, + validation_workers, + api_key, + json_output, + otlp_endpoint, + otlp_service_name, + cors_allow_origin, + ngrok, + ngrok_authtoken, + ngrok_edge, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + usage_stats, + } = args; + + if let Some(Commands::PrintSchema) = command { + use utoipa::OpenApi; + let api_doc = ApiDoc::openapi(); + let api_doc = serde_json::to_string_pretty(&api_doc).unwrap(); + println!("{}", api_doc); + std::process::exit(0); + }; + text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output); + + // Validate args + if max_input_tokens >= max_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_input_tokens` must be < `max_total_tokens`".to_string(), + )); + } + if max_input_tokens as u32 > max_batch_prefill_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); + } + + if validation_workers == 0 { + return Err(RouterError::ArgumentValidation( + "`validation_workers` must be > 0".to_string(), + )); + } + + if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { + if max_batch_prefill_tokens > *max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); + } + if max_total_tokens as u32 > *max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); + } + } + + if let Some(max_batch_size) = max_batch_size { + if max_batch_size == 0 { + return Err(RouterError::ArgumentValidation( + "`max_batch_size` must be > 0".to_string(), + )); + } + } + + let backend = LlamaCppBackend::new(); + + // Run server + server::run( + backend, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + validation_workers, + api_key, + tokenizer_name, + tokenizer_config_path, + revision, + hostname, + port, + cors_allow_origin, + ngrok, + ngrok_authtoken, + ngrok_edge, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + usage_stats, + ) + .await?; + Ok(()) +} + +#[derive(Debug, Error)] +enum RouterError { + #[error("Argument validation error: {0}")] + ArgumentValidation(String), + #[error("Backend failed: {0}")] + Backend(#[from] V3Error), + #[error("WebServer error: {0}")] + WebServer(#[from] server::WebServerError), + #[error("Tokio runtime failed to start: {0}")] + Tokio(#[from] std::io::Error), +} \ No newline at end of file diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 831372cd..80b2b430 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -18,6 +18,8 @@ set(CMAKE_CXX_STANDARD 20) include(FetchContent) include(ExternalProject) +set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc") + option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")