feat(llamacpp): initial end2end build

2025-10-15 18:05:22 +00:00 · 2024-10-04 10:42:31 +02:00 · 2024-10-04 10:42:31 +02:00 · 52d57dca79
commit 52d57dca79
parent 7d1f8a2bd6
11 changed files with 398 additions and 12 deletions
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@ -6,12 +6,18 @@ set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
-
+option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 # Add dependencies
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 if(${LLAMA_CPP_BUILD_CUDA})
    message(STATUS "Enabling llama.cpp CUDA support")
    set(GGML_CUDA ON)
 endif()
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
    llama
@ -25,4 +31,12 @@ fetchcontent_makeavailable(llama)
 add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
-target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
+target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
 if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
    message(STATUS "Building llama.cpp offline runner")
    add_executable(tgi_llama_cpp_offline_runner offline/main.cpp)
    target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl)
 endif()
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@ -6,3 +6,20 @@ authors.workspace = true
 homepage.workspace = true
 [dependencies]
 clap = { version = "4.5.19", features = ["derive"] }
 cxx = "1.0"
 hf-hub = { workspace = true }
 image = { version = "0.25.1", features = ["default-formats"] }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
 serde_json = "1.0.128"
 text-generation-router = { path = "../../router" }
 thiserror = "1.0.64"
 tokio = "1.40.0"
 tokio-stream = "0.1.16"
 tokenizers = { workspace = true }
 [build-dependencies]
 cmake = "0.1"
 cxx-build = { version = "1.0", features = ["parallel"] }
 pkg-config = "0.3"
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@ -0,0 +1,94 @@
 use cxx_build::CFG;
 use std::env;
 use std::path::PathBuf;
 const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl";
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const MPI_REQUIRED_VERSION: &str = "4.1";
 macro_rules! probe {
    ($name: expr, $version: expr) => {
        if let Err(_) = pkg_config::probe_library($name) {
            pkg_config::probe_library(&format!("{}-{}", $name, $version))
                .expect(&format!("Failed to locate {}", $name));
        }
    };
 }
 fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf {
    let install_path = env::var("CMAKE_INSTALL_PREFIX")
        .map(|val| PathBuf::from(val))
        .unwrap_or(out_dir.join("dist"));
    let _ = cmake::Config::new(".")
        .uses_cxx11()
        .generator("Ninja")
        .profile(match is_debug {
            true => "Debug",
            false => "Release",
        })
        .env("OPT_LEVEL", opt_level)
        .define("CMAKE_INSTALL_PREFIX", &install_path)
        // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
        // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
        // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
        .build();
    // Additional transitive CMake dependencies
    let deps_folder = out_dir.join("build").join("_deps");
    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
        let dep_name = match is_debug {
            true => format!("{}d", dependency),
            false => String::from(dependency),
        };
        let dep_path = deps_folder.join(format!("{}-build", dependency));
        println!("cargo:rustc-link-search={}", dep_path.display());
        println!("cargo:rustc-link-lib=static={}", dep_name);
    }
    let deps_folder = out_dir.join("build").join("_deps");
    deps_folder
 }
 fn build_ffi_layer(deps_folder: &PathBuf) {
    println!("cargo:warning={}", &deps_folder.display());
    CFG.include_prefix = "backends/llamacpp";
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
        .include(deps_folder.join("fmt-src").join("include"))
        .include(deps_folder.join("spdlog-src").join("include"))
        .include(deps_folder.join("llama-src").join("common"))
        .include(deps_folder.join("llama-src").join("ggml").join("include"))
        .include(deps_folder.join("llama-src").join("include"))
        .file("csrc/backend.cpp")
        .std("c++20")
        .compile(CMAKE_LLAMA_CPP_TARGET);
    println!("cargo:rerun-if-changed=CMakeLists.txt");
    println!("cargo:rerun-if-changed=csrc/backend.hpp");
    println!("cargo:rerun-if-changed=csrc/backend.cpp");
 }
 fn main() {
    // Misc variables
    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
    let build_profile = env::var("PROFILE").unwrap();
    let (is_debug, opt_level) = match build_profile.as_ref() {
        "debug" => (true, "0"),
        _ => (false, "3"),
    };
    // Build the backend
    let deps_folder = build_backend(is_debug, opt_level, &out_dir);
    // Build the FFI layer calling the backend above
    build_ffi_layer(&deps_folder);
    // Emit linkage search path
    probe!("ompi", MPI_REQUIRED_VERSION);
    // Backend
    // BACKEND_DEPS.iter().for_each(|name| {
    //     println!("cargo:rustc-link-lib=static={}", name);
    // });
 }
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@ -4,9 +4,10 @@ set(SPDLOG_FMT_EXTERNAL ON)
 # Define the level at which SPDLOG_ compilation level is defined
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+    message(STATUS "Verbose logging is enabled in debug build")
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG)
 else()
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO)
 endif ()
 fetchcontent_declare(
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@ -46,8 +46,11 @@ namespace huggingface::tgi::backends::llama {
    }
    TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
-        : model(model), ctx(ctx), batch() {
+        : model(model), ctx(ctx), batch()
-
+    {
        char modelName[128];
        llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName));
        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
    }
    TgiLlamaCppBackend::~TgiLlamaCppBackend() {
@ -63,4 +66,8 @@ namespace huggingface::tgi::backends::llama {
            llama_free(ctx);
        }
    }
    void TgiLlamaCppBackend::schedule() {
        std::vector<llama_token> tokens;
    }
 }
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@ -1,7 +1,6 @@
 //
 // Created by Morgan Funtowicz on 9/28/2024.
 //
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
@ -9,7 +8,7 @@
 #include <llama.h>
 namespace huggingface::tgi::backends::llama {
-    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
+//    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
    class TgiLlamaCppBackend {
@ -18,8 +17,10 @@ namespace huggingface::tgi::backends::llama {
        llama_context* ctx;
        llama_batch batch;
    public:
-        TgiLlamaCppBackend(llama_model* const model, llama_context* const);
+        TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
        ~TgiLlamaCppBackend();
        void schedule();
    };
    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@ -0,0 +1,22 @@
 //
 // Created by mfuntowicz on 10/3/24.
 //
 #include <string_view>
 #include <fmt/format.h>
 #include <fmt/color.h>
 #include <spdlog/spdlog.h>
 #include "../csrc/backend.hpp"
 int main(int argc, char** argv) {
    if(argc < 2) {
        fmt::print("No model folder provider");
        return 1;
    }
    spdlog::set_level(spdlog::level::debug);
    const std::string_view model_root = argv[1];
    auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root);
    fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root);
 }
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -0,0 +1,18 @@
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 pub struct TgiLlamaCppBakend {}
 impl Backend for TgiLlamaCppBakend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        Err(InferError::GenerationError("Not implemented yet".into()))
    }
    async fn health(&self, current_health: bool) -> bool {
        todo!()
    }
 }
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@ -0,0 +1,11 @@
 pub mod backend;
 #[cxx::bridge(namespace = "huggingface::tgi::backends::llama")]
 mod ffi {
    unsafe extern "C++" {
        include!("backends/llamacpp/csrc/backend.cpp");
        /// Represent an instance of the llama.cpp backend instance on C++ side
        type LlamaCppBackendImpl;
    }
 }
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -1,3 +1,202 @@
-fn main() {
+use clap::{Parser, Subcommand};
-    println!("Hello, world!");
+use text_generation_router::{server, usage_stats};
 use thiserror::Error;
 use text_generation_router::server::ApiDoc;
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 struct Args {
    #[command(subcommand)]
    command: Option<Commands>,
    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
    #[clap(default_value = "1024", long, env)]
    max_input_tokens: usize,
    #[clap(default_value = "2048", long, env)]
    max_total_tokens: usize,
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
    #[clap(default_value = "4096", long, env)]
    max_batch_prefill_tokens: u32,
    #[clap(long, env)]
    max_batch_total_tokens: Option<u32>,
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,
    #[clap(long, env)]
    max_batch_size: Option<usize>,
    #[clap(default_value = "0.0.0.0", long, env)]
    hostname: String,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
    master_shard_uds_path: String,
    #[clap(default_value = "bigscience/bloom", long, env)]
    tokenizer_name: String,
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,
    #[clap(long, env)]
    api_key: Option<String>,
    #[clap(long, env)]
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,
    #[clap(long, env)]
    ngrok: bool,
    #[clap(long, env)]
    ngrok_authtoken: Option<String>,
    #[clap(long, env)]
    ngrok_edge: Option<String>,
    #[clap(long, env, default_value_t = false)]
    messages_api_enabled: bool,
    #[clap(long, env, default_value_t = false)]
    disable_grammar_support: bool,
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
 }
 #[derive(Debug, Subcommand)]
 enum Commands {
    PrintSchema,
 }
 #[tokio::main]
 async fn main() -> Result<(), RouterError> {
    // Get args
    let args = Args::parse();
    // Pattern match configuration
    let Args {
        command,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        hostname,
        port,
        master_shard_uds_path,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        validation_workers,
        api_key,
        json_output,
        otlp_endpoint,
        otlp_service_name,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        messages_api_enabled,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
    } = args;
    if let Some(Commands::PrintSchema) = command {
        use utoipa::OpenApi;
        let api_doc = ApiDoc::openapi();
        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
        println!("{}", api_doc);
        std::process::exit(0);
    };
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
    // Validate args
    if max_input_tokens >= max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if max_input_tokens as u32 > max_batch_prefill_tokens {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }
    if validation_workers == 0 {
        return Err(RouterError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
        ));
    }
    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
        if max_batch_prefill_tokens > *max_batch_total_tokens {
            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
        }
        if max_total_tokens as u32 > *max_batch_total_tokens {
            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
        }
    }
    if let Some(max_batch_size) = max_batch_size {
        if max_batch_size == 0 {
            return Err(RouterError::ArgumentValidation(
                "`max_batch_size` must be > 0".to_string(),
            ));
        }
    }
    let backend = LlamaCppBackend::new();
    // Run server
    server::run(
        backend,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        validation_workers,
        api_key,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        hostname,
        port,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        messages_api_enabled,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
    )
        .await?;
    Ok(())
 }
 #[derive(Debug, Error)]
 enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("Backend failed: {0}")]
    Backend(#[from] V3Error),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
    Tokio(#[from] std::io::Error),
 }
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -18,6 +18,8 @@ set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 include(ExternalProject)
 set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc")
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")