From 862a519fdd818fd492696021bb3d19e993ec7b8b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 15:35:55 +0100 Subject: [PATCH] misc(doc): rust documentation --- backends/llamacpp/src/backend.rs | 135 +++++++++++++++++++++++++------ backends/llamacpp/src/lib.rs | 60 ++++++++++++++ 2 files changed, 169 insertions(+), 26 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 709e5d42..32547655 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -24,6 +24,10 @@ use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; +/// Detect the number of CPU cores on the machine +/// +/// returns: usize Integer greater than 0 representing the number of CPU cores on the machine +/// fn get_num_cores() -> usize { match option_env!("TGI_USE_PHYSICAL_CORES") .unwrap_or("OFF") @@ -41,6 +45,45 @@ fn get_num_cores() -> usize { } } +/// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores +/// +/// # Arguments +/// +/// * `num_cores_per_instance`: Minimum number of cores for each instance +/// +/// returns: Vec, Global> +/// +/// # Examples +/// +/// ``` +/// +/// ``` +fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { + // Get the total number of cores on the CPU + let cores_count = get_num_cores(); + + // Make sure each instance has some cores available + let mut effective_num_cores_per_instance = match num_cores_per_instance { + 0 => cores_count, + _ => num_cores_per_instance, + }; + + // If we have spare cores, let's see if we can give everyone one more core + let num_instances = cores_count / effective_num_cores_per_instance; + if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { + effective_num_cores_per_instance = effective_num_cores_per_instance + 1; + warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); + } + + (0..num_instances) + .map(|ordinal| { + let start = ordinal * effective_num_cores_per_instance; + let end = (ordinal + 1) * effective_num_cores_per_instance - 1; + start..end + }) + .collect() +} + type InferResult = Result; unsafe impl Send for LlamaCppWorkerFrontend {} @@ -96,6 +139,20 @@ pub struct LlamaCppBackend { } impl LlamaCppBackend { + /// Attempt to create a new llama.cpp worker from the provided model path + /// + /// # Arguments + /// + /// * `path`: Path to the GGUF model file to load + /// * `num_threads`: Number of cores the model is allowed to spawn for its computations + /// + /// returns: Result, LlamaCppBackendError> + /// + /// # Examples + /// + /// ``` + /// + /// ``` fn allocate_worker( path: &Path, num_threads: u32, @@ -144,32 +201,27 @@ impl LlamaCppBackend { } } -fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { - // Get the total number of cores on the CPU - let cores_count = get_num_cores(); - - // Make sure each instance has some cores available - let mut effective_num_cores_per_instance = match num_cores_per_instance { - 0 => cores_count, - _ => num_cores_per_instance, - }; - - // If we have spare cores, let's see if we can give everyone one more core - let num_instances = cores_count / effective_num_cores_per_instance; - if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { - effective_num_cores_per_instance = effective_num_cores_per_instance + 1; - warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); - } - - (0..num_instances) - .map(|ordinal| { - let start = ordinal * effective_num_cores_per_instance; - let end = (ordinal + 1) * effective_num_cores_per_instance - 1; - start..end - }) - .collect() -} - +/// llama.cpp worker actual streaming callback, called everytime a new token is being generated +/// +/// # Arguments +/// +/// * `ctx`: InferContext holding the channel to stream back generated token to the client. +/// *UNSAFE* This parameter is unsafe and represented as a mutable pointer to avoid automatic drop of its +/// referenced resources after the first iteration step. +/// It's the responsibility of the caller to ensure a `Box::from_raw` is taking back full ownership of the pointer +/// for correct deletion. +/// * `new_token_id`: The sampled token identifier +/// * `new_token_logit`: the sampled token identifier log probability +/// * `is_final`: Flag indicating if the sampled token is a final one +/// * `n_generated_tokens`: Counter representing the actual number of token generated at this stage +/// +/// returns: bool `true` if the worker should stop the generation at this stage, `false` to continue +/// +/// # Examples +/// +/// ``` +/// +/// ``` fn llama_generate_callback( ctx: *mut InferContext, new_token_id: u32, @@ -234,6 +286,20 @@ fn llama_generate_callback( status.is_err() } +/// Main loop allowing scheduling incoming requests without blocking the main execution thread +/// +/// # Arguments +/// +/// * `queue`: Synchronized container to receive new request +/// * `backlog`: Synchronized container to dispatch new request towards all the workers for one to pick it up. +/// +/// returns: () +/// +/// # Examples +/// +/// ``` +/// +/// ``` async fn scheduler_loop( mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender)>, backlog: MpmcSender<(GenerationContext, UnboundedSender)>, @@ -251,6 +317,23 @@ async fn scheduler_loop( } } +/// llama.cpp worker thread receiving incoming requests from the scheduler and handling all generation +/// process along with the streaming logic back to the client. +/// +/// # Arguments +/// +/// * `backend`: Owned llama.cpp worker with allocated execution resources +/// * `affinity`: Set of CPUs to bind the worker's thread for scheduling +/// * `tokenizer`: Tokenizer to use to decode generated token +/// * `backlog`: Multi-consumers queue holding the requests waiting to be handled by a worker +/// +/// returns: () +/// +/// # Examples +/// +/// ``` +/// +/// ``` fn worker_loop( mut backend: UniquePtr, affinity: Vec, diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index e06220f2..d844bb9f 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -49,14 +49,74 @@ mod ffi { #[cxx_name = "llama_cpp_worker_frontend_t"] type LlamaCppWorkerFrontend; + /// Create a new llama.cpp worker + /// + /// # Arguments + /// + /// * `modelPath`: Path to the GGUF model file to load + /// * `num_threads`: Number of threads the worker is allowed to spawn to run computations + /// + /// returns: Result<, > + /// + /// # Examples + /// + /// ``` + /// + /// ``` fn create_worker_frontend( modelPath: &str, num_threads: u32, ) -> Result>; + /// Define the NUMA cores affinity on which the current thread is allowed to be scheduled. + /// + /// # Arguments + /// + /// * `affinity`: Set of CPU cores allowed for scheduling + /// + /// returns: () + /// + /// # Examples + /// + /// ``` + /// // Bind the current thread for execution on cores 0, 1, 2, 3 + /// set_numa_core_affinity(&[0, 1, 2, 3]); + /// ``` fn set_numa_core_affinity(affinity: &[usize]); + + /// Force llama.cpp to reevaluate the allowed NUMA context (core and memory affinity) for + /// its internal threads scheduling. + /// This method can potentially cause llama.cpp / ggml to reallocate its internal threadpool to + /// match the new affinity constraints + /// + /// returns: () + /// + /// # Examples + /// + /// ``` + /// set_numa_core_affinity(&[0, 1, 2, 3]); + /// update_numa_affinity(); + /// ``` fn update_numa_affinity(); + /// Generate new tokens from the provided prompt input `tokens` and generation and sampling parameters, + /// streaming back each generated individual token through the `callback`. + /// + /// # Arguments + /// + /// * `tokens`: Prompt input tokenized from the request's text input + /// * `generation_params`: Parameters controling the generation loop + /// * `sampling_params`: Parameters controling the sampling from the token distribution + /// * `stream`: Opaque structure mapping HTTP client transport to stream back token + /// * `callback`: Function pointer called everytime a new token is generated + /// + /// returns: Result> + /// + /// # Examples + /// + /// ``` + /// + /// ``` unsafe fn stream( self: Pin<&mut LlamaCppWorkerFrontend>, tokens: &[u32],