misc(doc): rust documentation

2025-06-01 20:02:08 +00:00 · 2024-11-22 15:35:55 +01:00 · 2024-11-22 15:35:55 +01:00 · 862a519fdd
commit 862a519fdd
parent b9c04b9c07
2 changed files with 169 additions and 26 deletions
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -24,6 +24,10 @@ use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};

+/// Detect the number of CPU cores on the machine
+///
+/// returns: usize Integer greater than 0 representing the number of CPU cores on the machine
+///
 fn get_num_cores() -> usize {
    match option_env!("TGI_USE_PHYSICAL_CORES")
        .unwrap_or("OFF")
@ -41,6 +45,45 @@ fn get_num_cores() -> usize {
    }
 }

+/// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores
+///
+/// # Arguments
+///
+/// * `num_cores_per_instance`: Minimum number of cores for each instance
+///
+/// returns: Vec<Range<usize>, Global>
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
+fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
+    // Get the total number of cores on the CPU
+    let cores_count = get_num_cores();
+
+    // Make sure each instance has some cores available
+    let mut effective_num_cores_per_instance = match num_cores_per_instance {
+        0 => cores_count,
+        _ => num_cores_per_instance,
+    };
+
+    // If we have spare cores, let's see if we can give everyone one more core
+    let num_instances = cores_count / effective_num_cores_per_instance;
+    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
+        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
+        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
+    }
+
+    (0..num_instances)
+        .map(|ordinal| {
+            let start = ordinal * effective_num_cores_per_instance;
+            let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
+            start..end
+        })
+        .collect()
+}
+
 type InferResult = Result<InferStreamResponse, InferError>;

 unsafe impl Send for LlamaCppWorkerFrontend {}
@ -96,6 +139,20 @@ pub struct LlamaCppBackend {
 }

 impl LlamaCppBackend {
+    /// Attempt to create a new llama.cpp worker from the provided model path
+    ///
+    /// # Arguments
+    ///
+    /// * `path`: Path to the GGUF model file to load
+    /// * `num_threads`: Number of cores the model is allowed to spawn for its computations
+    ///
+    /// returns: Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError>
+    ///
+    /// # Examples
+    ///
+    /// ```
+    ///
+    /// ```
    fn allocate_worker(
        path: &Path,
        num_threads: u32,
@ -144,32 +201,27 @@ impl LlamaCppBackend {
    }
 }

-fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
-    // Get the total number of cores on the CPU
-    let cores_count = get_num_cores();
-
-    // Make sure each instance has some cores available
-    let mut effective_num_cores_per_instance = match num_cores_per_instance {
-        0 => cores_count,
-        _ => num_cores_per_instance,
-    };
-
-    // If we have spare cores, let's see if we can give everyone one more core
-    let num_instances = cores_count / effective_num_cores_per_instance;
-    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
-        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
-        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
-    }
-
-    (0..num_instances)
-        .map(|ordinal| {
-            let start = ordinal * effective_num_cores_per_instance;
-            let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
-            start..end
-        })
-        .collect()
-}
-
+/// llama.cpp worker actual streaming callback, called everytime a new token is being generated
+///
+/// # Arguments
+///
+/// * `ctx`: InferContext holding the channel to stream back generated token to the client.
+/// *UNSAFE* This parameter is unsafe and represented as a mutable pointer to avoid automatic drop of its
+/// referenced resources after the first iteration step.
+/// It's the responsibility of the caller to ensure a `Box::from_raw` is taking back full ownership of the pointer
+/// for correct deletion.
+/// * `new_token_id`: The sampled token identifier
+/// * `new_token_logit`: the sampled token identifier log probability
+/// * `is_final`: Flag indicating if the sampled token is a final one
+/// * `n_generated_tokens`: Counter representing the actual number of token generated at this stage
+///
+/// returns: bool `true` if the worker should stop the generation at this stage, `false` to continue
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 fn llama_generate_callback(
    ctx: *mut InferContext,
    new_token_id: u32,
@ -234,6 +286,20 @@ fn llama_generate_callback(
    status.is_err()
 }

+/// Main loop allowing scheduling incoming requests without blocking the main execution thread
+///
+/// # Arguments
+///
+/// * `queue`: Synchronized container to receive new request
+/// * `backlog`: Synchronized container to dispatch new request towards all the workers for one to pick it up.
+///
+/// returns: ()
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 async fn scheduler_loop(
    mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
    backlog: MpmcSender<(GenerationContext, UnboundedSender<InferResult>)>,
@ -251,6 +317,23 @@ async fn scheduler_loop(
    }
 }

+/// llama.cpp worker thread receiving incoming requests from the scheduler and handling all generation
+/// process along with the streaming logic back to the client.
+///
+/// # Arguments
+///
+/// * `backend`: Owned llama.cpp worker with allocated execution resources
+/// * `affinity`: Set of CPUs to bind the worker's thread for scheduling
+/// * `tokenizer`: Tokenizer to use to decode generated token
+/// * `backlog`: Multi-consumers queue holding the requests waiting to be handled by a worker
+///
+/// returns: ()
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 fn worker_loop(
    mut backend: UniquePtr<LlamaCppWorkerFrontend>,
    affinity: Vec<usize>,
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@ -49,14 +49,74 @@ mod ffi {
        #[cxx_name = "llama_cpp_worker_frontend_t"]
        type LlamaCppWorkerFrontend;

+        /// Create a new llama.cpp worker
+        ///
+        /// # Arguments
+        ///
+        /// * `modelPath`: Path to the GGUF model file to load
+        /// * `num_threads`: Number of threads the worker is allowed to spawn to run computations
+        ///
+        /// returns: Result<<unknown>, <unknown>>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
        fn create_worker_frontend(
            modelPath: &str,
            num_threads: u32,
        ) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;

+        /// Define the NUMA cores affinity on which the current thread is allowed to be scheduled.
+        ///
+        /// # Arguments
+        ///
+        /// * `affinity`: Set of CPU cores allowed for scheduling
+        ///
+        /// returns: ()
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// // Bind the current thread for execution on cores 0, 1, 2, 3
+        /// set_numa_core_affinity(&[0, 1, 2, 3]);
+        /// ```
        fn set_numa_core_affinity(affinity: &[usize]);
+
+        /// Force llama.cpp to reevaluate the allowed NUMA context (core and memory affinity) for
+        /// its internal threads scheduling.
+        /// This method can potentially cause llama.cpp / ggml to reallocate its internal threadpool to
+        /// match the new affinity constraints
+        ///
+        /// returns: ()
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// set_numa_core_affinity(&[0, 1, 2, 3]);
+        /// update_numa_affinity();
+        /// ```
        fn update_numa_affinity();

+        /// Generate new tokens from the provided prompt input `tokens` and generation and sampling parameters,
+        /// streaming back each generated individual token through the `callback`.
+        ///
+        /// # Arguments
+        ///
+        /// * `tokens`: Prompt input tokenized from the request's text input
+        /// * `generation_params`: Parameters controling the generation loop
+        /// * `sampling_params`: Parameters controling the sampling from the token distribution
+        /// * `stream`: Opaque structure mapping HTTP client transport to stream back token
+        /// * `callback`: Function pointer called everytime a new token is generated
+        ///
+        /// returns: Result<usize, <unknown>>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
        unsafe fn stream(
            self: Pin<&mut LlamaCppWorkerFrontend>,
            tokens: &[u32],