misc(doc): rust documentation

This commit is contained in:
Morgan Funtowicz 2024-11-22 15:35:55 +01:00
parent b9c04b9c07
commit 862a519fdd
2 changed files with 169 additions and 26 deletions

View File

@ -24,6 +24,10 @@ use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{debug, error, info};
/// Detect the number of CPU cores on the machine
///
/// returns: usize Integer greater than 0 representing the number of CPU cores on the machine
///
fn get_num_cores() -> usize {
match option_env!("TGI_USE_PHYSICAL_CORES")
.unwrap_or("OFF")
@ -41,6 +45,45 @@ fn get_num_cores() -> usize {
}
}
/// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores
///
/// # Arguments
///
/// * `num_cores_per_instance`: Minimum number of cores for each instance
///
/// returns: Vec<Range<usize>, Global>
///
/// # Examples
///
/// ```
///
/// ```
fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
// Get the total number of cores on the CPU
let cores_count = get_num_cores();
// Make sure each instance has some cores available
let mut effective_num_cores_per_instance = match num_cores_per_instance {
0 => cores_count,
_ => num_cores_per_instance,
};
// If we have spare cores, let's see if we can give everyone one more core
let num_instances = cores_count / effective_num_cores_per_instance;
if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
}
(0..num_instances)
.map(|ordinal| {
let start = ordinal * effective_num_cores_per_instance;
let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
start..end
})
.collect()
}
type InferResult = Result<InferStreamResponse, InferError>;
unsafe impl Send for LlamaCppWorkerFrontend {}
@ -96,6 +139,20 @@ pub struct LlamaCppBackend {
}
impl LlamaCppBackend {
/// Attempt to create a new llama.cpp worker from the provided model path
///
/// # Arguments
///
/// * `path`: Path to the GGUF model file to load
/// * `num_threads`: Number of cores the model is allowed to spawn for its computations
///
/// returns: Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError>
///
/// # Examples
///
/// ```
///
/// ```
fn allocate_worker(
path: &Path,
num_threads: u32,
@ -144,32 +201,27 @@ impl LlamaCppBackend {
}
}
fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
// Get the total number of cores on the CPU
let cores_count = get_num_cores();
// Make sure each instance has some cores available
let mut effective_num_cores_per_instance = match num_cores_per_instance {
0 => cores_count,
_ => num_cores_per_instance,
};
// If we have spare cores, let's see if we can give everyone one more core
let num_instances = cores_count / effective_num_cores_per_instance;
if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
}
(0..num_instances)
.map(|ordinal| {
let start = ordinal * effective_num_cores_per_instance;
let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
start..end
})
.collect()
}
/// llama.cpp worker actual streaming callback, called everytime a new token is being generated
///
/// # Arguments
///
/// * `ctx`: InferContext holding the channel to stream back generated token to the client.
/// *UNSAFE* This parameter is unsafe and represented as a mutable pointer to avoid automatic drop of its
/// referenced resources after the first iteration step.
/// It's the responsibility of the caller to ensure a `Box::from_raw` is taking back full ownership of the pointer
/// for correct deletion.
/// * `new_token_id`: The sampled token identifier
/// * `new_token_logit`: the sampled token identifier log probability
/// * `is_final`: Flag indicating if the sampled token is a final one
/// * `n_generated_tokens`: Counter representing the actual number of token generated at this stage
///
/// returns: bool `true` if the worker should stop the generation at this stage, `false` to continue
///
/// # Examples
///
/// ```
///
/// ```
fn llama_generate_callback(
ctx: *mut InferContext,
new_token_id: u32,
@ -234,6 +286,20 @@ fn llama_generate_callback(
status.is_err()
}
/// Main loop allowing scheduling incoming requests without blocking the main execution thread
///
/// # Arguments
///
/// * `queue`: Synchronized container to receive new request
/// * `backlog`: Synchronized container to dispatch new request towards all the workers for one to pick it up.
///
/// returns: ()
///
/// # Examples
///
/// ```
///
/// ```
async fn scheduler_loop(
mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
backlog: MpmcSender<(GenerationContext, UnboundedSender<InferResult>)>,
@ -251,6 +317,23 @@ async fn scheduler_loop(
}
}
/// llama.cpp worker thread receiving incoming requests from the scheduler and handling all generation
/// process along with the streaming logic back to the client.
///
/// # Arguments
///
/// * `backend`: Owned llama.cpp worker with allocated execution resources
/// * `affinity`: Set of CPUs to bind the worker's thread for scheduling
/// * `tokenizer`: Tokenizer to use to decode generated token
/// * `backlog`: Multi-consumers queue holding the requests waiting to be handled by a worker
///
/// returns: ()
///
/// # Examples
///
/// ```
///
/// ```
fn worker_loop(
mut backend: UniquePtr<LlamaCppWorkerFrontend>,
affinity: Vec<usize>,

View File

@ -49,14 +49,74 @@ mod ffi {
#[cxx_name = "llama_cpp_worker_frontend_t"]
type LlamaCppWorkerFrontend;
/// Create a new llama.cpp worker
///
/// # Arguments
///
/// * `modelPath`: Path to the GGUF model file to load
/// * `num_threads`: Number of threads the worker is allowed to spawn to run computations
///
/// returns: Result<<unknown>, <unknown>>
///
/// # Examples
///
/// ```
///
/// ```
fn create_worker_frontend(
modelPath: &str,
num_threads: u32,
) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
/// Define the NUMA cores affinity on which the current thread is allowed to be scheduled.
///
/// # Arguments
///
/// * `affinity`: Set of CPU cores allowed for scheduling
///
/// returns: ()
///
/// # Examples
///
/// ```
/// // Bind the current thread for execution on cores 0, 1, 2, 3
/// set_numa_core_affinity(&[0, 1, 2, 3]);
/// ```
fn set_numa_core_affinity(affinity: &[usize]);
/// Force llama.cpp to reevaluate the allowed NUMA context (core and memory affinity) for
/// its internal threads scheduling.
/// This method can potentially cause llama.cpp / ggml to reallocate its internal threadpool to
/// match the new affinity constraints
///
/// returns: ()
///
/// # Examples
///
/// ```
/// set_numa_core_affinity(&[0, 1, 2, 3]);
/// update_numa_affinity();
/// ```
fn update_numa_affinity();
/// Generate new tokens from the provided prompt input `tokens` and generation and sampling parameters,
/// streaming back each generated individual token through the `callback`.
///
/// # Arguments
///
/// * `tokens`: Prompt input tokenized from the request's text input
/// * `generation_params`: Parameters controling the generation loop
/// * `sampling_params`: Parameters controling the sampling from the token distribution
/// * `stream`: Opaque structure mapping HTTP client transport to stream back token
/// * `callback`: Function pointer called everytime a new token is generated
///
/// returns: Result<usize, <unknown>>
///
/// # Examples
///
/// ```
///
/// ```
unsafe fn stream(
self: Pin<&mut LlamaCppWorkerFrontend>,
tokens: &[u32],