text-generation-inference/backends/llamacpp/src/backend.rs

use crate::ffi::{
    create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams,
};
use async_trait::async_trait;
use cxx::UniquePtr;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::Arc;
use std::thread::{spawn, JoinHandle};
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::{
    ValidGenerateRequest, ValidParameters, ValidStoppingParameters,
};
use text_generation_router::{FinishReason, Token};
use thiserror::Error;
use tokenizers::Tokenizer;
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{debug, error, info};

type InferResult = Result<InferStreamResponse, InferError>;

unsafe impl Send for LlamaCppWorkerFrontend {}

impl From<&ValidParameters> for SamplingParams {
    fn from(v: &ValidParameters) -> Self {
        Self {
            top_k: v.top_k,
            top_p: v.top_p,
            frequency_penalty: v.frequency_penalty,
            repetition_penalty: v.repetition_penalty,
            seed: v.seed,
        }
    }
}

impl From<&ValidStoppingParameters> for GenerationParams {
    fn from(v: &ValidStoppingParameters) -> Self {
        Self {
            max_new_tokens: v.max_new_tokens,
            ignore_eos_token: v.ignore_eos_token,
        }
    }
}

#[cfg_attr(debug_assertions, derive(Debug))]
pub(crate) struct GenerationContext {
    pub(crate) input_tokens: Arc<Vec<u32>>,
    pub(crate) generated_tokens: Vec<u32>,
    pub(crate) generation_params: GenerationParams,
    pub(crate) sampling_params: SamplingParams,
}

pub(crate) struct InferContext {
    pub(crate) start: Instant,
    pub(crate) stream: UnboundedSender<InferResult>,
    pub(crate) tokenizer: Tokenizer,
    pub(crate) generation: GenerationContext,
}

#[derive(Debug, Error)]
pub enum LlamaCppBackendError {
    #[error("Provided GGUF model path {0} doesn't exist")]
    ModelFileDoesntExist(String),

    #[error("Failed to initialize model from GGUF file {0}: {1}")]
    ModelInitializationFailed(PathBuf, String),
}

// pub struct LlamaCppBackend {
//     backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
//     _scheduler_handle: JoinHandle<()>,
// }

struct LlamaCppWorker {
    sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
    handle: JoinHandle<()>,
}

pub enum LlamaCppBackend {
    Single(LlamaCppWorker),
    // Multi(Vec<LlamaCppWorker>)
}

impl LlamaCppBackend {
    fn allocate_worker(
        path: &Path,
    ) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
        create_worker_frontend(&path.display().to_string()).map_err(|ref err| {
            LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
        })
    }

    pub fn new<P: AsRef<Path>>(
        model_path: P,
        tokenizer: Tokenizer,
        num_cores_per_instance: u16,
    ) -> Result<Self, LlamaCppBackendError> {
        let shared_path = Arc::new(model_path);
        let path = shared_path.deref().as_ref();
        if !path.exists() {
            return Err(LlamaCppBackendError::ModelFileDoesntExist(
                path.display().to_string(),
            ));
        }

        let worker = match num_cores_per_instance {
            0 => {
                let worker = Self::allocate_worker(path)?;
                let (sender, receiver) = channel();
                let handle = spawn(|| scheduler_loop(worker, tokenizer, receiver));
                LlamaCppBackend::Single(LlamaCppWorker { sender, handle })
            }
            _ => panic!("No supported yet"),
        };

        Ok(worker)
    }
}

fn llama_generate_callback(
    ctx: *mut InferContext,
    new_token_id: u32,
    new_token_logit: f32,
    is_final: bool,
    n_generated_tokens: usize,
) -> bool {
    debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");

    let ctx = unsafe { &mut *ctx };

    // Append the new token to the generated ones
    ctx.generation.generated_tokens.push(new_token_id);

    // Generate response
    let response = match ctx.tokenizer.decode(&[new_token_id], false) {
        Ok(text) => {
            let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text);
            let token = Token {
                id: new_token_id,
                text,
                logprob: new_token_logit,
                special,
            };

            // Should we generate an ending or intermediate response?
            match is_final {
                false => Ok(InferStreamResponse::Intermediate {
                    token,
                    top_tokens: vec![],
                }),
                true => {
                    // Decode the whole text
                    match ctx
                        .tokenizer
                        .decode(&ctx.generation.generated_tokens, false)
                    {
                        Ok(text) => Ok(InferStreamResponse::End {
                            token,
                            top_tokens: vec![],
                            generated_text: GeneratedText {
                                text,
                                generated_tokens: n_generated_tokens as u32,
                                finish_reason: FinishReason::Length,
                                seed: Some(ctx.generation.sampling_params.seed),
                            },
                            start: ctx.start,
                            queued: ctx.start,
                        }),
                        Err(err) => Err(InferError::GenerationError(err.to_string())),
                    }
                }
            }
        }
        Err(ref err) => Err(InferError::GenerationError(err.to_string())),
    };

    // Send back to the client
    let status = ctx.stream.send(response).inspect_err(|err| {
        error!("Failed to send back the response: {}", err);
    });
    status.is_err()
}

fn scheduler_loop(
    mut backend: UniquePtr<LlamaCppWorkerFrontend>,
    tokenizer: Tokenizer,
    backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
) {
    // This loop will mostly decode single token at every step, so no need to rely on parallelism
    tokenizers::utils::parallelism::set_parallelism(false);

    loop {
        if let Ok((generation, stream)) = backlog.recv() {
            let start = Instant::now();
            let tokenizer = tokenizer.clone();
            let generation_params = generation.generation_params; // copy
            let sampling_params = generation.sampling_params; // copy
            let input_tokens = Arc::clone(&generation.input_tokens);

            // Creating the whole InferContext and pushing it to the heap
            {
                let ctx = Box::new(InferContext {
                    start,
                    stream,
                    tokenizer,
                    generation,
                });

                // We leak the box to avoid it being freed after the first callback call
                // when going out of scope
                unsafe {
                    let boxed_ctx = Box::into_raw(ctx);
                    if let Err(e) = backend.pin_mut().stream(
                        &input_tokens,
                        generation_params,
                        &sampling_params,
                        boxed_ctx,
                        llama_generate_callback,
                    ) {
                        error!("Error while decoding tokens... {}", e.what());
                    }

                    // Make sure we re-keep track of the OpaqueStream box
                    let _ = Box::from_raw(boxed_ctx);
                }
            }
        } else {
            info!("IPC channel is closed, exiting the scheduler loop");
            break;
        }
    }
}

#[async_trait]
impl Backend for LlamaCppBackend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<InferResult>, InferError> {
        if let Some(input_ids) = request.input_ids {
            let (sx, rx) = unbounded_channel();
            let sampling_params = SamplingParams::from(&request.parameters);
            let generation_params = GenerationParams::from(&request.stopping_parameters);

            let ctx = GenerationContext {
                input_tokens: Arc::clone(&input_ids),
                generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize),
                generation_params,
                sampling_params,
            };

            match self {
                LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) {
                    Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
                    Err(_) => Err(InferError::GenerationError(
                        "Failed to sent the request".to_string(),
                    )),
                },
            }
        } else {
            Err(InferError::GenerationError(
                "Unsupported modalities".to_string(),
            ))
        }
    }

    async fn health(&self, _: bool) -> bool {
        true
    }
}
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`use crate::ffi::{`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`};`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`use async_trait::async_trait;`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`use cxx::UniquePtr;`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`use std::ops::Deref;`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`use std::path::{Path, PathBuf};`
chore: unsued variables 2024-11-02 23:53:34 +00:00			`use std::sync::mpsc::{channel, Receiver, Sender};`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`use std::sync::Arc;`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`use std::thread::{spawn, JoinHandle};`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`use text_generation_router::validation::{`
			`ValidGenerateRequest, ValidParameters, ValidStoppingParameters,`
			`};`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`use text_generation_router::{FinishReason, Token};`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`use thiserror::Error;`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`use tokenizers::Tokenizer;`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`use tokio::time::Instant;`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`use tokio_stream::wrappers::UnboundedReceiverStream;`
misc(backend): decrease log verbosity in callback 2024-11-04 22:24:50 +00:00			`use tracing::{debug, error, info};`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`type InferResult = Result<InferStreamResponse, InferError>;`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`unsafe impl Send for LlamaCppWorkerFrontend {}`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`impl From<&ValidParameters> for SamplingParams {`
			`fn from(v: &ValidParameters) -> Self {`
			`Self {`
			`top_k: v.top_k,`
			`top_p: v.top_p,`
			`frequency_penalty: v.frequency_penalty,`
			`repetition_penalty: v.repetition_penalty,`
			`seed: v.seed,`
			`}`
			`}`
			`}`

			`impl From<&ValidStoppingParameters> for GenerationParams {`
			`fn from(v: &ValidStoppingParameters) -> Self {`
			`Self {`
			`max_new_tokens: v.max_new_tokens,`
			`ignore_eos_token: v.ignore_eos_token,`
			`}`
			`}`
			`}`

			`#[cfg_attr(debug_assertions, derive(Debug))]`
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 16:01:22 +00:00			`pub(crate) struct GenerationContext {`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`pub(crate) input_tokens: Arc<Vec<u32>>,`
			`pub(crate) generated_tokens: Vec<u32>,`
			`pub(crate) generation_params: GenerationParams,`
			`pub(crate) sampling_params: SamplingParams,`
			`}`

feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`pub(crate) struct InferContext {`
			`pub(crate) start: Instant,`
			`pub(crate) stream: UnboundedSender<InferResult>,`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`pub(crate) tokenizer: Tokenizer,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`pub(crate) generation: GenerationContext,`
			`}`

feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`#[derive(Debug, Error)]`
			`pub enum LlamaCppBackendError {`
			`#[error("Provided GGUF model path {0} doesn't exist")]`
			`ModelFileDoesntExist(String),`

			`#[error("Failed to initialize model from GGUF file {0}: {1}")]`
			`ModelInitializationFailed(PathBuf, String),`
feat(backend): wip Rust binding 2024-10-24 07:56:40 +00:00			`}`

feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`// pub struct LlamaCppBackend {`
			`// backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,`
			`// _scheduler_handle: JoinHandle<()>,`
			`// }`

			`struct LlamaCppWorker {`
			`sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,`
			`handle: JoinHandle<()>,`
			`}`

			`pub enum LlamaCppBackend {`
			`Single(LlamaCppWorker),`
			`// Multi(Vec<LlamaCppWorker>)`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`}`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00
			`impl LlamaCppBackend {`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`fn allocate_worker(`
			`path: &Path,`
			`) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {`
			`create_worker_frontend(&path.display().to_string()).map_err(\|ref err\| {`
			`LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())`
			`})`
			`}`

			`pub fn new<P: AsRef<Path>>(`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`model_path: P,`
			`tokenizer: Tokenizer,`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`num_cores_per_instance: u16,`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`) -> Result<Self, LlamaCppBackendError> {`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`let shared_path = Arc::new(model_path);`
			`let path = shared_path.deref().as_ref();`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`if !path.exists() {`
			`return Err(LlamaCppBackendError::ModelFileDoesntExist(`
			`path.display().to_string(),`
			`));`
			`}`

feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`let worker = match num_cores_per_instance {`
			`0 => {`
			`let worker = Self::allocate_worker(path)?;`
			`let (sender, receiver) = channel();`
			`let handle = spawn(\|\| scheduler_loop(worker, tokenizer, receiver));`
			`LlamaCppBackend::Single(LlamaCppWorker { sender, handle })`
			`}`
			`_ => panic!("No supported yet"),`
			`};`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`Ok(worker)`
feat(backend): wip Rust binding 2024-10-24 07:56:40 +00:00			`}`
			`}`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`fn llama_generate_callback(`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`ctx: *mut InferContext,`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`new_token_id: u32,`
			`new_token_logit: f32,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`is_final: bool,`
feat(backend): add number of generated tokens in the callback 2024-11-03 22:07:22 +00:00			`n_generated_tokens: usize,`
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 16:01:22 +00:00			`) -> bool {`
misc(backend): decrease log verbosity in callback 2024-11-04 22:24:50 +00:00			`debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00
			`let ctx = unsafe { &mut *ctx };`

			`// Append the new token to the generated ones`
			`ctx.generation.generated_tokens.push(new_token_id);`

feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`// Generate response`
			`let response = match ctx.tokenizer.decode(&[new_token_id], false) {`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`Ok(text) => {`
			`let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text);`
feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`let token = Token {`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`id: new_token_id,`
			`text,`
			`logprob: new_token_logit,`
			`special,`
feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`};`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00
feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`// Should we generate an ending or intermediate response?`
feat(backend): handle all the tokenization failure and send back to the client 2024-11-06 16:46:46 +00:00			`match is_final {`
			`false => Ok(InferStreamResponse::Intermediate {`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`token,`
			`top_tokens: vec![],`
feat(backend): handle all the tokenization failure and send back to the client 2024-11-06 16:46:46 +00:00			`}),`
			`true => {`
			`// Decode the whole text`
			`match ctx`
			`.tokenizer`
			`.decode(&ctx.generation.generated_tokens, false)`
			`{`
			`Ok(text) => Ok(InferStreamResponse::End {`
			`token,`
			`top_tokens: vec![],`
			`generated_text: GeneratedText {`
			`text,`
			`generated_tokens: n_generated_tokens as u32,`
			`finish_reason: FinishReason::Length,`
			`seed: Some(ctx.generation.sampling_params.seed),`
			`},`
			`start: ctx.start,`
			`queued: ctx.start,`
			`}),`
			`Err(err) => Err(InferError::GenerationError(err.to_string())),`
			`}`
			`}`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`}`
			`}`
feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`Err(ref err) => Err(InferError::GenerationError(err.to_string())),`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`};`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00
			`// Send back to the client`
feat(backend): simplify Rust callback 2024-11-12 23:22:11 +00:00			`let status = ctx.stream.send(response).inspect_err(\|err\| {`
			`error!("Failed to send back the response: {}", err);`
			`});`
			`status.is_err()`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`}`

feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`fn scheduler_loop(`
			`mut backend: UniquePtr<LlamaCppWorkerFrontend>,`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`tokenizer: Tokenizer,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`) {`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`// This loop will mostly decode single token at every step, so no need to rely on parallelism`
			`tokenizers::utils::parallelism::set_parallelism(false);`

feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`loop {`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`if let Ok((generation, stream)) = backlog.recv() {`
feat(backend): somewhat generates the final infer response 2024-11-02 23:46:04 +00:00			`let start = Instant::now();`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`let tokenizer = tokenizer.clone();`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`let generation_params = generation.generation_params; // copy`
			`let sampling_params = generation.sampling_params; // copy`
			`let input_tokens = Arc::clone(&generation.input_tokens);`

			`// Creating the whole InferContext and pushing it to the heap`
			`{`
			`let ctx = Box::new(InferContext {`
			`start,`
			`stream,`
feat(backend): expose tokenizer to the GenerationContext to decode token 2024-11-04 22:01:57 +00:00			`tokenizer,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`generation,`
			`});`

feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`// We leak the box to avoid it being freed after the first callback call`
			`// when going out of scope`
			`unsafe {`
			`let boxed_ctx = Box::into_raw(ctx);`
			`if let Err(e) = backend.pin_mut().stream(`
			`&input_tokens,`
			`generation_params,`
			`&sampling_params,`
			`boxed_ctx,`
			`llama_generate_callback,`
			`) {`
			`error!("Error while decoding tokens... {}", e.what());`
			`}`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`// Make sure we re-keep track of the OpaqueStream box`
			`let _ = Box::from_raw(boxed_ctx);`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`}`
			`}`
			`} else {`
			`info!("IPC channel is closed, exiting the scheduler loop");`
			`break;`
misc(offline): expose more parameters for generate 2024-10-28 21:44:47 +00:00			`}`
			`}`
			`}`
feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00
			`#[async_trait]`
			`impl Backend for LlamaCppBackend {`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`fn schedule(`
			`&self,`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`request: ValidGenerateRequest,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`) -> Result<UnboundedReceiverStream<InferResult>, InferError> {`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`if let Some(input_ids) = request.input_ids {`
			`let (sx, rx) = unbounded_channel();`
			`let sampling_params = SamplingParams::from(&request.parameters);`
			`let generation_params = GenerationParams::from(&request.stopping_parameters);`

feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`let ctx = GenerationContext {`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`input_tokens: Arc::clone(&input_ids),`
			`generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize),`
			`generation_params,`
			`sampling_params,`
			`};`

feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`match self {`
			`LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) {`
			`Ok(_) => Ok(UnboundedReceiverStream::new(rx)),`
			`Err(_) => Err(InferError::GenerationError(`
			`"Failed to sent the request".to_string(),`
			`)),`
			`},`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`}`
			`} else {`
			`Err(InferError::GenerationError(`
			`"Unsupported modalities".to_string(),`
			`))`
			`}`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`}`

feat(backend): build and link through build.rs 2024-10-24 14:42:50 +00:00			`async fn health(&self, _: bool) -> bool {`
			`true`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`}`
			`}`