text-generation-inference/backends/llamacpp/src/lib.rs

use crate::backend::InferContext;
use crate::ffi::SamplingParams;

pub mod backend;

impl Default for SamplingParams {
    fn default() -> Self {
        Self {
            top_k: u32::MAX,
            top_p: 1.0f32,
            frequency_penalty: 0.0f32,
            repetition_penalty: 0.0f32,
            seed: 2014u64,
        }
    }
}

#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
mod ffi {
    #[derive(Debug, Copy, Clone)]
    struct GenerationParams {
        max_new_tokens: u32,
        ignore_eos_token: bool,
    }

    #[derive(Debug, Copy, Clone)]
    struct SamplingParams {
        top_k: u32,
        top_p: f32,
        frequency_penalty: f32,
        repetition_penalty: f32,
        seed: u64,
    }

    extern "Rust" {
        type InferContext<'a>;
    }

    unsafe extern "C++" {
        include!("backends/llamacpp/csrc/ffi.hpp");

        #[cxx_name = "generation_params_t"]
        type GenerationParams;

        #[cxx_name = "sampling_params_t"]
        type SamplingParams;

        /// Represent an instance of the llama.cpp backend instance on C++ side
        #[cxx_name = "llama_cpp_worker_frontend_t"]
        type LlamaCppWorkerFrontend;

        fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;

        unsafe fn stream(
            self: Pin<&mut LlamaCppWorkerFrontend>,
            tokens: &[u32],
            generation_params: GenerationParams,
            sampling_params: &SamplingParams,
            stream: *mut InferContext,
            callback: unsafe fn(*mut InferContext, u32, f32, bool, usize) -> bool,
        ) -> Result<usize>;
    }
}
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`use crate::backend::InferContext;`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`use crate::ffi::SamplingParams;`

feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`pub mod backend;`

feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`impl Default for SamplingParams {`
			`fn default() -> Self {`
			`Self {`
			`top_k: u32::MAX,`
			`top_p: 1.0f32,`
			`frequency_penalty: 0.0f32,`
			`repetition_penalty: 0.0f32,`
			`seed: 2014u64,`
			`}`
			`}`
			`}`

			`#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`mod ffi {`
feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`#[derive(Debug, Copy, Clone)]`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`struct GenerationParams {`
			`max_new_tokens: u32,`
feat(backend): add mapping for ignore_eos_token stopping criteria 2024-10-31 20:32:29 +00:00			`ignore_eos_token: bool,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`}`

feat(backend): bind incoming request to the server 2024-10-31 23:50:42 +00:00			`#[derive(Debug, Copy, Clone)]`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`struct SamplingParams {`
			`top_k: u32,`
			`top_p: f32,`
			`frequency_penalty: f32,`
			`repetition_penalty: f32,`
			`seed: u64,`
			`}`

feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`extern "Rust" {`
feat(backend): wrap Arc tokenizer to avoid duplicating 2024-11-14 07:41:38 +00:00			`type InferContext<'a>;`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`}`

feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`unsafe extern "C++" {`
feat(backend): wip Rust binding 2024-10-24 07:56:40 +00:00			`include!("backends/llamacpp/csrc/ffi.hpp");`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`#[cxx_name = "generation_params_t"]`
			`type GenerationParams;`

			`#[cxx_name = "sampling_params_t"]`
			`type SamplingParams;`

feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`/// Represent an instance of the llama.cpp backend instance on C++ side`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`#[cxx_name = "llama_cpp_worker_frontend_t"]`
			`type LlamaCppWorkerFrontend;`
feat(backend): wip Rust binding 2024-10-24 07:56:40 +00:00
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`unsafe fn stream(`
feat(backend): simplify overall cpp structure 2024-11-09 21:10:33 +00:00			`self: Pin<&mut LlamaCppWorkerFrontend>,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`tokens: &[u32],`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 23:36:32 +00:00			`generation_params: GenerationParams,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`sampling_params: &SamplingParams,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 15:17:43 +00:00			`stream: *mut InferContext,`
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 16:01:22 +00:00			`callback: unsafe fn(*mut InferContext, u32, f32, bool, usize) -> bool,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 16:51:57 +00:00			`) -> Result<usize>;`
feat(llamacpp): initial end2end build 2024-10-04 08:42:31 +00:00			`}`
			`}`