diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index a30eb217e..54f1cf736 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -39,7 +39,7 @@ namespace huggingface::tgi::backends::llamacpp { return {pSampler, llama_sampler_deleter}; } - worker_t::worker_t(std::shared_ptr model, const llama_context_params ¶ms) + worker_t::worker_t(std::shared_ptr model, const llama_context_params &¶ms) : model_(model), context_(llama_new_context_with_model(model_.get(), params)) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index de37df75e..039d4eac9 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp { * @param model * @param params */ - worker_t(std::shared_ptr, const llama_context_params &); + worker_t(std::shared_ptr, const llama_context_params &&); /** * diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 147f81aef..f9eec7819 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -51,8 +51,8 @@ namespace huggingface::tgi::backends::llamacpp { worker_t worker_; public: - explicit llama_cpp_worker_frontend_t(llama_model *model): - model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {} + explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads): + model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {} size_t stream( rust::Slice input_tokens, @@ -88,7 +88,7 @@ namespace huggingface::tgi::backends::llamacpp { } }; - std::unique_ptr create_worker_frontend(rust::Str modelPath) { + std::unique_ptr create_worker_frontend(rust::Str modelPath, uint32_t num_threads) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG spdlog::set_level(spdlog::level::debug); #endif @@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp { // Allocate the model from the Rust provided, string path auto *model = (llama_load_model_from_file(static_cast(modelPath).c_str(), params)); - return std::make_unique(model); + return std::make_unique(model, static_cast(num_threads)); } struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 1ef959a82..e846a476e 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -122,8 +122,9 @@ pub struct LlamaCppBackend { impl LlamaCppBackend { fn allocate_worker( path: &Path, + num_threads: u32, ) -> Result, LlamaCppBackendError> { - create_worker_frontend(&path.display().to_string()).map_err(|ref err| { + create_worker_frontend(&path.display().to_string(), num_threads).map_err(|ref err| { LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string()) }) } @@ -145,17 +146,19 @@ impl LlamaCppBackend { // Allocate all the workers let streams = cores_allocation .iter() - .map(|affinity| match Self::allocate_worker(path) { - Ok(worker) => { - let tokenizer = Arc::clone(&tokenizer); - let (sender, receiver) = channel(); - let affinity = affinity.clone().collect::>(); - spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); + .map( + |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) { + Ok(worker) => { + let tokenizer = Arc::clone(&tokenizer); + let (sender, receiver) = channel(); + let affinity = affinity.clone().collect::>(); + spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); - Ok(LlamaCppWorker { sender }) - } - Err(e) => Err(e), - }) + Ok(LlamaCppWorker { sender }) + } + Err(e) => Err(e), + }, + ) .collect::, _>>()?; // Start the scheduler loop diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index f9fc72e51..6b047bf53 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -49,7 +49,10 @@ mod ffi { #[cxx_name = "llama_cpp_worker_frontend_t"] type LlamaCppWorkerFrontend; - fn create_worker_frontend(modelPath: &str) -> Result>; + fn create_worker_frontend( + modelPath: &str, + num_threads: u32, + ) -> Result>; fn set_numactl_core_affinity(affinity: &[usize]);