From 02cd6fe427b8ba705a4a138926971f8dc5562a9f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 13 Nov 2024 00:08:26 +0100 Subject: [PATCH] chore(backend): minor improvements --- backends/llamacpp/csrc/ffi.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 948e96a0..43694fa3 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -50,6 +50,8 @@ namespace huggingface::tgi::backends::llamacpp { InferContext *ctx, rust::Fn callback ) { + // Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries + // It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool { return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); @@ -76,11 +78,18 @@ namespace huggingface::tgi::backends::llamacpp { }; std::unique_ptr create_worker_frontend(rust::Str modelPath) { - const auto cxxPath = std::string(modelPath); + // Initialize the numa context from numactl + static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){ + llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL); + return true; + }(); + + // Allocate model weights parameters auto params = llama_model_default_params(); params.use_mmap = true; - auto *model = (llama_load_model_from_file(cxxPath.c_str(), params)); + // Allocate the model from the Rust provided, string path + auto *model = (llama_load_model_from_file(static_cast(modelPath).c_str(), params)); return std::make_unique(model); } }