diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 38a94c8a..332bb4d5 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -2,52 +2,40 @@ // Created by Morgan Funtowicz on 9/28/2024. // -#include -#include +#include +#include +#include +#include #include +#include #include #include "backend.hpp" namespace huggingface::tgi::backends::llama { - std::unique_ptr - CreateLlamaCppBackend(std::string_view root) { - SPDLOG_INFO(FMT_STRING("Loading model from {}"), root); - gpt_init(); + std::expected, TgiLlamaCppBackendError> + CreateLlamaCppBackend(const std::filesystem::path& modelPath) { + SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath); + llama_backend_init(); + llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); - // Fake argv - std::vector args = {"tgi_llama_cpp_backend", "--model", root}; - std::vector argv; - for (const auto &arg: args) { - argv.push_back(const_cast(arg.data())); - } - argv.push_back(nullptr); - - // Create the GPT parameters - gpt_params params; - if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) { - throw std::runtime_error("Failed to create GPT Params from model"); + // Load the model + if(!exists(modelPath)) { + return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); } - - // Create the inference engine - SPDLOG_INFO("Allocating llama.cpp model from gpt_params"); - auto result = llama_init_from_gpt_params(params); - - // Unpack all the inference engine components - auto model = result.model; - auto context = result.context; - auto loras = result.lora_adapters; - - // Make sure everything is correctly initialized - if (model == nullptr) - throw std::runtime_error(fmt::format("Failed to load model from {}", root)); + auto params = llama_model_default_params(); + auto* model = llama_load_model_from_file(modelPath.c_str(), params); + auto* context = llama_new_context_with_model(model, { + .n_batch = 1, + .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, + .flash_attn = true, + }); return std::make_unique(model, context); } - huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, - llama_context *const ctx) + huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) : model(model), ctx(ctx), batch() { char modelName[128]; llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 7e3c9020..bcf728db 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,12 +4,17 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include #include #include namespace huggingface::tgi::backends::llama { // const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; + enum TgiLlamaCppBackendError { + MODEL_FILE_DOESNT_EXIST = 1 + }; + class TgiLlamaCppBackend { private: @@ -23,7 +28,8 @@ namespace huggingface::tgi::backends::llama { void schedule(); }; - std::unique_ptr CreateLlamaCppBackend(std::string_view root); + std::expected, TgiLlamaCppBackendError> + CreateLlamaCppBackend(const std::filesystem::path& root); } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 4009588d..2f50cac1 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include "../csrc/backend.hpp" @@ -16,7 +17,7 @@ int main(int argc, char** argv) { spdlog::set_level(spdlog::level::debug); - const std::string_view model_root = argv[1]; - auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root); - fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root); + const auto modelPath = absolute(std::filesystem::path(argv[1])); + if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value()) + fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath); } \ No newline at end of file