diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 332bb4d5..859041c2 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -2,20 +2,23 @@ // Created by Morgan Funtowicz on 9/28/2024. // -#include #include +#include + #include #include +#include #include #include #include + #include "backend.hpp" namespace huggingface::tgi::backends::llama { std::expected, TgiLlamaCppBackendError> CreateLlamaCppBackend(const std::filesystem::path& modelPath) { - SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath); + SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); @@ -28,39 +31,109 @@ namespace huggingface::tgi::backends::llama { auto* model = llama_load_model_from_file(modelPath.c_str(), params); auto* context = llama_new_context_with_model(model, { .n_batch = 1, + .n_threads = 16, .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, - .flash_attn = true, + .flash_attn = false, }); return std::make_unique(model, context); } huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx), batch() { - char modelName[128]; - llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); + : model(model), ctx(ctx) { +#ifndef NDEBUG + char modelName[256]; + llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); +#endif } huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() { - if (model) { - SPDLOG_DEBUG("Freeing llama.cpp model"); - llama_free_model(model); - } - if (ctx) { SPDLOG_DEBUG("Freeing llama.cpp context"); llama_free(ctx); } + + if(model) { + SPDLOG_DEBUG("Freeing llama.cpp model"); + llama_free_model(model); + } } - void huggingface::tgi::backends::llama::TgiLlamaCppBackend::schedule() { - std::vector tokens; + std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { + std::vector tokens(llama_n_seq_max(ctx)); + + if(auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0){ + tokens.resize(-nTokens); + llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); + } else { + tokens.resize(nTokens); + } + + SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size()); + return tokens; } - namespace impl { - class LlamaCppBackendImpl { + std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( + const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) { + auto *sampler = llama_sampler_chain_init({.no_perf = false}); - }; + // Penalties + llama_sampler_chain_add(sampler, llama_sampler_init_penalties( + llama_n_vocab(model), + llama_token_eos(model), + llama_token_nl (model), + 0.0f, + repetitionPenalty, + frequencyPenalty, + 0.0f, + false, + false + )); + llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); + + if(0 < topP && topP < 1) { + llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); + } + + llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); + return std::make_unique(sampler); + } + + std::vector huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate( + std::span tokens, const uint32_t topK, const float_t topP, const uint32_t maxNewTokens) { + SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); + + // Allocate generation result + std::vector generated; + generated.reserve(llama_n_seq_max(ctx) - tokens.size()); + + // Retrieve decoding context + auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); + auto sampler = GetSamplerFromArgs(topK, topP, 1.0, 1.0, 2014); + + // Decode + for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { +#ifndef NDEBUG + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(ctx, batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); +#else + const auto status = llama_decode(ctx, batch); +#endif + if (status == LLAMA_SUCCESS) { + // Sample the new token + auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); + generated.emplace_back(new_token_id); + generating = !llama_token_is_eog(model, new_token_id); + + // Next iteration + batch = llama_batch_get_one(&new_token_id, 1); + } + } + generated.shrink_to_fit(); + return generated; } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index bcf728db..e109a158 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,28 +4,61 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include +#include #include #include #include -namespace huggingface::tgi::backends::llama { -// const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; +#define LLAMA_SUCCESS 0 +namespace huggingface::tgi::backends::llama { enum TgiLlamaCppBackendError { MODEL_FILE_DOESNT_EXIST = 1 }; class TgiLlamaCppBackend { + using TokenId = int32_t; + private: llama_model* model; llama_context* ctx; - llama_batch batch; + + /** + * + * @param topK + * @param topP + * @return + */ + std::unique_ptr GetSamplerFromArgs( + uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed); + public: TgiLlamaCppBackend(llama_model *model, llama_context *ctx); ~TgiLlamaCppBackend(); - void schedule(); + /** + * + * @param text + * @return + */ + [[nodiscard]] std::vector Tokenize(const std::string& text) const; + + /** + * + * @param tokens + * @param topK + * @param topP + * @param maxNewTokens + * @return + */ + [[nodiscard]] std::vector Generate( + std::span tokens, + uint32_t topK, + float_t topP = 1.0f, + uint32_t maxNewTokens = std::numeric_limits::max() + ); }; std::expected, TgiLlamaCppBackendError> diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 2f50cac1..3165261f 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -3,21 +3,37 @@ // #include +#include #include #include -#include +#include #include #include "../csrc/backend.hpp" int main(int argc, char** argv) { - if(argc < 2) { + if (argc < 2) { fmt::print("No model folder provider"); return 1; } spdlog::set_level(spdlog::level::debug); + const auto prompt = "My name is Morgan"; + const auto modelPath = absolute(std::filesystem::path(argv[1])); - if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value()) - fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath); -} \ No newline at end of file + if (auto maybeBackend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) { + // Retrieve the backend + const auto& backend = *maybeBackend; + + // Generate + const auto promptTokens = backend->Tokenize(prompt); + const auto out = backend->Generate(promptTokens, 30, 1.0, 32); + fmt::print(FMT_STRING("Generated: {}"), out); + } else { + switch (maybeBackend.error()) { + case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath); + return maybeBackend.error(); + } + } +}