From 0c1dd0ed2b3d38dfaa2aa5409b39c7b73eca9493 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 29 Oct 2024 22:30:36 +0100 Subject: [PATCH] feat(llamacpp): wip explosion --- backends/llamacpp/csrc/backend.cpp | 172 ++++++++++------------------- backends/llamacpp/csrc/backend.hpp | 103 +++++++++-------- backends/llamacpp/csrc/ffi.hpp | 4 +- 3 files changed, 120 insertions(+), 159 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 907fe58e..080a4401 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -15,82 +15,15 @@ #include "backend.hpp" namespace huggingface::tgi::backends::llamacpp { - [[nodiscard]] - std::expected, TgiLlamaCppBackendError> - TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath, const uint16_t nThreads) noexcept { - SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); - llama_backend_init(); - llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); - -#ifdef TGI_LLAMACPP_BACKEND_DEBUG - llama_print_system_info(); -#endif - - // Load the model - if (!exists(modelPath)) { - return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); - } - - auto params = llama_model_default_params(); - auto *model = llama_load_model_from_file(modelPath.c_str(), params); - auto *context = llama_new_context_with_model(model, { - .n_batch = 1, - .n_threads = nThreads, - .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, - .flash_attn = false, - }); - - return std::make_pair(model, context); - } - - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, - llama_context *const ctx) - : model(model), ctx(ctx) { -#ifdef TGI_LLAMACPP_BACKEND_DEBUG - char modelName[256]; - llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); - SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); -#endif - } - - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() { - if (ctx) { - SPDLOG_DEBUG("Freeing llama.cpp context"); - llama_free(ctx); - } - - if (model) { - SPDLOG_DEBUG("Freeing llama.cpp model"); - llama_free_model(model); - } - } - - std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { - std::vector tokens(llama_n_seq_max(ctx)); - - if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, - true); nTokens < 0) { - tokens.resize(-nTokens); - llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); - } else { - tokens.resize(nTokens); - } - - SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size()); - return tokens; - } - - std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( - const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, - const uint64_t seed) { - auto *sampler = llama_sampler_chain_init({.no_perf = false}); + std::unique_ptr SamplingParams::IntoLlamaSampler(const llama_model *pModel) const { + auto *pSampler = llama_sampler_chain_init({.no_perf = false}); // Penalties - llama_sampler_chain_add(sampler, llama_sampler_init_penalties( - llama_n_vocab(model), - llama_token_eos(model), - llama_token_nl(model), + llama_sampler_chain_add(pSampler, llama_sampler_init_penalties( + llama_n_vocab(pModel), + llama_token_eos(pModel), + llama_token_nl(pModel), 0.0f, repetitionPenalty, frequencyPenalty, @@ -98,57 +31,70 @@ namespace huggingface::tgi::backends::llamacpp { false, false )); - llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); + llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(topK))); if (0 < topP && topP < 1) { - llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); + llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(topP, 1)); } - llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); - return std::make_unique(sampler); + llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); + return std::unique_ptr(pSampler); } - std::expected, TgiLlamaCppBackendError> - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate( - std::span tokens, - const uint32_t topK, - const float_t topP, - const float_t frequencyPenalty, - const float_t repetitionPenalty, - const uint32_t maxNewTokens, - const uint64_t seed - ) { - SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); + Worker::Worker(std::shared_ptr pModel, const llama_context_params ¶ms) + : mModel_(pModel), mParams_(params) { - // Allocate generation result - std::vector generated; - generated.reserve(llama_n_seq_max(ctx) - tokens.size()); - - // Retrieve decoding context - auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); - auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed); - - // Decode - for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG - const auto start = std::chrono::steady_clock::now(); - const auto status = llama_decode(ctx, batch); - const auto end = std::chrono::steady_clock::now(); - const auto latency = std::chrono::duration_cast(end - start); - SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); -#else - const auto status = llama_decode(ctx, batch); + char modelName[256]; + llama_model_meta_val_str(pModel.get(), "general.name", modelName, sizeof(modelName)); + SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif - if (LLAMA_SUCCESS(status)) { - // Sample the new token - auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); - generated.emplace_back(new_token_id); - generating = !llama_token_is_eog(model, new_token_id); + } + + void Worker::Loop(std::atomic_flag &running, std::atomic_uint8_t &waiting, std::queue &backlog) { + auto *context = llama_new_context_with_model(mModel_.get(), mParams_); + + while (running.test(std::memory_order_acquire)) { + if (waiting.load(std::memory_order_acquire) > 0) { + --waiting; + + auto request = backlog.front(); + auto sampler = request.IntoLlamaSampler(mModel_.get()); + + // Retrieve decoding context + auto batch = llama_batch_get_one(tokens.data(), tokens.size()); + // Decode + for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < 1; ++nDecoded) { +#ifdef TGI_LLAMACPP_BACKEND_DEBUG + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(context, batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); +#else + const auto status = llama_decode(ctx, batch); +#endif + if (LLAMA_SUCCESS(status)) { + // Sample the new token + auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); + generated.emplace_back(new_token_id); + generating = !llama_token_is_eog(mModel_.get(), new_token_id); + + // Next iteration + batch = llama_batch_get_one(&new_token_id, 1); + } + } + + backlog.pop(); - // Next iteration - batch = llama_batch_get_one(&new_token_id, 1); } } - return generated; + + llama_free(context); } + + huggingface::tgi::backends::llamacpp::BackendBase::BackendBase(llama_model *model) + : mModel_(model, llama_free_model) { llama_backend_init(); } + + BackendBase::~BackendBase() { llama_backend_free(); } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 24b49949..e4814d45 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,9 +4,11 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include #include #include #include +#include #include #include #include @@ -16,72 +18,85 @@ #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { - enum TgiLlamaCppBackendError : uint8_t { + enum BackendError : uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; - class TgiLlamaCppBackend { - using TokenId = llama_token; - - private: - llama_model *model; - llama_context *ctx; + struct SamplingParams { + uint32_t topK = std::numeric_limits::max(); + float_t topP = 1.0f; + float_t frequencyPenalty = 0.0f; + float_t repetitionPenalty = 0.0f; + uint64_t seed = 2014; /** - * - * @param topK - * @param topP + * Convert this GenerationParams to the respective llama_sampler structure + * @param Pointer to the model data * @return */ - std::unique_ptr GetSamplerFromArgs( - uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed); + std::unique_ptr IntoLlamaSampler(const llama_model *) const; + }; + + class Worker { + protected: + constexpr static auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; public: - /** - * - * @return - */ - static std::expected, TgiLlamaCppBackendError> - FromGGUF(const std::filesystem::path &, uint16_t) noexcept; + using model_ptr_type = std::shared_ptr; + using context_params_type = llama_context_params; + using token_id_type = llama_token; - TgiLlamaCppBackend(llama_model *model, llama_context *ctx); + private: + const model_ptr_type mModel_; + context_params_type mParams_; - ~TgiLlamaCppBackend(); + public: + Worker(std::shared_ptr pModel, const llama_context_params ¶ms); - /** - * - * @param text - * @return - */ - [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]] - std::vector Tokenize(const std::string &text) const; + void Loop(std::atomic_flag &, std::atomic_uint8_t &, std::queue &) const; + }; + + + class BackendBase { + + private: + std::shared_ptr mModel_; + + public: + explicit BackendBase(llama_model *model); + + ~BackendBase(); /** * * @param tokens - * @param topK - * @param topP - * @param frequencyPenalty - * @param repetitionPenalty + * @params out + * @param params * @param maxNewTokens - * @param seed * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, TgiLlamaCppBackendError> Generate( - std::span tokens, - uint32_t topK, - float_t topP = 1.0f, - float_t frequencyPenalty = 0.0f, - float_t repetitionPenalty = 0.0f, - uint32_t maxNewTokens = std::numeric_limits::max() - 1, - uint64_t seed = 2014 + std::expected, BackendError> Generate( + std::span tokens, + std::span out, + const SamplingParams ¶ms, + uint32_t maxNewTokens = std::numeric_limits::max() - 1 + ); + + /** + * + * @param tokens + * @param params + * @param maxNewTokens + * @return + */ + [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] + std::expected, BackendError> Generate( + std::span tokens, + const SamplingParams ¶ms, + uint32_t maxNewTokens = std::numeric_limits::max() - 1 ); }; - - [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]] - std::expected, TgiLlamaCppBackendError> - CreateLlamaCppBackend(const std::filesystem::path &root); } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 09d8af2d..d15728b9 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -28,10 +28,10 @@ namespace huggingface::tgi::backends::llamacpp::impl { class LlamaCppBackendImpl { private: - TgiLlamaCppBackend _inner; + BackendBase _inner; public: - LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {} + LlamaCppBackendImpl(llama_model *model) : _inner(model) {} }; std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) {