feat(backend): refactor the callback to handle intermediate and end inference message

2025-09-11 20:34:54 +00:00 · 2024-11-04 16:17:43 +01:00 · 2024-11-04 16:17:43 +01:00 · 5b7a951389
commit 5b7a951389
parent 11c593dc69
5 changed files with 142 additions and 141 deletions
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@ -114,9 +114,6 @@ namespace huggingface::tgi::backends::llamacpp {
                auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id);
                auto new_token_logits = 0.0f; // TODO: return logit

-                // Push the token to the generated vector on Rust side
-                generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
-
                // Handle termination cases
                const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
                const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog;
@ -150,10 +147,15 @@ namespace huggingface::tgi::backends::llamacpp {
    ) {
        // TODO: Should we provide a way to change this value?
        auto generated = std::vector<llama_token>(2 << 8);
+        auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos,
+                                  size_t num_generated_tokens) {
+            generated.emplace_back(new_token_id);

-        auto nTokensGenerated = generate(tokens, generated, generation_params, sampling_params, callback);
-        if (nTokensGenerated.has_value())
-            generated.resize(*nTokensGenerated);
+            if (callback.has_value())
+                (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens);
+        };
+
+        auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback);
        return generated;
    }

@ -168,25 +170,24 @@ namespace huggingface::tgi::backends::llamacpp {
        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
    }

-    std::expected<std::size_t, backend_error_t>
-    single_worker_backend_t::generate(
+    std::expected<size_t, backend_error_t>
+    single_worker_backend_t::stream(
            std::span<const llama_token> tokens,
-            std::span<llama_token> out,
            const generation_params_t &generation_params,
            const sampling_params_t &sampling_params,
-            const std::optional<llama_decode_callback> &callback
+            const llama_decode_callback &callback
    ) {
-        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback);
+        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens}, callback);
    }

    std::expected<size_t, backend_error_t>
-    multi_worker_backend_t::generate(
-            std::span<const llama_token>,
-            std::span<llama_token>,
+    multi_worker_backend_t::stream(
+            std::span<const llama_token> tokens,
            const generation_params_t &generation_params,
            const sampling_params_t &sampling_params,
-            const std::optional<llama_decode_callback> &callback) {
-        SPDLOG_ERROR("Not implemented yet");
-        return 0uz;
+            const llama_decode_callback &callback
+    ) {
+        SPDLOG_WARN("Not implemented for multi_worker_t");
+        return 0;
    }
 }
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@ -69,7 +69,6 @@ namespace huggingface::tgi::backends::llamacpp {
        generation_params_t generation_params;
        sampling_params_t sampling_params;
        std::span<const llama_token> input_tokens;
-        std::span<llama_token> generated_tokens;
    };

    /**
@ -125,25 +124,9 @@ namespace huggingface::tgi::backends::llamacpp {
        /**
         *
         * @param tokens
-         * @params out
-         * @param params
-         * @param maxNewTokens
-         * @return
-         */
-        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        virtual std::expected<size_t, backend_error_t> generate(
-                std::span<const llama_token> input_tokens,
-                std::span<llama_token> generated_tokens,
-                const generation_params_t &generation_params,
-                const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) = 0;
-
-        /**
-         *
-         * @param tokens
-         * @param params
-         * @param maxNewTokens
+         * @param generation_params
+         * @param sampling_params
+         * @param callback
         * @return
         */
        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
@ -153,6 +136,22 @@ namespace huggingface::tgi::backends::llamacpp {
                const sampling_params_t &sampling_params,
                const std::optional<llama_decode_callback> &callback = std::nullopt
        );
+
+        /**
+         *
+         * @param tokens
+         * @param generation_params
+         * @param sampling_params
+         * @params callback
+         * @return
+         */
+        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
+        virtual std::expected<size_t, backend_error_t> stream(
+                std::span<const llama_token> tokens,
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                const llama_decode_callback &callback
+        ) = 0;
    };


@ -174,16 +173,11 @@ namespace huggingface::tgi::backends::llamacpp {
    public:
        explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);

-        using backend_base_t::generate;
-
-        std::expected<size_t, backend_error_t>
-        generate(
+        std::expected<size_t, backend_error_t> stream(
                std::span<const llama_token> tokens,
-                std::span<llama_token> out,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) override;
+                const llama_decode_callback &callback) override;
    };

    class multi_worker_backend_t : backend_base_t {
@ -191,13 +185,11 @@ namespace huggingface::tgi::backends::llamacpp {
        llama_context_ptr mContext_;

    public:
-        std::expected<size_t, backend_error_t> generate(
-                std::span<const llama_token>,
-                std::span<llama_token>,
+        std::expected<size_t, backend_error_t> stream(
+                std::span<const llama_token> tokens,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) override;
+                const llama_decode_callback &callback) override;
    };
 }

--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@ -28,23 +28,20 @@ namespace huggingface::tgi::backends::llamacpp {

    // Concept identifying types which have a .generate() -> size_t method to do in-place generation
    template<typename T>
-    concept has_emplace_generate = requires(
+    concept has_stream_method = requires(
            T t,
            std::span<const llama_token> input_tokens,
-            std::span<llama_token> generated_tokens,
            const generation_params_t &generation_params,
            const sampling_params_t &sampling_params,
            llama_decode_callback callback
    ) {
        {
-        t.generate(input_tokens, generated_tokens, generation_params, sampling_params, callback)
+        t.stream(input_tokens, generation_params, sampling_params, callback)
        } -> std::same_as<std::expected<size_t, backend_error_t>>;
    };

-    static_assert(has_emplace_generate<single_worker_backend_t>,
-                  "single_worker_backend_t doesn't meet concept is_generate_emplace_capable");
-    static_assert(has_emplace_generate<multi_worker_backend_t>,
-                  "multi_worker_backend_t doesn't meet concept is_generate_emplace_capable");
+    static_assert(has_stream_method<single_worker_backend_t>, "single_worker_backend_t doesn't meet concept has_stream_method");
+    static_assert(has_stream_method<multi_worker_backend_t>, "multi_worker_backend_t doesn't meet concept has_stream_method");

    class llama_cpp_backend_exception_t : std::exception {

@ -64,29 +61,25 @@ namespace huggingface::tgi::backends::llamacpp {

        size_t stream(
                rust::Slice<const uint32_t> input_tokens,
-                rust::Slice <uint32_t> generated_tokens,
                const generation_params_t generation_params,
                const sampling_params_t &sampling_params,
-                OpaqueStream *stream,
-                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool, size_t)> callback
+                InferContext *ctx,
+                rust::Fn<void(InferContext *, uint32_t, float_t, bool, size_t)> callback
        ) {
            // Define the visitor lambda function which requires the has_emplace_generate constraint on T
-            auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
+            auto inner_fw = [=, &sampling_params, &ctx, &callback]<has_stream_method T>(T &&backend)
                    -> std::expected<size_t, backend_error_t> {

-                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
-                    callback(stream, new_token_id, logits, is_eos, n_generated_tokens);
+                auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
+                    callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
                };

                // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
                auto input_tokens_v =
                        std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
-                auto generated_tokens_v =
-                        std::span(reinterpret_cast<llama_token *>(generated_tokens.data()), generated_tokens.size());

-                return backend.generate(
+                return backend.stream(
                        input_tokens_v,
-                        generated_tokens_v,
                        generation_params,
                        sampling_params,
                        context_forwarding_callback
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -1,7 +1,6 @@
 use crate::ffi::{
    create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
 };
-use crate::OpaqueStream;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use std::path::{Path, PathBuf};
@ -14,12 +13,13 @@ use text_generation_router::validation::{
 };
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
+use tokio::sync::mpsc::error::SendError;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};

-type BoxedOpaqueStream = Box<OpaqueStream>;
+type InferResult = Result<InferStreamResponse, InferError>;

 unsafe impl Send for LlamaCppBackendImpl {}

@ -45,14 +45,19 @@ impl From<&ValidStoppingParameters> for GenerationParams {
 }

 #[cfg_attr(debug_assertions, derive(Debug))]
-struct InferContext {
-    pub(crate) stream: UnboundedSender<Result<InferStreamResponse, InferError>>,
+struct GenerationContext {
    pub(crate) input_tokens: Arc<Vec<u32>>,
    pub(crate) generated_tokens: Vec<u32>,
    pub(crate) generation_params: GenerationParams,
    pub(crate) sampling_params: SamplingParams,
 }

+pub(crate) struct InferContext {
+    pub(crate) start: Instant,
+    pub(crate) stream: UnboundedSender<InferResult>,
+    pub(crate) generation: GenerationContext,
+}
+
 #[derive(Debug, Error)]
 pub enum LlamaCppBackendError {
    #[error("Provided GGUF model path {0} doesn't exist")]
@ -63,7 +68,7 @@ pub enum LlamaCppBackendError {
 }

 pub struct LlamaCppBackend {
-    backlog: Sender<InferContext>,
+    backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
    scheduler_handle: JoinHandle<()>,
 }

@ -98,81 +103,96 @@ impl LlamaCppBackend {
 }

 fn llama_generate_callback(
-    channel: *mut OpaqueStream,
+    ctx: *mut InferContext,
    new_token_id: u32,
    new_token_logit: f32,
-    is_eos: bool,
+    is_final: bool,
    n_generated_tokens: usize,
 ) {
-    let response = InferStreamResponse::Intermediate {
-        token: Token {
+    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
+
+    // Decode token
+    let token = Token {
        id: new_token_id,
        text: "".to_string(),
        logprob: new_token_logit,
        special: false,
-        },
-        top_tokens: vec![],
    };
-    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})");

-    unsafe {
-        if let Err(ref err) = (*channel).0.send(Ok(response)) {
-            error!(
-                "Failed to send back token to the client: {}",
-                err.to_string()
-            );
+    let ctx = unsafe { &mut *ctx };
+
+    // Append the new token to the generated ones
+    ctx.generation.generated_tokens.push(new_token_id);
+
+    // Create the streamed response
+    let response = match is_final {
+        false => InferStreamResponse::Intermediate {
+            token,
+            top_tokens: vec![],
+        },
+        true => {
+            // Decode the whole text
+            let text = String::new();
+
+            // Stream end response
+            InferStreamResponse::End {
+                token,
+                top_tokens: vec![],
+                generated_text: GeneratedText {
+                    text,
+                    generated_tokens: n_generated_tokens as u32,
+                    finish_reason: FinishReason::Length,
+                    seed: Some(ctx.generation.sampling_params.seed),
+                },
+                start: ctx.start,
+                queued: ctx.start,
+            }
+        }
    };
+
+    // Send back to the client
+    if let Err(ref err) = ctx.stream.send(Ok(response)) {
+        error!("Failed to send back the response to the client, cancelling request");
+        // TODO: cancel the request
    }
 }

 unsafe fn scheduler_loop(
    mut backend: UniquePtr<LlamaCppBackendImpl>,
-    backlog: Receiver<InferContext>,
+    backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
+    // This loop will mostly decode single token at every step, so no need to rely on parallelism
+    tokenizers::utils::parallelism::set_parallelism(false);
+
    loop {
-        if let Ok(mut ctx) = backlog.recv() {
+        if let Ok((generation, stream)) = backlog.recv() {
            let start = Instant::now();
-            let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream));
-            let stream_ptr = Box::into_raw(stream);
-            let result = backend.pin_mut().stream(
-                &ctx.input_tokens,
-                &mut ctx.generated_tokens,
-                ctx.generation_params,
-                &ctx.sampling_params,
-                stream_ptr,
+            let generation_params = generation.generation_params; // copy
+            let sampling_params = generation.sampling_params; // copy
+            let input_tokens = Arc::clone(&generation.input_tokens);
+
+            // Creating the whole InferContext and pushing it to the heap
+            {
+                let ctx = Box::new(InferContext {
+                    start,
+                    stream,
+                    generation,
+                });
+
+                let boxed_ctx = Box::into_raw(ctx);
+
+                if let Err(e) = backend.pin_mut().stream(
+                    &input_tokens,
+                    generation_params,
+                    &sampling_params,
+                    boxed_ctx,
                    llama_generate_callback,
-            );
+                ) {
+                    error!("Error while decoding tokens... {}", e.what());
+                }

                // Make sure we re-keep track of the OpaqueStream box
-            let stream = Box::from_raw(stream_ptr);
-
-            match result {
-                Ok(n_tokens) => {
-                    unsafe {
-                        ctx.generated_tokens.set_len(n_tokens);
-                    }
-
-                    let _ = stream.0.send(Ok(InferStreamResponse::End {
-                        token: Token {
-                            id: ctx.generated_tokens[n_tokens - 1],
-                            text: "".to_string(),
-                            logprob: 0.0,
-                            special: false,
-                        },
-                        top_tokens: vec![],
-                        generated_text: GeneratedText {
-                            text: "".to_string(),
-                            generated_tokens: n_tokens as u32,
-                            finish_reason: FinishReason::Length,
-                            seed: Some(ctx.sampling_params.seed),
-                        },
-                        start,
-                        queued: start,
-                    }));
-
-                    debug!("Generated {n_tokens} tokens -> {:?}", ctx.generated_tokens);
-                }
-                Err(err) => println!("Error: {err}"),
+                let _ = Box::from_raw(boxed_ctx);
            }
        } else {
            info!("IPC channel is closed, exiting the scheduler loop");
@ -186,21 +206,20 @@ impl Backend for LlamaCppBackend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
-    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+    ) -> Result<UnboundedReceiverStream<InferResult>, InferError> {
        if let Some(input_ids) = request.input_ids {
            let (sx, rx) = unbounded_channel();
            let sampling_params = SamplingParams::from(&request.parameters);
            let generation_params = GenerationParams::from(&request.stopping_parameters);

-            let ctx = InferContext {
-                stream: sx,
+            let ctx = GenerationContext {
                input_tokens: Arc::clone(&input_ids),
                generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize),
                generation_params,
                sampling_params,
            };

-            match self.backlog.send(ctx) {
+            match self.backlog.send((ctx, sx)) {
                Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
                Err(_) => Err(InferError::GenerationError(
                    "Failed to sent the request".to_string(),
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@ -1,6 +1,5 @@
+use crate::backend::InferContext;
 use crate::ffi::SamplingParams;
-use text_generation_router::infer::{InferError, InferStreamResponse};
-use tokio::sync::mpsc::UnboundedSender;

 pub mod backend;

@ -16,8 +15,6 @@ impl Default for SamplingParams {
    }
 }

-struct OpaqueStream(UnboundedSender<Result<InferStreamResponse, InferError>>);
-
 #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
 mod ffi {
    #[derive(Debug, Copy, Clone)]
@ -36,7 +33,7 @@ mod ffi {
    }

    extern "Rust" {
-        type OpaqueStream;
+        type InferContext;
    }

    unsafe extern "C++" {
@ -66,11 +63,10 @@ mod ffi {
        unsafe fn stream(
            self: Pin<&mut LlamaCppBackendImpl>,
            tokens: &[u32],
-            generated: &mut [u32],
            generation_params: GenerationParams,
            sampling_params: &SamplingParams,
-            stream: *mut OpaqueStream,
-            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize),
+            stream: *mut InferContext,
+            callback: unsafe fn(*mut InferContext, u32, f32, bool, usize),
        ) -> Result<usize>;
    }
 }