text-generation-inference/backends/trtllm/src/looper.rs

use std::hint;
use std::ops::Deref;
use std::path::Path;

use async_trait::async_trait;
use cxx::{UniquePtr};
use hashbrown::{HashMap};
use log::warn;
use tokenizers::{Encoding, Tokenizer};
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
use tokio::task::{spawn_blocking, JoinHandle};
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{debug, error};

use text_generation_router::infer::InferError::{GenerationError, ValidationError};
use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
use text_generation_router::validation::ValidationError::{
    EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
};
use text_generation_router::validation::{Chunk, ValidGenerateRequest};

use crate::errors::TensorRtLlmBackendError;
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
use crate::utils::first_line;

type InferResult<T> = Result<T, InferError>;

struct IdentifiableRequest<T> {
    request_id: u64,
    inner: T,
}

/// Wrap the TGI server forwarded ValidGenerateRequest with the tokenized view of the prompt
struct ValidGenerateRequestWithTokens {
    encoding: Encoding,
    inner: ValidGenerateRequest,
}

/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
struct GenerationContext {
    request: ValidGenerateRequestWithTokens,
    start: Option<Instant>,
    queued: Instant,
    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
}

#[derive(Debug, Copy, Clone)]
struct DecodedToken {
    id: u32,
    log_prob: f32,
    is_final: bool,
}

impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
    type Error = InferError;

    fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
        if !step.has_error {
            Ok(Self {
                id: step.token_id,
                log_prob: step.log_prob,
                is_final: step.is_final,
            })
        } else {
            Err(GenerationError(step.error_msg.clone()))
        }
    }
}

/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
struct DecodedTokenContext {
    token: DecodedToken,
    channel: UnboundedSender<InferResult<InferStreamResponse>>,
}

fn executor_status_looper(
    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
    mut waiting_requests: UnboundedReceiver<GenerationContext>,
    post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
) {
    // Track the tuple (request_id, stream) for each request
    let mut in_flights = HashMap::<u64, GenerationContext>::with_capacity(128);

    // TODO: Does it need a spin-loop?
    'scheduler: loop {
        // Is there any request pending to be scheduled?
        let awaiting_requests = waiting_requests.len();
        for _ in 0..awaiting_requests {
            // Retrieve all the requests
            if let Some(mut ctx) = waiting_requests.blocking_recv() {
                // Submit all the request to the executor and move the context to the in-flight tracker
                let request = &ctx.request;
                let generation_params = &request.inner.parameters;
                let stopping_params = &request.inner.stopping_parameters;

                // Submit to the TensorRT-LLM executor for scheduling
                match backend.pin_mut().submit(
                    request.encoding.get_ids(),
                    stopping_params.max_new_tokens,
                    generation_params.top_k as i32,
                    generation_params.top_p,
                    generation_params.temperature,
                    generation_params.repetition_penalty,
                    generation_params.frequency_penalty,
                    generation_params.seed,
                ) {
                    Ok(request_id) => {
                        // Insert the context linked to the generated request id in the tracker
                        debug!("[in-flight] Added {}", request_id);
                        ctx.start = Some(Instant::now());
                        in_flights.insert(request_id, ctx);
                    }
                    Err(e) => {
                        // Return to the caller
                        let what = e.to_string();
                        error!(error = what.as_str(), "Failed to schedule request");

                        let err = Err(InferError::SchedulingError(what));
                        if let Err(_) = ctx.streamer.send(err) {
                            error!("Failed to send back error to the client");
                        }
                    }
                };
            }
        }

        if backend.num_responses_ready() > 0 {
            match backend.pin_mut().pull_tokens() {
                Ok(responses) => {
                    // Iterate through all the decoded token
                    for step in responses.deref() {
                        if let Some(ctx) = in_flights.get(&step.request_id) {

                            // Remove from tracked requests
                            let parcel = DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
                                token: dt,
                                channel: ctx.streamer.clone(),
                            });

                            // Submit the work to p:the post_processor
                            let posted = post_processor_sender.send((step.request_id, parcel));

                            if posted.is_err() || step.is_final {
                                debug!("Removing {}", step.request_id);
                                let _ = in_flights.remove(&step.request_id);
                            }
                        } else {
                            warn!("Untracked request {}", step.request_id,);
                        }
                    };
                }
                Err(ref err) => {
                    error!("Failed to get responses from the executor: {}.", err.what());
                    break 'scheduler;
                }
            }
        }

        // Hint the CPU we are spin-locking
        hint::spin_loop();
    }
}

fn post_processor_looper(
    tokenizer: Tokenizer,
    mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
) {
    'post_processor: loop {
        if decoded_tokens.is_closed() {
            warn!("Post processor IPC is closed, loop will exit now.");
            break 'post_processor;
        }

        let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(128);

        if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
            let state = states.entry(request_id).or_insert(vec![]);
        }
    }
}


unsafe impl Send for crate::ffi::TensorRtLlmBackendImpl {}

pub struct TensorRtLlmBackendV2 {
    tokenizer: Tokenizer,
    executor_looper: JoinHandle<()>,
    post_processor_looper: JoinHandle<()>,
    executor: UnboundedSender<GenerationContext>,
}

impl TensorRtLlmBackendV2 {
    pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
        tokenizer: Tokenizer,
        engine_folder: P,
        executor_worker_path: PP,
    ) -> Result<Self, TensorRtLlmBackendError> {
        // Retrieve paths as &str for the backend creation
        let engine_folder = engine_folder.as_ref();
        let executor_worker_path = executor_worker_path.as_ref();

        let engine_folder = String::from(
            engine_folder
                .to_str()
                .expect("Failed to convert engine_folder to valid UTF-8"),
        );

        let executor_worker_path = String::from(
            executor_worker_path
                .to_str()
                .expect("Failed to convert executor_worker_path to valid UTF-8"),
        );

        // Allocate the IPC layer to communicate with the backend
        let (executor_sender, executor_receiver) = unbounded_channel();
        let (post_processor_sender, post_processor_receiver) = unbounded_channel();

        // Create the FFI backend
        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;

        // Executor looper is responsible for scheduling and pulling requests state at regular interval
        let executor_looper = spawn_blocking(move || {
            executor_status_looper(backend, executor_receiver, post_processor_sender)
        });

        // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
        let tokenizer_ = tokenizer.clone();
        let post_processor_looper =
            spawn_blocking(move || post_processor_looper(tokenizer_, post_processor_receiver));

        Ok(TensorRtLlmBackendV2 {
            tokenizer,
            executor_looper,
            post_processor_looper,
            executor: executor_sender,
        })
    }

    fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {
        if request.top_n_tokens > 1 {
            return Err(ValidationError(TopNTokensDisabled));
        }

        // TODO: Is it really needed? How can it be validated before?
        if request.parameters.grammar.is_some() {
            return Err(ValidationError(Grammar));
        }

        match request.inputs.len() {
            0 => Err(ValidationError(EmptyInput)),
            2.. => Err(GenerationError(
                "TensorRT-LLM backend don't support multi-chunk".into(),
            )),
            1 => match request.inputs.first().expect("Single item-chunk") {
                Chunk::Text(text) => Ok(text),
                Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),
            },
        }
    }
}

#[async_trait]
impl Backend for TensorRtLlmBackendV2 {
    fn schedule(
        &self,
        inner: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        let prompt = Self::validate(&inner)?;

        // We encode the prompt in every request context/thread
        let encoding = self
            .tokenizer
            .encode(prompt.as_str(), true)
            .map_err(|e| GenerationError(format!("Tokenization failed {}", e.to_string())))?;

        let request = ValidGenerateRequestWithTokens { encoding, inner };

        // Open-up the stream to send tokens
        let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();

        // Send the context to the executor for scheduling
        let queued = Instant::now();
        match self.executor.send(GenerationContext {
            request,
            start: None,
            queued,
            streamer,
        }) {
            Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
            Err(_) => Err(GenerationError(
                "Failed to submit request to the backend".into(),
            )),
        }
    }

    async fn health(&self, current_health: bool) -> bool {
        current_health
            & !self.executor_looper.is_finished()
            & !self.post_processor_looper.is_finished()
    }
}
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`use std::hint;`
			`use std::ops::Deref;`
			`use std::path::Path;`

			`use async_trait::async_trait;`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`use cxx::{UniquePtr};`
			`use hashbrown::{HashMap};`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`use log::warn;`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`use tokenizers::{Encoding, Tokenizer};`
(ffi) fix usage of wrong vector constructor making a capacity fill call 2024-08-09 20:45:18 +00:00			`use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};`
			`use tokio::task::{spawn_blocking, JoinHandle};`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`use tokio::time::Instant;`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`use tokio_stream::wrappers::UnboundedReceiverStream;`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`use tracing::{debug, error};`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
(ffi) fix usage of wrong vector constructor making a capacity fill call 2024-08-09 20:45:18 +00:00			`use text_generation_router::infer::InferError::{GenerationError, ValidationError};`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`use text_generation_router::infer::{Backend, InferError, InferStreamResponse};`
(ffi) fix usage of wrong vector constructor making a capacity fill call 2024-08-09 20:45:18 +00:00			`use text_generation_router::validation::ValidationError::{`
			`EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,`
			`};`
			`use text_generation_router::validation::{Chunk, ValidGenerateRequest};`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
			`use crate::errors::TensorRtLlmBackendError;`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`use crate::utils::first_line;`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`type InferResult<T> = Result<T, InferError>;`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`struct IdentifiableRequest<T> {`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`request_id: u64,`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`inner: T,`
			`}`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`/// Wrap the TGI server forwarded ValidGenerateRequest with the tokenized view of the prompt`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`struct ValidGenerateRequestWithTokens {`
			`encoding: Encoding,`
			`inner: ValidGenerateRequest,`
			`}`

(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`/// Wrap the requests along with the channel used to stream back to the client the decoded tokens`
			`struct GenerationContext {`
			`request: ValidGenerateRequestWithTokens,`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`start: Option<Instant>,`
			`queued: Instant,`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`streamer: UnboundedSender<InferResult<InferStreamResponse>>,`
			`}`

			`#[derive(Debug, Copy, Clone)]`
			`struct DecodedToken {`
			`id: u32,`
			`log_prob: f32,`
			`is_final: bool,`
			`}`

(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`type Error = InferError;`

(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`if !step.has_error {`
			`Ok(Self {`
			`id: step.token_id,`
			`log_prob: step.log_prob,`
			`is_final: step.is_final,`
			`})`
			`} else {`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`Err(GenerationError(step.error_msg.clone()))`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`}`
			`}`
			`}`

			`/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`struct DecodedTokenContext {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`token: DecodedToken,`
			`channel: UnboundedSender<InferResult<InferStreamResponse>>,`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`}`

(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`fn executor_status_looper(`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`mut backend: UniquePtr<TensorRtLlmBackendImpl>,`
			`mut waiting_requests: UnboundedReceiver<GenerationContext>,`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`) {`
			`// Track the tuple (request_id, stream) for each request`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`let mut in_flights = HashMap::<u64, GenerationContext>::with_capacity(128);`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
			`// TODO: Does it need a spin-loop?`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`'scheduler: loop {`
			`// Is there any request pending to be scheduled?`
			`let awaiting_requests = waiting_requests.len();`
			`for _ in 0..awaiting_requests {`
			`// Retrieve all the requests`
			`if let Some(mut ctx) = waiting_requests.blocking_recv() {`
			`// Submit all the request to the executor and move the context to the in-flight tracker`
			`let request = &ctx.request;`
			`let generation_params = &request.inner.parameters;`
			`let stopping_params = &request.inner.stopping_parameters;`

			`// Submit to the TensorRT-LLM executor for scheduling`
			`match backend.pin_mut().submit(`
			`request.encoding.get_ids(),`
			`stopping_params.max_new_tokens,`
			`generation_params.top_k as i32,`
			`generation_params.top_p,`
			`generation_params.temperature,`
			`generation_params.repetition_penalty,`
			`generation_params.frequency_penalty,`
			`generation_params.seed,`
			`) {`
			`Ok(request_id) => {`
			`// Insert the context linked to the generated request id in the tracker`
			`debug!("[in-flight] Added {}", request_id);`
			`ctx.start = Some(Instant::now());`
			`in_flights.insert(request_id, ctx);`
			`}`
			`Err(e) => {`
			`// Return to the caller`
			`let what = e.to_string();`
			`error!(error = what.as_str(), "Failed to schedule request");`

			`let err = Err(InferError::SchedulingError(what));`
			`if let Err(_) = ctx.streamer.send(err) {`
			`error!("Failed to send back error to the client");`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`}`
			`};`
			`}`
			`}`

			`if backend.num_responses_ready() > 0 {`
			`match backend.pin_mut().pull_tokens() {`
			`Ok(responses) => {`
			`// Iterate through all the decoded token`
			`for step in responses.deref() {`
			`if let Some(ctx) = in_flights.get(&step.request_id) {`

			`// Remove from tracked requests`
			`let parcel = DecodedToken::try_from(step).map(\|dt\| DecodedTokenContext {`
			`token: dt,`
			`channel: ctx.streamer.clone(),`
			`});`

			`// Submit the work to p:the post_processor`
			`let posted = post_processor_sender.send((step.request_id, parcel));`

			`if posted.is_err() \|\| step.is_final {`
			`debug!("Removing {}", step.request_id);`
			`let _ = in_flights.remove(&step.request_id);`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`} else {`
			`warn!("Untracked request {}", step.request_id,);`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`
			`};`
			`}`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`Err(ref err) => {`
			`error!("Failed to get responses from the executor: {}.", err.what());`
			`break 'scheduler;`
			`}`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`}`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
			`// Hint the CPU we are spin-locking`
			`hint::spin_loop();`
			`}`
			`}`

(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`fn post_processor_looper(`
			`tokenizer: Tokenizer,`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`) {`
			`'post_processor: loop {`
			`if decoded_tokens.is_closed() {`
			`warn!("Post processor IPC is closed, loop will exit now.");`
			`break 'post_processor;`
			`}`

(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(128);`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {`
			`let state = states.entry(request_id).or_insert(vec![]);`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`}`
			`}`
			`}`

(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00
			`unsafe impl Send for crate::ffi::TensorRtLlmBackendImpl {}`

(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`pub struct TensorRtLlmBackendV2 {`
			`tokenizer: Tokenizer,`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`executor_looper: JoinHandle<()>,`
			`post_processor_looper: JoinHandle<()>,`
			`executor: UnboundedSender<GenerationContext>,`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`

			`impl TensorRtLlmBackendV2 {`
			`pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(`
			`tokenizer: Tokenizer,`
			`engine_folder: P,`
			`executor_worker_path: PP,`
			`) -> Result<Self, TensorRtLlmBackendError> {`
			`// Retrieve paths as &str for the backend creation`
			`let engine_folder = engine_folder.as_ref();`
			`let executor_worker_path = executor_worker_path.as_ref();`

			`let engine_folder = String::from(`
			`engine_folder`
			`.to_str()`
			`.expect("Failed to convert engine_folder to valid UTF-8"),`
			`);`

			`let executor_worker_path = String::from(`
			`executor_worker_path`
			`.to_str()`
			`.expect("Failed to convert executor_worker_path to valid UTF-8"),`
			`);`

			`// Allocate the IPC layer to communicate with the backend`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`let (executor_sender, executor_receiver) = unbounded_channel();`
			`let (post_processor_sender, post_processor_receiver) = unbounded_channel();`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
			`// Create the FFI backend`
			`let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`.map_err(\|e\| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`// Executor looper is responsible for scheduling and pulling requests state at regular interval`
			`let executor_looper = spawn_blocking(move \|\| {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`executor_status_looper(backend, executor_receiver, post_processor_sender)`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`});`

			`// Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user`
			`let tokenizer_ = tokenizer.clone();`
			`let post_processor_looper =`
			`spawn_blocking(move \|\| post_processor_looper(tokenizer_, post_processor_receiver));`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00
			`Ok(TensorRtLlmBackendV2 {`
			`tokenizer,`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`executor_looper,`
			`post_processor_looper,`
			`executor: executor_sender,`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`})`
			`}`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00
			`fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {`
			`if request.top_n_tokens > 1 {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`return Err(ValidationError(TopNTokensDisabled));`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`}`

			`// TODO: Is it really needed? How can it be validated before?`
			`if request.parameters.grammar.is_some() {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`return Err(ValidationError(Grammar));`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`}`

			`match request.inputs.len() {`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`0 => Err(ValidationError(EmptyInput)),`
			`2.. => Err(GenerationError(`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`"TensorRT-LLM backend don't support multi-chunk".into(),`
			`)),`
			`1 => match request.inputs.first().expect("Single item-chunk") {`
			`Chunk::Text(text) => Ok(text),`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`},`
			`}`
			`}`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`

			`#[async_trait]`
			`impl Backend for TensorRtLlmBackendV2 {`
			`fn schedule(`
			`&self,`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`inner: ValidGenerateRequest,`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00			`let prompt = Self::validate(&inner)?;`

			`// We encode the prompt in every request context/thread`
			`let encoding = self`
			`.tokenizer`
			`.encode(prompt.as_str(), true)`
			`.map_err(\|e\| GenerationError(format!("Tokenization failed {}", e.to_string())))?;`

			`let request = ValidGenerateRequestWithTokens { encoding, inner };`

			`// Open-up the stream to send tokens`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();`
(ffi) encode the provided user prompt within each request thread 2024-08-05 07:56:14 +00:00
			`// Send the context to the executor for scheduling`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`let queued = Instant::now();`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`match self.executor.send(GenerationContext {`
			`request,`
(scheduler) rework submit/pull logic 2024-08-26 13:39:20 +00:00			`start: None,`
			`queued,`
(backend) refactor & cleanup 2024-08-11 12:10:28 +00:00			`streamer,`
			`}) {`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),`
			`Err(_) => Err(GenerationError(`
			`"Failed to submit request to the backend".into(),`
			`)),`
			`}`
			`}`

			`async fn health(&self, current_health: bool) -> bool {`
(backend) implement the post_processor background thread 2024-08-05 13:27:18 +00:00			`current_health`
			`& !self.executor_looper.is_finished()`
			`& !self.post_processor_looper.is_finished()`
(looper) new looper initial implementation 2024-08-02 22:18:39 +00:00			`}`
			`}`