Adding some wiggle room.

2025-09-17 23:34:52 +00:00 · 2024-04-12 07:22:26 +00:00 · 2024-04-12 07:22:26 +00:00 · 289b0721c4
commit 289b0721c4
parent 9176ecbcea
3 changed files with 11 additions and 3 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1340,7 +1340,9 @@ fn main() -> Result<(), LauncherError> {
                let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
                    max_batch_size * max_input_tokens
                } else {
-                    max_input_tokens
+                    // Adding some edge in order to account for potential block_size alignement
                    // issue.
                    max_input_tokens + 50
                } as u32;
                tracing::info!("Default `max_batch_prefill_tokens` to {value}");
                value
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@ -190,12 +190,14 @@ impl State {
        token_budget: u32,
    ) -> Option<NextBatch> {
        if self.entries.is_empty() {
            tracing::debug!("No queue");
            return None;
        }
        // Check if we have enough entries
        if let Some(min_size) = min_size {
            if self.entries.len() < min_size {
                tracing::debug!("Not enough entries");
                return None;
            }
        }
@ -218,6 +220,7 @@ impl State {
            // was dropped by the client)
            if entry.response_tx.is_closed() {
                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
                tracing::debug!("Dropping entry");
                continue;
            }
@ -254,10 +257,12 @@ impl State {
            {
                // Entry is over budget
                // Add it back to the front
                tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
                self.entries.push_front((id, entry));
                break;
            }
            tracing::debug!("Accepting entry");
            // Create a new span to link the batch back to this entry
            let entry_batch_span = info_span!(parent: &entry.span, "infer");
            // Add relationships
@ -288,6 +293,7 @@ impl State {
        // Empty batch
        if batch_requests.is_empty() {
            tracing::debug!("Filterered out all entries");
            return None;
        }
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -161,13 +161,13 @@ impl Validation {
            } else {
                return Err(ValidationError::UnsetMaxNewTokens);
            };
-            let input_length = truncate.unwrap_or(self.max_input_length);
+            let mut input_length = truncate.unwrap_or(self.max_input_length);
            // We don't have a tokenizer, therefore we have no idea how long is the query, let
            // them through and hope for the best.
            // Validate MaxNewTokens
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
-                // input_length = input_length.saturating_sub(max_new_tokens as usize);
+                input_length = input_length.saturating_sub(max_new_tokens as usize);
                // return Err(ValidationError::MaxNewTokens(
                //     self.max_total_tokens - self.max_input_length,
                //     max_new_tokens,