diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 405d1d7f8..d0f9e3cf1 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1340,7 +1340,9 @@ fn main() -> Result<(), LauncherError> { let value: u32 = if let Some(max_batch_size) = args.max_batch_size { max_batch_size * max_input_tokens } else { - max_input_tokens + // Adding some edge in order to account for potential block_size alignement + // issue. + max_input_tokens + 50 } as u32; tracing::info!("Default `max_batch_prefill_tokens` to {value}"); value diff --git a/router/src/queue.rs b/router/src/queue.rs index 52ea16cae..b8147fb58 100644 --- a/router/src/queue.rs +++ b/router/src/queue.rs @@ -190,12 +190,14 @@ impl State { token_budget: u32, ) -> Option { if self.entries.is_empty() { + tracing::debug!("No queue"); return None; } // Check if we have enough entries if let Some(min_size) = min_size { if self.entries.len() < min_size { + tracing::debug!("Not enough entries"); return None; } } @@ -218,6 +220,7 @@ impl State { // was dropped by the client) if entry.response_tx.is_closed() { metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); + tracing::debug!("Dropping entry"); continue; } @@ -254,10 +257,12 @@ impl State { { // Entry is over budget // Add it back to the front + tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate); self.entries.push_front((id, entry)); break; } + tracing::debug!("Accepting entry"); // Create a new span to link the batch back to this entry let entry_batch_span = info_span!(parent: &entry.span, "infer"); // Add relationships @@ -288,6 +293,7 @@ impl State { // Empty batch if batch_requests.is_empty() { + tracing::debug!("Filterered out all entries"); return None; } diff --git a/router/src/validation.rs b/router/src/validation.rs index b6dbe1e88..2029c7e0a 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -161,13 +161,13 @@ impl Validation { } else { return Err(ValidationError::UnsetMaxNewTokens); }; - let input_length = truncate.unwrap_or(self.max_input_length); + let mut input_length = truncate.unwrap_or(self.max_input_length); // We don't have a tokenizer, therefore we have no idea how long is the query, let // them through and hope for the best. // Validate MaxNewTokens if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 { - // input_length = input_length.saturating_sub(max_new_tokens as usize); + input_length = input_length.saturating_sub(max_new_tokens as usize); // return Err(ValidationError::MaxNewTokens( // self.max_total_tokens - self.max_input_length, // max_new_tokens,