diff --git a/router/src/infer/v2/queue.rs b/router/src/infer/v2/queue.rs index 0b51645a..9f3636ad 100644 --- a/router/src/infer/v2/queue.rs +++ b/router/src/infer/v2/queue.rs @@ -205,6 +205,13 @@ impl State { } } + if let Some(max_size) = max_size { + if max_size == 0 { + tracing::debug!("No capacity"); + return None; + } + } + // Pad prefill_token_budget to be a multiple of block size let prefill_token_budget = ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size; @@ -297,7 +304,7 @@ impl State { batch_entries.insert(id, entry); // Check if max_size - if Some(batch_requests.len()) == max_size { + if Some(batch_requests.len()) >= max_size { break; } } diff --git a/router/src/infer/v2/scheduler.rs b/router/src/infer/v2/scheduler.rs index 3d6c36cf..6ad07ea8 100644 --- a/router/src/infer/v2/scheduler.rs +++ b/router/src/infer/v2/scheduler.rs @@ -161,7 +161,10 @@ pub(crate) async fn batching_task( }; let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); - let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize); + let max_size = max_batch_size.map(|max_size| { + if batch_size as usize > max_size { 0 } else { max_size - batch_size as usize } + }); + // Try to get a new batch if let Some((mut new_entries, new_batch, span)) = queue