Revert "Prefer prefill instead of decode when max_waiting_tokens==0 (#18)" (#45) (#76)

Co-authored-by: mswiniarsk <156412439+mswiniarsk@users.noreply.github.com>
2025-09-17 23:34:52 +00:00 · 2024-02-27 11:56:45 +01:00 · 2024-02-27 11:56:45 +01:00 · 6248c5610e
commit 6248c5610e
parent 83b059bd27
1 changed files with 32 additions and 44 deletions
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -310,9 +310,8 @@ async fn batching_task(
                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
                };
-                let mut token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
                loop {
                // Try to get a new batch
                if let Some((mut new_entries, new_batch, span)) = queue
                    .next_batch(min_size, max_batch_prefill_tokens, token_budget)
@ -345,20 +344,9 @@ async fn batching_task(
                    waiting_tokens = 1;
                    // Extend current batch with the new batch
                    if let Some(new_cached_batch) = new_cached_batch {
                            token_budget = token_budget.saturating_sub(new_cached_batch.max_tokens);
                        entries.extend(new_entries);
                        batches.push(new_cached_batch);
                    }
                    } else {
                        // Break as there is no batch
                        break;
                    }
                    // Loop again in case of max_waiting_tokens == 0
                    // to prefer doing next prefill. Break otherwise
                    if max_waiting_tokens != 0 {
                        break;
                    }
                }
                // Create span for this batch to add context to inference calls