Prefer prefill instead of decode when max_waiting_tokens==0 (#18)

2025-09-17 23:34:52 +00:00 · 2024-01-19 15:25:40 +01:00 · 2024-01-19 15:25:40 +01:00 · da0f874d49
commit da0f874d49
parent 60f63262db
1 changed files with 44 additions and 32 deletions
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -310,8 +310,9 @@ async fn batching_task(
                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
                };
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let mut token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
                loop {
                    // Try to get a new batch
                    if let Some((mut new_entries, new_batch, span)) = queue
                        .next_batch(min_size, max_batch_prefill_tokens, token_budget)
@ -344,9 +345,20 @@ async fn batching_task(
                        waiting_tokens = 1;
                        // Extend current batch with the new batch
                        if let Some(new_cached_batch) = new_cached_batch {
                            token_budget = token_budget.saturating_sub(new_cached_batch.max_tokens);
                            entries.extend(new_entries);
                            batches.push(new_cached_batch);
                        }
                    } else {
                        // Break as there is no batch
                        break;
                    }
                    // Loop again in case of max_waiting_tokens == 0
                    // to prefer doing next prefill. Break otherwise
                    if max_waiting_tokens != 0 {
                        break;
                    }
                }
                // Create span for this batch to add context to inference calls