Revert "Prefer prefill instead of decode when max_waiting_tokens==0 (#18)" (#45) (#76)

Co-authored-by: mswiniarsk <156412439+mswiniarsk@users.noreply.github.com>
This commit is contained in:
Karol Damaszke 2024-02-27 11:56:45 +01:00 committed by GitHub
parent 83b059bd27
commit 6248c5610e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -310,9 +310,8 @@ async fn batching_task(
Some((batch_size as f32 * waiting_served_ratio).floor() as usize) Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
}; };
let mut token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
loop {
// Try to get a new batch // Try to get a new batch
if let Some((mut new_entries, new_batch, span)) = queue if let Some((mut new_entries, new_batch, span)) = queue
.next_batch(min_size, max_batch_prefill_tokens, token_budget) .next_batch(min_size, max_batch_prefill_tokens, token_budget)
@ -345,20 +344,9 @@ async fn batching_task(
waiting_tokens = 1; waiting_tokens = 1;
// Extend current batch with the new batch // Extend current batch with the new batch
if let Some(new_cached_batch) = new_cached_batch { if let Some(new_cached_batch) = new_cached_batch {
token_budget = token_budget.saturating_sub(new_cached_batch.max_tokens);
entries.extend(new_entries); entries.extend(new_entries);
batches.push(new_cached_batch); batches.push(new_cached_batch);
} }
} else {
// Break as there is no batch
break;
}
// Loop again in case of max_waiting_tokens == 0
// to prefer doing next prefill. Break otherwise
if max_waiting_tokens != 0 {
break;
}
} }
// Create span for this batch to add context to inference calls // Create span for this batch to add context to inference calls