mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 14:22:08 +00:00
Adding some wiggle room.
This commit is contained in:
parent
9176ecbcea
commit
289b0721c4
@ -1340,7 +1340,9 @@ fn main() -> Result<(), LauncherError> {
|
||||
let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
|
||||
max_batch_size * max_input_tokens
|
||||
} else {
|
||||
max_input_tokens
|
||||
// Adding some edge in order to account for potential block_size alignement
|
||||
// issue.
|
||||
max_input_tokens + 50
|
||||
} as u32;
|
||||
tracing::info!("Default `max_batch_prefill_tokens` to {value}");
|
||||
value
|
||||
|
@ -190,12 +190,14 @@ impl State {
|
||||
token_budget: u32,
|
||||
) -> Option<NextBatch> {
|
||||
if self.entries.is_empty() {
|
||||
tracing::debug!("No queue");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check if we have enough entries
|
||||
if let Some(min_size) = min_size {
|
||||
if self.entries.len() < min_size {
|
||||
tracing::debug!("Not enough entries");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
@ -218,6 +220,7 @@ impl State {
|
||||
// was dropped by the client)
|
||||
if entry.response_tx.is_closed() {
|
||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
||||
tracing::debug!("Dropping entry");
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -254,10 +257,12 @@ impl State {
|
||||
{
|
||||
// Entry is over budget
|
||||
// Add it back to the front
|
||||
tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
|
||||
self.entries.push_front((id, entry));
|
||||
break;
|
||||
}
|
||||
|
||||
tracing::debug!("Accepting entry");
|
||||
// Create a new span to link the batch back to this entry
|
||||
let entry_batch_span = info_span!(parent: &entry.span, "infer");
|
||||
// Add relationships
|
||||
@ -288,6 +293,7 @@ impl State {
|
||||
|
||||
// Empty batch
|
||||
if batch_requests.is_empty() {
|
||||
tracing::debug!("Filterered out all entries");
|
||||
return None;
|
||||
}
|
||||
|
||||
|
@ -161,13 +161,13 @@ impl Validation {
|
||||
} else {
|
||||
return Err(ValidationError::UnsetMaxNewTokens);
|
||||
};
|
||||
let input_length = truncate.unwrap_or(self.max_input_length);
|
||||
let mut input_length = truncate.unwrap_or(self.max_input_length);
|
||||
|
||||
// We don't have a tokenizer, therefore we have no idea how long is the query, let
|
||||
// them through and hope for the best.
|
||||
// Validate MaxNewTokens
|
||||
if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
|
||||
// input_length = input_length.saturating_sub(max_new_tokens as usize);
|
||||
input_length = input_length.saturating_sub(max_new_tokens as usize);
|
||||
// return Err(ValidationError::MaxNewTokens(
|
||||
// self.max_total_tokens - self.max_input_length,
|
||||
// max_new_tokens,
|
||||
|
Loading…
Reference in New Issue
Block a user