mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
Adding some wiggle room.
This commit is contained in:
parent
9176ecbcea
commit
289b0721c4
@ -1340,7 +1340,9 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
|
let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
|
||||||
max_batch_size * max_input_tokens
|
max_batch_size * max_input_tokens
|
||||||
} else {
|
} else {
|
||||||
max_input_tokens
|
// Adding some edge in order to account for potential block_size alignement
|
||||||
|
// issue.
|
||||||
|
max_input_tokens + 50
|
||||||
} as u32;
|
} as u32;
|
||||||
tracing::info!("Default `max_batch_prefill_tokens` to {value}");
|
tracing::info!("Default `max_batch_prefill_tokens` to {value}");
|
||||||
value
|
value
|
||||||
|
@ -190,12 +190,14 @@ impl State {
|
|||||||
token_budget: u32,
|
token_budget: u32,
|
||||||
) -> Option<NextBatch> {
|
) -> Option<NextBatch> {
|
||||||
if self.entries.is_empty() {
|
if self.entries.is_empty() {
|
||||||
|
tracing::debug!("No queue");
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if we have enough entries
|
// Check if we have enough entries
|
||||||
if let Some(min_size) = min_size {
|
if let Some(min_size) = min_size {
|
||||||
if self.entries.len() < min_size {
|
if self.entries.len() < min_size {
|
||||||
|
tracing::debug!("Not enough entries");
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -218,6 +220,7 @@ impl State {
|
|||||||
// was dropped by the client)
|
// was dropped by the client)
|
||||||
if entry.response_tx.is_closed() {
|
if entry.response_tx.is_closed() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
||||||
|
tracing::debug!("Dropping entry");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -254,10 +257,12 @@ impl State {
|
|||||||
{
|
{
|
||||||
// Entry is over budget
|
// Entry is over budget
|
||||||
// Add it back to the front
|
// Add it back to the front
|
||||||
|
tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
|
||||||
self.entries.push_front((id, entry));
|
self.entries.push_front((id, entry));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tracing::debug!("Accepting entry");
|
||||||
// Create a new span to link the batch back to this entry
|
// Create a new span to link the batch back to this entry
|
||||||
let entry_batch_span = info_span!(parent: &entry.span, "infer");
|
let entry_batch_span = info_span!(parent: &entry.span, "infer");
|
||||||
// Add relationships
|
// Add relationships
|
||||||
@ -288,6 +293,7 @@ impl State {
|
|||||||
|
|
||||||
// Empty batch
|
// Empty batch
|
||||||
if batch_requests.is_empty() {
|
if batch_requests.is_empty() {
|
||||||
|
tracing::debug!("Filterered out all entries");
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,13 +161,13 @@ impl Validation {
|
|||||||
} else {
|
} else {
|
||||||
return Err(ValidationError::UnsetMaxNewTokens);
|
return Err(ValidationError::UnsetMaxNewTokens);
|
||||||
};
|
};
|
||||||
let input_length = truncate.unwrap_or(self.max_input_length);
|
let mut input_length = truncate.unwrap_or(self.max_input_length);
|
||||||
|
|
||||||
// We don't have a tokenizer, therefore we have no idea how long is the query, let
|
// We don't have a tokenizer, therefore we have no idea how long is the query, let
|
||||||
// them through and hope for the best.
|
// them through and hope for the best.
|
||||||
// Validate MaxNewTokens
|
// Validate MaxNewTokens
|
||||||
if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
|
if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
|
||||||
// input_length = input_length.saturating_sub(max_new_tokens as usize);
|
input_length = input_length.saturating_sub(max_new_tokens as usize);
|
||||||
// return Err(ValidationError::MaxNewTokens(
|
// return Err(ValidationError::MaxNewTokens(
|
||||||
// self.max_total_tokens - self.max_input_length,
|
// self.max_total_tokens - self.max_input_length,
|
||||||
// max_new_tokens,
|
// max_new_tokens,
|
||||||
|
Loading…
Reference in New Issue
Block a user