mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
feat(router): add max_batch_size
This commit is contained in:
parent
39af000cb9
commit
faaa9dfe0a
38
Cargo.lock
generated
38
Cargo.lock
generated
@ -2787,7 +2787,7 @@ dependencies = [
|
|||||||
"tabled",
|
"tabled",
|
||||||
"text-generation-client",
|
"text-generation-client",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.14.1",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
@ -2850,7 +2850,7 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
"text-generation-client",
|
"text-generation-client",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.15.1",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tower-http",
|
"tower-http",
|
||||||
@ -2972,6 +2972,40 @@ dependencies = [
|
|||||||
"unicode_categories",
|
"unicode_categories",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokenizers"
|
||||||
|
version = "0.15.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6db445cceba5dfeb0f9702be7d6bfd91801ddcbe8fe8722defe7f2e96da75812"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"clap",
|
||||||
|
"derive_builder",
|
||||||
|
"esaxx-rs",
|
||||||
|
"getrandom",
|
||||||
|
"hf-hub",
|
||||||
|
"indicatif",
|
||||||
|
"itertools 0.11.0",
|
||||||
|
"lazy_static",
|
||||||
|
"log",
|
||||||
|
"macro_rules_attribute",
|
||||||
|
"monostate",
|
||||||
|
"onig",
|
||||||
|
"paste",
|
||||||
|
"rand",
|
||||||
|
"rayon",
|
||||||
|
"rayon-cond",
|
||||||
|
"regex",
|
||||||
|
"regex-syntax 0.7.5",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"spm_precompiled",
|
||||||
|
"thiserror",
|
||||||
|
"unicode-normalization-alignments",
|
||||||
|
"unicode-segmentation",
|
||||||
|
"unicode_categories",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.35.1"
|
version = "1.35.1"
|
||||||
|
@ -279,6 +279,11 @@ struct Args {
|
|||||||
#[clap(default_value = "20", long, env)]
|
#[clap(default_value = "20", long, env)]
|
||||||
max_waiting_tokens: usize,
|
max_waiting_tokens: usize,
|
||||||
|
|
||||||
|
/// Enforce a maximum number of requests per batch
|
||||||
|
/// Specific flag for hardware targets that do not support unpadded inference
|
||||||
|
#[clap(long, env)]
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
|
|
||||||
/// The IP address to listen on
|
/// The IP address to listen on
|
||||||
#[clap(default_value = "0.0.0.0", long, env)]
|
#[clap(default_value = "0.0.0.0", long, env)]
|
||||||
hostname: String,
|
hostname: String,
|
||||||
@ -1046,6 +1051,12 @@ fn spawn_webserver(
|
|||||||
router_args.push(max_batch_total_tokens.to_string());
|
router_args.push(max_batch_total_tokens.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Router optional max batch size
|
||||||
|
if let Some(max_batch_size) = args.max_batch_size {
|
||||||
|
router_args.push("--max-batch-size".to_string());
|
||||||
|
router_args.push(max_batch_size.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
// Model optional revision
|
// Model optional revision
|
||||||
if let Some(ref revision) = args.revision {
|
if let Some(ref revision) = args.revision {
|
||||||
router_args.push("--revision".to_string());
|
router_args.push("--revision".to_string());
|
||||||
|
@ -105,6 +105,7 @@ impl Client {
|
|||||||
max_input_length: u32,
|
max_input_length: u32,
|
||||||
max_prefill_tokens: u32,
|
max_prefill_tokens: u32,
|
||||||
max_total_tokens: u32,
|
max_total_tokens: u32,
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
) -> Result<Option<u32>> {
|
) -> Result<Option<u32>> {
|
||||||
let mut n_tokens = 0;
|
let mut n_tokens = 0;
|
||||||
let mut requests = Vec::new();
|
let mut requests = Vec::new();
|
||||||
@ -136,6 +137,11 @@ impl Client {
|
|||||||
top_n_tokens: 20,
|
top_n_tokens: 20,
|
||||||
});
|
});
|
||||||
n_tokens += max_input_length;
|
n_tokens += max_input_length;
|
||||||
|
|
||||||
|
// Check max_batch_size
|
||||||
|
if Some(requests.len()) == max_batch_size {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let batch = Batch {
|
let batch = Batch {
|
||||||
|
@ -97,12 +97,18 @@ impl ShardedClient {
|
|||||||
max_input_length: u32,
|
max_input_length: u32,
|
||||||
max_prefill_tokens: u32,
|
max_prefill_tokens: u32,
|
||||||
max_total_tokens: u32,
|
max_total_tokens: u32,
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
) -> Result<Option<u32>> {
|
) -> Result<Option<u32>> {
|
||||||
let futures: Vec<_> = self
|
let futures: Vec<_> = self
|
||||||
.clients
|
.clients
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.map(|client| {
|
.map(|client| {
|
||||||
Box::pin(client.warmup(max_input_length, max_prefill_tokens, max_total_tokens))
|
Box::pin(client.warmup(
|
||||||
|
max_input_length,
|
||||||
|
max_prefill_tokens,
|
||||||
|
max_total_tokens,
|
||||||
|
max_batch_size,
|
||||||
|
))
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
// Take the minimum value
|
// Take the minimum value
|
||||||
|
@ -61,6 +61,7 @@ impl Infer {
|
|||||||
max_batch_prefill_tokens: u32,
|
max_batch_prefill_tokens: u32,
|
||||||
max_batch_total_tokens: u32,
|
max_batch_total_tokens: u32,
|
||||||
max_waiting_tokens: usize,
|
max_waiting_tokens: usize,
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
max_concurrent_requests: usize,
|
max_concurrent_requests: usize,
|
||||||
requires_padding: bool,
|
requires_padding: bool,
|
||||||
window_size: Option<u32>,
|
window_size: Option<u32>,
|
||||||
@ -69,7 +70,7 @@ impl Infer {
|
|||||||
tokenizer_config: HubTokenizerConfig,
|
tokenizer_config: HubTokenizerConfig,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
// Infer shared state
|
// Infer shared state
|
||||||
let queue = Queue::new(requires_padding, 16, window_size, speculate);
|
let queue = Queue::new(requires_padding, max_batch_size, 16, window_size, speculate);
|
||||||
let shared = Arc::new(Shared {
|
let shared = Arc::new(Shared {
|
||||||
batching_task: Notify::new(),
|
batching_task: Notify::new(),
|
||||||
});
|
});
|
||||||
|
@ -73,6 +73,8 @@ pub struct Info {
|
|||||||
pub max_batch_total_tokens: u32,
|
pub max_batch_total_tokens: u32,
|
||||||
#[schema(example = "20")]
|
#[schema(example = "20")]
|
||||||
pub max_waiting_tokens: usize,
|
pub max_waiting_tokens: usize,
|
||||||
|
#[schema(nullable = true, example = "null")]
|
||||||
|
pub max_batch_size: Option<usize>,
|
||||||
#[schema(example = "2")]
|
#[schema(example = "2")]
|
||||||
pub validation_workers: usize,
|
pub validation_workers: usize,
|
||||||
/// Router Info
|
/// Router Info
|
||||||
|
@ -45,6 +45,8 @@ struct Args {
|
|||||||
max_batch_total_tokens: Option<u32>,
|
max_batch_total_tokens: Option<u32>,
|
||||||
#[clap(default_value = "20", long, env)]
|
#[clap(default_value = "20", long, env)]
|
||||||
max_waiting_tokens: usize,
|
max_waiting_tokens: usize,
|
||||||
|
#[clap(long, env)]
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
#[clap(default_value = "0.0.0.0", long, env)]
|
#[clap(default_value = "0.0.0.0", long, env)]
|
||||||
hostname: String,
|
hostname: String,
|
||||||
#[clap(default_value = "3000", long, short, env)]
|
#[clap(default_value = "3000", long, short, env)]
|
||||||
@ -91,6 +93,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
max_batch_prefill_tokens,
|
max_batch_prefill_tokens,
|
||||||
max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
max_waiting_tokens,
|
max_waiting_tokens,
|
||||||
|
max_batch_size,
|
||||||
hostname,
|
hostname,
|
||||||
port,
|
port,
|
||||||
master_shard_uds_path,
|
master_shard_uds_path,
|
||||||
@ -288,6 +291,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
max_input_length as u32,
|
max_input_length as u32,
|
||||||
max_batch_prefill_tokens,
|
max_batch_prefill_tokens,
|
||||||
max_total_tokens as u32,
|
max_total_tokens as u32,
|
||||||
|
max_batch_size,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(RouterError::Warmup)?
|
.map_err(RouterError::Warmup)?
|
||||||
@ -344,6 +348,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
max_batch_prefill_tokens,
|
max_batch_prefill_tokens,
|
||||||
max_supported_batch_total_tokens,
|
max_supported_batch_total_tokens,
|
||||||
max_waiting_tokens,
|
max_waiting_tokens,
|
||||||
|
max_batch_size,
|
||||||
sharded_client,
|
sharded_client,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
validation_workers,
|
validation_workers,
|
||||||
|
@ -36,6 +36,7 @@ pub(crate) struct Queue {
|
|||||||
impl Queue {
|
impl Queue {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
requires_padding: bool,
|
requires_padding: bool,
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
block_size: u32,
|
block_size: u32,
|
||||||
window_size: Option<u32>,
|
window_size: Option<u32>,
|
||||||
speculate: u32,
|
speculate: u32,
|
||||||
@ -46,6 +47,7 @@ impl Queue {
|
|||||||
// Launch background queue task
|
// Launch background queue task
|
||||||
tokio::spawn(queue_task(
|
tokio::spawn(queue_task(
|
||||||
requires_padding,
|
requires_padding,
|
||||||
|
max_batch_size,
|
||||||
block_size,
|
block_size,
|
||||||
window_size,
|
window_size,
|
||||||
speculate,
|
speculate,
|
||||||
@ -95,6 +97,7 @@ impl Queue {
|
|||||||
// Background task responsible of the queue state
|
// Background task responsible of the queue state
|
||||||
async fn queue_task(
|
async fn queue_task(
|
||||||
requires_padding: bool,
|
requires_padding: bool,
|
||||||
|
max_size: Option<usize>,
|
||||||
block_size: u32,
|
block_size: u32,
|
||||||
window_size: Option<u32>,
|
window_size: Option<u32>,
|
||||||
speculate: u32,
|
speculate: u32,
|
||||||
@ -115,7 +118,8 @@ async fn queue_task(
|
|||||||
response_sender,
|
response_sender,
|
||||||
span,
|
span,
|
||||||
} => span.in_scope(|| {
|
} => span.in_scope(|| {
|
||||||
let next_batch = state.next_batch(min_size, prefill_token_budget, token_budget);
|
let next_batch =
|
||||||
|
state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
|
||||||
response_sender.send(next_batch).unwrap();
|
response_sender.send(next_batch).unwrap();
|
||||||
metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
|
metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
|
||||||
}),
|
}),
|
||||||
@ -181,6 +185,7 @@ impl State {
|
|||||||
fn next_batch(
|
fn next_batch(
|
||||||
&mut self,
|
&mut self,
|
||||||
min_size: Option<usize>,
|
min_size: Option<usize>,
|
||||||
|
max_size: Option<usize>,
|
||||||
prefill_token_budget: u32,
|
prefill_token_budget: u32,
|
||||||
token_budget: u32,
|
token_budget: u32,
|
||||||
) -> Option<NextBatch> {
|
) -> Option<NextBatch> {
|
||||||
@ -274,6 +279,11 @@ impl State {
|
|||||||
entry.batch_time = Some(Instant::now());
|
entry.batch_time = Some(Instant::now());
|
||||||
// Insert in batch_entries IntMap
|
// Insert in batch_entries IntMap
|
||||||
batch_entries.insert(id, entry);
|
batch_entries.insert(id, entry);
|
||||||
|
|
||||||
|
// Check if max_size
|
||||||
|
if Some(batch_requests.len()) == max_size {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Empty batch
|
// Empty batch
|
||||||
@ -393,8 +403,8 @@ mod tests {
|
|||||||
fn test_next_batch_empty() {
|
fn test_next_batch_empty() {
|
||||||
let mut state = State::new(false, 1, None, 0);
|
let mut state = State::new(false, 1, None, 0);
|
||||||
|
|
||||||
assert!(state.next_batch(None, 1, 1).is_none());
|
assert!(state.next_batch(None, None, 1, 1).is_none());
|
||||||
assert!(state.next_batch(Some(1), 1, 1).is_none());
|
assert!(state.next_batch(Some(1), None, 1, 1).is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -405,7 +415,7 @@ mod tests {
|
|||||||
state.append(entry1);
|
state.append(entry1);
|
||||||
state.append(entry2);
|
state.append(entry2);
|
||||||
|
|
||||||
let (entries, batch, _) = state.next_batch(None, 2, 2).unwrap();
|
let (entries, batch, _) = state.next_batch(None, None, 2, 2).unwrap();
|
||||||
assert_eq!(entries.len(), 2);
|
assert_eq!(entries.len(), 2);
|
||||||
assert!(entries.contains_key(&0));
|
assert!(entries.contains_key(&0));
|
||||||
assert!(entries.contains_key(&1));
|
assert!(entries.contains_key(&1));
|
||||||
@ -421,7 +431,7 @@ mod tests {
|
|||||||
let (entry3, _guard3) = default_entry();
|
let (entry3, _guard3) = default_entry();
|
||||||
state.append(entry3);
|
state.append(entry3);
|
||||||
|
|
||||||
assert!(state.next_batch(Some(2), 2, 2).is_none());
|
assert!(state.next_batch(Some(2), None, 2, 2).is_none());
|
||||||
|
|
||||||
assert_eq!(state.next_id, 3);
|
assert_eq!(state.next_id, 3);
|
||||||
assert_eq!(state.entries.len(), 1);
|
assert_eq!(state.entries.len(), 1);
|
||||||
@ -429,6 +439,26 @@ mod tests {
|
|||||||
assert_eq!(id, 2);
|
assert_eq!(id, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_next_batch_max_size() {
|
||||||
|
let mut state = State::new(false, 1, None, 0);
|
||||||
|
let (entry1, _guard1) = default_entry();
|
||||||
|
let (entry2, _guard2) = default_entry();
|
||||||
|
state.append(entry1);
|
||||||
|
state.append(entry2);
|
||||||
|
|
||||||
|
let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).unwrap();
|
||||||
|
assert_eq!(entries.len(), 1);
|
||||||
|
assert!(entries.contains_key(&0));
|
||||||
|
assert!(entries.get(&0).unwrap().batch_time.is_some());
|
||||||
|
assert_eq!(batch.id, 0);
|
||||||
|
assert_eq!(batch.size, 1);
|
||||||
|
|
||||||
|
assert_eq!(state.next_id, 2);
|
||||||
|
assert_eq!(state.entries.len(), 1);
|
||||||
|
assert_eq!(state.next_batch_id, 1);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_next_batch_token_budget() {
|
fn test_next_batch_token_budget() {
|
||||||
let mut state = State::new(false, 1, None, 0);
|
let mut state = State::new(false, 1, None, 0);
|
||||||
@ -437,7 +467,7 @@ mod tests {
|
|||||||
state.append(entry1);
|
state.append(entry1);
|
||||||
state.append(entry2);
|
state.append(entry2);
|
||||||
|
|
||||||
let (entries, batch, _) = state.next_batch(None, 1, 1).unwrap();
|
let (entries, batch, _) = state.next_batch(None, None, 1, 1).unwrap();
|
||||||
assert_eq!(entries.len(), 1);
|
assert_eq!(entries.len(), 1);
|
||||||
assert!(entries.contains_key(&0));
|
assert!(entries.contains_key(&0));
|
||||||
assert_eq!(batch.id, 0);
|
assert_eq!(batch.id, 0);
|
||||||
@ -450,7 +480,7 @@ mod tests {
|
|||||||
let (entry3, _guard3) = default_entry();
|
let (entry3, _guard3) = default_entry();
|
||||||
state.append(entry3);
|
state.append(entry3);
|
||||||
|
|
||||||
let (entries, batch, _) = state.next_batch(None, 3, 3).unwrap();
|
let (entries, batch, _) = state.next_batch(None, None, 3, 3).unwrap();
|
||||||
assert_eq!(entries.len(), 2);
|
assert_eq!(entries.len(), 2);
|
||||||
assert!(entries.contains_key(&1));
|
assert!(entries.contains_key(&1));
|
||||||
assert!(entries.contains_key(&2));
|
assert!(entries.contains_key(&2));
|
||||||
@ -464,14 +494,14 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_append() {
|
async fn test_queue_append() {
|
||||||
let queue = Queue::new(false, 1, None, 0);
|
let queue = Queue::new(false, None, 1, None, 0);
|
||||||
let (entry, _guard) = default_entry();
|
let (entry, _guard) = default_entry();
|
||||||
queue.append(entry);
|
queue.append(entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_next_batch_empty() {
|
async fn test_queue_next_batch_empty() {
|
||||||
let queue = Queue::new(false, 1, None, 0);
|
let queue = Queue::new(false, None, 1, None, 0);
|
||||||
|
|
||||||
assert!(queue.next_batch(None, 1, 1).await.is_none());
|
assert!(queue.next_batch(None, 1, 1).await.is_none());
|
||||||
assert!(queue.next_batch(Some(1), 1, 1).await.is_none());
|
assert!(queue.next_batch(Some(1), 1, 1).await.is_none());
|
||||||
@ -479,7 +509,7 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_next_batch_min_size() {
|
async fn test_queue_next_batch_min_size() {
|
||||||
let queue = Queue::new(false, 1, None, 0);
|
let queue = Queue::new(false, None, 1, None, 0);
|
||||||
let (entry1, _guard1) = default_entry();
|
let (entry1, _guard1) = default_entry();
|
||||||
let (entry2, _guard2) = default_entry();
|
let (entry2, _guard2) = default_entry();
|
||||||
queue.append(entry1);
|
queue.append(entry1);
|
||||||
@ -510,9 +540,25 @@ mod tests {
|
|||||||
assert_eq!(batch2.size, 1);
|
assert_eq!(batch2.size, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_queue_next_batch_max_size() {
|
||||||
|
let queue = Queue::new(false, Some(1), 1, None, 0);
|
||||||
|
let (entry1, _guard1) = default_entry();
|
||||||
|
let (entry2, _guard2) = default_entry();
|
||||||
|
queue.append(entry1);
|
||||||
|
queue.append(entry2);
|
||||||
|
|
||||||
|
let (entries, batch, _) = queue.next_batch(None, 2, 2).await.unwrap();
|
||||||
|
assert_eq!(entries.len(), 1);
|
||||||
|
assert!(entries.contains_key(&0));
|
||||||
|
assert!(entries.get(&0).unwrap().batch_time.is_some());
|
||||||
|
assert_eq!(batch.id, 0);
|
||||||
|
assert_eq!(batch.size, 1);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_next_batch_token_budget() {
|
async fn test_queue_next_batch_token_budget() {
|
||||||
let queue = Queue::new(false, 1, None, 0);
|
let queue = Queue::new(false, None, 1, None, 0);
|
||||||
let (entry1, _guard1) = default_entry();
|
let (entry1, _guard1) = default_entry();
|
||||||
let (entry2, _guard2) = default_entry();
|
let (entry2, _guard2) = default_entry();
|
||||||
queue.append(entry1);
|
queue.append(entry1);
|
||||||
@ -537,7 +583,7 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_next_batch_token_speculate() {
|
async fn test_queue_next_batch_token_speculate() {
|
||||||
let queue = Queue::new(false, 1, None, 2);
|
let queue = Queue::new(false, None, 1, None, 2);
|
||||||
let (entry1, _guard1) = default_entry();
|
let (entry1, _guard1) = default_entry();
|
||||||
let (entry2, _guard2) = default_entry();
|
let (entry2, _guard2) = default_entry();
|
||||||
queue.append(entry1);
|
queue.append(entry1);
|
||||||
@ -556,7 +602,7 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_queue_next_batch_dropped_receiver() {
|
async fn test_queue_next_batch_dropped_receiver() {
|
||||||
let queue = Queue::new(false, 1, None, 0);
|
let queue = Queue::new(false, None, 1, None, 0);
|
||||||
let (entry, _) = default_entry();
|
let (entry, _) = default_entry();
|
||||||
queue.append(entry);
|
queue.append(entry);
|
||||||
|
|
||||||
|
@ -762,6 +762,7 @@ pub async fn run(
|
|||||||
max_batch_prefill_tokens: u32,
|
max_batch_prefill_tokens: u32,
|
||||||
max_batch_total_tokens: u32,
|
max_batch_total_tokens: u32,
|
||||||
max_waiting_tokens: usize,
|
max_waiting_tokens: usize,
|
||||||
|
max_batch_size: Option<usize>,
|
||||||
client: ShardedClient,
|
client: ShardedClient,
|
||||||
tokenizer: Option<Tokenizer>,
|
tokenizer: Option<Tokenizer>,
|
||||||
validation_workers: usize,
|
validation_workers: usize,
|
||||||
@ -843,6 +844,7 @@ pub async fn run(
|
|||||||
max_batch_prefill_tokens,
|
max_batch_prefill_tokens,
|
||||||
max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
max_waiting_tokens,
|
max_waiting_tokens,
|
||||||
|
max_batch_size,
|
||||||
max_concurrent_requests,
|
max_concurrent_requests,
|
||||||
shard_info.requires_padding,
|
shard_info.requires_padding,
|
||||||
shard_info.window_size,
|
shard_info.window_size,
|
||||||
@ -924,6 +926,7 @@ pub async fn run(
|
|||||||
waiting_served_ratio,
|
waiting_served_ratio,
|
||||||
max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
max_waiting_tokens,
|
max_waiting_tokens,
|
||||||
|
max_batch_size,
|
||||||
validation_workers,
|
validation_workers,
|
||||||
version: env!("CARGO_PKG_VERSION"),
|
version: env!("CARGO_PKG_VERSION"),
|
||||||
sha: option_env!("VERGEN_GIT_SHA"),
|
sha: option_env!("VERGEN_GIT_SHA"),
|
||||||
|
Loading…
Reference in New Issue
Block a user