From 412f605e32b6f3feae463814cf991fba2d53364c Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 12 Feb 2025 08:29:06 +0100 Subject: [PATCH] Preventing single user hugging the server to death by asking for way too many tokens. --- router/src/validation.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/router/src/validation.rs b/router/src/validation.rs index 7ac05b21..3b021b67 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -147,7 +147,13 @@ impl Validation { // Get total tokens let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens { - (max_new_tokens, max_new_tokens) + // Do not accept humongous max_new_tokens queries. + // We preallocate the default but we prevent a single user + // from taking up all the slots in a handful of queries that consume little + // amount of tokens. (You can have 10 token long query that creates a handful of token + // but the requested amount to be 120k. + let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH); + (chunk_size, max_new_tokens) } else { // Use the maximum possible number of tokens as default // However, the system will re-queue the request everytime it completes