From 8a211dc7fc278efaf4688c6566b6ba06ae028f39 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 13 Feb 2025 11:23:17 +0100 Subject: [PATCH] Preventing single user hugging the server to death by asking (#3016) for way too many tokens. --- router/src/validation.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/router/src/validation.rs b/router/src/validation.rs index 7ac05b21..3b021b67 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -147,7 +147,13 @@ impl Validation { // Get total tokens let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens { - (max_new_tokens, max_new_tokens) + // Do not accept humongous max_new_tokens queries. + // We preallocate the default but we prevent a single user + // from taking up all the slots in a handful of queries that consume little + // amount of tokens. (You can have 10 token long query that creates a handful of token + // but the requested amount to be 120k. + let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH); + (chunk_size, max_new_tokens) } else { // Use the maximum possible number of tokens as default // However, the system will re-queue the request everytime it completes