Preventing single user hugging the server to death by asking

for way too many tokens.
This commit is contained in:
Nicolas Patry 2025-02-12 08:29:06 +01:00
parent b86c3947ab
commit 412f605e32
No known key found for this signature in database
GPG Key ID: 4242CEF24CB6DBF9

View File

@ -147,7 +147,13 @@ impl Validation {
// Get total tokens // Get total tokens
let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens { let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens {
(max_new_tokens, max_new_tokens) // Do not accept humongous max_new_tokens queries.
// We preallocate the default but we prevent a single user
// from taking up all the slots in a handful of queries that consume little
// amount of tokens. (You can have 10 token long query that creates a handful of token
// but the requested amount to be 120k.
let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH);
(chunk_size, max_new_tokens)
} else { } else {
// Use the maximum possible number of tokens as default // Use the maximum possible number of tokens as default
// However, the system will re-queue the request everytime it completes // However, the system will re-queue the request everytime it completes