Preventing single user hugging the server to death by asking (#3016)

for way too many tokens.
This commit is contained in:
Nicolas Patry 2025-02-13 11:23:17 +01:00 committed by GitHub
parent 4cccce4b44
commit 8a211dc7fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -147,7 +147,13 @@ impl Validation {
// Get total tokens
let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens {
(max_new_tokens, max_new_tokens)
// Do not accept humongous max_new_tokens queries.
// We preallocate the default but we prevent a single user
// from taking up all the slots in a handful of queries that consume little
// amount of tokens. (You can have 10 token long query that creates a handful of token
// but the requested amount to be 120k.
let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH);
(chunk_size, max_new_tokens)
} else {
// Use the maximum possible number of tokens as default
// However, the system will re-queue the request everytime it completes