From d43e10e097653de734d140d832b1487451d02cfc Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 11 Apr 2024 18:30:38 +0000 Subject: [PATCH] Making things work most of the time. --- launcher/src/main.rs | 7 ++++++- router/src/validation.rs | 14 ++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 3f8bd424d..405d1d7f8 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1290,7 +1290,12 @@ fn main() -> Result<(), LauncherError> { let content = std::fs::read_to_string(filename)?; let config: Config = serde_json::from_str(&content)?; - let max_default = 2usize.pow(14); + // Quantization usually means you're even more RAM constrained. + let max_default = if args.quantize.is_some() { + 4096 + } else { + 2usize.pow(14) + }; let max_position_embeddings = if config.max_position_embeddings > max_default { let max = config.max_position_embeddings; diff --git a/router/src/validation.rs b/router/src/validation.rs index 24bcf1910..ba6f4f6d6 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -163,13 +163,15 @@ impl Validation { }; let input_length = truncate.unwrap_or(self.max_input_length); + // We don't have a tokenizer, therefore we have no idea how long is the query, let + // them through and hope for the best. // Validate MaxNewTokens - if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 { - return Err(ValidationError::MaxNewTokens( - self.max_total_tokens - self.max_input_length, - max_new_tokens, - )); - } + // if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 { + // return Err(ValidationError::MaxNewTokens( + // self.max_total_tokens - self.max_input_length, + // max_new_tokens, + // )); + // } Ok((inputs, input_length, max_new_tokens)) }