Fixed load test. Bad sanitation on the router meant CUDA OOM.

This commit is contained in:
Nicolas Patry 2024-04-08 14:08:02 +00:00
parent 99771cfad5
commit 39620ce29f
4 changed files with 59188 additions and 3 deletions

View File

@ -13,7 +13,7 @@ def get_chicken():
def flash_llava_next_handle(launcher): def flash_llava_next_handle(launcher):
with launcher( with launcher(
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
num_shard=1, num_shard=2,
max_input_length=4000, max_input_length=4000,
max_total_tokens=4096, max_total_tokens=4096,
) as handle: ) as handle:

View File

@ -164,7 +164,8 @@ async fn generate(
let start_time = Instant::now(); let start_time = Instant::now();
metrics::increment_counter!("tgi_request_count"); metrics::increment_counter!("tgi_request_count");
tracing::debug!("Input: {}", req.inputs); // Do not long ultra long inputs, like image payloads.
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
let compute_characters = req.inputs.chars().count(); let compute_characters = req.inputs.chars().count();
let mut add_prompt = None; let mut add_prompt = None;

View File

@ -432,7 +432,11 @@ fn prepare_input(
) -> Result<(tokenizers::Encoding, String), ValidationError> { ) -> Result<(tokenizers::Encoding, String), ValidationError> {
let simplified_query = if is_multimodal { let simplified_query = if is_multimodal {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
RE.replace_all(&inputs, "<image>").into() // HACK: Llava uses arbitrary number of tokens (between 576 and ~576 * 5).
// Idefics uses a sequence encoder which doesn't take as much space in the KV cache, but
// there is still some encoder values.
// This hacks just forces more "allocation" for the given
RE.replace_all(&inputs, "<image>".repeat(576)).into()
} else { } else {
inputs.clone() inputs.clone()
}; };
@ -443,6 +447,8 @@ fn prepare_input(
// Optionally truncate // Optionally truncate
if let Some(truncate) = truncate { if let Some(truncate) = truncate {
// XXX: Critical to keep the multimodal check otherwise this modifies the original string
// Which we really don't want.
if truncate < encoding.len() && !is_multimodal { if truncate < encoding.len() && !is_multimodal {
encoding.truncate(truncate, 0, TruncationDirection::Left); encoding.truncate(truncate, 0, TruncationDirection::Left);
inputs = tokenizer inputs = tokenizer