Fixed load test. Bad sanitation on the router meant CUDA OOM.

2025-09-11 12:24:53 +00:00 · 2024-04-08 14:08:02 +00:00 · 2024-04-08 14:08:02 +00:00 · 39620ce29f
commit 39620ce29f
parent 99771cfad5
4 changed files with 59188 additions and 3 deletions
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
--- a/integration-tests/models/test_llava_next.py
+++ b/integration-tests/models/test_llava_next.py
@ -13,7 +13,7 @@ def get_chicken():
 def flash_llava_next_handle(launcher):
    with launcher(
        "llava-hf/llava-v1.6-mistral-7b-hf",
-        num_shard=1,
+        num_shard=2,
        max_input_length=4000,
        max_total_tokens=4096,
    ) as handle:
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -164,7 +164,8 @@ async fn generate(
    let start_time = Instant::now();
    metrics::increment_counter!("tgi_request_count");

-    tracing::debug!("Input: {}", req.inputs);
+    // Do not long ultra long inputs, like image payloads.
+    tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);

    let compute_characters = req.inputs.chars().count();
    let mut add_prompt = None;
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -432,7 +432,11 @@ fn prepare_input(
 ) -> Result<(tokenizers::Encoding, String), ValidationError> {
    let simplified_query = if is_multimodal {
        static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
-        RE.replace_all(&inputs, "<image>").into()
+        // HACK: Llava uses arbitrary number of tokens (between 576 and ~576 * 5).
+        // Idefics uses a sequence encoder which doesn't take as much space in the KV cache, but
+        // there is still some encoder values.
+        // This hacks just forces more "allocation" for the given
+        RE.replace_all(&inputs, "<image>".repeat(576)).into()
    } else {
        inputs.clone()
    };
@ -443,6 +447,8 @@ fn prepare_input(

    // Optionally truncate
    if let Some(truncate) = truncate {
+        // XXX: Critical to keep the multimodal check otherwise this modifies the original string
+        // Which we really don't want.
        if truncate < encoding.len() && !is_multimodal {
            encoding.truncate(truncate, 0, TruncationDirection::Left);
            inputs = tokenizer