Fixed load test. Bad sanitation on the router meant CUDA OOM.

This commit is contained in:
Nicolas Patry 2024-04-08 14:08:02 +00:00
parent 99771cfad5
commit 39620ce29f
4 changed files with 59188 additions and 3 deletions

View File

@ -13,7 +13,7 @@ def get_chicken():
def flash_llava_next_handle(launcher):
with launcher(
"llava-hf/llava-v1.6-mistral-7b-hf",
num_shard=1,
num_shard=2,
max_input_length=4000,
max_total_tokens=4096,
) as handle:

View File

@ -164,7 +164,8 @@ async fn generate(
let start_time = Instant::now();
metrics::increment_counter!("tgi_request_count");
tracing::debug!("Input: {}", req.inputs);
// Do not long ultra long inputs, like image payloads.
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
let compute_characters = req.inputs.chars().count();
let mut add_prompt = None;

View File

@ -432,7 +432,11 @@ fn prepare_input(
) -> Result<(tokenizers::Encoding, String), ValidationError> {
let simplified_query = if is_multimodal {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
RE.replace_all(&inputs, "<image>").into()
// HACK: Llava uses arbitrary number of tokens (between 576 and ~576 * 5).
// Idefics uses a sequence encoder which doesn't take as much space in the KV cache, but
// there is still some encoder values.
// This hacks just forces more "allocation" for the given
RE.replace_all(&inputs, "<image>".repeat(576)).into()
} else {
inputs.clone()
};
@ -443,6 +447,8 @@ fn prepare_input(
// Optionally truncate
if let Some(truncate) = truncate {
// XXX: Critical to keep the multimodal check otherwise this modifies the original string
// Which we really don't want.
if truncate < encoding.len() && !is_multimodal {
encoding.truncate(truncate, 0, TruncationDirection::Left);
inputs = tokenizer