mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Fixed load test. Bad sanitation on the router meant CUDA OOM.
This commit is contained in:
parent
99771cfad5
commit
39620ce29f
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ def get_chicken():
|
||||
def flash_llava_next_handle(launcher):
|
||||
with launcher(
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
num_shard=1,
|
||||
num_shard=2,
|
||||
max_input_length=4000,
|
||||
max_total_tokens=4096,
|
||||
) as handle:
|
||||
|
@ -164,7 +164,8 @@ async fn generate(
|
||||
let start_time = Instant::now();
|
||||
metrics::increment_counter!("tgi_request_count");
|
||||
|
||||
tracing::debug!("Input: {}", req.inputs);
|
||||
// Do not long ultra long inputs, like image payloads.
|
||||
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
|
||||
|
||||
let compute_characters = req.inputs.chars().count();
|
||||
let mut add_prompt = None;
|
||||
|
@ -432,7 +432,11 @@ fn prepare_input(
|
||||
) -> Result<(tokenizers::Encoding, String), ValidationError> {
|
||||
let simplified_query = if is_multimodal {
|
||||
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
|
||||
RE.replace_all(&inputs, "<image>").into()
|
||||
// HACK: Llava uses arbitrary number of tokens (between 576 and ~576 * 5).
|
||||
// Idefics uses a sequence encoder which doesn't take as much space in the KV cache, but
|
||||
// there is still some encoder values.
|
||||
// This hacks just forces more "allocation" for the given
|
||||
RE.replace_all(&inputs, "<image>".repeat(576)).into()
|
||||
} else {
|
||||
inputs.clone()
|
||||
};
|
||||
@ -443,6 +447,8 @@ fn prepare_input(
|
||||
|
||||
// Optionally truncate
|
||||
if let Some(truncate) = truncate {
|
||||
// XXX: Critical to keep the multimodal check otherwise this modifies the original string
|
||||
// Which we really don't want.
|
||||
if truncate < encoding.len() && !is_multimodal {
|
||||
encoding.truncate(truncate, 0, TruncationDirection::Left);
|
||||
inputs = tokenizer
|
||||
|
Loading…
Reference in New Issue
Block a user