mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Fixed load test. Bad sanitation on the router meant CUDA OOM.
This commit is contained in:
parent
99771cfad5
commit
39620ce29f
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ def get_chicken():
|
|||||||
def flash_llava_next_handle(launcher):
|
def flash_llava_next_handle(launcher):
|
||||||
with launcher(
|
with launcher(
|
||||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
num_shard=1,
|
num_shard=2,
|
||||||
max_input_length=4000,
|
max_input_length=4000,
|
||||||
max_total_tokens=4096,
|
max_total_tokens=4096,
|
||||||
) as handle:
|
) as handle:
|
||||||
|
@ -164,7 +164,8 @@ async fn generate(
|
|||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::increment_counter!("tgi_request_count");
|
||||||
|
|
||||||
tracing::debug!("Input: {}", req.inputs);
|
// Do not long ultra long inputs, like image payloads.
|
||||||
|
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
|
||||||
|
|
||||||
let compute_characters = req.inputs.chars().count();
|
let compute_characters = req.inputs.chars().count();
|
||||||
let mut add_prompt = None;
|
let mut add_prompt = None;
|
||||||
|
@ -432,7 +432,11 @@ fn prepare_input(
|
|||||||
) -> Result<(tokenizers::Encoding, String), ValidationError> {
|
) -> Result<(tokenizers::Encoding, String), ValidationError> {
|
||||||
let simplified_query = if is_multimodal {
|
let simplified_query = if is_multimodal {
|
||||||
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
|
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
|
||||||
RE.replace_all(&inputs, "<image>").into()
|
// HACK: Llava uses arbitrary number of tokens (between 576 and ~576 * 5).
|
||||||
|
// Idefics uses a sequence encoder which doesn't take as much space in the KV cache, but
|
||||||
|
// there is still some encoder values.
|
||||||
|
// This hacks just forces more "allocation" for the given
|
||||||
|
RE.replace_all(&inputs, "<image>".repeat(576)).into()
|
||||||
} else {
|
} else {
|
||||||
inputs.clone()
|
inputs.clone()
|
||||||
};
|
};
|
||||||
@ -443,6 +447,8 @@ fn prepare_input(
|
|||||||
|
|
||||||
// Optionally truncate
|
// Optionally truncate
|
||||||
if let Some(truncate) = truncate {
|
if let Some(truncate) = truncate {
|
||||||
|
// XXX: Critical to keep the multimodal check otherwise this modifies the original string
|
||||||
|
// Which we really don't want.
|
||||||
if truncate < encoding.len() && !is_multimodal {
|
if truncate < encoding.len() && !is_multimodal {
|
||||||
encoding.truncate(truncate, 0, TruncationDirection::Left);
|
encoding.truncate(truncate, 0, TruncationDirection::Left);
|
||||||
inputs = tokenizer
|
inputs = tokenizer
|
||||||
|
Loading…
Reference in New Issue
Block a user