diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 0f39ff28..5ac90351 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models. The following sectio - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2) - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal) +- [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal) - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal) - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f) - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) diff --git a/router/src/validation.rs b/router/src/validation.rs index 8137ac58..6d5b06bd 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -614,6 +614,73 @@ fn image_tokens( image_string } + Idefics3(config) => { + const FAKE: &str = ""; + const IMAGE: &str = ""; + const GLOBAL_IMG: &str = ""; + + let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize(); + + // resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio + let (height, width) = if height > max_longest_edge_for_image_resize + || width > max_longest_edge_for_image_resize + { + let aspect_ratio = height as f32 / width as f32; + if height > width { + ( + max_longest_edge_for_image_resize, + (max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize, + ) + } else { + ( + (max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize, + max_longest_edge_for_image_resize, + ) + } + } else { + (height, width) + }; + + let image_seq_len = config.get_number_of_features(); + let max_edge = config.get_max_longest_edge(); + + let (image_rows, image_cols) = if height > max_edge || width > max_edge { + ( + (height as f32 / max_edge as f32).ceil() as usize, + (width as f32 / max_edge as f32).ceil() as usize, + ) + } else { + (0, 0) + }; + + let mut image_string = String::new(); + + if image_rows == 0 && image_cols == 0 { + // Single image case + image_string.push_str(FAKE); + image_string.push_str(GLOBAL_IMG); + image_string.push_str(&IMAGE.repeat(image_seq_len)); + image_string.push_str(FAKE); + } else { + // Split image case + for n_h in 0..image_rows { + for n_w in 0..image_cols { + image_string.push_str(FAKE); + image_string.push_str(&format!("", n_h + 1, n_w + 1)); + image_string.push_str(&IMAGE.repeat(image_seq_len)); + } + image_string.push('\n'); + } + + image_string.push('\n'); + image_string.push_str(FAKE); + image_string.push_str(GLOBAL_IMG); + image_string.push_str(&IMAGE.repeat(image_seq_len)); + image_string.push_str(FAKE); + } + + image_string + } Paligemma(config) => "".repeat(config.get_number_of_features(height, width)), LlavaNext(config) => "".repeat(config.get_number_of_features(height, width)), Qwen2Vl(config) => format!( @@ -647,7 +714,8 @@ fn prepare_input( static RE: Lazy = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); let (tokenizer_query, input_chunks) = match config { Some( - config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)), + config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_) + | Qwen2Vl(_)), ) => { let mut input_chunks = Vec::new(); let mut tokenizer_query = String::with_capacity(inputs.len()); diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 3e565109..63207938 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -534,7 +534,7 @@ class FlashLlamaModel(torch.nn.Module): prefix=( f"model.layers.{layer_id}" if not prefix - else f"{prefix}.model.layers.{layer_id}" + else f"{prefix}.layers.{layer_id}" ), config=config, weights=weights, @@ -547,7 +547,7 @@ class FlashLlamaModel(torch.nn.Module): prefix=( f"model.layers.{layer_id}" if not prefix - else f"{prefix}.model.layers.{layer_id}" + else f"{prefix}.layers.{layer_id}" ), config=config, weights=weights, diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 6040625b..b1967ec3 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -774,7 +774,7 @@ class Idefics3ForConditionalGeneration(nn.Module): kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], block_tables: torch.Tensor, slots: torch.Tensor, - input_lengths: torch.Tensor, + seqlen: Seqlen, max_s: int, prefill_cache_indices: Optional[torch.Tensor], lm_head_indices: Optional[torch.Tensor] = None, @@ -783,6 +783,10 @@ class Idefics3ForConditionalGeneration(nn.Module): # Unused here image_sizes: Optional[torch.Tensor] = None, adapter_data: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + cross_attention_states: Optional[torch.Tensor] = None, + image_indices=None, ): inputs_embeds = self.text_model.embed_tokens(input_ids) if pixel_values is not None: @@ -872,7 +876,7 @@ class Idefics3ForConditionalGeneration(nn.Module): kv_cache=kv_cache, block_tables=block_tables, slots=slots, - input_lengths=input_lengths, + seqlen=seqlen, max_s=max_s, true_max_s=max_s, prefill_cache_indices=None,