feat: update docs, image token logic and weight names

2025-09-11 12:24:53 +00:00 · 2024-12-17 19:44:24 +00:00 · 2024-12-17 19:44:24 +00:00 · dbe1666bc7
commit dbe1666bc7
parent ebef284b3d
4 changed files with 78 additions and 5 deletions
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models. The following sectio

 - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
+- [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -614,6 +614,73 @@ fn image_tokens(

            image_string
        }
+        Idefics3(config) => {
+            const FAKE: &str = "<fake_token_around_image>";
+            const IMAGE: &str = "<image>";
+            const GLOBAL_IMG: &str = "<global-img>";
+
+            let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();
+
+            // resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio
+            let (height, width) = if height > max_longest_edge_for_image_resize
+                || width > max_longest_edge_for_image_resize
+            {
+                let aspect_ratio = height as f32 / width as f32;
+                if height > width {
+                    (
+                        max_longest_edge_for_image_resize,
+                        (max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize,
+                    )
+                } else {
+                    (
+                        (max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize,
+                        max_longest_edge_for_image_resize,
+                    )
+                }
+            } else {
+                (height, width)
+            };
+
+            let image_seq_len = config.get_number_of_features();
+            let max_edge = config.get_max_longest_edge();
+
+            let (image_rows, image_cols) = if height > max_edge || width > max_edge {
+                (
+                    (height as f32 / max_edge as f32).ceil() as usize,
+                    (width as f32 / max_edge as f32).ceil() as usize,
+                )
+            } else {
+                (0, 0)
+            };
+
+            let mut image_string = String::new();
+
+            if image_rows == 0 && image_cols == 0 {
+                // Single image case
+                image_string.push_str(FAKE);
+                image_string.push_str(GLOBAL_IMG);
+                image_string.push_str(&IMAGE.repeat(image_seq_len));
+                image_string.push_str(FAKE);
+            } else {
+                // Split image case
+                for n_h in 0..image_rows {
+                    for n_w in 0..image_cols {
+                        image_string.push_str(FAKE);
+                        image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
+                        image_string.push_str(&IMAGE.repeat(image_seq_len));
+                    }
+                    image_string.push('\n');
+                }
+
+                image_string.push('\n');
+                image_string.push_str(FAKE);
+                image_string.push_str(GLOBAL_IMG);
+                image_string.push_str(&IMAGE.repeat(image_seq_len));
+                image_string.push_str(FAKE);
+            }
+
+            image_string
+        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        Qwen2Vl(config) => format!(
@ -647,7 +714,8 @@ fn prepare_input<T: TokenizerTrait>(
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
        Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
+            | Qwen2Vl(_)),
        ) => {
            let mut input_chunks = Vec::new();
            let mut tokenizer_query = String::with_capacity(inputs.len());
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -534,7 +534,7 @@ class FlashLlamaModel(torch.nn.Module):
                        prefix=(
                            f"model.layers.{layer_id}"
                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
+                            else f"{prefix}.layers.{layer_id}"
                        ),
                        config=config,
                        weights=weights,
@ -547,7 +547,7 @@ class FlashLlamaModel(torch.nn.Module):
                        prefix=(
                            f"model.layers.{layer_id}"
                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
+                            else f"{prefix}.layers.{layer_id}"
                        ),
                        config=config,
                        weights=weights,
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@ -774,7 +774,7 @@ class Idefics3ForConditionalGeneration(nn.Module):
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
@ -783,6 +783,10 @@ class Idefics3ForConditionalGeneration(nn.Module):
        # Unused here
        image_sizes: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)
        if pixel_values is not None:
@ -872,7 +876,7 @@ class Idefics3ForConditionalGeneration(nn.Module):
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=None,