feat: update docs, image token logic and weight names

2025-09-11 12:24:53 +00:00 · 2024-12-17 19:44:24 +00:00 · 2024-12-17 19:44:24 +00:00 · dbe1666bc7
commit dbe1666bc7
parent ebef284b3d
4 changed files with 78 additions and 5 deletions
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -614,6 +614,73 @@ fn image_tokens(
            image_string
        }
        Idefics3(config) => {
            const FAKE: &str = "<fake_token_around_image>";
            const IMAGE: &str = "<image>";
            const GLOBAL_IMG: &str = "<global-img>";
            let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();
            // resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio
            let (height, width) = if height > max_longest_edge_for_image_resize
                || width > max_longest_edge_for_image_resize
            {
                let aspect_ratio = height as f32 / width as f32;
                if height > width {
                    (
                        max_longest_edge_for_image_resize,
                        (max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize,
                    )
                } else {
                    (
                        (max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize,
                        max_longest_edge_for_image_resize,
                    )
                }
            } else {
                (height, width)
            };
            let image_seq_len = config.get_number_of_features();
            let max_edge = config.get_max_longest_edge();
            let (image_rows, image_cols) = if height > max_edge || width > max_edge {
                (
                    (height as f32 / max_edge as f32).ceil() as usize,
                    (width as f32 / max_edge as f32).ceil() as usize,
                )
            } else {
                (0, 0)
            };
            let mut image_string = String::new();
            if image_rows == 0 && image_cols == 0 {
                // Single image case
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            } else {
                // Split image case
                for n_h in 0..image_rows {
                    for n_w in 0..image_cols {
                        image_string.push_str(FAKE);
                        image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
                        image_string.push_str(&IMAGE.repeat(image_seq_len));
                    }
                    image_string.push('\n');
                }
                image_string.push('\n');
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            }
            image_string
        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        Qwen2Vl(config) => format!(
@ -647,7 +714,8 @@ fn prepare_input<T: TokenizerTrait>(
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
        Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
            | Qwen2Vl(_)),
        ) => {
            let mut input_chunks = Vec::new();
            let mut tokenizer_query = String::with_capacity(inputs.len());
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -534,7 +534,7 @@ class FlashLlamaModel(torch.nn.Module):
                        prefix=(
                            f"model.layers.{layer_id}"
                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
+                            else f"{prefix}.layers.{layer_id}"
                        ),
                        config=config,
                        weights=weights,
@ -547,7 +547,7 @@ class FlashLlamaModel(torch.nn.Module):
                        prefix=(
                            f"model.layers.{layer_id}"
                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
+                            else f"{prefix}.layers.{layer_id}"
                        ),
                        config=config,
                        weights=weights,
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@ -774,7 +774,7 @@ class Idefics3ForConditionalGeneration(nn.Module):
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
@ -783,6 +783,10 @@ class Idefics3ForConditionalGeneration(nn.Module):
        # Unused here
        image_sizes: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        cross_attention_states: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)
        if pixel_values is not None:
@ -872,7 +876,7 @@ class Idefics3ForConditionalGeneration(nn.Module):
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=None,