diff --git a/Dockerfile b/Dockerfile index 65376b0b..9782f931 100644 --- a/Dockerfile +++ b/Dockerfile @@ -193,7 +193,8 @@ RUN cd server && \ pwd && \ text-generation-server --help -RUN uv pip install torchvision --no-deps +# This shouldn't be necessary. +# RUN uv pip install torchvision --no-deps # Copy build artifacts from flash attention builder COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages diff --git a/integration-tests/models/test_transformers_llama4.py b/integration-tests/models/test_transformers_llama4.py index a73138d1..a20d3284 100644 --- a/integration-tests/models/test_transformers_llama4.py +++ b/integration-tests/models/test_transformers_llama4.py @@ -152,4 +152,4 @@ async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot ], max_tokens=100, ) - assert response == response_snapshot \ No newline at end of file + assert response == response_snapshot diff --git a/router/src/config.rs b/router/src/config.rs index 0074b29a..8188e535 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -153,12 +153,9 @@ fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usiz for (h, w) in _asp_ratios { let divisor = gcd(h, w); - let key = (h / divisor, w / divisor); // reduced aspect ratio as key + let key = (h / divisor, w / divisor); // reduced aspect ratio as key - if !asp_dict.contains_key(&key) { - asp_dict.insert(key, vec![]); - } - asp_dict.get_mut(&key).unwrap().push((h, w)); + asp_dict.entry(key).or_default().push((h, w)); } } @@ -176,7 +173,7 @@ fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usiz fn get_best_fit( original_height: usize, original_width: usize, - possible_resolutions: &Vec<(usize, usize)>, + possible_resolutions: &[(usize, usize)], resize_to_max_canvas: bool, ) -> (usize, usize) { let orig_h = original_height as f32; @@ -194,20 +191,13 @@ fn get_best_fit( let upscaling_options: Vec = scales.iter().copied().filter(|&s| s >= 1.0).collect(); let selected_scale = if !upscaling_options.is_empty() { if resize_to_max_canvas { - upscaling_options - .into_iter() - .fold(f32::MIN, f32::max) + upscaling_options.into_iter().fold(f32::MIN, f32::max) } else { - upscaling_options - .into_iter() - .fold(f32::MAX, f32::min) + upscaling_options.into_iter().fold(f32::MAX, f32::min) } } else { - let downscaling_options: Vec = - scales.iter().copied().filter(|&s| s < 1.0).collect(); - downscaling_options - .into_iter() - .fold(f32::MIN, f32::max) + let downscaling_options: Vec = scales.iter().copied().filter(|&s| s < 1.0).collect(); + downscaling_options.into_iter().fold(f32::MIN, f32::max) }; let chosen_canvas: Vec<(usize, usize)> = possible_resolutions @@ -375,7 +365,6 @@ pub struct Gemma3 { vision_config: Gemma3VisionConfig, } - #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] diff --git a/router/src/lib.rs b/router/src/lib.rs index e2c0f921..50adb5cf 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -207,7 +207,6 @@ pub struct Llama4Processor { do_image_splitting: bool, } - #[derive(Debug, Clone, Deserialize, Default)] pub struct HubProcessorConfig { pub chat_template: Option, diff --git a/router/src/validation.rs b/router/src/validation.rs index 3813e358..2d1d9a3d 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -698,7 +698,8 @@ fn image_tokens( let image_height = config.image_size(); let patch_size = config.patch_size(); let pixel_shuffle_ratio = config.pixel_shuffle_ratio(); - let downsample_ratio = (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize; + let downsample_ratio = + (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize; let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width); let image_width = image_height; // Assuming pixel shape: [H][W][C] @@ -726,7 +727,7 @@ fn image_tokens( img_string.push_str(IMAGE_END); img_string - }, + } Qwen2Vl(config) => format!( "<|vision_start|>{:?}<|vision_end|>", "<|image_pad|>".repeat(config.get_number_of_features(height, width)) @@ -770,8 +771,8 @@ fn prepare_input( static RE: Lazy = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); let (tokenizer_query, input_chunks) = match config { Some( - config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_) | Paligemma(_) - | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)), + config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_) + | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)), ) => { let mut input_chunks = Vec::new(); let mut tokenizer_query = String::with_capacity(inputs.len()); diff --git a/server/text_generation_server/models/transformers_flash_vlm.py b/server/text_generation_server/models/transformers_flash_vlm.py index ff385017..a7beb68b 100644 --- a/server/text_generation_server/models/transformers_flash_vlm.py +++ b/server/text_generation_server/models/transformers_flash_vlm.py @@ -395,7 +395,7 @@ class TransformersFlashVlmCausalLM(VlmCausalLM): image_grid_thw=image_grid_thw, attention_mask=inputs.get("attention_mask", None), use_sdpa=inputs.get("use_sdpa", False), - cache_position=inputs.get("cache_position", None) + cache_position=inputs.get("cache_position", None), ).logits logits = self.post_process_outputs(logits, lm_head_indices) @@ -560,9 +560,7 @@ class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM): class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM): def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): - inputs = super().pre_process_inputs( - input_ids, position_ids, cu_seqlen_prefill - ) + inputs = super().pre_process_inputs(input_ids, position_ids, cu_seqlen_prefill) inputs["cache_position"] = position_ids inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device) - return inputs \ No newline at end of file + return inputs