diff --git a/router/src/config.rs b/router/src/config.rs index 93b6f4fa..650f1b47 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -89,7 +89,12 @@ impl LlavaNext { pub fn get_number_of_features(&self, height: usize, width: usize) -> usize { let image_size = self.vision_config.image_size; let patch_size = self.vision_config.patch_size; - assert!(image_size % patch_size == 0); + if image_size % patch_size != 0 { + warn!( + "Image size {} is not divisible by patch size {}, will round down", + image_size, patch_size + ); + } let npatches = image_size / patch_size; // Dimensions are intentionally swapped to be bug-compatible with // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 @@ -461,4 +466,26 @@ mod test { let slots = config.get_number_of_features(1067, 1600); assert_eq!(slots, 2144); } + + #[test] + fn test_uneven_division() { + let config = LlavaNext { + text_config: TextConfig {}, + vision_config: VisionConfig { + image_size: 337, // Intentionally uneven + patch_size: 14, + }, + image_grid_pinpoints: vec![ + (336, 672), + (672, 336), + (672, 672), + (1008, 336), + (336, 1008), + ], + }; + + // Should still work even with uneven division + let slots = config.get_number_of_features(640, 640); + assert_eq!(slots, 2928); + } } diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 2b1e01df..11901e34 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -225,7 +225,10 @@ def get_number_of_features(height: int, width: int, config) -> int: image_size = config.vision_config.image_size patch_size = config.vision_config.patch_size - assert image_size % patch_size == 0 + if image_size % patch_size != 0: + logger.warning( + f"Image size {image_size} is not divisible by patch size {patch_size}" + ) npatches = image_size // patch_size @@ -579,9 +582,9 @@ class VlmCausalLM(FlashCausalLM): cuda_graph["input_lengths"].zero_() cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths cuda_graph["cache_lengths"].zero_() - cuda_graph["cache_lengths"][ - : cache_lengths_tensor.shape[0] - ] = cache_lengths_tensor + cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = ( + cache_lengths_tensor + ) with self._forward_context( block_tables=cuda_graph["block_tables"],