llava next image encoder to allow un-aligned patch / image sizes

2025-07-10 18:00:16 +00:00 · 2025-01-22 17:09:59 +08:00 · 2025-01-22 17:09:59 +08:00 · fd88b1d6b9
commit fd88b1d6b9
parent 64a33c1f05
2 changed files with 35 additions and 5 deletions
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -88,7 +88,12 @@ impl LlavaNext {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let image_size = self.vision_config.image_size;
        let patch_size = self.vision_config.patch_size;
-        assert!(image_size % patch_size == 0);
+        if image_size % patch_size != 0 {
+            warn!(
+                "Image size {} is not divisible by patch size {}, will round down",
+                image_size, patch_size
+            );
+        }
        let npatches = image_size / patch_size;
        // Dimensions are intentionally swapped to be bug-compatible with
        // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
@ -271,4 +276,26 @@ mod test {
        let slots = config.get_number_of_features(1067, 1600);
        assert_eq!(slots, 2144);
    }
+
+    #[test]
+    fn test_uneven_division() {
+        let config = LlavaNext {
+            text_config: TextConfig {},
+            vision_config: VisionConfig {
+                image_size: 337, // Intentionally uneven
+                patch_size: 14,
+            },
+            image_grid_pinpoints: vec![
+                (336, 672),
+                (672, 336),
+                (672, 672),
+                (1008, 336),
+                (336, 1008),
+            ],
+        };
+
+        // Should still work even with uneven division
+        let slots = config.get_number_of_features(640, 640);
+        assert_eq!(slots, 2928);
+    }
 }
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -170,7 +170,10 @@ def get_number_of_features(height: int, width: int, config) -> int:
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size

-    assert image_size % patch_size == 0
+    if image_size % patch_size != 0:
+        logger.warning(
+            f"Image size {image_size} is not divisible by patch size {patch_size}"
+        )

    npatches = image_size // patch_size

@ -520,9 +523,9 @@ class VlmCausalLM(FlashCausalLM):
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
-        cuda_graph["cache_lengths"][
-            : cache_lengths_tensor.shape[0]
-        ] = cache_lengths_tensor
+        cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
+            cache_lengths_tensor
+        )

        with self._forward_context(
            block_tables=cuda_graph["block_tables"],