Merge fd88b1d6b9 into 8f8819795f

2025-09-17 23:34:52 +00:00 · 2025-04-18 09:19:18 -04:00 · 2025-04-18 09:19:18 -04:00 · d73e2184ef
commit d73e2184ef
parent 8f8819795f fd88b1d6b9
2 changed files with 35 additions and 5 deletions
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -89,7 +89,12 @@ impl LlavaNext {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let image_size = self.vision_config.image_size;
        let patch_size = self.vision_config.patch_size;
-        assert!(image_size % patch_size == 0);
+        if image_size % patch_size != 0 {
            warn!(
                "Image size {} is not divisible by patch size {}, will round down",
                image_size, patch_size
            );
        }
        let npatches = image_size / patch_size;
        // Dimensions are intentionally swapped to be bug-compatible with
        // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
@ -461,4 +466,26 @@ mod test {
        let slots = config.get_number_of_features(1067, 1600);
        assert_eq!(slots, 2144);
    }
    #[test]
    fn test_uneven_division() {
        let config = LlavaNext {
            text_config: TextConfig {},
            vision_config: VisionConfig {
                image_size: 337, // Intentionally uneven
                patch_size: 14,
            },
            image_grid_pinpoints: vec![
                (336, 672),
                (672, 336),
                (672, 672),
                (1008, 336),
                (336, 1008),
            ],
        };
        // Should still work even with uneven division
        let slots = config.get_number_of_features(640, 640);
        assert_eq!(slots, 2928);
    }
 }
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -225,7 +225,10 @@ def get_number_of_features(height: int, width: int, config) -> int:
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size
-    assert image_size % patch_size == 0
+    if image_size % patch_size != 0:
        logger.warning(
            f"Image size {image_size} is not divisible by patch size {patch_size}"
        )
    npatches = image_size // patch_size
@ -579,9 +582,9 @@ class VlmCausalLM(FlashCausalLM):
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
-        cuda_graph["cache_lengths"][
+        cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
-            : cache_lengths_tensor.shape[0]
+            cache_lengths_tensor
-        ] = cache_lengths_tensor
+        )
        with self._forward_context(
            block_tables=cuda_graph["block_tables"],