llava next image encoder to allow un-aligned patch / image sizes

This commit is contained in:
Jiayu Liu 2025-01-22 17:09:59 +08:00
parent 64a33c1f05
commit fd88b1d6b9
No known key found for this signature in database
2 changed files with 35 additions and 5 deletions

View File

@ -88,7 +88,12 @@ impl LlavaNext {
pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
let image_size = self.vision_config.image_size;
let patch_size = self.vision_config.patch_size;
assert!(image_size % patch_size == 0);
if image_size % patch_size != 0 {
warn!(
"Image size {} is not divisible by patch size {}, will round down",
image_size, patch_size
);
}
let npatches = image_size / patch_size;
// Dimensions are intentionally swapped to be bug-compatible with
// upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
@ -271,4 +276,26 @@ mod test {
let slots = config.get_number_of_features(1067, 1600);
assert_eq!(slots, 2144);
}
#[test]
fn test_uneven_division() {
let config = LlavaNext {
text_config: TextConfig {},
vision_config: VisionConfig {
image_size: 337, // Intentionally uneven
patch_size: 14,
},
image_grid_pinpoints: vec![
(336, 672),
(672, 336),
(672, 672),
(1008, 336),
(336, 1008),
],
};
// Should still work even with uneven division
let slots = config.get_number_of_features(640, 640);
assert_eq!(slots, 2928);
}
}

View File

@ -170,7 +170,10 @@ def get_number_of_features(height: int, width: int, config) -> int:
image_size = config.vision_config.image_size
patch_size = config.vision_config.patch_size
assert image_size % patch_size == 0
if image_size % patch_size != 0:
logger.warning(
f"Image size {image_size} is not divisible by patch size {patch_size}"
)
npatches = image_size // patch_size
@ -520,9 +523,9 @@ class VlmCausalLM(FlashCausalLM):
cuda_graph["input_lengths"].zero_()
cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
cuda_graph["cache_lengths"].zero_()
cuda_graph["cache_lengths"][
: cache_lengths_tensor.shape[0]
] = cache_lengths_tensor
cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
cache_lengths_tensor
)
with self._forward_context(
block_tables=cuda_graph["block_tables"],