diff --git a/router/src/config.rs b/router/src/config.rs index ef672870..ccbdd8b2 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -71,10 +71,12 @@ fn get_unpadded_features( let current_aspect_ratio: f64 = current_width as f64 / current_height as f64; let (current_height, current_width) = if aspect_ratio > current_aspect_ratio { let new_height = (height * current_width) / width; - (new_height, current_width) + let padding = (current_height - new_height) / 2; + (current_height - (2 * padding), current_width) } else { let new_width = (width * current_height) / height; - (current_height, new_width) + let padding = (current_width - new_width) / 2; + (current_height, current_width - (2 * padding)) }; let unpadded_features = current_height * current_width; @@ -88,7 +90,9 @@ impl LlavaNext { let patch_size = self.vision_config.patch_size; assert!(image_size % patch_size == 0); let npatches = image_size / patch_size; - let (num_patch_height, num_patch_width) = + // Dimensions are intentionally swapped to be bug-compatible with + // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 + let (num_patch_width, num_patch_height) = get_anyres_image_grid_shape(height, width, &self.image_grid_pinpoints, image_size); let (unpadded_features, newline_features) = diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index de9673aa..9e0caf7b 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -39,7 +39,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): Args: image_size (`tuple`): - The size of the input image in the format (width, height). + The size of the input image in the format (height, width). grid_pinpoints (`List`): A list containing possible resolutions. Each item in the list should be a tuple or list of the form `(height, width)`. @@ -47,7 +47,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): The size of each image patch. Returns: - tuple: The shape of the image patch grid in the format (width, height). + tuple: The shape of the image patch grid in the format (height, width). """ if not isinstance(grid_pinpoints, list): raise ValueError("grid_pinpoints should be a list of tuples or lists") @@ -229,7 +229,10 @@ class LlavaNextForConditionalGeneration(nn.Module): raise ValueError( "The number of patches is not consistent with the image size." ) - num_patch_height, num_patch_width = get_anyres_image_grid_shape( + + # Dimensions are intentionally swapped to be bug-compatible with + # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 + num_patch_width, num_patch_height = get_anyres_image_grid_shape( image_sizes[image_idx], self.config.image_grid_pinpoints, self.config.vision_config.image_size, diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index a02a2091..60bec05d 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -22,7 +22,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): Args: image_size (`tuple`): - The size of the input image in the format (width, height). + The size of the input image in the format (height, width). grid_pinpoints (`List`): A list containing possible resolutions. Each item in the list should be a tuple or list of the form `(height, width)`. @@ -64,19 +64,26 @@ def image_text_replacement(processor, image_input, config, image_id) -> str: def get_unpadded_features( - height: int, width: int, npatches: int, num_patch_height: int, num_patch_width: int + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, ) -> Tuple[int, int]: current_height = npatches * num_patch_height current_width = npatches * num_patch_width - aspect_ratio: float = width / height + aspect_ratio: float = original_width / original_height current_aspect_ratio: float = current_width / current_height + if aspect_ratio > current_aspect_ratio: - new_height = (height * current_width) // width - current_height = new_height + new_height = (original_height * current_width) // original_width + padding = (current_height - new_height) // 2 + current_height = current_height - (2 * padding) else: - new_width = (width * current_height) // height - current_width = new_width + new_width = (original_width * current_height) // original_height + padding = (current_width - new_width) // 2 + current_width = current_width - (2 * padding) unpadded_features = current_height * current_width newline_features = current_height @@ -95,7 +102,9 @@ def get_number_of_features(height: int, width: int, config) -> int: npatches = image_size // patch_size - num_patch_height, num_patch_width = get_anyres_image_grid_shape( + # Dimensions are intentionally swapped to be bug-compatible with + # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 + num_patch_width, num_patch_height = get_anyres_image_grid_shape( [height, width], image_grid_pinpoints, image_size,