From 1aa812da43253840c994d49882ac1de16aaddbb7 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 22 Apr 2024 20:40:21 +0000 Subject: [PATCH] Making image passes image per image to save VRAM. --- router/client/src/client.rs | 6 +- .../models/custom_modeling/idefics2.py | 100 ++++++++++-------- .../models/custom_modeling/llava_next.py | 7 +- 3 files changed, 65 insertions(+), 48 deletions(-) diff --git a/router/client/src/client.rs b/router/client/src/client.rs index 24ecd2ad..e8035106 100644 --- a/router/client/src/client.rs +++ b/router/client/src/client.rs @@ -115,7 +115,11 @@ impl Client { let mut inputs = String::new(); inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize)); - inputs.push_str("![]()"); + if n_tokens == 0 { + // 1 request is enough to test vision heads. + // Sending images on other queries messes up easily with truncation. + inputs.push_str("![]()"); + } requests.push(Request { id: 0, diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 0029e3b2..cb2ee7db 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -745,58 +745,66 @@ class Idefics2ForConditionalGeneration(nn.Module): inputs_embeds = self.text_model.embed_tokens(input_ids) if pixel_values is not None: batch_size, num_images, num_channels, height, width = pixel_values.shape - pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility - pixel_values = pixel_values.view( - batch_size * num_images, *pixel_values.shape[2:] - ) + all_states = [] + all_pixel_values = pixel_values + all_pixel_mask = pixel_attention_mask + for i in range(batch_size): + pixel_values = all_pixel_values.to( + dtype=self.dtype + ) # fp16 compatibility + pixel_values = pixel_values[i : i + 1] + pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:]) - # Remove padding images - padding images are full 0. - nb_values_per_image = pixel_values.shape[1:].numel() - real_images_inds = (pixel_values == 0.0).sum( - dim=(-1, -2, -3) - ) != nb_values_per_image - pixel_values = pixel_values[real_images_inds].contiguous() + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum( + dim=(-1, -2, -3) + ) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() - # Handle the vision attention mask - if pixel_attention_mask is None: - pixel_attention_mask = torch.ones( - size=( - pixel_values.size(0), - pixel_values.size(2), - pixel_values.size(3), - ), - dtype=torch.bool, - device=pixel_values.device, + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=( + pixel_values.size(0), + pixel_values.size(2), + pixel_values.size(3), + ), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask/pP p + pixel_attention_mask = all_pixel_mask[i : i + 1] + pixel_attention_mask = pixel_attention_mask.view( + 1 * num_images, *pixel_attention_mask.shape[2:] + ) + pixel_attention_mask = pixel_attention_mask[ + real_images_inds + ].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold( + dimension=1, size=patch_size, step=patch_size ) - else: - # Remove padding images from the mask/pP p - pixel_attention_mask = pixel_attention_mask.view( - batch_size * num_images, *pixel_attention_mask.shape[2:] + patches_subgrid = patches_subgrid.unfold( + dimension=2, size=patch_size, step=patch_size ) - pixel_attention_mask = pixel_attention_mask[ - real_images_inds - ].contiguous() + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - patch_size = self.config.vision_config.patch_size - patches_subgrid = pixel_attention_mask.unfold( - dimension=1, size=patch_size, step=patch_size - ) - patches_subgrid = patches_subgrid.unfold( - dimension=2, size=patch_size, step=patch_size - ) - patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) - # Get sequence from the vision encoder - image_hidden_states = self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - ) - - # Modality projection & resampling - image_hidden_states = self.connector( - image_hidden_states, - attention_mask=patch_attention_mask.view(pixel_values.size(0), -1), - ) + # Modality projection & resampling + image_hidden_states = self.connector( + image_hidden_states, + attention_mask=patch_attention_mask.view(pixel_values.size(0), -1), + ) + all_states.append(image_hidden_states) + image_hidden_states = torch.stack(all_states, dim=0) # When we generate, we don't want to replace the potential image_token_id that we generated by images # that simply don't exist inputs_embeds = self._merge_input_ids_with_image_features( diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 14bf19e1..0d93791f 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -154,7 +154,12 @@ class LlavaNextForConditionalGeneration(nn.Module): """In place merges in vision_embeddings with inputs_embeds.""" mask = input_ids == self.config.image_token_index # Let's pray we have enabled enough slots ! - inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + try: + inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + except Exception as e: + raise RuntimeError( + f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens` to handle images. If error happens at regular runtime, please fill in an issue: {e}" + ) return inputs_embeds def forward(