Making image passes image per image to save VRAM.

2025-09-11 12:24:53 +00:00 · 2024-04-22 20:40:21 +00:00 · 2024-04-22 20:40:21 +00:00 · 1aa812da43
commit 1aa812da43
parent 60d2757c36
3 changed files with 65 additions and 48 deletions
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -115,7 +115,11 @@ impl Client {

            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
                inputs.push_str("![](data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=)");
+            }

            requests.push(Request {
                id: 0,
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@ -745,10 +745,15 @@ class Idefics2ForConditionalGeneration(nn.Module):
        inputs_embeds = self.text_model.embed_tokens(input_ids)
        if pixel_values is not None:
            batch_size, num_images, num_channels, height, width = pixel_values.shape
-            pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
-            pixel_values = pixel_values.view(
-                batch_size * num_images, *pixel_values.shape[2:]
-            )
+            all_states = []
+            all_pixel_values = pixel_values
+            all_pixel_mask = pixel_attention_mask
+            for i in range(batch_size):
+                pixel_values = all_pixel_values.to(
+                    dtype=self.dtype
+                )  # fp16 compatibility
+                pixel_values = pixel_values[i : i + 1]
+                pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])

                # Remove padding images - padding images are full 0.
                nb_values_per_image = pixel_values.shape[1:].numel()
@ -770,8 +775,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
                    )
                else:
                    # Remove padding images from the mask/pP p
+                    pixel_attention_mask = all_pixel_mask[i : i + 1]
                    pixel_attention_mask = pixel_attention_mask.view(
-                    batch_size * num_images, *pixel_attention_mask.shape[2:]
+                        1 * num_images, *pixel_attention_mask.shape[2:]
                    )
                    pixel_attention_mask = pixel_attention_mask[
                        real_images_inds
@ -797,6 +803,8 @@ class Idefics2ForConditionalGeneration(nn.Module):
                    image_hidden_states,
                    attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
                )
+                all_states.append(image_hidden_states)
+            image_hidden_states = torch.stack(all_states, dim=0)
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@ -154,7 +154,12 @@ class LlavaNextForConditionalGeneration(nn.Module):
        """In place merges in vision_embeddings with inputs_embeds."""
        mask = input_ids == self.config.image_token_index
        # Let's pray we have enabled enough slots !
+        try:
            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        except Exception as e:
+            raise RuntimeError(
+                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
+            )
        return inputs_embeds

    def forward(