mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Making image passes image per image to save VRAM.
This commit is contained in:
parent
60d2757c36
commit
1aa812da43
@ -115,7 +115,11 @@ impl Client {
|
||||
|
||||
let mut inputs = String::new();
|
||||
inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
|
||||
if n_tokens == 0 {
|
||||
// 1 request is enough to test vision heads.
|
||||
// Sending images on other queries messes up easily with truncation.
|
||||
inputs.push_str("");
|
||||
}
|
||||
|
||||
requests.push(Request {
|
||||
id: 0,
|
||||
|
@ -745,10 +745,15 @@ class Idefics2ForConditionalGeneration(nn.Module):
|
||||
inputs_embeds = self.text_model.embed_tokens(input_ids)
|
||||
if pixel_values is not None:
|
||||
batch_size, num_images, num_channels, height, width = pixel_values.shape
|
||||
pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility
|
||||
pixel_values = pixel_values.view(
|
||||
batch_size * num_images, *pixel_values.shape[2:]
|
||||
)
|
||||
all_states = []
|
||||
all_pixel_values = pixel_values
|
||||
all_pixel_mask = pixel_attention_mask
|
||||
for i in range(batch_size):
|
||||
pixel_values = all_pixel_values.to(
|
||||
dtype=self.dtype
|
||||
) # fp16 compatibility
|
||||
pixel_values = pixel_values[i : i + 1]
|
||||
pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])
|
||||
|
||||
# Remove padding images - padding images are full 0.
|
||||
nb_values_per_image = pixel_values.shape[1:].numel()
|
||||
@ -770,8 +775,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
|
||||
)
|
||||
else:
|
||||
# Remove padding images from the mask/pP p
|
||||
pixel_attention_mask = all_pixel_mask[i : i + 1]
|
||||
pixel_attention_mask = pixel_attention_mask.view(
|
||||
batch_size * num_images, *pixel_attention_mask.shape[2:]
|
||||
1 * num_images, *pixel_attention_mask.shape[2:]
|
||||
)
|
||||
pixel_attention_mask = pixel_attention_mask[
|
||||
real_images_inds
|
||||
@ -797,6 +803,8 @@ class Idefics2ForConditionalGeneration(nn.Module):
|
||||
image_hidden_states,
|
||||
attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
|
||||
)
|
||||
all_states.append(image_hidden_states)
|
||||
image_hidden_states = torch.stack(all_states, dim=0)
|
||||
# When we generate, we don't want to replace the potential image_token_id that we generated by images
|
||||
# that simply don't exist
|
||||
inputs_embeds = self._merge_input_ids_with_image_features(
|
||||
|
@ -154,7 +154,12 @@ class LlavaNextForConditionalGeneration(nn.Module):
|
||||
"""In place merges in vision_embeddings with inputs_embeds."""
|
||||
mask = input_ids == self.config.image_token_index
|
||||
# Let's pray we have enabled enough slots !
|
||||
try:
|
||||
inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens` to handle images. If error happens at regular runtime, please fill in an issue: {e}"
|
||||
)
|
||||
return inputs_embeds
|
||||
|
||||
def forward(
|
||||
|
Loading…
Reference in New Issue
Block a user