mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 14:52:20 +00:00
Qwen2-VL runtime error fix when prompted with multiple images (#2840)
* Fix runtime error when Qwen2-VL was prompted with multiple images
Fix runtime error when Qwen2-VL model is prompted with prompt with more
than one image. The runtime error was:
File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 459, in get_position_ids
text_pos_ids = torch.arange(text_length, device=d)
RuntimeError: upper bound and larger bound inconsistent with step sign
The error was caused by text_length variable going to negative value
when multiple images caused multiple loops in the get_position_ids
function's main loop.
The error is a simple logic mistake where next_image_pos is initialized
as relative offset from current_pos, but was used like it was absolute
position from zero.
* Fix runtime error when Qwen2-VL was prompted with multiple images
Fix runtime error when Qwen2-VL model is prompted with prompt with more
than one image. The runtime error was:
File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 534, in forward
inputs_embeds[input_ids == self.image_token_id] = image_embeds
RuntimeError: shape mismatch: value tensor of shape [512, 3584] cannot be broadcast to indexing result of shape [1024, 3584]
(The error message shape numbers can be different depending on the input
image resolutions)
The error was caused by adding the wrong number of <|image_pad|> tokens
to the tokenized input in the image_text_replacement function.
The error is a simple logical mistake where the number of image pad
tokens is checked from pixel_value_shape tensor's first dimension
length. However, the pixel_value_shape contains patches from all of the
images. Therefore the code added the total number of required image pad
tokens for the whole input to each of the images locations. This
resulted to extra image pad tokens to be present in the tokenized input.
The fix was to check the number of required tokens from the
image_grid_thw tensor. The tensor includes grid_t, grid_h, and grid_w
values for each image. grid_t * grid_h * grid_w results to the total
number of patches for the image [1]. The number of required image pad
tokens is number_of_patches // 4.
[1] 31f9a289a6/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py (L311)
---------
Co-authored-by: Janne Alatalo <janne.alatalo@jamk.fi>
This commit is contained in:
parent
a72f339c79
commit
7eeefa3b57
@ -450,7 +450,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
width //= self.spatial_merge_size
|
width //= self.spatial_merge_size
|
||||||
|
|
||||||
# calculate the length of the text and image tokens
|
# calculate the length of the text and image tokens
|
||||||
text_length = next_image_pos - current_pos
|
text_length = next_image_pos
|
||||||
start_idx = (
|
start_idx = (
|
||||||
llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||||
)
|
)
|
||||||
@ -480,7 +480,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
)
|
)
|
||||||
llm_pos_ids_list.append(image_pos_ids)
|
llm_pos_ids_list.append(image_pos_ids)
|
||||||
|
|
||||||
current_pos = next_image_pos + time_steps * height * width
|
current_pos += next_image_pos + time_steps * height * width
|
||||||
image_index += 1
|
image_index += 1
|
||||||
|
|
||||||
if current_pos < batch_input_ids.size(1):
|
if current_pos < batch_input_ids.size(1):
|
||||||
|
@ -68,7 +68,8 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
|
|||||||
elif config.model_type == "paligemma":
|
elif config.model_type == "paligemma":
|
||||||
return "<image>" * config.text_config.num_image_tokens
|
return "<image>" * config.text_config.num_image_tokens
|
||||||
elif config.model_type == "qwen2_vl":
|
elif config.model_type == "qwen2_vl":
|
||||||
num_pads = image_input.pixel_values.shape[0] // 4
|
grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
|
||||||
|
num_pads = grid_t * grid_h * grid_w // 4
|
||||||
padding = "<|image_pad|>" * num_pads
|
padding = "<|image_pad|>" * num_pads
|
||||||
return f"<|vision_start|>{padding}<|vision_end|>"
|
return f"<|vision_start|>{padding}<|vision_end|>"
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user