From fca2218fa91ab8208b3c3633315415be10325069 Mon Sep 17 00:00:00 2001
From: Janne Alatalo <janne.alatalo@jamk.fi>
Date: Fri, 13 Dec 2024 11:58:07 +0200
Subject: [PATCH] Fix runtime error when Qwen2-VL was prompted with multiple
 images

Fix runtime error when Qwen2-VL model is prompted with prompt with more
than one image. The runtime error was:

 File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 459, in get_position_ids
    text_pos_ids = torch.arange(text_length, device=d)
RuntimeError: upper bound and larger bound inconsistent with step sign

The error was caused by text_length variable going to negative value
when multiple images caused multiple loops in the get_position_ids
function's main loop.

The error is a simple logic mistake where next_image_pos is initialized
as relative offset from current_pos, but was used like it was absolute
position from zero.
---
 .../text_generation_server/models/custom_modeling/qwen2_vl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index ddb4e36d..a8e1e8c1 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -450,7 +450,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                     width //= self.spatial_merge_size
 
                     # calculate the length of the text and image tokens
-                    text_length = next_image_pos - current_pos
+                    text_length = next_image_pos
                     start_idx = (
                         llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
                     )
@@ -480,7 +480,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                     )
                     llm_pos_ids_list.append(image_pos_ids)
 
-                    current_pos = next_image_pos + time_steps * height * width
+                    current_pos += next_image_pos + time_steps * height * width
                     image_index += 1
 
             if current_pos < batch_input_ids.size(1):