diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
index 1c74a405..2f7ffb08 100644
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "The image shows a rabbit with a is on floating in outer a a in outer and seems a as an in the be an astronaut suit a a a have crew the front ag a suit the chalet",
+        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1730084696,
+  "created": 1730164250,
   "id": "",
   "model": "Qwen/Qwen2-VL-7B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
   "usage": {
-    "completion_tokens": 41,
+    "completion_tokens": 58,
     "prompt_tokens": 349,
-    "total_tokens": 390
+    "total_tokens": 407
   }
 }
diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py
index 73413eb0..357de2b1 100644
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@@ -3,7 +3,7 @@ import pytest
 
 @pytest.fixture(scope="module")
 def flash_qwen2_vl_handle(launcher):
-    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+    with launcher("Qwen/Qwen2-VL-7B-Instruct", cuda_graphs=[0]) as handle:
         yield handle
 
 
@@ -36,13 +36,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
 
     assert (
         response.choices[0].message.content
-        == "The image shows a rabbit with a is on floating in outer a a in outer and seems a as an in the be an astronaut suit a a a have crew the front ag a suit the chalet"
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
     )
 
-    # # TODO: return reference response
-    # assert (
-    #     response.choices[0].message.content
-    #     == "The image depicts an astronaut with a rabbit's head standing on a rocky, reddish terrain. The astronaut is wearing a space suit with various buttons and"
-    # )
-
     assert response == response_snapshot
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index 8eee045a..6ebc3d4e 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -409,7 +409,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                         .item()
                     )
                     # TODO: revisit above to get all next_image_pos in one go to avoid copying in the loop
-                    time_steps, height, width = image_grid_thw[image_index]
+                    time_steps, height, width = image_grid_thw[image_index].clone()
                     height //= self.spatial_merge_size
                     width //= self.spatial_merge_size
 
@@ -487,12 +487,13 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         # apply the visual model to the pixel values if they are provided
         if pixel_values is not None and len(pixel_values) > 0:
             if pixel_values is not None:
-                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                image_embeds = self.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).squeeze(0)
                 inputs_embeds[input_ids == self.image_token_id] = image_embeds
 
-        position_ids = self.get_position_ids(input_ids.unsqueeze(0), image_grid_thw)
         hidden_states = self.text_model(
-            inputs_embeds=inputs_embeds.squeeze(0),
+            inputs_embeds=inputs_embeds,
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=kv_cache,
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index a8467059..fc813b30 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -360,6 +360,16 @@ class VlmCausalLM(FlashCausalLM):
             max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
+        if self.model.get_position_ids:
+            if position_ids.shape[0] != 1:
+                position_ids = self.model.get_position_ids(
+                    input_ids.unsqueeze(0), batch.image_grid_thw
+                )
+                batch.position_ids = position_ids[0, 0, :]
+            else:
+                position_ids = position_ids.repeat(3, 1, 1).clone()
+                batch.position_ids = position_ids[0, 0, :]
+
         if cu_seqlen_prefill is None and self.max_past() is not None:
             # In decode, not prefill, we're actually overwriting the KV-cache
             # in a circular buffer mode.