Improve qwen vl impl (#2943)

* feat: refactor model, improve startup and re enable tests * fix: improve multimodal rotary embed caching * fix: limit vision flop calc to qwen2 vl models and update config typing * fix: include clippy lint * feat: refactor position ids in warmup and bump tests * fix: prefer default dtype * fix: enable all cuda graphs and bump snapshots * fix: adjust rotaty init path * fix: simplify get position ids and remove usused vision config * fix: update position ids so first dim is batch, simplify rotary and bump vlm default token limit * fix: improve position id init during cuda warmup for mrope and simplfy rotary forward * fix: check existance before accessing rope type in cuda warmup * fix: check key before access * fix: improve mrope check in cuda graph warmup * fix: remove check for default rope type * fix: add more test and improve model generation * fix: improve and simplify get_cos_sin, refactors and cleanup get_position_ids * fix: adjust signatures with types
2025-09-08 19:04:52 +00:00 · 2025-02-04 12:44:18 -05:00 · 2025-02-04 12:44:18 -05:00 · c1cf36c0dc
commit c1cf36c0dc
parent dd2bd5fdb3
15 changed files with 865 additions and 324 deletions
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@ -1,73 +1,469 @@
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "length",
+    "finish_reason": "eos_token",
-    "generated_tokens": 10,
+    "generated_tokens": 76,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 18183,
-        "logprob": -1.6669922,
+        "logprob": -1.5195312,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6832,
-        "logprob": -0.08959961,
+        "logprob": -0.06817627,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
-        "logprob": -0.14685059,
+        "logprob": -0.13122559,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
-        "logprob": -0.125,
+        "logprob": -0.13415527,
        "special": false,
        "text": " a"
      },
      {
        "id": 25993,
-        "logprob": -0.81640625,
+        "logprob": -0.8769531,
        "special": false,
        "text": " subset"
      },
      {
        "id": 315,
-        "logprob": -0.0013418198,
+        "logprob": -0.0011396408,
        "special": false,
        "text": " of"
      },
      {
        "id": 5662,
-        "logprob": -0.16027832,
+        "logprob": -0.16442871,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6832,
-        "logprob": -0.0016393661,
+        "logprob": -0.0026416779,
        "special": false,
        "text": " learning"
      },
      {
        "id": 429,
-        "logprob": -0.4477539,
+        "logprob": -0.48754883,
        "special": false,
        "text": " that"
      },
      {
        "id": 5711,
-        "logprob": -1.2802734,
+        "logprob": -1.2294922,
        "special": false,
        "text": " uses"
      },
      {
        "id": 29728,
        "logprob": -0.66503906,
        "special": false,
        "text": " neural"
      },
      {
        "id": 14155,
        "logprob": -0.02960205,
        "special": false,
        "text": " networks"
      },
      {
        "id": 311,
        "logprob": -0.7236328,
        "special": false,
        "text": " to"
      },
      {
        "id": 3960,
        "logprob": -1.1914062,
        "special": false,
        "text": " learn"
      },
      {
        "id": 504,
        "logprob": -0.7089844,
        "special": false,
        "text": " from"
      },
      {
        "id": 821,
        "logprob": -0.7729492,
        "special": false,
        "text": " data"
      },
      {
        "id": 13,
        "logprob": -0.7836914,
        "special": false,
        "text": "."
      },
      {
        "id": 1084,
        "logprob": -0.9941406,
        "special": false,
        "text": " It"
      },
      {
        "id": 374,
        "logprob": -0.52441406,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.9511719,
        "special": false,
        "text": " a"
      },
      {
        "id": 943,
        "logprob": -0.8642578,
        "special": false,
        "text": " type"
      },
      {
        "id": 315,
        "logprob": -0.00030231476,
        "special": false,
        "text": " of"
      },
      {
        "id": 20443,
        "logprob": -0.14416504,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11229,
        "logprob": -0.013824463,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 429,
        "logprob": -0.18762207,
        "special": false,
        "text": " that"
      },
      {
        "id": 646,
        "logprob": -1.0087891,
        "special": false,
        "text": " can"
      },
      {
        "id": 3960,
        "logprob": -0.90234375,
        "special": false,
        "text": " learn"
      },
      {
        "id": 504,
        "logprob": -0.54345703,
        "special": false,
        "text": " from"
      },
      {
        "id": 323,
        "logprob": -1.0400391,
        "special": false,
        "text": " and"
      },
      {
        "id": 1281,
        "logprob": -0.072509766,
        "special": false,
        "text": " make"
      },
      {
        "id": 19898,
        "logprob": -0.16516113,
        "special": false,
        "text": " predictions"
      },
      {
        "id": 389,
        "logprob": -0.4416504,
        "special": false,
        "text": " on"
      },
      {
        "id": 3460,
        "logprob": -0.5385742,
        "special": false,
        "text": " large"
      },
      {
        "id": 14713,
        "logprob": -0.4387207,
        "special": false,
        "text": " amounts"
      },
      {
        "id": 315,
        "logprob": -0.00015091896,
        "special": false,
        "text": " of"
      },
      {
        "id": 821,
        "logprob": -0.061431885,
        "special": false,
        "text": " data"
      },
      {
        "id": 13,
        "logprob": -0.71875,
        "special": false,
        "text": "."
      },
      {
        "id": 18183,
        "logprob": -0.23632812,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6832,
        "logprob": -0.0017204285,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -1.1738281,
        "special": false,
        "text": " is"
      },
      {
        "id": 1483,
        "logprob": -0.61083984,
        "special": false,
        "text": " used"
      },
      {
        "id": 304,
        "logprob": -0.035003662,
        "special": false,
        "text": " in"
      },
      {
        "id": 264,
        "logprob": -0.118652344,
        "special": false,
        "text": " a"
      },
      {
        "id": 8045,
        "logprob": -0.42016602,
        "special": false,
        "text": " variety"
      },
      {
        "id": 315,
        "logprob": -1.6212463e-05,
        "special": false,
        "text": " of"
      },
      {
        "id": 8357,
        "logprob": -0.1315918,
        "special": false,
        "text": " applications"
      },
      {
        "id": 11,
        "logprob": -0.12915039,
        "special": false,
        "text": ","
      },
      {
        "id": 2670,
        "logprob": -0.12463379,
        "special": false,
        "text": " including"
      },
      {
        "id": 2168,
        "logprob": -0.37402344,
        "special": false,
        "text": " image"
      },
      {
        "id": 323,
        "logprob": -0.1451416,
        "special": false,
        "text": " and"
      },
      {
        "id": 8806,
        "logprob": -0.028869629,
        "special": false,
        "text": " speech"
      },
      {
        "id": 17843,
        "logprob": -0.00024068356,
        "special": false,
        "text": " recognition"
      },
      {
        "id": 11,
        "logprob": -0.00031018257,
        "special": false,
        "text": ","
      },
      {
        "id": 5810,
        "logprob": -0.019821167,
        "special": false,
        "text": " natural"
      },
      {
        "id": 4128,
        "logprob": -0.00012528896,
        "special": false,
        "text": " language"
      },
      {
        "id": 8692,
        "logprob": -0.00089263916,
        "special": false,
        "text": " processing"
      },
      {
        "id": 11,
        "logprob": -0.00073862076,
        "special": false,
        "text": ","
      },
      {
        "id": 323,
        "logprob": -0.040161133,
        "special": false,
        "text": " and"
      },
      {
        "id": 38193,
        "logprob": -0.4519043,
        "special": false,
        "text": " autonomous"
      },
      {
        "id": 11474,
        "logprob": -0.39941406,
        "special": false,
        "text": " vehicles"
      },
      {
        "id": 13,
        "logprob": -0.21166992,
        "special": false,
        "text": "."
      },
      {
        "id": 1084,
        "logprob": -0.9082031,
        "special": false,
        "text": " It"
      },
      {
        "id": 374,
        "logprob": -0.44213867,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -1.2177734,
        "special": false,
        "text": " a"
      },
      {
        "id": 18512,
        "logprob": -0.5205078,
        "special": false,
        "text": " rapidly"
      },
      {
        "id": 7826,
        "logprob": -0.15332031,
        "special": false,
        "text": " growing"
      },
      {
        "id": 2070,
        "logprob": -0.0039978027,
        "special": false,
        "text": " field"
      },
      {
        "id": 448,
        "logprob": -0.9091797,
        "special": false,
        "text": " with"
      },
      {
        "id": 1657,
        "logprob": -0.17114258,
        "special": false,
        "text": " many"
      },
      {
        "id": 4650,
        "logprob": -0.70703125,
        "special": false,
        "text": " potential"
      },
      {
        "id": 8357,
        "logprob": -0.025131226,
        "special": false,
        "text": " applications"
      },
      {
        "id": 304,
        "logprob": -0.6699219,
        "special": false,
        "text": " in"
      },
      {
        "id": 279,
        "logprob": -0.35205078,
        "special": false,
        "text": " the"
      },
      {
        "id": 3853,
        "logprob": -0.049194336,
        "special": false,
        "text": " future"
      },
      {
        "id": 13,
        "logprob": -0.21972656,
        "special": false,
        "text": "."
      },
      {
        "id": 151643,
        "logprob": -2.0019531,
        "special": true,
        "text": "<|endoftext|>"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": " Deep learning is a subset of machine learning that uses"
+  "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@ -7,67 +7,67 @@
    "seed": 0,
    "tokens": [
      {
-        "id": 1939,
+        "id": 5267,
-        "logprob": -2.2460938,
+        "logprob": -1.1464844,
        "special": false,
-        "text": "?\n\n"
+        "text": "?\n"
      },
      {
        "id": 33464,
-        "logprob": 0.0,
+        "logprob": -0.83203125,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 20909,
-        "logprob": -0.48608398,
+        "logprob": -0.5625,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 4102,
        "logprob": -2.265625,
        "special": false,
        "text": " "
      },
      {
        "id": 285,
        "logprob": 0.0,
        "special": false,
        "text": "is"
      },
      {
        "id": 458,
        "logprob": -0.6328125,
        "special": false,
        "text": " an"
      },
      {
        "id": 20443,
        "logprob": -0.1796875,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11229,
        "logprob": 0.0,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 320,
-        "logprob": -0.37695312,
+        "logprob": -2.1464844,
        "special": false,
        "text": " ("
      },
      {
-        "id": 15469,
+        "id": 16524,
        "logprob": 0.0,
        "special": false,
-        "text": "AI"
+        "text": "DL"
      },
      {
        "id": 701,
        "logprob": -2.2089844,
        "special": false,
        "text": "),"
      },
      {
        "id": 476,
        "logprob": -0.27368164,
        "special": false,
        "text": " or"
      },
      {
        "id": 20443,
        "logprob": -0.09442139,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 29728,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 14155,
        "logprob": 0.0,
        "special": false,
        "text": " networks"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+  "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@ -9,61 +9,61 @@
      "tokens": [
        {
          "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
@ -82,61 +82,61 @@
      "tokens": [
        {
          "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
@ -155,61 +155,61 @@
      "tokens": [
        {
          "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
@ -228,61 +228,61 @@
      "tokens": [
        {
          "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
@ -0,0 +1,26 @@
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image showcases a stunning cityscape, featuring the iconic Statue of Liberty in the foreground. The image displays Lady Liberty's imposing presence, with her towering base standing beside her. Behind the statue, the city's skyline extends across the horizon, adorned with numerous tall buildings, including the Empire State Building and other notable skyscrapers. The water reflecting the sun's rays creates a serene and picturesque scene, emphasizing the beauty and resilience of this global landmark. The sky is a clear, pale blue, adding to the overall tranquility of the scene.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738348090,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 110,
    "prompt_tokens": 8736,
    "total_tokens": 8846
  }
 }
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json
@ -0,0 +1,26 @@
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image shows a stylized scene set in what appears to be a diner or restaurant. In the foreground, there is a table with various food items, including a burger with lettuce and tomato, a bowl of fries, and a drink in a cup with a straw. On the right side of the table, there is an owl sitting alertly, looking directly at the camera. Behind the owl and the table, there is a large, green, dinosaur-like creature resembling Godzilla, with its mouth open and tongue visible. In the background, the diner's decor includes various signs and posters, with a green sign reading \"Basta\" and another sign that says \"Tabasco.\" The setting has a retro or vintage feel, with fluorescent lighting overhead and clean, polished surfaces.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738348100,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 156,
    "prompt_tokens": 5375,
    "total_tokens": 5531
  }
 }
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@ -5,7 +5,7 @@
      "index": 0,
      "logprobs": null,
      "message": {
-        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
+        "content": "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
@ -13,14 +13,14 @@
      "usage": null
    }
  ],
-  "created": 1730164250,
+  "created": 1738347908,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
-    "completion_tokens": 58,
+    "completion_tokens": 89,
-    "prompt_tokens": 349,
+    "prompt_tokens": 1364,
-    "total_tokens": 407
+    "total_tokens": 1453
  }
 }
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
@ -11,10 +11,10 @@
      "logprobs": null
    }
  ],
-  "created": 1730416361,
+  "created": 1737646031,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.0.2-dev0-native",
  "usage": null
 }
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
 ):
    response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
        "What is deep learning?",
-        max_new_tokens=10,
+        # prefer a longer response than the default, allow the llm to end generation
        max_new_tokens=1000,
        decoder_input_details=True,
    )
    assert (
        response.generated_text
-        == " Deep learning is a subset of machine learning that uses"
+        == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
    )
-    assert response.details.generated_tokens == 10
+    assert response.details.generated_tokens == 76
    assert response == response_snapshot
@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
-        == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+        == "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
    )
    assert response == response_snapshot
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@ -1,81 +1,122 @@
-# Disabled because it's broken.
+import pytest
-# import pytest
+
-#
+
-#
+@pytest.fixture(scope="module")
-# @pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
-# def flash_qwen2_vl_handle(launcher):
+    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
-#     with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+        yield handle
-#         yield handle
+
-#
+
-#
+@pytest.fixture(scope="module")
-# @pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
-# async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
-#     await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
-#     return flash_qwen2_vl_handle.client
+
-#
+
-#
+@pytest.mark.private
-# @pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
-#     response = await flash_qwen2.chat(
+        seed=42,
-#         max_tokens=100,
+        messages=[
-#         seed=42,
+            {
-#         messages=[
+                "role": "user",
-#             {
+                "content": [
-#                 "role": "user",
+                    {
-#                 "content": [
+                        "type": "image_url",
-#                     {
+                        "image_url": {
-#                         "type": "image_url",
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         "image_url": {
+                        },
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
-#                         },
+                    {"type": "text", "text": "Describe this image."},
-#                     },
+                ],
-#                     {"type": "text", "text": "Describe this image."},
+            },
-#                 ],
+        ],
-#             },
+    )
-#         ],
+
-#     )
+    assert (
-#
+        response.choices[0].message.content
-#     assert (
+        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
-#         response.choices[0].message.content
+    )
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+
-#     )
+    assert response == response_snapshot
-#
+
-#     assert response == response_snapshot
+
-#
+@pytest.mark.private
-#
+async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
-# @pytest.mark.private
+    responses = await flash_qwen2.chat(
-# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+        seed=42,
-#     responses = await flash_qwen2.chat(
+        messages=[
-#         max_tokens=100,
+            {
-#         seed=42,
+                "role": "user",
-#         messages=[
+                "content": [
-#             {
+                    {
-#                 "role": "user",
+                        "type": "image_url",
-#                 "content": [
+                        "image_url": {
-#                     {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         "type": "image_url",
+                        },
-#                         "image_url": {
+                    },
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    {"type": "text", "text": "Describe this image."},
-#                         },
+                ],
-#                     },
+            },
-#                     {"type": "text", "text": "Describe this image."},
+        ],
-#                 ],
+        stream=True,
-#             },
+    )
-#         ],
+
-#         stream=True,
+    count = 0
-#     )
+    generated = ""
-#
+    last_response = None
-#     count = 0
+    async for response in responses:
-#     generated = ""
+        count += 1
-#     last_response = None
+        generated += response.choices[0].delta.content
-#     async for response in responses:
+        last_response = response
-#         count += 1
+
-#         generated += response.choices[0].delta.content
+    assert (
-#         last_response = response
+        generated
-#
+        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
-#     assert (
+    )
-#         generated
+    assert count == 89
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    assert last_response == response_snapshot
-#     )
+
-#     assert count == 58
+
-#     assert last_response == response_snapshot
+@pytest.mark.private
 async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot):
    response = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot
@pytest.mark.private
 async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot):
    response = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -2049,7 +2049,16 @@ fn main() -> Result<(), LauncherError> {
            None => {
                let compute_type = compute_type(num_shard);
                let compute_optimal = compute_optimal(config.as_ref(), compute_type.as_ref());
-                let default = compute_optimal.unwrap_or(4096);
+                // TODO: remove this when we correctly esimate the flops for VLMs
                // this is a short term temporary fix to enable vlms to avoid rejecting images
                let default_optimal = match config {
                    Some(ref config) => match config.model_type.as_deref() {
                        Some("qwen2_vl") => 10_000,
                        _ => 4096,
                    },
                    None => 4096,
                };
                let default = compute_optimal.unwrap_or(default_optimal);
                let vram_maximum = vram_maximum(
                    config.as_ref(),
                    compute_type.as_ref(),
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@ -86,11 +86,18 @@ class PositionRotaryEmbedding(nn.Module):
            # `rope_type` is now standard in transformers, but some existing models
            # have `type` instead.
            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
            mrope_section = rope_scaling.get("mrope_section", None)
            if rope_type == "linear":
                pass
            elif rope_type == "default":
                pass
            elif rope_type == "mrope":
                mrope_section = rope_scaling["mrope_section"]
                if mrope_section is not None:
                    return RotaryPositionEmbeddingMultimodalSections(
                        inv_freq, scaling_factor, mrope_section
                    )
            elif rope_type == "dynamic":
                scaling_factor = rope_scaling["factor"]
                return DynamicPositionRotaryEmbedding(
@ -548,3 +555,66 @@ def apply_llama3_scaling(
            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
 class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
    def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sections: list):
        super().__init__(inv_freq, scaling_factor)
        self.sections = sections
        self._cos_cached = None
        self._sin_cached = None
        self.section_indices = (
            torch.arange(len(self.sections))
            .repeat_interleave(torch.tensor(self.sections))
            .view(1, 1, -1)
            .to(inv_freq.device)
        )
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # rotate half the sequence length
        rot = cos.shape[-1] // 2
        q2 = torch.cat([-query[..., rot:], query[..., :rot]], dim=-1)
        k2 = torch.cat([-key[..., rot:], key[..., :rot]], dim=-1)
        # apply the rotation
        rotary_emb.apply_rotary(query, q2, cos, sin, query, q2, True)
        rotary_emb.apply_rotary(key, k2, cos, sin, key, k2, True)
    def _update_cos_sin_cache(
        self, dtype: torch.dtype, device: torch.device, seqlen: int
    ):
        # always cache the cos/sin for the full sequence length to avoid
        # recomputing if the sequence length is smaller than the cached one
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
            self._sections = self.section_indices.expand(seqlen, -1, -1)
    def get_cos_sin(
        self,
        position_ids: torch.Tensor,
        max_s: int,
        dtype: torch.dtype,
    ):
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        slen = position_ids.shape[0]
        cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
        sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
        cos = torch.cat([cos, cos], dim=-1)
        sin = torch.cat([sin, sin], dim=-1)
        return cos, sin
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -1363,6 +1363,7 @@ def get_model(
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            default_dtype=torch.bfloat16,
            kv_cache_dtype=kv_cache_dtype,
            trust_remote_code=trust_remote_code,
            lora_adapter_ids=lora_adapter_ids,
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -61,11 +61,6 @@ class Qwen2Attention(torch.nn.Module):
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.mrope_section = (
            config.rope_scaling.get("mrope_section", None)
            if config.rope_scaling is not None
            else None
        )
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
@ -127,17 +122,6 @@ class Qwen2Attention(torch.nn.Module):
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
        if self.mrope_section is not None:
            # if mrope_section is set, we need to split the cos and sin into 3 parts and concatenate them in a specific order
            cos = torch.cat(
                [m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
                dim=-1,
            )
            sin = torch.cat(
                [m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
                dim=-1,
            )
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
        if prefill_cache_indices is not None:
@ -251,7 +235,7 @@ class Qwen2Layer(nn.Module):
        max_s,
        prefill_cache_indices,
    ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        normed_hidden_states, residual = self.input_layernorm(hidden_states)
        # Self Attention
        attn_output = self.self_attn(
@ -266,15 +250,13 @@ class Qwen2Layer(nn.Module):
            max_s,
            prefill_cache_indices,
        )
        hidden_states = attn_output + residual
        # faster post attention rms norm
-        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+        hidden_states, residual = self.post_attention_layernorm(hidden_states)
-            attn_output, res
+        mlp_output = self.mlp(hidden_states)
-        )
+        hidden_states = mlp_output + residual
-
+        return hidden_states
        mlp_output = self.mlp(normed_attn_res_output)
        return mlp_output, attn_res
 class Qwen2Model(torch.nn.Module):
@ -322,18 +304,15 @@ class Qwen2Model(torch.nn.Module):
    ) -> torch.Tensor:
        hidden_states = inputs_embeds
        # flatten position ids from 2D to 1D
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids.flatten(), true_max_s, hidden_states.dtype
+            position_ids,
            true_max_s,
            hidden_states.dtype,
        )
        # reshape back to 2D if the position_ids were 2D
        if position_ids.size(0) != cos.size(0):
            cos = cos.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
            sin = sin.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
        residual = None
        for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
+            hidden_states = layer(
                hidden_states,
                residual,
                cos,
@ -347,7 +326,7 @@ class Qwen2Model(torch.nn.Module):
                prefill_cache_indices,
            )
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states, _ = self.norm(hidden_states)
        return hidden_states
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@ -222,12 +222,11 @@ class Qwen2VLVisionBlock(nn.Module):
    def forward(
        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
    ) -> torch.Tensor:
-        hidden_states_post_norm1, res = self.norm1(hidden_states)
+        norm1_out, residual = self.norm1(hidden_states)
-        hidden_states = hidden_states + self.attn(
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
-            hidden_states_post_norm1, cu_seqlens, rotary_pos_emb, max_seqlen
+        hidden_states = attn_out + residual
-        )
+        norm2_out, residual = self.norm2(hidden_states)
-        hidden_states_post_norm2, res = self.norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(norm2_out)
        hidden_states = hidden_states + self.mlp(hidden_states_post_norm2)
        return hidden_states
@ -378,8 +377,12 @@ class Qwen2VLForConditionalGeneration(nn.Module):
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
        # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
        config.rope_scaling.update({"rope_type": "mrope"})
        self.hidden_size = config.hidden_size
        self.vision_start_token_id = config.vision_start_token_id
        self.vision_end_token_id = config.vision_end_token_id
        self.image_token_id = config.image_token_id
        self.video_token_id = config.video_token_id
        self.spatial_merge_size = config.vision_config.spatial_merge_size
@ -407,98 +410,88 @@ class Qwen2VLForConditionalGeneration(nn.Module):
        )
        self.device = weights.device
    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
    # modified to first find segments then initialize position ids for each segment
    # Steps:
    #  locate all vision and text segments
    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
    #  calculate `text_segment_lengths` for each text segment to be used as offset
    #  create position ids for each vision segment based on the image grid
    #  create position ids for each text segment
    #  combine all the position ids
    #  the final segment is the difference between the last vision segment and the end of the input
    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
    def get_position_ids(
        self,
-        batch_input_ids: torch.Tensor,
+        input_ids: torch.Tensor,
-        image_grid_thw: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.Tensor] = None,
-        # video_grid_thw is not implemented yet as we do not accept video inputs at the moment
+    ) -> torch.Tensor:
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if image_grid_thw is None:
-        if batch_input_ids.dim() == 1:
+            return (
-            batch_input_ids = batch_input_ids.unsqueeze(0)
+                torch.arange(input_ids.shape[0], device=input_ids.device)
-
+                .unsqueeze(1)
-        position_ids = torch.ones(
+                .repeat(1, 3)
            3,
            batch_input_ids.shape[0],
            batch_input_ids.shape[1],
            dtype=batch_input_ids.dtype,
            device=batch_input_ids.device,
            )
-        d = batch_input_ids.device
+
-        if image_grid_thw is not None:
+        spatial_merge_size = self.spatial_merge_size
-            image_index = 0
+        vision_start_token_id = self.vision_start_token_id
        vision_end_token_id = self.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]
        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
-
+        for i, _ in enumerate(vision_segments):
-            for i, input_ids in enumerate(batch_input_ids):
+            t, h, w = (
-                vision_start_indices = torch.argwhere(
+                image_grid_thw[i][0],
-                    input_ids == self.vision_start_token_id
+                image_grid_thw[i][1] // spatial_merge_size,
-                ).squeeze(1)
+                image_grid_thw[i][2] // spatial_merge_size,
                vision_tokens = input_ids[vision_start_indices + 1]
                # only copy the sum of the image tokens GPU<->CPU
                image_count = (vision_tokens == self.image_token_id).sum().item()
                current_pos = 0
                for _ in range(image_count):
                    # copy the value position of the next image token from GPU<->CPU
                    next_image_pos = (
                        (input_ids[current_pos:] == self.image_token_id)
                        .nonzero()[0]
                        .item()
            )
-                    # TODO: revisit above to get all next_image_pos in one go to avoid copying in the loop
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
-                    time_steps, height, width = image_grid_thw[image_index].clone()
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
-                    height //= self.spatial_merge_size
+            w_indices = torch.arange(w, device=device).repeat(t * h)
-                    width //= self.spatial_merge_size
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
-                    # calculate the length of the text and image tokens
+            # offset by the position of the last vision segment
-                    text_length = next_image_pos
+            im = image_position_ids + vision_segment_lengths[i]
-                    start_idx = (
+            llm_pos_ids_list.append(im)
                        llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
                    )
-                    # text position ids
+        # create position ids for each text segment
-                    text_pos_ids = torch.arange(text_length, device=d)
+        text_ranges = [
-                    text_pos_ids = text_pos_ids.view(1, -1).expand(3, -1) + start_idx
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
-                    llm_pos_ids_list.append(text_pos_ids)
+            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]
-                    # image position ids
+        full_llm_pos_ids_list = [
-                    t_indices = torch.arange(time_steps, device=d).repeat_interleave(
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
-                        height * width
+        ]
-                    )
+        max_s = full_llm_pos_ids_list[-1].max() + 1
-                    h_indices = (
+        final_text_len = input_ids_len - vision_ends[-1]
-                        torch.arange(height, device=d)
+        if final_text_len > 0:
-                        .repeat_interleave(width)
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
-                        .repeat(time_steps)
+            full_llm_pos_ids_list.append(m + max_s)
                    )
                    w_indices = torch.arange(width, device=d).repeat(
                        height * time_steps
                    )
                    image_pos_ids = (
                        torch.stack([t_indices, h_indices, w_indices])
                        + text_length
                        + start_idx
                    )
                    llm_pos_ids_list.append(image_pos_ids)
                    current_pos += next_image_pos + time_steps * height * width
                    image_index += 1
            if current_pos < batch_input_ids.size(1):
                st_idx = (
                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                )
                text_len = batch_input_ids.size(1) - current_pos
                llm_pos_ids_list.append(
                    torch.arange(text_len, device=d).view(1, -1).expand(3, -1) + st_idx
                )
            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
            position_ids[:, i, :] = llm_positions.to(position_ids.device)
        else:
        position_ids = (
-                torch.arange(batch_input_ids.shape[1], device=batch_input_ids.device)
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
                .view(1, 1, -1)
                .repeat(3, batch_input_ids.shape[0], 1)
        )
        return position_ids
@ -527,6 +520,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
        # apply the visual model to the pixel values if they are provided
        if pixel_values is not None and len(pixel_values) > 0:
            pixel_values = pixel_values.to(inputs_embeds.dtype)
            if pixel_values is not None:
                image_embeds = self.visual(
                    pixel_values, grid_thw=image_grid_thw
@ -545,7 +539,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
        )
        hidden_states, _ = self.norm(hidden_states)
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1401,6 +1401,13 @@ class FlashCausalLM(Model):
        if max_bs is None:
            input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
            position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
            config = getattr(self.model, "config", None)
            rope_scaling = getattr(config, "rope_scaling", None) if config else None
            if (  # mrope have position_ids per section, if so repeat n times
                isinstance(rope_scaling, dict) and rope_scaling["rope_type"] == "mrope"
            ):
                n_sections = len(self.model.config.rope_scaling["mrope_section"])
                position_ids = position_ids.unsqueeze(1).repeat(1, n_sections)
            slots = torch.arange(bs, dtype=torch.int64, device=self.device)
            input_lengths_tensor = (
                torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
@ -1456,14 +1463,6 @@ class FlashCausalLM(Model):
        else:
            state = None
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "model_type")
            and self.model.config.model_type == "qwen2_vl"
        ):
            if position_ids.dim() == 1:
                position_ids = self.model.get_position_ids(input_ids)
        graph = torch.cuda.CUDAGraph()
        self.cuda_graphs[bs] = {
            "input_ids": input_ids,
@ -2050,7 +2049,7 @@ class FlashCausalLM(Model):
        # instantly become of shape [BATCH_SIZE]
        if prefill and finished_prefilling:
            indices = batch.cu_seqlen_prefill[1:] - 1
-            batch.position_ids = batch.position_ids[(..., indices)]
+            batch.position_ids = batch.position_ids[indices]
            batch.slot_indices = batch.slot_indices[indices]
            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
                indices