From 7ab99bc6b3ae44362658de6f4eaa41c8861f4c8b Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Wed, 22 Jan 2025 20:51:20 +0000
Subject: [PATCH] feat: refactor position ids in warmup and bump tests

---
 ...essed_tensors_w8a8_int_dynamic_weight.json | 422 +++++++++++++++++-
 ...rs_w8a8_int_dynamic_weight_all_params.json |  78 ++--
 ..._tensors_w8a8_int_dynamic_weight_load.json |  80 ++--
 ...pressed_tensors_w8a8_int_dynamic_weight.py |   9 +-
 .../models/flash_causal_lm.py                 |  20 +-
 5 files changed, 499 insertions(+), 110 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
index 2525f72c..7dbfc627 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@@ -1,73 +1,469 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 10,
+    "finish_reason": "eos_token",
+    "generated_tokens": 76,
     "prefill": [],
     "seed": null,
     "tokens": [
       {
         "id": 18183,
-        "logprob": -1.6669922,
+        "logprob": -1.5195312,
         "special": false,
         "text": " Deep"
       },
       {
         "id": 6832,
-        "logprob": -0.08959961,
+        "logprob": -0.06817627,
         "special": false,
         "text": " learning"
       },
       {
         "id": 374,
-        "logprob": -0.14685059,
+        "logprob": -0.13122559,
         "special": false,
         "text": " is"
       },
       {
         "id": 264,
-        "logprob": -0.125,
+        "logprob": -0.13415527,
         "special": false,
         "text": " a"
       },
       {
         "id": 25993,
-        "logprob": -0.81640625,
+        "logprob": -0.8769531,
         "special": false,
         "text": " subset"
       },
       {
         "id": 315,
-        "logprob": -0.0013418198,
+        "logprob": -0.0011396408,
         "special": false,
         "text": " of"
       },
       {
         "id": 5662,
-        "logprob": -0.16027832,
+        "logprob": -0.16442871,
         "special": false,
         "text": " machine"
       },
       {
         "id": 6832,
-        "logprob": -0.0016393661,
+        "logprob": -0.0026416779,
         "special": false,
         "text": " learning"
       },
       {
         "id": 429,
-        "logprob": -0.4477539,
+        "logprob": -0.48754883,
         "special": false,
         "text": " that"
       },
       {
         "id": 5711,
-        "logprob": -1.2802734,
+        "logprob": -1.2294922,
         "special": false,
         "text": " uses"
+      },
+      {
+        "id": 29728,
+        "logprob": -0.66503906,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 14155,
+        "logprob": -0.02960205,
+        "special": false,
+        "text": " networks"
+      },
+      {
+        "id": 311,
+        "logprob": -0.7236328,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3960,
+        "logprob": -1.1914062,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 821,
+        "logprob": -0.7729492,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.7836914,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9941406,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.52441406,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.9511719,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 943,
+        "logprob": -0.8642578,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00030231476,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.14416504,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11229,
+        "logprob": -0.013824463,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 429,
+        "logprob": -0.18762207,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 646,
+        "logprob": -1.0087891,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 3960,
+        "logprob": -0.90234375,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.54345703,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 323,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 1281,
+        "logprob": -0.072509766,
+        "special": false,
+        "text": " make"
+      },
+      {
+        "id": 19898,
+        "logprob": -0.16516113,
+        "special": false,
+        "text": " predictions"
+      },
+      {
+        "id": 389,
+        "logprob": -0.4416504,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 3460,
+        "logprob": -0.5385742,
+        "special": false,
+        "text": " large"
+      },
+      {
+        "id": 14713,
+        "logprob": -0.4387207,
+        "special": false,
+        "text": " amounts"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00015091896,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 821,
+        "logprob": -0.061431885,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.71875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 18183,
+        "logprob": -0.23632812,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.0017204285,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -1.1738281,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1483,
+        "logprob": -0.61083984,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 304,
+        "logprob": -0.035003662,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 264,
+        "logprob": -0.118652344,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8045,
+        "logprob": -0.42016602,
+        "special": false,
+        "text": " variety"
+      },
+      {
+        "id": 315,
+        "logprob": -1.6212463e-05,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.1315918,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 11,
+        "logprob": -0.12915039,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 2670,
+        "logprob": -0.12463379,
+        "special": false,
+        "text": " including"
+      },
+      {
+        "id": 2168,
+        "logprob": -0.37402344,
+        "special": false,
+        "text": " image"
+      },
+      {
+        "id": 323,
+        "logprob": -0.1451416,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 8806,
+        "logprob": -0.028869629,
+        "special": false,
+        "text": " speech"
+      },
+      {
+        "id": 17843,
+        "logprob": -0.00024068356,
+        "special": false,
+        "text": " recognition"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00031018257,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 5810,
+        "logprob": -0.019821167,
+        "special": false,
+        "text": " natural"
+      },
+      {
+        "id": 4128,
+        "logprob": -0.00012528896,
+        "special": false,
+        "text": " language"
+      },
+      {
+        "id": 8692,
+        "logprob": -0.00089263916,
+        "special": false,
+        "text": " processing"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00073862076,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 323,
+        "logprob": -0.040161133,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 38193,
+        "logprob": -0.4519043,
+        "special": false,
+        "text": " autonomous"
+      },
+      {
+        "id": 11474,
+        "logprob": -0.39941406,
+        "special": false,
+        "text": " vehicles"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21166992,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9082031,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.44213867,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -1.2177734,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 18512,
+        "logprob": -0.5205078,
+        "special": false,
+        "text": " rapidly"
+      },
+      {
+        "id": 7826,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " growing"
+      },
+      {
+        "id": 2070,
+        "logprob": -0.0039978027,
+        "special": false,
+        "text": " field"
+      },
+      {
+        "id": 448,
+        "logprob": -0.9091797,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 1657,
+        "logprob": -0.17114258,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 4650,
+        "logprob": -0.70703125,
+        "special": false,
+        "text": " potential"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.025131226,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 304,
+        "logprob": -0.6699219,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 279,
+        "logprob": -0.35205078,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.049194336,
+        "special": false,
+        "text": " future"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21972656,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 151643,
+        "logprob": -2.0019531,
+        "special": true,
+        "text": "<|endoftext|>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": " Deep learning is a subset of machine learning that uses"
+  "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
index 6b3f5092..2c840e67 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@@ -7,67 +7,67 @@
     "seed": 0,
     "tokens": [
       {
-        "id": 1939,
-        "logprob": -2.2460938,
+        "id": 5267,
+        "logprob": -1.1464844,
         "special": false,
-        "text": "?\n\n"
+        "text": "?\n"
       },
       {
         "id": 33464,
-        "logprob": 0.0,
+        "logprob": -0.83203125,
         "special": false,
         "text": "Deep"
       },
       {
         "id": 20909,
-        "logprob": -0.48608398,
+        "logprob": -0.5625,
         "special": false,
         "text": " Learning"
       },
-      {
-        "id": 4102,
-        "logprob": -2.265625,
-        "special": false,
-        "text": " "
-      },
-      {
-        "id": 285,
-        "logprob": 0.0,
-        "special": false,
-        "text": "is"
-      },
-      {
-        "id": 458,
-        "logprob": -0.6328125,
-        "special": false,
-        "text": " an"
-      },
-      {
-        "id": 20443,
-        "logprob": -0.1796875,
-        "special": false,
-        "text": " artificial"
-      },
-      {
-        "id": 11229,
-        "logprob": 0.0,
-        "special": false,
-        "text": " intelligence"
-      },
       {
         "id": 320,
-        "logprob": -0.37695312,
+        "logprob": -2.1464844,
         "special": false,
         "text": " ("
       },
       {
-        "id": 15469,
+        "id": 16524,
         "logprob": 0.0,
         "special": false,
-        "text": "AI"
+        "text": "DL"
+      },
+      {
+        "id": 701,
+        "logprob": -2.2089844,
+        "special": false,
+        "text": "),"
+      },
+      {
+        "id": 476,
+        "logprob": -0.27368164,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.09442139,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 29728,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 14155,
+        "logprob": 0.0,
+        "special": false,
+        "text": " networks"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+  "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
index 1fa4e33a..aee5698b 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@@ -9,61 +9,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -82,61 +82,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -155,61 +155,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -228,61 +228,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
index a0b0416b..17e12c22 100644
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
 ):
     response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
         "What is deep learning?",
-        max_new_tokens=10,
+        # prefer a longer response than the default, allow the llm to end generation
+        max_new_tokens=1000,
         decoder_input_details=True,
     )
 
     assert (
         response.generated_text
-        == " Deep learning is a subset of machine learning that uses"
+        == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
     )
-    assert response.details.generated_tokens == 10
+    assert response.details.generated_tokens == 76
     assert response == response_snapshot
 
 
@@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+        == "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
     )
     assert response == response_snapshot
 
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 6bc3c2ca..a7d7f711 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1400,7 +1400,11 @@ class FlashCausalLM(Model):
         cache_lengths = [0] * bs
         if max_bs is None:
             input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
-            position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
+            if hasattr(self.model, "get_position_ids"):
+                # use model specific position ids for initialization
+                position_ids = self.model.get_position_ids(input_ids)
+            else:
+                position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
             slots = torch.arange(bs, dtype=torch.int64, device=self.device)
             input_lengths_tensor = (
                 torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
@@ -1427,7 +1431,7 @@ class FlashCausalLM(Model):
                     "Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
                 )
             input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs]
-            position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
+            position_ids = self.cuda_graphs[max_bs]["position_ids"][..., :bs]
             if ATTENTION == "flashinfer":
                 block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt]
             else:
@@ -1456,14 +1460,6 @@ class FlashCausalLM(Model):
         else:
             state = None
 
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "model_type")
-            and self.model.config.model_type == "qwen2_vl"
-        ):
-            if position_ids.dim() == 1:
-                position_ids = self.model.get_position_ids(input_ids)
-
         graph = torch.cuda.CUDAGraph()
         self.cuda_graphs[bs] = {
             "input_ids": input_ids,
@@ -1486,10 +1482,6 @@ class FlashCausalLM(Model):
             state=state,
             cache_lengths_tensor=cache_lengths_tensor,
         ):
-            # in the case of N dimensional position ids we need to slice the
-            # position ids to match the input_ids size for cuda graphs warmup
-            position_ids = position_ids[..., : input_ids.shape[0]]
-
             seqlen = Seqlen(
                 input_lengths=input_lengths_tensor,
                 cache_lengths=cache_lengths_tensor,