Working state except all params ??

2025-09-11 12:24:53 +00:00 · 2023-12-01 18:49:01 +00:00 · 2023-12-01 18:49:01 +00:00 · e7e07342bd
commit e7e07342bd
parent 657ccd8276
6 changed files with 518 additions and 344 deletions
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_all_params.json
@ -1,8 +1,8 @@
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "stop_sequence",
+    "finish_reason": "length",
-    "generated_tokens": 5,
+    "generated_tokens": 10,
    "prefill": [
      {
        "id": 1,
@ -10,49 +10,89 @@
        "text": "<s>"
      },
      {
-        "id": 4321,
+        "id": 338,
-        "logprob": -10.0625,
+        "logprob": -10.0078125,
-        "text": "Test"
+        "text": "is"
      },
      {
-        "id": 2009,
+        "id": 21784,
-        "logprob": -12.28125,
+        "logprob": -15.515625,
-        "text": "request"
+        "text": "Deep"
      },
      {
        "id": 29257,
        "logprob": -2.8847656,
        "text": "Learning"
      },
      {
        "id": 29973,
        "logprob": -4.140625,
        "text": "?"
      }
    ],
    "seed": 0,
    "tokens": [
      {
-        "id": 5229,
+        "id": 13,
-        "logprob": -1.7587891,
+        "logprob": -1.1582031,
        "special": false,
-        "text": " failed"
+        "text": "\n"
      },
      {
-        "id": 363,
+        "id": 2772,
        "logprob": -0.5175781,
        "special": false,
        "text": " for"
      },
      {
        "id": 1404,
        "logprob": 0.0,
        "special": false,
-        "text": " user"
+        "text": "De"
      },
      {
-        "id": 376,
+        "id": 1022,
        "logprob": 0.0,
        "special": false,
-        "text": " \""
+        "text": "ep"
      },
      {
-        "id": 1688,
+        "id": 6509,
-        "logprob": -0.20422363,
+        "logprob": 0.0,
        "special": false,
-        "text": "test"
+        "text": " learning"
      },
      {
        "id": 313,
        "logprob": -1.0712891,
        "special": false,
        "text": " ("
      },
      {
        "id": 15189,
        "logprob": -0.7578125,
        "special": false,
        "text": "also"
      },
      {
        "id": 2998,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 408,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 6483,
        "logprob": 0.0,
        "special": false,
        "text": " deep"
      },
      {
        "id": 19677,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      }
    ]
  },
-  "generated_text": "Test request failed for user \"test"
+  "generated_text": "What is Deep Learning?\nDeep learning (also known as deep neural"
 }
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_load.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_load.json
@ -11,81 +11,108 @@
          "text": "<s>"
        },
        {
-          "id": 4321,
+          "id": 1724,
-          "logprob": -10.0625,
+          "logprob": -10.734375,
-          "text": "Test"
+          "text": "What"
        },
        {
-          "id": 2009,
+          "id": 338,
-          "logprob": -12.28125,
+          "logprob": -1.5488281,
-          "text": "request"
+          "text": "is"
        },
        {
          "id": 21784,
          "logprob": -9.2890625,
          "text": "Deep"
        },
        {
          "id": 29257,
          "logprob": -1.2753906,
          "text": "Learning"
        },
        {
          "id": 29973,
          "logprob": -0.48046875,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -2.0878906,
          "special": false,
          "text": " for"
        },
        {
          "id": 278,
          "logprob": -3.4082031,
          "special": false,
          "text": " the"
        },
        {
          "id": 376,
          "logprob": -3.8457031,
          "special": false,
          "text": " \""
        },
        {
          "id": 2577,
          "logprob": -3.5605469,
          "special": false,
          "text": "Get"
        },
        {
          "id": 599,
          "logprob": -3.4707031,
          "special": false,
          "text": " all"
        },
        {
          "id": 4160,
          "logprob": -3.2421875,
          "special": false,
          "text": " users"
        },
        {
          "id": 29908,
          "logprob": -0.49072266,
          "special": false,
          "text": "\""
        },
        {
          "id": 16248,
          "logprob": -1.2353516,
          "special": false,
          "text": " endpoint"
        },
        {
          "id": 29889,
          "logprob": -0.8833008,
          "special": false,
          "text": "."
        },
        {
          "id": 13,
-          "logprob": -0.42089844,
+          "logprob": -1.1845703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.5727539,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.00010967255,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.04510498,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.00020992756,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.0046539307,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025844574,
          "special": false,
          "text": " learning"
        },
        {
          "id": 393,
          "logprob": -0.09185791,
          "special": false,
          "text": " that"
        },
        {
          "id": 20789,
          "logprob": -0.4951172,
          "special": false,
          "text": " involves"
        }
      ]
    },
-    "generated_text": " for the \"Get all users\" endpoint.\n"
+    "generated_text": "ep learning is a subset of machine learning that involves"
  },
  {
    "details": {
@ -99,81 +126,108 @@
          "text": "<s>"
        },
        {
-          "id": 4321,
+          "id": 1724,
-          "logprob": -10.0625,
+          "logprob": -10.734375,
-          "text": "Test"
+          "text": "What"
        },
        {
-          "id": 2009,
+          "id": 338,
-          "logprob": -12.28125,
+          "logprob": -1.5488281,
-          "text": "request"
+          "text": "is"
        },
        {
          "id": 21784,
          "logprob": -9.2890625,
          "text": "Deep"
        },
        {
          "id": 29257,
          "logprob": -1.2724609,
          "text": "Learning"
        },
        {
          "id": 29973,
          "logprob": -0.47729492,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -2.0878906,
          "special": false,
          "text": " for"
        },
        {
          "id": 278,
          "logprob": -3.4082031,
          "special": false,
          "text": " the"
        },
        {
          "id": 376,
          "logprob": -3.8457031,
          "special": false,
          "text": " \""
        },
        {
          "id": 2577,
          "logprob": -3.5625,
          "special": false,
          "text": "Get"
        },
        {
          "id": 599,
          "logprob": -3.4726562,
          "special": false,
          "text": " all"
        },
        {
          "id": 4160,
          "logprob": -3.2382812,
          "special": false,
          "text": " users"
        },
        {
          "id": 29908,
          "logprob": -0.49047852,
          "special": false,
          "text": "\""
        },
        {
          "id": 16248,
          "logprob": -1.2412109,
          "special": false,
          "text": " endpoint"
        },
        {
          "id": 29889,
          "logprob": -0.87402344,
          "special": false,
          "text": "."
        },
        {
          "id": 13,
-          "logprob": -0.41723633,
+          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        },
        {
          "id": 393,
          "logprob": -0.091918945,
          "special": false,
          "text": " that"
        },
        {
          "id": 20789,
          "logprob": -0.50097656,
          "special": false,
          "text": " involves"
        }
      ]
    },
-    "generated_text": " for the \"Get all users\" endpoint.\n"
+    "generated_text": "ep learning is a subset of machine learning that involves"
  },
  {
    "details": {
@ -187,81 +241,108 @@
          "text": "<s>"
        },
        {
-          "id": 4321,
+          "id": 1724,
-          "logprob": -10.0625,
+          "logprob": -10.734375,
-          "text": "Test"
+          "text": "What"
        },
        {
-          "id": 2009,
+          "id": 338,
-          "logprob": -12.28125,
+          "logprob": -1.5488281,
-          "text": "request"
+          "text": "is"
        },
        {
          "id": 21784,
          "logprob": -9.2890625,
          "text": "Deep"
        },
        {
          "id": 29257,
          "logprob": -1.2724609,
          "text": "Learning"
        },
        {
          "id": 29973,
          "logprob": -0.47729492,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -2.0878906,
          "special": false,
          "text": " for"
        },
        {
          "id": 278,
          "logprob": -3.4082031,
          "special": false,
          "text": " the"
        },
        {
          "id": 376,
          "logprob": -3.8457031,
          "special": false,
          "text": " \""
        },
        {
          "id": 2577,
          "logprob": -3.5605469,
          "special": false,
          "text": "Get"
        },
        {
          "id": 599,
          "logprob": -3.4707031,
          "special": false,
          "text": " all"
        },
        {
          "id": 4160,
          "logprob": -3.2421875,
          "special": false,
          "text": " users"
        },
        {
          "id": 29908,
          "logprob": -0.49072266,
          "special": false,
          "text": "\""
        },
        {
          "id": 16248,
          "logprob": -1.2353516,
          "special": false,
          "text": " endpoint"
        },
        {
          "id": 29889,
          "logprob": -0.8833008,
          "special": false,
          "text": "."
        },
        {
          "id": 13,
-          "logprob": -0.42089844,
+          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        },
        {
          "id": 393,
          "logprob": -0.091918945,
          "special": false,
          "text": " that"
        },
        {
          "id": 20789,
          "logprob": -0.50097656,
          "special": false,
          "text": " involves"
        }
      ]
    },
-    "generated_text": " for the \"Get all users\" endpoint.\n"
+    "generated_text": "ep learning is a subset of machine learning that involves"
  },
  {
    "details": {
@ -275,80 +356,107 @@
          "text": "<s>"
        },
        {
-          "id": 4321,
+          "id": 1724,
-          "logprob": -10.0625,
+          "logprob": -10.734375,
-          "text": "Test"
+          "text": "What"
        },
        {
-          "id": 2009,
+          "id": 338,
-          "logprob": -12.28125,
+          "logprob": -1.5488281,
-          "text": "request"
+          "text": "is"
        },
        {
          "id": 21784,
          "logprob": -9.2890625,
          "text": "Deep"
        },
        {
          "id": 29257,
          "logprob": -1.2724609,
          "text": "Learning"
        },
        {
          "id": 29973,
          "logprob": -0.47729492,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -2.0878906,
          "special": false,
          "text": " for"
        },
        {
          "id": 278,
          "logprob": -3.4082031,
          "special": false,
          "text": " the"
        },
        {
          "id": 376,
          "logprob": -3.8457031,
          "special": false,
          "text": " \""
        },
        {
          "id": 2577,
          "logprob": -3.5605469,
          "special": false,
          "text": "Get"
        },
        {
          "id": 599,
          "logprob": -3.4707031,
          "special": false,
          "text": " all"
        },
        {
          "id": 4160,
          "logprob": -3.2421875,
          "special": false,
          "text": " users"
        },
        {
          "id": 29908,
          "logprob": -0.49072266,
          "special": false,
          "text": "\""
        },
        {
          "id": 16248,
          "logprob": -1.2353516,
          "special": false,
          "text": " endpoint"
        },
        {
          "id": 29889,
          "logprob": -0.8833008,
          "special": false,
          "text": "."
        },
        {
          "id": 13,
-          "logprob": -0.42089844,
+          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        },
        {
          "id": 393,
          "logprob": -0.091918945,
          "special": false,
          "text": " that"
        },
        {
          "id": 20789,
          "logprob": -0.50097656,
          "special": false,
          "text": " involves"
        }
      ]
    },
-    "generated_text": " for the \"Get all users\" endpoint.\n"
+    "generated_text": "ep learning is a subset of machine learning that involves"
  }
 ]
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_simple.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_simple.json
@ -10,79 +10,106 @@
        "text": "<s>"
      },
      {
-        "id": 4321,
+        "id": 1724,
-        "logprob": -10.0625,
+        "logprob": -10.734375,
-        "text": "Test"
+        "text": "What"
      },
      {
-        "id": 2009,
+        "id": 338,
-        "logprob": -12.28125,
+        "logprob": -1.5488281,
-        "text": "request"
+        "text": "is"
      },
      {
        "id": 21784,
        "logprob": -9.2890625,
        "text": "Deep"
      },
      {
        "id": 29257,
        "logprob": -1.2753906,
        "text": "Learning"
      },
      {
        "id": 29973,
        "logprob": -0.48046875,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 363,
        "logprob": -2.0878906,
        "special": false,
        "text": " for"
      },
      {
        "id": 278,
        "logprob": -3.4121094,
        "special": false,
        "text": " the"
      },
      {
        "id": 376,
        "logprob": -3.8457031,
        "special": false,
        "text": " \""
      },
      {
        "id": 2577,
        "logprob": -3.5566406,
        "special": false,
        "text": "Get"
      },
      {
        "id": 599,
        "logprob": -3.4746094,
        "special": false,
        "text": " all"
      },
      {
        "id": 4160,
        "logprob": -3.2363281,
        "special": false,
        "text": " users"
      },
      {
        "id": 29908,
        "logprob": -0.49023438,
        "special": false,
        "text": "\""
      },
      {
        "id": 16248,
        "logprob": -1.2402344,
        "special": false,
        "text": " endpoint"
      },
      {
        "id": 29889,
        "logprob": -0.88134766,
        "special": false,
        "text": "."
      },
      {
        "id": 13,
-        "logprob": -0.41870117,
+        "logprob": -1.1845703,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2772,
        "logprob": -0.5727539,
        "special": false,
        "text": "De"
      },
      {
        "id": 1022,
        "logprob": -0.000108122826,
        "special": false,
        "text": "ep"
      },
      {
        "id": 6509,
        "logprob": -0.1239624,
        "special": false,
        "text": " learning"
      },
      {
        "id": 338,
        "logprob": -0.044433594,
        "special": false,
        "text": " is"
      },
      {
        "id": 263,
        "logprob": -0.01852417,
        "special": false,
        "text": " a"
      },
      {
        "id": 11306,
        "logprob": -0.45922852,
        "special": false,
        "text": " subset"
      },
      {
        "id": 310,
        "logprob": -0.0002104044,
        "special": false,
        "text": " of"
      },
      {
        "id": 4933,
        "logprob": -0.004787445,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6509,
        "logprob": -0.00026226044,
        "special": false,
        "text": " learning"
      },
      {
        "id": 393,
        "logprob": -0.09161377,
        "special": false,
        "text": " that"
      },
      {
        "id": 20789,
        "logprob": -0.49560547,
        "special": false,
        "text": " involves"
      }
    ]
  },
-  "generated_text": " for the \"Get all users\" endpoint.\n"
+  "generated_text": "ep learning is a subset of machine learning that involves"
 }
--- a/integration-tests/models/test_flash_medusa.py
+++ b/integration-tests/models/test_flash_medusa.py
@ -17,7 +17,7 @@ async def flash_medusa(flash_medusa_handle):
@pytest.mark.private
 async def test_flash_medusa_simple(flash_medusa, response_snapshot):
    response = await flash_medusa.generate(
-        "Test request", max_new_tokens=10, decoder_input_details=True
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
    )
    assert response.details.generated_tokens == 10
@ -28,7 +28,7 @@ async def test_flash_medusa_simple(flash_medusa, response_snapshot):
@pytest.mark.private
 async def test_flash_medusa_all_params(flash_medusa, response_snapshot):
    response = await flash_medusa.generate(
-        "Test request",
+        "What is Deep Learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
@ -43,17 +43,17 @@ async def test_flash_medusa_all_params(flash_medusa, response_snapshot):
        seed=0,
    )
-    assert response.details.generated_tokens == 5
+    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_medusa_load(flash_medusa, generate_load, response_snapshot):
-    responses = await generate_load(flash_medusa, "Test request", max_new_tokens=10, n=4)
+    responses = await generate_load(flash_medusa, "What is Deep Learning?", max_new_tokens=10, n=4)
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses]), f"{[r.generated_text for r in responses]}"
-    assert responses[0].generated_text == ' for the "Get all users" endpoint.\n' 
+    assert responses[0].generated_text == 'ep learning is a subset of machine learning that involves' 
    assert responses == response_snapshot
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -232,7 +232,7 @@ class FlashCausalLMBatch(Batch):
            cumulative_max_length += total_tokens
            max_seqlen = max(max_seqlen, input_length)
            max_blocks = max(max_blocks, needed_blocks)
-            max_length = max(max_length, input_length + max_new_tokens)
+            max_length = max(max_length, input_length + max_new_tokens + speculative_length)
        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
            next_token_chooser_parameters, dtype, device
@ -479,6 +479,7 @@ class FlashCausalLMBatch(Batch):
        max_blocks = 0
        max_length = 0
        max_seqlen = 0
        speculative_length = 0 if batches[0].speculative_ids is None else batches[0].speculative_ids.shape[1]
        for b in batches:
            total_batch_size += len(b)
            total_slots += len(b.slots)
@ -489,6 +490,7 @@ class FlashCausalLMBatch(Batch):
                max_length,
                max(
                    input_length
                    + speculative_length
                    + stopping_criteria.max_new_tokens
                    - stopping_criteria.current_tokens
                    for input_length, stopping_criteria in zip(
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@ -16,7 +16,6 @@ from text_generation_server.utils.logits_process import (
 from text_generation_server.utils.watermark import WatermarkLogitsProcessor
 from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
 class NextTokenChooser:
    def __init__(
        self,
@ -289,8 +288,6 @@ class HeterogeneousNextTokenChooser:
                        indices.append(index)
                    else:
                        break
                # if accepted > 1:
                #     import ipdb;ipdb.set_trace()
                accepted_ids.append(accepted)
            accepted_ids = torch.tensor(accepted_ids, device=input_ids.device, dtype=input_ids.dtype)
            next_ids = next_ids[indices]