From a78b6fd1e89ae033ea4815a13511b8a9f23e1071 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 4 Dec 2024 21:34:46 +0100
Subject: [PATCH] Fixing a few tests.

---
 docs/source/reference/launcher.md             |  10 +
 .../models/__snapshots__/test.py              |  22 +
 .../test_bloom_560m/test_bloom_560m.json      |  58 ++-
 .../test_bloom_560m_all_params.json           |  28 +-
 .../test_bloom_560m/test_bloom_560m_load.json | 232 ++++++++++-
 .../test_bloom_560m_sharded.json              |  58 ++-
 .../test_bloom_560m_sharded_load.json         | 232 ++++++++++-
 .../test_flash_llama_awq_load_sharded.json    | 132 +-----
 .../test_idefics/test_idefics.json            |  98 ++++-
 .../test_idefics/test_idefics_load.json       | 392 +++++++++++++++++-
 .../test_mamba/test_mamba_all_params.json     |  28 +-
 .../test_mamba/test_mamba_load.json           | 112 ++++-
 .../__snapshots__/test_mpt/test_mpt.json      |  28 +-
 .../__snapshots__/test_mpt/test_mpt_load.json | 112 ++++-
 .../test_mt0_base/test_mt0_base.json          |   8 +-
 .../test_mt0_base_all_params.json             |   8 +-
 .../test_mt0_base/test_mt0_base_load.json     |  32 +-
 .../test_t5_sharded/test_t5_sharded.json      |   8 +-
 .../test_t5_sharded/test_t5_sharded_load.json |  32 +-
 launcher/src/main.rs                          |  13 +-
 20 files changed, 1476 insertions(+), 167 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test.py

diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index 90246aa4..159b22e7 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -467,6 +467,16 @@ Options:
           [env: PAYLOAD_LIMIT=]
           [default: 2000000]
 
+```
+## ENABLE_PREFILL_LOGPROBS
+```shell
+      --enable-prefill-logprobs
+          Enables prefill logprobs
+          
+          Logprobs in the prompt are deactivated by default because they consume a large amount of VRAM (especially for long prompts). Using this flag reallows users to ask for them.
+          
+          [env: ENABLE_PREFILL_LOGPROBS=]
+
 ```
 ## HELP
 ```shell
diff --git a/integration-tests/models/__snapshots__/test.py b/integration-tests/models/__snapshots__/test.py
new file mode 100644
index 00000000..f6c9a6a9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test.py
@@ -0,0 +1,22 @@
+import os
+import json
+
+
+for root, dirs, files in os.walk("."):
+    for filename in files:
+        if filename.endswith(".json"):
+            with open(os.path.join(root, filename), "r") as f:
+                data = json.load(f)
+
+            print(os.path.join(root, filename))
+            try:
+                if filename.endswith("_load.json"):
+                    for i in range(len(data)):
+                        data[i]["details"]["prefill"] = []
+                else:
+                    data["details"]["prefill"] = []
+            except Exception:
+                pass
+
+            with open(os.path.join(root, filename), "w") as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
index ac276749..54c66408 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
@@ -3,7 +3,63 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 17934,
+        "logprob": null,
+        "text": "Pour"
+      },
+      {
+        "id": 49833,
+        "logprob": -10.5703125,
+        "text": " dég"
+      },
+      {
+        "id": 21543,
+        "logprob": -0.14746094,
+        "text": "uster"
+      },
+      {
+        "id": 447,
+        "logprob": -1.9277344,
+        "text": " un"
+      },
+      {
+        "id": 46341,
+        "logprob": -15.421875,
+        "text": " ort"
+      },
+      {
+        "id": 35567,
+        "logprob": -7.5820312,
+        "text": "olan"
+      },
+      {
+        "id": 15,
+        "logprob": -1.4013672,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -1.5595703,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -0.9428711,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -3.703125,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -1.7763672,
+        "text": " d'abord"
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
index e538ba35..9422f27f 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
@@ -3,7 +3,33 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 15,
+        "logprob": null,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -5.4453125,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -2.3378906,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -4.3320312,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -2.9160156,
+        "text": " d'abord"
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
index 59c62253..0a86bef8 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
@@ -4,7 +4,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.5625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4609375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5585938,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4003906,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5673828,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94628906,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.703125,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -76,7 +132,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -148,7 +260,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -220,7 +388,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
index 697edf6a..b17c889e 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
@@ -3,7 +3,63 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 17934,
+        "logprob": null,
+        "text": "Pour"
+      },
+      {
+        "id": 49833,
+        "logprob": -10.546875,
+        "text": " dég"
+      },
+      {
+        "id": 21543,
+        "logprob": -0.14819336,
+        "text": "uster"
+      },
+      {
+        "id": 447,
+        "logprob": -1.9257812,
+        "text": " un"
+      },
+      {
+        "id": 46341,
+        "logprob": -15.4296875,
+        "text": " ort"
+      },
+      {
+        "id": 35567,
+        "logprob": -7.5625,
+        "text": "olan"
+      },
+      {
+        "id": 15,
+        "logprob": -1.4199219,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -1.5634766,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -0.9458008,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -3.6816406,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -1.7753906,
+        "text": " d'abord"
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
index 1528e963..2dd480b9 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
@@ -4,7 +4,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.5390625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14758301,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9296875,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4453125,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.59375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3994141,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.578125,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.9453125,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -76,7 +132,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -148,7 +260,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -220,7 +388,63 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
index f1d9129d..56a10a75 100644
--- a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
+++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
@@ -4,38 +4,7 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.6914062,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4746094,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8623047,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7558594,
-          "text": "?"
-        }
-      ],
+      "prefill": [],
       "seed": null,
       "tokens": [
         {
@@ -108,38 +77,7 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.6914062,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4746094,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8623047,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7558594,
-          "text": "?"
-        }
-      ],
+      "prefill": [],
       "seed": null,
       "tokens": [
         {
@@ -212,38 +150,7 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.6914062,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4746094,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8623047,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7558594,
-          "text": "?"
-        }
-      ],
+      "prefill": [],
       "seed": null,
       "tokens": [
         {
@@ -316,38 +223,7 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.6914062,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4746094,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8623047,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7558594,
-          "text": "?"
-        }
-      ],
+      "prefill": [],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
index 9774f84b..90fb6dcc 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
@@ -3,7 +3,103 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4911,
+        "logprob": -6.9765625,
+        "text": "User"
+      },
+      {
+        "id": 29901,
+        "logprob": -0.0059432983,
+        "text": ":"
+      },
+      {
+        "id": 32000,
+        "logprob": -0.8408203,
+        "text": "<fake_token_around_image>"
+      },
+      {
+        "id": 32001,
+        "logprob": -9.906292e-05,
+        "text": "<image>"
+      },
+      {
+        "id": 32000,
+        "logprob": -2.3841858e-07,
+        "text": "<fake_token_around_image>"
+      },
+      {
+        "id": 1815,
+        "logprob": -4.1679688,
+        "text": "Can"
+      },
+      {
+        "id": 366,
+        "logprob": -0.014099121,
+        "text": "you"
+      },
+      {
+        "id": 2649,
+        "logprob": -4.4609375,
+        "text": "tell"
+      },
+      {
+        "id": 592,
+        "logprob": -0.29882812,
+        "text": "me"
+      },
+      {
+        "id": 263,
+        "logprob": -4.1445312,
+        "text": "a"
+      },
+      {
+        "id": 1407,
+        "logprob": -9.3828125,
+        "text": "very"
+      },
+      {
+        "id": 3273,
+        "logprob": -1.9736328,
+        "text": "short"
+      },
+      {
+        "id": 5828,
+        "logprob": -0.2800293,
+        "text": "story"
+      },
+      {
+        "id": 2729,
+        "logprob": -3.5625,
+        "text": "based"
+      },
+      {
+        "id": 373,
+        "logprob": -0.0006427765,
+        "text": "on"
+      },
+      {
+        "id": 278,
+        "logprob": -0.13952637,
+        "text": "the"
+      },
+      {
+        "id": 1967,
+        "logprob": -0.068115234,
+        "text": "image"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.16357422,
+        "text": "?"
+      }
+    ],
     "seed": null,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
index 0b10b285..21d6161b 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
@@ -4,7 +4,103 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -6.9804688,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.006122589,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.8417969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.918213e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.3841858e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.1679688,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.014091492,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.4726562,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2998047,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -4.15625,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.3828125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9716797,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.27734375,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.5605469,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00064468384,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.14160156,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.06915283,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.16381836,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -76,7 +172,103 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -6.9804688,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.006122589,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.8417969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.942055e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.3841858e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.1679688,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.014091492,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.4726562,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2998047,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -4.15625,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.3828125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9716797,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.27734375,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.5605469,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.0006451607,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.14160156,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.06915283,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.16381836,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -148,7 +340,103 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -6.9804688,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.006122589,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.8417969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.918213e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.3841858e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.1679688,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.014091492,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.4726562,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2998047,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -4.15625,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.3828125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9716797,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.27734375,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.5605469,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00064468384,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.14160156,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.06915283,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.16381836,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -220,7 +508,103 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -6.9804688,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.006122589,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.8417969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.942055e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.3841858e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.1679688,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.014091492,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.4726562,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2998047,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -4.15625,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.3828125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9716797,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.27734375,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.5605469,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.0006451607,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.14160156,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.06915283,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.16381836,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
index 3895b48d..ef88926c 100644
--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
@@ -3,7 +3,33 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 2502,
+        "logprob": null,
+        "text": " red"
+      },
+      {
+        "id": 13,
+        "logprob": -2.734375,
+        "text": ","
+      },
+      {
+        "id": 8862,
+        "logprob": -3.6875,
+        "text": " yellow"
+      },
+      {
+        "id": 13,
+        "logprob": -0.40234375,
+        "text": ","
+      },
+      {
+        "id": 209,
+        "logprob": -8.25,
+        "text": " "
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
index 99edd6dd..4921c14b 100644
--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
@@ -4,7 +4,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.83984375,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -12.8125,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.84375,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.25,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -77,7 +103,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -150,7 +202,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -223,7 +301,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 10,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_mpt/test_mpt.json b/integration-tests/models/__snapshots__/test_mpt/test_mpt.json
index ba8ee809..abbbf03c 100644
--- a/integration-tests/models/__snapshots__/test_mpt/test_mpt.json
+++ b/integration-tests/models/__snapshots__/test_mpt/test_mpt.json
@@ -3,7 +3,33 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 17,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 1276,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 310,
+        "logprob": -1.5117188,
+        "text": " is"
+      },
+      {
+        "id": 18147,
+        "logprob": -8.96875,
+        "text": " Deep"
+      },
+      {
+        "id": 20727,
+        "logprob": -1.953125,
+        "text": " Learning"
+      },
+      {
+        "id": 32,
+        "logprob": -0.94189453,
+        "text": "?"
+      }
+    ],
     "seed": null,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_mpt/test_mpt_load.json b/integration-tests/models/__snapshots__/test_mpt/test_mpt_load.json
index bb7b8846..e3bc57ed 100644
--- a/integration-tests/models/__snapshots__/test_mpt/test_mpt_load.json
+++ b/integration-tests/models/__snapshots__/test_mpt/test_mpt_load.json
@@ -4,7 +4,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 17,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -1.5117188,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -8.96875,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -1.953125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -0.94189453,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -118,7 +144,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 17,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -1.5,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -8.984375,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -1.96875,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -0.93359375,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -232,7 +284,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 17,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -1.5,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -8.984375,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -1.96875,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -0.93359375,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -346,7 +424,33 @@
       "best_of_sequences": null,
       "finish_reason": "length",
       "generated_tokens": 17,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -1.5,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -8.984375,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -1.96875,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -0.93359375,
+          "text": "?"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
index b83e31a5..c1cd24cd 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
@@ -3,7 +3,13 @@
     "best_of_sequences": null,
     "finish_reason": "eos_token",
     "generated_tokens": 5,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 0,
+        "logprob": null,
+        "text": "<pad>"
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
index 06864988..9fd950a2 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -3,7 +3,13 @@
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 10,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 0,
+        "logprob": null,
+        "text": "<pad>"
+      }
+    ],
     "seed": 0,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
index 205e6656..c0834ae1 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
@@ -4,7 +4,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 6,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -52,7 +58,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 6,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -100,7 +112,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 6,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -148,7 +166,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 6,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded.json b/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded.json
index 6bdf9606..6090e2c9 100644
--- a/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded.json
+++ b/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded.json
@@ -3,7 +3,13 @@
     "best_of_sequences": null,
     "finish_reason": "eos_token",
     "generated_tokens": 7,
-    "prefill": [],
+    "prefill": [
+      {
+        "id": 0,
+        "logprob": null,
+        "text": "<pad>"
+      }
+    ],
     "seed": null,
     "tokens": [
       {
diff --git a/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded_load.json b/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded_load.json
index 16b92294..3e9af12e 100644
--- a/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded_load.json
+++ b/integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded_load.json
@@ -4,7 +4,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 7,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -58,7 +64,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 7,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -112,7 +124,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 7,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
@@ -166,7 +184,13 @@
       "best_of_sequences": null,
       "finish_reason": "eos_token",
       "generated_tokens": 7,
-      "prefill": [],
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
       "seed": null,
       "tokens": [
         {
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 610d6227..0530d521 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -80,7 +80,7 @@ fn get_config(
     };
 
     let content = std::fs::read_to_string(filename)?;
-    let config: RawConfig = serde_json::from_str(&content)?;
+    let config: RawConfig = serde_json::from_str(&content).expect("?");
 
     let config: Config = config.into();
     Ok(config)
@@ -171,6 +171,8 @@ struct RawConfig {
     head_dim: Option<usize>,
     vision_config: Option<VisionConfig>,
     is_encoder_decoder: Option<bool>,
+    #[serde(rename = "num_experts_per_tok")]
+    experts: Option<usize>,
 }
 
 #[derive(Deserialize)]
@@ -194,6 +196,7 @@ struct Config {
     model_type: Option<String>,
     vision_config: Option<VisionConfig>,
     is_encoder_decoder: bool,
+    experts: Option<usize>,
 }
 
 impl Config {
@@ -202,7 +205,11 @@ impl Config {
         let num_kv_heads = self.num_kv_heads? as u64;
         let head_dim = self.head_dim? as u64;
         let hidden_size = self.hidden_size? as u64;
-        let intermediate_size = self.intermediate_size? as u64;
+        let intermediate_size = if let Some(experts) = self.experts {
+            (self.intermediate_size? * experts) as u64
+        } else {
+            self.intermediate_size? as u64
+        };
         let num_layers = self.num_layers? as u64;
 
         let q_flops = 2 * num_heads * head_dim * hidden_size;
@@ -245,6 +252,7 @@ impl From<RawConfig> for Config {
         let model_type = other.model_type;
         let vision_config = other.vision_config;
         let is_encoder_decoder = other.is_encoder_decoder.unwrap_or(false);
+        let experts = other.experts;
         Config {
             max_position_embeddings,
             quantize,
@@ -257,6 +265,7 @@ impl From<RawConfig> for Config {
             num_kv_heads,
             intermediate_size,
             num_layers,
+            experts,
         }
     }
 }