From d57b7091aa444f5be560df8dd4b1ecf38aca4d2f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 10 Sep 2024 10:24:56 +0200
Subject: [PATCH] Are we done yet ?

---
 backends/v3/src/backend.rs                    |   3 +-
 backends/v3/src/queue.rs                      |   2 +-
 backends/v3/src/radix.rs                      |   2 +-
 ...t_flash_llama_completion_many_prompts.json |  20 +-
 ..._llama_completion_many_prompts_stream.json | 932 +++++++++---------
 ..._flash_llama_completion_single_prompt.json |  10 +-
 .../models/test_completion_prompts.py         |  38 +-
 launcher/src/main.rs                          |   3 +-
 router/src/infer/mod.rs                       |   3 +
 router/src/server.rs                          |   6 +-
 .../models/flash_causal_lm.py                 |  86 +-
 11 files changed, 571 insertions(+), 534 deletions(-)
diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index 935f7980..f8a10ca2 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -376,10 +376,9 @@ fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u6
         // Send generation responses back to the infer task
         // If the receive an error from the Flume channel, it means that the client dropped the
         // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
+        let stopped = send_responses(generation, entry).inspect_err(|_err| {
             tracing::error!("Entry response channel error.");
             metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
-            err
         }).unwrap_or(true);
         if stopped {
             entries.remove(&id).expect("ID not found in entries. This is a bug.");
diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index 978a495c..c36afefa 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -366,7 +366,7 @@ impl State {
                         break;
                     }
                     Some(block_allocation) => {
-                        tracing::debug!("Allocation: {block_allocation:?}");
+                        // tracing::debug!("Allocation: {block_allocation:?}");
                         max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
                         Some(block_allocation)
                     }
diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs
index 1f3bef15..db6028ac 100644
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@@ -123,7 +123,7 @@ impl Allocator for RadixAllocator {
             prefill_tokens: prefill_tokens.clone(),
         };
 
-        tracing::debug!("Blocks {blocks:?}");
+        // tracing::debug!("Blocks {blocks:?}");
 
         self.allocation_id += 1;
         self.allocations.insert(self.allocation_id, allocation);
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
index 9f3faffc..abbc29ac 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -1,38 +1,38 @@
 {
   "choices": [
     {
-      "finish_reason": "stop",
+      "finish_reason": "length",
       "index": 1,
       "logprobs": null,
-      "text": " PR for more information?"
+      "text": " This is a question that has puzzled many people for"
     },
     {
       "finish_reason": "length",
       "index": 3,
       "logprobs": null,
-      "text": "hd20220811-"
+      "text": "usculas_minusculas(s):\n    \"\"\"\n"
     },
     {
       "finish_reason": "length",
       "index": 0,
       "logprobs": null,
-      "text": "le Business Incubator is providing a workspace"
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
     },
     {
       "finish_reason": "length",
       "index": 2,
       "logprobs": null,
-      "text": " severely flawed and often has a substandard"
+      "text": " Paris\nWhat is the capital of France?\nThe"
     }
   ],
-  "created": 1722014725,
+  "created": 1725877154,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "object": "text_completion",
   "system_fingerprint": "2.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 36,
-    "prompt_tokens": 8,
-    "total_tokens": 44
+    "completion_tokens": 40,
+    "prompt_tokens": 22,
+    "total_tokens": 62
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
index 8e3c8f4d..dd22ceae 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@@ -5,12 +5,12 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "\n"
+        "text": " A"
       }
     ],
-    "created": 1725874430,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -20,12 +20,72 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "\n"
+        "text": " This"
       }
     ],
-    "created": 1725874430,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " Paris"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "us"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Beginner"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " is"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -38,9 +98,9 @@
         "text": "\n"
       }
     ],
-    "created": 1725874430,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -50,12 +110,12 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "hd"
+        "text": "cul"
       }
     ],
-    "created": 1725874430,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -65,12 +125,12 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "What"
+        "text": "’s"
       }
     ],
-    "created": 1725874430,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -80,447 +140,27 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "rig"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Business"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " Business"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "2"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "2"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": ":"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " Process"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "0"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "0"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " And"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " And"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "2"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "2"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Stock"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " Stock"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " Stock"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "0"
-      }
-    ],
-    "created": 1725874430,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Moh"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " ,"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " Moh"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "7"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "s"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": "s"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "s"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "`"
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": ","
-      }
-    ],
-    "created": 1725874431,
-    "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
         "text": " a"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "What"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -530,12 +170,372 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "R"
+        "text": "as"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Guide"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " question"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " is"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "_minus"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " that"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " the"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "cul"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "Deep"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " has"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " capital"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "as"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " learning"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " puzzled"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " of"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "(s"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " is"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " many"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " France"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "):\n"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " a"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " people"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "?\n"
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "text_completion",
+    "system_fingerprint": "2.2.1-dev0-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "   "
+      }
+    ],
+    "created": 1725883643,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -545,12 +545,12 @@
         "finish_reason": "length",
         "index": 0,
         "logprobs": null,
-        "text": "('\\"
+        "text": " subset"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -560,12 +560,12 @@
         "finish_reason": "length",
         "index": 1,
         "logprobs": null,
-        "text": " And"
+        "text": " for"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -575,12 +575,12 @@
         "finish_reason": "length",
         "index": 2,
         "logprobs": null,
-        "text": " Service"
+        "text": "The"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   },
@@ -590,12 +590,12 @@
         "finish_reason": "length",
         "index": 3,
         "logprobs": null,
-        "text": "1"
+        "text": " \"\"\"\n"
       }
     ],
-    "created": 1725874431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
     "system_fingerprint": "2.2.1-dev0-native"
   }
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
index c4e37a01..7ad56271 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@@ -4,17 +4,17 @@
       "finish_reason": "length",
       "index": 0,
       "logprobs": null,
-      "text": "\n2.2 How"
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
     }
   ],
-  "created": 1725874238,
+  "created": 1725876621,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "object": "text_completion",
   "system_fingerprint": "2.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 5,
+    "completion_tokens": 10,
     "prompt_tokens": 6,
-    "total_tokens": 11
+    "total_tokens": 16
   }
 }
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index 65f9a1a0..a3b6651d 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -11,7 +11,7 @@ from text_generation.types import (
 @pytest.fixture(scope="module")
 def flash_llama_completion_handle(launcher):
     with launcher(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
     ) as handle:
         yield handle
 
@@ -35,15 +35,18 @@ def test_flash_llama_completion_single_prompt(
         json={
             "model": "tgi",
             "prompt": "What is Deep Learning?",
-            "max_tokens": 5,
-            "seed": 0,
+            "max_tokens": 10,
+            "temperature": 0.0,
         },
         headers=flash_llama_completion.headers,
         stream=False,
     )
     response = response.json()
     assert len(response["choices"]) == 1
-    assert response["choices"][0]["text"] == "\n2.2 How"
+    assert (
+        response["choices"][0]["text"]
+        == " A Beginner’s Guide\nDeep learning is a subset"
+    )
     assert response == response_snapshot
 
 
@@ -53,9 +56,15 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
         f"{flash_llama_completion.base_url}/v1/completions",
         json={
             "model": "tgi",
-            "prompt": ["Say", "this", "is", "a"],
+            "prompt": [
+                "What is Deep Learning?",
+                "Is water wet?",
+                "What is the capital of France?",
+                "def mai",
+            ],
             "max_tokens": 10,
             "seed": 0,
+            "temperature": 0.0,
         },
         headers=flash_llama_completion.headers,
         stream=False,
@@ -63,9 +72,16 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
     response = response.json()
     assert len(response["choices"]) == 4
 
-    all_indexes = [choice["index"] for choice in response["choices"]]
+    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
     all_indexes.sort()
-    assert all_indexes == [0, 1, 2, 3]
+    all_indices, all_strings = zip(*all_indexes)
+    assert list(all_indices) == [0, 1, 2, 3]
+    assert list(all_strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]
 
     assert response == response_snapshot
 
@@ -84,6 +100,7 @@ async def test_flash_llama_completion_many_prompts_stream(
         ],
         "max_tokens": 10,
         "seed": 0,
+        "temperature": 0.0,
         "stream": True,
     }
 
@@ -114,5 +131,10 @@ async def test_flash_llama_completion_many_prompts_stream(
                     strings[index] += c["choices"][0]["text"]
 
     assert response.status == 200
-    # assert strings == ["What Business: And Stock Mohs`('\\", '\nrig Business Process And Stock ,s, And', '\n\n202 Stock Mohs a Service', 'hd\n20207\nR1']
+    assert list(strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]
     assert chunks == response_snapshot
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 8e5c9dcd..2cdccfe0 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1843,9 +1843,8 @@ fn main() -> Result<(), LauncherError> {
         shutdown.clone(),
         &shutdown_receiver,
     )
-    .map_err(|err| {
+    .inspect_err(|_| {
         shutdown_shards(shutdown.clone(), &shutdown_receiver);
-        err
     })?;
 
     // Default exit code
diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
index 240282d9..4a2341da 100644
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@@ -336,6 +336,8 @@ pub enum InferError {
     ValidationError(#[from] ValidationError),
     #[error("Incomplete generation")]
     IncompleteGeneration,
+    #[error("Incomplete generation stream")]
+    IncompleteGenerationStream,
     #[error("Template error: {0}")]
     TemplateError(#[from] minijinja::Error),
     #[error("Missing template vatiable: {0}")]
@@ -351,6 +353,7 @@ impl InferError {
             InferError::Overloaded(_) => "overloaded",
             InferError::ValidationError(_) => "validation",
             InferError::IncompleteGeneration => "incomplete_generation",
+            InferError::IncompleteGenerationStream => "incomplete_generation_stream",
             InferError::TemplateError(_) => "template_error",
             InferError::MissingTemplateVariable(_) => "missing_template_variable",
             InferError::ToolError(_) => "tool_error",
diff --git a/router/src/server.rs b/router/src/server.rs
index 6a4316bd..913f2011 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -540,6 +540,7 @@ async fn generate_stream_internal(
         // Inference
         let mut end_reached = false;
         let mut error = false;
+        let mut index = 0;
 
         let mut add_prompt = None;
         if req.parameters.return_full_text.unwrap_or(false) {
@@ -562,7 +563,6 @@ async fn generate_stream_internal(
             match infer.generate_stream(req).instrument(info_span!(parent: &span, "async_stream")).await {
                 // Keep permit as long as generate_stream lives
                 Ok((_permit, input_length, response_stream)) => {
-                    let mut index = 0;
                     let mut response_stream = Box::pin(response_stream);
                     // Server-Sent Event stream
                     while let Some(response) = response_stream.next().await {
@@ -677,8 +677,9 @@ async fn generate_stream_internal(
             // Check if generation reached the end
             // Skip if we already sent an error
             if !end_reached && !error {
-                let err = InferError::IncompleteGeneration;
+                let err = InferError::IncompleteGenerationStream;
                 metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
+                tracing::info!("n iterations {index}");
                 tracing::error!("{err}");
                 yield Ok(Event::from(err));
             }
@@ -2558,6 +2559,7 @@ impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
             InferError::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS,
             InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR,
+            InferError::IncompleteGenerationStream => StatusCode::INTERNAL_SERVER_ERROR,
             InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::MissingTemplateVariable(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 0ac55b42..d7308041 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -515,6 +515,7 @@ class FlashCausalLMBatch(Batch):
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
+        assert len(pb.requests) > 0
         batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
         return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
 
@@ -640,6 +641,7 @@ class FlashCausalLMBatch(Batch):
         adapter_segments = torch.tensor(
             adapter_segments, dtype=torch.int32, device=device
         )
+        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
 
         return type(self)(
             batch_id=self.batch_id,
@@ -834,6 +836,8 @@ class FlashCausalLMBatch(Batch):
 
         start_slots = torch.concat(start_slots)
 
+        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters,
             dtype=batches[0].next_token_chooser.dtype,
@@ -1083,12 +1087,12 @@ class FlashCausalLM(Model):
         if ATTENTION in {"flashdecoding", "flashinfer"}:
             self.kv_cache = [
                 (
-                    torch.empty(
+                    torch.zeros(
                         (num_blocks, BLOCK_SIZE, num_heads, head_size),
                         dtype=dtype,
                         device=device,
                     ),
-                    torch.empty(
+                    torch.zeros(
                         (num_blocks, BLOCK_SIZE, num_heads, head_size),
                         dtype=dtype,
                         device=device,
@@ -1099,12 +1103,12 @@ class FlashCausalLM(Model):
         elif SYSTEM == "ipex" and device == torch.device("cpu"):
             self.kv_cache = [
                 (
-                    torch.empty(
+                    torch.zeros(
                         (num_blocks, num_heads, BLOCK_SIZE, head_size),
                         dtype=dtype,
                         device=device,
                     ),
-                    torch.empty(
+                    torch.zeros(
                         (num_blocks, num_heads, BLOCK_SIZE, head_size),
                         dtype=dtype,
                         device=device,
@@ -1150,20 +1154,6 @@ class FlashCausalLM(Model):
                 input_lengths=input_lengths,
                 prefix_lens=prefix_lengths,
             )
-
-        self.cuda_graphs[bs] = {
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "kv_cache": self.kv_cache,
-            "block_tables": block_tables,
-            "slots": slots,
-            "input_lengths": input_lengths_tensor,
-            "prefix_lengths": prefix_lengths_tensor,
-        }
-        graph = torch.cuda.CUDAGraph()
-        self.cuda_graphs[bs]["graph"] = graph
-
-        if ATTENTION == "flashinfer":
             from text_generation_server.layers.attention.flashinfer import (
                 create_decode_state_cuda_graphs,
             )
@@ -1180,19 +1170,29 @@ class FlashCausalLM(Model):
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,
             )
-            self.cuda_graphs[bs]["state"] = state
         else:
             state = None
 
+        graph = torch.cuda.CUDAGraph()
+        self.cuda_graphs[bs] = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "kv_cache": self.kv_cache,
+            "block_tables": block_tables,
+            "slots": slots,
+            "input_lengths": input_lengths_tensor,
+            "prefix_lengths": prefix_lengths_tensor,
+            "state": state,
+            "graph": graph,
+        }
+
         torch.cuda.synchronize()
         # Run once outside to warmup
         with self._forward_context(
             block_tables=block_tables,
             cu_seqlen_prefill=None,
-            input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
             state=state,
-            prefix_lens=prefix_lengths,
             prefix_lens_tensor=prefix_lengths_tensor,
         ):
             seqlen = Seqlen(
@@ -1214,6 +1214,7 @@ class FlashCausalLM(Model):
                 prefill_cache_indices=None,
                 lm_head_indices=None,
             )
+            del seqlen
 
             torch.cuda.synchronize()
 
@@ -1479,9 +1480,7 @@ class FlashCausalLM(Model):
             with self._forward_context(
                 block_tables=block_tables,
                 cu_seqlen_prefill=cu_seqlen_prefill,
-                input_lengths=batch.input_lengths,
-                input_lengths_tensor=input_lengths + prefix_lens_tensor,
-                prefix_lens=batch.prefix_lens,
+                input_lengths_tensor=input_lengths,
                 prefix_lens_tensor=prefix_lens_tensor,
             ):
                 max_k = (input_lengths + prefix_lens_tensor).max().item()
@@ -1519,12 +1518,27 @@ class FlashCausalLM(Model):
                 input_lengths=batch.input_lengths,
                 prefix_lens=batch.prefix_lens,
             )
+            # assert block_tables.shape[0] >= slots.shape[0]
             cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
-        else:
-            cuda_graph["block_tables"][
-                : block_tables.shape[0], : block_tables.shape[1]
-            ] = block_tables
-        cuda_graph["slots"].fill_(-1)
+            page_size = BLOCK_SIZE
+            indptr = torch.zeros(
+                input_lengths.shape[0] + 1,
+                device=input_lengths.device,
+                dtype=torch.int32,
+            )
+            # Round up to page size and then calculate the cumulative sum to get
+            # the indices into the block table.
+            torch.add(input_lengths, page_size - 1, out=indptr[1:])
+            indptr[1:].div_(page_size, rounding_mode="floor")
+            indptr[1:].cumsum_(-1)
+            # Get the lengths of the last page in a block.
+            last_page_len = torch.empty(
+                input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+            )
+            torch.sub(input_lengths, 1, out=last_page_len)
+            last_page_len.remainder_(page_size)
+            last_page_len += 1
+        cuda_graph["slots"].fill_(0)
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
         cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
@@ -1534,11 +1548,9 @@ class FlashCausalLM(Model):
         with self._forward_context(
             block_tables=cuda_graph["block_tables"],
             cu_seqlen_prefill=None,
-            input_lengths=batch.input_lengths,
             input_lengths_tensor=cuda_graph["input_lengths"],
-            prefix_lens=batch.prefix_lens,
             prefix_lens_tensor=cuda_graph["prefix_lengths"],
-            state=cuda_graph.get("state"),
+            state=cuda_graph["state"],
         ):
             # Replay the graph
             cuda_graph["graph"].replay()
@@ -1767,7 +1779,7 @@ class FlashCausalLM(Model):
             left = 0
 
             if n_accepted_ids > 1:
-                log_master(logger.debug, f"Speculated ids {n_accepted_ids - 1}")
+                log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
 
             current_stopped = False
             for j in range(index, index + n_accepted_ids):
@@ -1886,6 +1898,8 @@ class FlashCausalLM(Model):
                     top_tokens,
                 )
 
+                # assert all(n is not None for n in next_token_texts)
+
                 generations.append(generation)
 
             # accept each new token for this specific request since we may
@@ -1922,9 +1936,7 @@ class FlashCausalLM(Model):
         *,
         block_tables: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
-        input_lengths: List[int],
         input_lengths_tensor: torch.Tensor,
-        prefix_lens: List[int],
         prefix_lens_tensor: torch.Tensor,
         state: Optional[Any] = None,
     ) -> ContextManager:
@@ -1950,7 +1962,7 @@ class FlashCausalLM(Model):
                 # ),
                 block_tables=block_tables,
                 cu_seqlens=cu_seqlen_prefill,
-                input_lengths=input_lengths_tensor,
+                input_lengths=input_lengths_tensor + prefix_lens_tensor,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,
                 head_size=self.head_size,
@@ -1960,7 +1972,7 @@ class FlashCausalLM(Model):
             assert input_lengths_tensor is not None
             return use_decode_state(
                 state=state if state is not None else self.decode_state,
-                input_lengths=input_lengths_tensor,
+                input_lengths=input_lengths_tensor + prefix_lens_tensor,
                 block_tables=block_tables,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,