diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index 935f7980..05a26370 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -168,8 +168,6 @@ pub(crate) async fn batching_task( None } else { // Minimum batch size - // TODO: temporarily disable to avoid incorrect deallocation + - // reallocation when using prefix caching. Some((batch_size as f32 * waiting_served_ratio).floor() as usize) }; diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs index 1f3bef15..40b6a399 100644 --- a/backends/v3/src/radix.rs +++ b/backends/v3/src/radix.rs @@ -70,9 +70,13 @@ impl Allocator for RadixAllocator { ) -> Option { let mut blocks = vec![]; let prefix_node = if let Some(prefill_tokens) = prefill_tokens.as_ref() { - let node_id = self - .cache_blocks - .find(prefill_tokens.as_slice(), &mut blocks); + let node_id = self.cache_blocks.find( + &prefill_tokens.as_slice()[..prefill_tokens.len().saturating_sub(1)], + &mut blocks, + ); + // Even if this allocation fails below, we need to increase he + // refcount to ensure that the prefix that was found is not evicted. + node_id } else { self.cache_blocks.root_id() @@ -89,8 +93,6 @@ impl Allocator for RadixAllocator { let suffix_blocks = (suffix_len + self.block_size - 1) / self.block_size; - tracing::info!("Prefix {prefix_len} - Suffix {suffix_len}"); - match self.alloc_or_reclaim(suffix_blocks as usize) { Some(suffix_blocks) => blocks.extend(suffix_blocks), None => { diff --git a/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json index b61eb1f9..dbf3c03a 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json @@ -14,7 +14,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -40,7 +40,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -66,7 +66,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -92,7 +92,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -118,7 +118,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -144,7 +144,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -170,7 +170,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -196,7 +196,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -222,7 +222,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -248,7 +248,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -274,7 +274,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -300,7 +300,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -326,7 +326,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -352,7 +352,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -378,7 +378,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -404,7 +404,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -430,7 +430,7 @@ "usage": null } ], - "created": 1725522217, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -456,7 +456,7 @@ "usage": null } ], - "created": 1725522218, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -482,7 +482,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -508,7 +508,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -534,7 +534,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -560,7 +560,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -586,7 +586,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -604,7 +604,7 @@ "index": 0, "logprobs": null, "message": { - "content": "/u/Cr!!!!!!!", + "content": "/u/CruxHub: Hey Alice, I", "name": null, "role": "assistant", "tool_calls": null @@ -612,7 +612,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -638,7 +638,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -664,7 +664,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -690,7 +690,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -716,7 +716,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -742,7 +742,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -768,7 +768,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -794,7 +794,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -820,7 +820,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -846,7 +846,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -872,7 +872,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -898,7 +898,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -924,7 +924,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -950,7 +950,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -976,7 +976,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1002,7 +1002,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1028,7 +1028,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1054,7 +1054,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1080,7 +1080,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1106,7 +1106,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1132,7 +1132,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1158,7 +1158,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1184,7 +1184,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1210,7 +1210,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1236,7 +1236,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1262,7 +1262,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1288,7 +1288,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1314,7 +1314,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1340,7 +1340,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1366,7 +1366,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1392,7 +1392,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1418,7 +1418,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1444,7 +1444,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1470,7 +1470,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1496,7 +1496,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1522,7 +1522,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1548,7 +1548,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1574,7 +1574,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1600,7 +1600,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1626,7 +1626,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1652,7 +1652,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1678,7 +1678,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1704,7 +1704,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1730,7 +1730,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1756,7 +1756,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1782,7 +1782,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1808,7 +1808,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1834,7 +1834,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1860,7 +1860,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1886,7 +1886,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1912,7 +1912,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1938,7 +1938,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1964,7 +1964,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -1990,7 +1990,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2016,7 +2016,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2042,7 +2042,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2068,7 +2068,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2094,7 +2094,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2120,7 +2120,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2146,7 +2146,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2172,7 +2172,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2198,7 +2198,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2224,7 +2224,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2250,7 +2250,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2276,7 +2276,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2302,7 +2302,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2328,7 +2328,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2354,7 +2354,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2380,7 +2380,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2406,7 +2406,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2432,7 +2432,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525943, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2458,7 +2458,7 @@ "usage": null } ], - "created": 1725522228, + "created": 1725525936, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2484,7 +2484,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2510,7 +2510,7 @@ "usage": null } ], - "created": 1725522228, + "created": 1725525941, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2536,7 +2536,7 @@ "usage": null } ], - "created": 1725522227, + "created": 1725525942, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", @@ -2562,7 +2562,7 @@ "usage": null } ], - "created": 1725522228, + "created": 1725525935, "id": "", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "chat.completion", diff --git a/integration-tests/models/test_flash_llama_prefix.py b/integration-tests/models/test_flash_llama_prefix.py index 89ec1d89..ae97b301 100644 --- a/integration-tests/models/test_flash_llama_prefix.py +++ b/integration-tests/models/test_flash_llama_prefix.py @@ -3,7 +3,7 @@ import pytest @pytest.fixture(scope="module") def flash_llama_handle(launcher): - with launcher("meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=4) as handle: + with launcher("meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=2) as handle: yield handle diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 9a60d06c..dc509a55 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -268,9 +268,6 @@ class FlashCausalLMBatch(Batch): assert ( prefix_len <= orig_input_length ), f"Prefix {prefix_len} vs input {orig_input_length}" - if prefix_len == orig_input_length: - assert prefix_len > 0 - prefix_len -= 1 prefix_ids.append(tokenized_input[:prefix_len]) tokenized_input = tokenized_input[prefix_len:]