From d57b7091aa444f5be560df8dd4b1ecf38aca4d2f Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 10 Sep 2024 10:24:56 +0200 Subject: [PATCH] Are we done yet ? --- backends/v3/src/backend.rs | 3 +- backends/v3/src/queue.rs | 2 +- backends/v3/src/radix.rs | 2 +- ...t_flash_llama_completion_many_prompts.json | 20 +- ..._llama_completion_many_prompts_stream.json | 932 +++++++++--------- ..._flash_llama_completion_single_prompt.json | 10 +- .../models/test_completion_prompts.py | 38 +- launcher/src/main.rs | 3 +- router/src/infer/mod.rs | 3 + router/src/server.rs | 6 +- .../models/flash_causal_lm.py | 86 +- 11 files changed, 571 insertions(+), 534 deletions(-) diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index 935f7980..f8a10ca2 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -376,10 +376,9 @@ fn filter_send_generations(generations: Vec, entries: &mut IntMap "dropped").increment(1); - err }).unwrap_or(true); if stopped { entries.remove(&id).expect("ID not found in entries. This is a bug."); diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 978a495c..c36afefa 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -366,7 +366,7 @@ impl State { break; } Some(block_allocation) => { - tracing::debug!("Allocation: {block_allocation:?}"); + // tracing::debug!("Allocation: {block_allocation:?}"); max_blocks = max(max_blocks, block_allocation.blocks.len() as u32); Some(block_allocation) } diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs index 1f3bef15..db6028ac 100644 --- a/backends/v3/src/radix.rs +++ b/backends/v3/src/radix.rs @@ -123,7 +123,7 @@ impl Allocator for RadixAllocator { prefill_tokens: prefill_tokens.clone(), }; - tracing::debug!("Blocks {blocks:?}"); + // tracing::debug!("Blocks {blocks:?}"); self.allocation_id += 1; self.allocations.insert(self.allocation_id, allocation); diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json index 9f3faffc..abbc29ac 100644 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json @@ -1,38 +1,38 @@ { "choices": [ { - "finish_reason": "stop", + "finish_reason": "length", "index": 1, "logprobs": null, - "text": " PR for more information?" + "text": " This is a question that has puzzled many people for" }, { "finish_reason": "length", "index": 3, "logprobs": null, - "text": "hd20220811-" + "text": "usculas_minusculas(s):\n \"\"\"\n" }, { "finish_reason": "length", "index": 0, "logprobs": null, - "text": "le Business Incubator is providing a workspace" + "text": " A Beginner’s Guide\nDeep learning is a subset" }, { "finish_reason": "length", "index": 2, "logprobs": null, - "text": " severely flawed and often has a substandard" + "text": " Paris\nWhat is the capital of France?\nThe" } ], - "created": 1722014725, + "created": 1725877154, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native", "usage": { - "completion_tokens": 36, - "prompt_tokens": 8, - "total_tokens": 44 + "completion_tokens": 40, + "prompt_tokens": 22, + "total_tokens": 62 } } diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json index 8e3c8f4d..dd22ceae 100644 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json @@ -5,12 +5,12 @@ "finish_reason": "", "index": 0, "logprobs": null, - "text": "\n" + "text": " A" } ], - "created": 1725874430, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -20,12 +20,72 @@ "finish_reason": "", "index": 1, "logprobs": null, - "text": "\n" + "text": " This" } ], - "created": 1725874430, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " Paris" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "us" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": " Beginner" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " is" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -38,9 +98,9 @@ "text": "\n" } ], - "created": 1725874430, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -50,12 +110,12 @@ "finish_reason": "", "index": 3, "logprobs": null, - "text": "hd" + "text": "cul" } ], - "created": 1725874430, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -65,12 +125,12 @@ "finish_reason": "", "index": 0, "logprobs": null, - "text": "What" + "text": "’s" } ], - "created": 1725874430, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -80,447 +140,27 @@ "finish_reason": "", "index": 1, "logprobs": null, - "text": "rig" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "\n" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "\n" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " Business" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " Business" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "2" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "2" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": ":" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " Process" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "0" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "0" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " And" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " And" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "2" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "2" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " Stock" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " Stock" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " Stock" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "0" - } - ], - "created": 1725874430, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " Moh" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " ," - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " Moh" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "7" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": "s" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": "s" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "s" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "\n" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": "`" - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": "," - } - ], - "created": 1725874431, - "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, "text": " a" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": "What" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -530,12 +170,372 @@ "finish_reason": "", "index": 3, "logprobs": null, - "text": "R" + "text": "as" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": " Guide" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " question" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " is" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "_minus" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": "\n" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " that" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " the" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "cul" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": "Deep" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " has" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " capital" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "as" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": " learning" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " puzzled" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " of" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "(s" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": " is" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " many" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": " France" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": "):\n" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": null, + "text": " a" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 1, + "logprobs": null, + "text": " people" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 2, + "logprobs": null, + "text": "?\n" + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "text_completion", + "system_fingerprint": "2.2.1-dev0-native" + }, + { + "choices": [ + { + "finish_reason": "", + "index": 3, + "logprobs": null, + "text": " " + } + ], + "created": 1725883643, + "id": "", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -545,12 +545,12 @@ "finish_reason": "length", "index": 0, "logprobs": null, - "text": "('\\" + "text": " subset" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -560,12 +560,12 @@ "finish_reason": "length", "index": 1, "logprobs": null, - "text": " And" + "text": " for" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -575,12 +575,12 @@ "finish_reason": "length", "index": 2, "logprobs": null, - "text": " Service" + "text": "The" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" }, @@ -590,12 +590,12 @@ "finish_reason": "length", "index": 3, "logprobs": null, - "text": "1" + "text": " \"\"\"\n" } ], - "created": 1725874431, + "created": 1725883643, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native" } diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json index c4e37a01..7ad56271 100644 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json @@ -4,17 +4,17 @@ "finish_reason": "length", "index": 0, "logprobs": null, - "text": "\n2.2 How" + "text": " A Beginner’s Guide\nDeep learning is a subset" } ], - "created": 1725874238, + "created": 1725876621, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "text_completion", "system_fingerprint": "2.2.1-dev0-native", "usage": { - "completion_tokens": 5, + "completion_tokens": 10, "prompt_tokens": 6, - "total_tokens": 11 + "total_tokens": 16 } } diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py index 65f9a1a0..a3b6651d 100644 --- a/integration-tests/models/test_completion_prompts.py +++ b/integration-tests/models/test_completion_prompts.py @@ -11,7 +11,7 @@ from text_generation.types import ( @pytest.fixture(scope="module") def flash_llama_completion_handle(launcher): with launcher( - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "meta-llama/Meta-Llama-3.1-8B-Instruct", ) as handle: yield handle @@ -35,15 +35,18 @@ def test_flash_llama_completion_single_prompt( json={ "model": "tgi", "prompt": "What is Deep Learning?", - "max_tokens": 5, - "seed": 0, + "max_tokens": 10, + "temperature": 0.0, }, headers=flash_llama_completion.headers, stream=False, ) response = response.json() assert len(response["choices"]) == 1 - assert response["choices"][0]["text"] == "\n2.2 How" + assert ( + response["choices"][0]["text"] + == " A Beginner’s Guide\nDeep learning is a subset" + ) assert response == response_snapshot @@ -53,9 +56,15 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn f"{flash_llama_completion.base_url}/v1/completions", json={ "model": "tgi", - "prompt": ["Say", "this", "is", "a"], + "prompt": [ + "What is Deep Learning?", + "Is water wet?", + "What is the capital of France?", + "def mai", + ], "max_tokens": 10, "seed": 0, + "temperature": 0.0, }, headers=flash_llama_completion.headers, stream=False, @@ -63,9 +72,16 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn response = response.json() assert len(response["choices"]) == 4 - all_indexes = [choice["index"] for choice in response["choices"]] + all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]] all_indexes.sort() - assert all_indexes == [0, 1, 2, 3] + all_indices, all_strings = zip(*all_indexes) + assert list(all_indices) == [0, 1, 2, 3] + assert list(all_strings) == [ + " A Beginner’s Guide\nDeep learning is a subset", + " This is a question that has puzzled many people for", + " Paris\nWhat is the capital of France?\nThe", + 'usculas_minusculas(s):\n """\n', + ] assert response == response_snapshot @@ -84,6 +100,7 @@ async def test_flash_llama_completion_many_prompts_stream( ], "max_tokens": 10, "seed": 0, + "temperature": 0.0, "stream": True, } @@ -114,5 +131,10 @@ async def test_flash_llama_completion_many_prompts_stream( strings[index] += c["choices"][0]["text"] assert response.status == 200 - # assert strings == ["What Business: And Stock Mohs`('\\", '\nrig Business Process And Stock ,s, And', '\n\n202 Stock Mohs a Service', 'hd\n20207\nR1'] + assert list(strings) == [ + " A Beginner’s Guide\nDeep learning is a subset", + " This is a question that has puzzled many people for", + " Paris\nWhat is the capital of France?\nThe", + 'usculas_minusculas(s):\n """\n', + ] assert chunks == response_snapshot diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 8e5c9dcd..2cdccfe0 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1843,9 +1843,8 @@ fn main() -> Result<(), LauncherError> { shutdown.clone(), &shutdown_receiver, ) - .map_err(|err| { + .inspect_err(|_| { shutdown_shards(shutdown.clone(), &shutdown_receiver); - err })?; // Default exit code diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 240282d9..4a2341da 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -336,6 +336,8 @@ pub enum InferError { ValidationError(#[from] ValidationError), #[error("Incomplete generation")] IncompleteGeneration, + #[error("Incomplete generation stream")] + IncompleteGenerationStream, #[error("Template error: {0}")] TemplateError(#[from] minijinja::Error), #[error("Missing template vatiable: {0}")] @@ -351,6 +353,7 @@ impl InferError { InferError::Overloaded(_) => "overloaded", InferError::ValidationError(_) => "validation", InferError::IncompleteGeneration => "incomplete_generation", + InferError::IncompleteGenerationStream => "incomplete_generation_stream", InferError::TemplateError(_) => "template_error", InferError::MissingTemplateVariable(_) => "missing_template_variable", InferError::ToolError(_) => "tool_error", diff --git a/router/src/server.rs b/router/src/server.rs index 6a4316bd..913f2011 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -540,6 +540,7 @@ async fn generate_stream_internal( // Inference let mut end_reached = false; let mut error = false; + let mut index = 0; let mut add_prompt = None; if req.parameters.return_full_text.unwrap_or(false) { @@ -562,7 +563,6 @@ async fn generate_stream_internal( match infer.generate_stream(req).instrument(info_span!(parent: &span, "async_stream")).await { // Keep permit as long as generate_stream lives Ok((_permit, input_length, response_stream)) => { - let mut index = 0; let mut response_stream = Box::pin(response_stream); // Server-Sent Event stream while let Some(response) = response_stream.next().await { @@ -677,8 +677,9 @@ async fn generate_stream_internal( // Check if generation reached the end // Skip if we already sent an error if !end_reached && !error { - let err = InferError::IncompleteGeneration; + let err = InferError::IncompleteGenerationStream; metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1); + tracing::info!("n iterations {index}"); tracing::error!("{err}"); yield Ok(Event::from(err)); } @@ -2558,6 +2559,7 @@ impl From for (StatusCode, Json) { InferError::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS, InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY, InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR, + InferError::IncompleteGenerationStream => StatusCode::INTERNAL_SERVER_ERROR, InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY, InferError::MissingTemplateVariable(_) => StatusCode::UNPROCESSABLE_ENTITY, InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 0ac55b42..d7308041 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -515,6 +515,7 @@ class FlashCausalLMBatch(Batch): dtype: torch.dtype, device: torch.device, ) -> "FlashCausalLMBatch": + assert len(pb.requests) > 0 batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer) return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) @@ -640,6 +641,7 @@ class FlashCausalLMBatch(Batch): adapter_segments = torch.tensor( adapter_segments, dtype=torch.int32, device=device ) + # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum() return type(self)( batch_id=self.batch_id, @@ -834,6 +836,8 @@ class FlashCausalLMBatch(Batch): start_slots = torch.concat(start_slots) + # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum() + next_token_chooser = HeterogeneousNextTokenChooser.from_pb( next_token_chooser_parameters, dtype=batches[0].next_token_chooser.dtype, @@ -1083,12 +1087,12 @@ class FlashCausalLM(Model): if ATTENTION in {"flashdecoding", "flashinfer"}: self.kv_cache = [ ( - torch.empty( + torch.zeros( (num_blocks, BLOCK_SIZE, num_heads, head_size), dtype=dtype, device=device, ), - torch.empty( + torch.zeros( (num_blocks, BLOCK_SIZE, num_heads, head_size), dtype=dtype, device=device, @@ -1099,12 +1103,12 @@ class FlashCausalLM(Model): elif SYSTEM == "ipex" and device == torch.device("cpu"): self.kv_cache = [ ( - torch.empty( + torch.zeros( (num_blocks, num_heads, BLOCK_SIZE, head_size), dtype=dtype, device=device, ), - torch.empty( + torch.zeros( (num_blocks, num_heads, BLOCK_SIZE, head_size), dtype=dtype, device=device, @@ -1150,20 +1154,6 @@ class FlashCausalLM(Model): input_lengths=input_lengths, prefix_lens=prefix_lengths, ) - - self.cuda_graphs[bs] = { - "input_ids": input_ids, - "position_ids": position_ids, - "kv_cache": self.kv_cache, - "block_tables": block_tables, - "slots": slots, - "input_lengths": input_lengths_tensor, - "prefix_lengths": prefix_lengths_tensor, - } - graph = torch.cuda.CUDAGraph() - self.cuda_graphs[bs]["graph"] = graph - - if ATTENTION == "flashinfer": from text_generation_server.layers.attention.flashinfer import ( create_decode_state_cuda_graphs, ) @@ -1180,19 +1170,29 @@ class FlashCausalLM(Model): num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, ) - self.cuda_graphs[bs]["state"] = state else: state = None + graph = torch.cuda.CUDAGraph() + self.cuda_graphs[bs] = { + "input_ids": input_ids, + "position_ids": position_ids, + "kv_cache": self.kv_cache, + "block_tables": block_tables, + "slots": slots, + "input_lengths": input_lengths_tensor, + "prefix_lengths": prefix_lengths_tensor, + "state": state, + "graph": graph, + } + torch.cuda.synchronize() # Run once outside to warmup with self._forward_context( block_tables=block_tables, cu_seqlen_prefill=None, - input_lengths=input_lengths, input_lengths_tensor=input_lengths_tensor, state=state, - prefix_lens=prefix_lengths, prefix_lens_tensor=prefix_lengths_tensor, ): seqlen = Seqlen( @@ -1214,6 +1214,7 @@ class FlashCausalLM(Model): prefill_cache_indices=None, lm_head_indices=None, ) + del seqlen torch.cuda.synchronize() @@ -1479,9 +1480,7 @@ class FlashCausalLM(Model): with self._forward_context( block_tables=block_tables, cu_seqlen_prefill=cu_seqlen_prefill, - input_lengths=batch.input_lengths, - input_lengths_tensor=input_lengths + prefix_lens_tensor, - prefix_lens=batch.prefix_lens, + input_lengths_tensor=input_lengths, prefix_lens_tensor=prefix_lens_tensor, ): max_k = (input_lengths + prefix_lens_tensor).max().item() @@ -1519,12 +1518,27 @@ class FlashCausalLM(Model): input_lengths=batch.input_lengths, prefix_lens=batch.prefix_lens, ) + # assert block_tables.shape[0] >= slots.shape[0] cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables - else: - cuda_graph["block_tables"][ - : block_tables.shape[0], : block_tables.shape[1] - ] = block_tables - cuda_graph["slots"].fill_(-1) + page_size = BLOCK_SIZE + indptr = torch.zeros( + input_lengths.shape[0] + 1, + device=input_lengths.device, + dtype=torch.int32, + ) + # Round up to page size and then calculate the cumulative sum to get + # the indices into the block table. + torch.add(input_lengths, page_size - 1, out=indptr[1:]) + indptr[1:].div_(page_size, rounding_mode="floor") + indptr[1:].cumsum_(-1) + # Get the lengths of the last page in a block. + last_page_len = torch.empty( + input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device + ) + torch.sub(input_lengths, 1, out=last_page_len) + last_page_len.remainder_(page_size) + last_page_len += 1 + cuda_graph["slots"].fill_(0) cuda_graph["slots"][: slots.shape[0]] = slots cuda_graph["input_lengths"].zero_() cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths @@ -1534,11 +1548,9 @@ class FlashCausalLM(Model): with self._forward_context( block_tables=cuda_graph["block_tables"], cu_seqlen_prefill=None, - input_lengths=batch.input_lengths, input_lengths_tensor=cuda_graph["input_lengths"], - prefix_lens=batch.prefix_lens, prefix_lens_tensor=cuda_graph["prefix_lengths"], - state=cuda_graph.get("state"), + state=cuda_graph["state"], ): # Replay the graph cuda_graph["graph"].replay() @@ -1767,7 +1779,7 @@ class FlashCausalLM(Model): left = 0 if n_accepted_ids > 1: - log_master(logger.debug, f"Speculated ids {n_accepted_ids - 1}") + log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}") current_stopped = False for j in range(index, index + n_accepted_ids): @@ -1886,6 +1898,8 @@ class FlashCausalLM(Model): top_tokens, ) + # assert all(n is not None for n in next_token_texts) + generations.append(generation) # accept each new token for this specific request since we may @@ -1922,9 +1936,7 @@ class FlashCausalLM(Model): *, block_tables: torch.Tensor, cu_seqlen_prefill: Optional[torch.Tensor], - input_lengths: List[int], input_lengths_tensor: torch.Tensor, - prefix_lens: List[int], prefix_lens_tensor: torch.Tensor, state: Optional[Any] = None, ) -> ContextManager: @@ -1950,7 +1962,7 @@ class FlashCausalLM(Model): # ), block_tables=block_tables, cu_seqlens=cu_seqlen_prefill, - input_lengths=input_lengths_tensor, + input_lengths=input_lengths_tensor + prefix_lens_tensor, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, head_size=self.head_size, @@ -1960,7 +1972,7 @@ class FlashCausalLM(Model): assert input_lengths_tensor is not None return use_decode_state( state=state if state is not None else self.decode_state, - input_lengths=input_lengths_tensor, + input_lengths=input_lengths_tensor + prefix_lens_tensor, block_tables=block_tables, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads,