From 55d984d730f2da4e736eb429f16a2aa06afabfb0 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 27 Aug 2024 15:00:22 +0200 Subject: [PATCH] Fixed flashinfer version. --- server/text_generation_server/models/flash_causal_lm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 265255dd1..22e0ada18 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1469,8 +1469,6 @@ class FlashCausalLM(Model): cuda_graph = None if cu_seqlen_prefill is not None or cuda_graph is None: - # TODO - # input_lengths = input_lengths + prefix_lens_tensor if ATTENTION == "flashinfer": block_tables = block_tables_to_ragged( block_tables=block_tables, @@ -1481,7 +1479,7 @@ class FlashCausalLM(Model): block_tables=block_tables, cu_seqlen_prefill=cu_seqlen_prefill, input_lengths=batch.input_lengths, - input_lengths_tensor=input_lengths, + input_lengths_tensor=input_lengths + prefix_lens_tensor, prefix_lens=batch.prefix_lens, prefix_lens_tensor=prefix_lens_tensor, ):