From 1c7c21d596a2904b205db5adeb67f5feee6096c0 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 1 Jul 2024 16:37:36 +0000 Subject: [PATCH] No need to recreate anything actually. --- server/text_generation_server/layers/attention/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py index ce9f5f32..bd0717ce 100644 --- a/server/text_generation_server/layers/attention/common.py +++ b/server/text_generation_server/layers/attention/common.py @@ -31,7 +31,8 @@ if FLASH_DECODING: self.cu_seqlen_k = cu_seqlen_k def clamp(self, max): - return Seqlen(torch.clamp(self.input_lengths, max=max)) + # Flash decoding doesn't need to clamp + return self else: