From c36c7ec83be716aa33a156833400e3b922d8d6c3 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:53:43 +0000
Subject: [PATCH] fix bug where tunableop is bound to cuda graph even when cuda
 graph are disabled

---
 server/text_generation_server/models/flash_causal_lm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d8c8838c..b46707b3 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -907,8 +907,11 @@ class FlashCausalLM(Model):
                         int(val)
                         for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                     ]
-                else:
+                elif CUDA_GRAPHS is not None:
                     tuning_sequences = CUDA_GRAPHS
+                else:
+                    # For seqlen = 1, we dispatch to LLMM1 kernel.
+                    tuning_sequences = [2, 3, 4, 5, 6, 7]
 
                 tunableop_filepath = os.path.join(
                     HUGGINGFACE_HUB_CACHE,