fix bug where tunableop is bound to cuda graph even when cuda graph are disabled

2025-09-11 20:34:54 +00:00 · 2024-06-06 13:53:43 +00:00 · 2024-06-06 13:53:43 +00:00 · c36c7ec83b
commit c36c7ec83b
parent 35d1946e67
1 changed files with 4 additions and 1 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -907,8 +907,11 @@ class FlashCausalLM(Model):
                        int(val)
                        for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                    ]
-                else:
+                elif CUDA_GRAPHS is not None:
                    tuning_sequences = CUDA_GRAPHS
+                else:
+                    # For seqlen = 1, we dispatch to LLMM1 kernel.
+                    tuning_sequences = [2, 3, 4, 5, 6, 7]

                tunableop_filepath = os.path.join(
                    HUGGINGFACE_HUB_CACHE,