mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
fix bug where tunableop is bound to cuda graph even when cuda graph are disabled
This commit is contained in:
parent
35d1946e67
commit
c36c7ec83b
@ -907,8 +907,11 @@ class FlashCausalLM(Model):
|
|||||||
int(val)
|
int(val)
|
||||||
for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
|
for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
|
||||||
]
|
]
|
||||||
else:
|
elif CUDA_GRAPHS is not None:
|
||||||
tuning_sequences = CUDA_GRAPHS
|
tuning_sequences = CUDA_GRAPHS
|
||||||
|
else:
|
||||||
|
# For seqlen = 1, we dispatch to LLMM1 kernel.
|
||||||
|
tuning_sequences = [2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
tunableop_filepath = os.path.join(
|
tunableop_filepath = os.path.join(
|
||||||
HUGGINGFACE_HUB_CACHE,
|
HUGGINGFACE_HUB_CACHE,
|
||||||
|
Loading…
Reference in New Issue
Block a user