Reducing number of reps while autotuning.

2025-07-08 00:40:16 +00:00 · 2023-06-06 11:56:10 +00:00 · 2023-06-06 11:56:10 +00:00 · fb0840944c
commit fb0840944c
parent 7de104b7f6
1 changed files with 1 additions and 1 deletions
--- a/server/text_generation_server/quant/custom_autotune.py
+++ b/server/text_generation_server/quant/custom_autotune.py
@ -69,7 +69,7 @@ class Autotuner(triton.KernelInterface):
        try:
            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40)
+            return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=10)
        except triton.compiler.OutOfResources:
            return (float('inf'), float('inf'), float('inf'))