diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/utils/gptq/custom_autotune.py index 589d89ef..1eb40f1e 100644 --- a/server/text_generation_server/utils/gptq/custom_autotune.py +++ b/server/text_generation_server/utils/gptq/custom_autotune.py @@ -88,7 +88,7 @@ class Autotuner(triton.KernelInterface): # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default return triton.testing.do_bench( - kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40 + kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40 ) except triton.OutOfResources: return (float("inf"), float("inf"), float("inf"))