diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md index a24cf902..d6e50cfd 100644 --- a/docs/source/basic_tutorials/monitoring.md +++ b/docs/source/basic_tutorials/monitoring.md @@ -72,4 +72,4 @@ Once Prometheus data source is configured, we can finally create our dashboard! Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/). -Load your dashboard configuration, and your TGI dashboard should be ready to go! \ No newline at end of file +Load your dashboard configuration, and your TGI dashboard should be ready to go! diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 333efe33..e18b885f 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -827,7 +827,7 @@ class FlashCausalLM(Model): self.device, ) - if SYSTEM == "rocm": + if SYSTEM == "rocm" and self.speculate is None or self.speculate == 0: if ( os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1" @@ -875,7 +875,11 @@ class FlashCausalLM(Model): logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}") # Warmup cuda graphs for bs in CUDA_GRAPHS: - if self.speculate is None or self.speculate + 1 <= bs: + if ( + self.speculate is None + or self.speculate == 0 + or self.speculate + 1 <= bs + ): self.cuda_graph_warmup(bs, max_s, max_bt) except torch.cuda.OutOfMemoryError: logger.exception(f"Decode cuda graph warmup failed")