Adding assertion.

2025-09-11 12:24:53 +00:00 · 2024-12-02 19:40:57 +01:00 · 2024-12-02 19:40:57 +01:00 · 45eb84e4b6
commit 45eb84e4b6
parent b4c5ca5a58
1 changed files with 4 additions and 0 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1416,6 +1416,10 @@ class FlashCausalLM(Model):
                    max_current_length=max_s,
                )
        else:
+            if bs > max_bs:
+                raise RuntimeError(
+                    "Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
+                )
            input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs]
            position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
            if ATTENTION == "flashinfer":