mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Adding assertion.
This commit is contained in:
parent
b4c5ca5a58
commit
45eb84e4b6
@ -1416,6 +1416,10 @@ class FlashCausalLM(Model):
|
||||
max_current_length=max_s,
|
||||
)
|
||||
else:
|
||||
if bs > max_bs:
|
||||
raise RuntimeError(
|
||||
"Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
|
||||
)
|
||||
input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs]
|
||||
position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
|
||||
if ATTENTION == "flashinfer":
|
||||
|
Loading…
Reference in New Issue
Block a user