From 15e5df1cc451dfd6e2fd45b25b97d9c148a513dc Mon Sep 17 00:00:00 2001 From: BaihuiJin Date: Tue, 16 Jul 2024 15:42:46 +0800 Subject: [PATCH] BS round up to BUCKET_SIZE to prevent capture graph when graph input not change (#185) --- server/text_generation_server/models/causal_lm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index ad2270ab..32c13daf 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -966,9 +966,9 @@ class CausalLM(Model): batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id) scenario = 'PREFILL' if prefill else 'GENERATE' - if self.enable_hpu_graph and batch.batch_size != self.prev_bs: + if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs: self.model.clear_cache() - self.prev_bs = batch.batch_size + self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE) dbg_trace( scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') assert batch.right_padding > 0, 'No more room for next token!'