diff --git a/README.md b/README.md index 0e2b32db..f7208bf8 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,9 @@ Environment Variables Added: | --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- | | MAX_TOTAL_TOKENS | integer | 0 | Control the padding of input | add -e in docker run, such | | ENABLE_HPU_GRAPH | true/false | true | Enable hpu graph or not | add -e in docker run command | -| PROF_WARMUPSTEP | integer | 0 | Enable/disable profile, control profile warmup step, 0 means disable profile | add -e in docker run command | -| PROF_STEP | integer | 5 | Control profile step | add -e in docker run command | +| PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command | +| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command | +| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command | | PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command | | PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command | | PROF_RECORD_SHAPES | true/false | false | Control record_shapes option in the profiler | add -e in docker run command | diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index d60a6144..ac7e3176 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -324,7 +324,8 @@ class CausalLMBatch(Batch): htorch.core.mark_step() dst_keys = [prepare_memory(new_bs * chunk_size, prev, inplace) for prev in src_keys[target_batch_idx]] - dst_keys = [move_data(dst_keys[layer_num], chunk_size, indices, [src[layer_num] for src in src_keys]) for layer_num in range(num_layers)] + dst_keys = [move_data(dst_keys[layer_num], chunk_size, indices, [src[layer_num] + for src in src_keys]) for layer_num in range(num_layers)] src_values = [torch.stack(src) for src in src_values] htorch.core.mark_step() @@ -334,7 +335,8 @@ class CausalLMBatch(Batch): htorch.core.mark_step() dst_values = [prepare_memory(new_bs * chunk_size, prev, inplace) for prev in src_values[target_batch_idx]] - dst_values = [move_data(dst_values[layer_num], chunk_size, indices, [src[layer_num] for src in src_values]) for layer_num in range(num_layers)] + dst_values = [move_data(dst_values[layer_num], chunk_size, indices, [src[layer_num] + for src in src_values]) for layer_num in range(num_layers)] past_key_values = past_key_values_type(zip(dst_keys, dst_values)) @@ -626,18 +628,20 @@ class CausalLM(Model): ) prof_ranks = [int(val) for val in os.getenv("PROF_RANKS", "0").split(',')] self.profiling_warmup_steps = int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in prof_ranks else 0 - self.profiling_steps = int(os.getenv("PROF_STEP", "5")) + self.profiling_steps = int(os.getenv("PROF_STEP", "0")) if rank in prof_ranks else 0 + self.profiling_wait_steps = int(os.getenv("PROF_WAITSTEP", "0")) record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true" output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile") - self.hb_profer = HabanaProfile( - warmup=self.profiling_warmup_steps, active=self.profiling_steps, output_dir=output_dir, record_shapes=record_shapes - ) - if self.profiling_warmup_steps > 0: - self.hb_profer_started = True - self.hb_profer.start() + if self.profiling_steps > 0: + self.hb_profiler = HabanaProfile( + wait=self.profiling_wait_steps, + warmup=self.profiling_warmup_steps, + active=self.profiling_steps, + output_dir=output_dir, record_shapes=record_shapes + ) + self.hb_profiler.start() else: - self.hb_profer = None - self.hb_profer_started = False + self.hb_profiler = None self.step = 0 def setup_quantization(self, model): @@ -979,10 +983,10 @@ class CausalLM(Model): req.prefix_offset = prefix_offset req.read_offset = read_offset htorch.core.mark_step() - self.step = self.step + 1 - if self.hb_profer_started == True and self.step > self.profiling_warmup_steps + self.profiling_steps: - self.hb_profer.stop() - self.hb_profer_started = False - - return generations, batch if not stopped else None \ No newline at end of file + if self.hb_profiler is not None: + if self.step > self.profiling_wait_steps + self.profiling_warmup_steps + self.profiling_steps: + self.hb_profiler.stop() + else: + self.hb_profiler.step() + return generations, batch if not stopped else None