Improve habana profile dev experience (#36) (#65)

Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
2025-09-18 07:44:53 +00:00 · 2024-02-22 13:57:45 +01:00 · 2024-02-22 13:57:45 +01:00 · 9ad6086250
commit 9ad6086250
parent f7ef414e38
2 changed files with 8 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -74,8 +74,10 @@ Environment Variables Added:
 |  MAX_TOTAL_TOKENS     | integer        | 0           | Control the padding of input          | add -e in docker run, such         |
 |  ENABLE_HPU_GRAPH     | true/false     | true        | Enable hpu graph or not                                                      |  add -e in docker run command  |
 |  PROF_WARMUPSTEP      | integer        | 0           | Enable/disable profile, control profile warmup step, 0 means disable profile |  add -e in docker run command  |
-|  PROF_STEP            | interger       | 5           | Control profile step                                                         |  add -e in docker run command  |
+|  PROF_STEP            | integer        | 5           | Control profile step                                                         |  add -e in docker run command  |
-|  PROF_PATH            | string         | /root/text-generation-inference                                   | Define profile folder  | add -e in docker run command  |
+|  PROF_PATH            | string         | /tmp/hpu_profile                                   | Define profile folder  | add -e in docker run command  |
 |  PROF_RANKS           | string         | 0           | Comma-separated list of ranks to profile                                     |  add -e in docker run command  |
 |  PROF_RECORD_SHAPES   | true/false     | false       | Control record_shapes option in the profiler                                 |  add -e in docker run command  |
 | LIMIT_HPU_GRAPH       | True/False     | False       | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
 | BATCH_BUCKET_SIZE     | integer        | 8           | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
 | PREFILL_BATCH_BUCKET_SIZE     | integer        | 4           | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -596,11 +596,13 @@ class CausalLM(Model):
            rank=rank,
            kwargs=kwargs,
        )
-        self.profiling_warmup_steps = int(os.getenv("PROF_WARMUPSTEP", "0"))
+        prof_ranks = [int(val) for val in os.getenv("PROF_RANKS", "0").split(',')]
        self.profiling_warmup_steps = int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in prof_ranks else 0
        self.profiling_steps = int(os.getenv("PROF_STEP", "5"))
        record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true"
        output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile")
        self.hb_profer = HabanaProfile(
-            warmup=self.profiling_warmup_steps, active=self.profiling_steps, output_dir=output_dir
+            warmup=self.profiling_warmup_steps, active=self.profiling_steps, output_dir=output_dir, record_shapes=record_shapes
        )
        if self.profiling_warmup_steps > 0:
            self.hb_profer_started = True