Set MAX_TOTAL_TOKENS automatically (#91)

Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
2025-07-15 20:30:16 +00:00 · 2024-03-01 11:25:15 +01:00 · 2024-03-01 11:25:15 +01:00 · 80ae9ead28
commit 80ae9ead28
parent a5c788cfe4
3 changed files with 10 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -72,15 +72,14 @@ Environment Variables Added:

 | Name                        | Value(s)   | Default          | Description                                                                                                                      | Usage                        |
 | --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
-| MAX_TOTAL_TOKENS            | integer    | 0                | Control the padding of input                                                                                                     | add -e in docker run, such   |
-| ENABLE_HPU_GRAPH            | true/false | true             | Enable hpu graph or not                                                                                                          | add -e in docker run command |
+| ENABLE_HPU_GRAPH            | True/False | True             | Enable hpu graph or not                                                                                                          | add -e in docker run command |
+| LIMIT_HPU_GRAPH             | True/False | False            | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212)                 | add -e in docker run command |
 | PROF_WAITSTEP               | integer    | 0                | Control profile wait steps                                                                                                       | add -e in docker run command |
 | PROF_WARMUPSTEP             | integer    | 0                | Control profile warmup steps                                                                                                     | add -e in docker run command |
 | PROF_STEP                   | integer    | 0                | Enable/disable profile, control profile active steps                                                                             | add -e in docker run command |
 | PROF_PATH                   | string     | /tmp/hpu_profile | Define profile folder                                                                                                            | add -e in docker run command |
 | PROF_RANKS                  | string     | 0                | Comma-separated list of ranks to profile                                                                                         | add -e in docker run command |
-| PROF_RECORD_SHAPES          | true/false | false            | Control record_shapes option in the profiler                                                                                     | add -e in docker run command |
-| LIMIT_HPU_GRAPH             | True/False | False            | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212)                 | add -e in docker run command |
+| PROF_RECORD_SHAPES          | True/False | False            | Control record_shapes option in the profiler                                                                                     | add -e in docker run command |
 | BATCH_BUCKET_SIZE           | integer    | 8                | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs  | add -e in docker run command |
 | PREFILL_BATCH_BUCKET_SIZE   | integer    | 4                | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
 | PAD_SEQUENCE_TO_MULTIPLE_OF | integer    | 128              | For prefill operation, sequences will be padded to a multiple of provided value.                                                 | add -e in docker run command |
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -376,6 +376,7 @@ fn shard_manager(
    revision: Option<String>,
    quantize: Option<Quantization>,
    dtype: Option<Dtype>,
+    max_total_tokens: usize,
    trust_remote_code: bool,
    uds_path: String,
    rank: usize,
@ -458,6 +459,9 @@ fn shard_manager(
    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();

+    // Max total tokens
+    envs.push(("MAX_TOTAL_TOKENS".into(), max_total_tokens.to_string().into()));
+
    // Torch Distributed Env vars
    if  world_size == 1 {
        envs.push(("RANK".into(), rank.to_string().into()));
@ -884,6 +888,7 @@ fn spawn_shards(
        let otlp_endpoint = args.otlp_endpoint.clone();
        let quantize = args.quantize;
        let dtype = args.dtype;
+        let max_total_tokens = args.max_total_tokens;
        let trust_remote_code = args.trust_remote_code;
        let master_port = args.master_port;
        let disable_custom_kernels = args.disable_custom_kernels;
@ -898,6 +903,7 @@ fn spawn_shards(
                revision,
                quantize,
                dtype,
+                max_total_tokens,
                trust_remote_code,
                uds_path,
                rank,
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -42,7 +42,7 @@ from functools import wraps

 tracer = trace.get_tracer(__name__)

-MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", "0"))
+MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 2048))
 BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8))
 PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128))
 PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4))