Set MAX_TOTAL_TOKENS automatically (#91)

Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
This commit is contained in:
Karol Damaszke 2024-03-01 11:25:15 +01:00 committed by GitHub
parent a5c788cfe4
commit 80ae9ead28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 5 deletions

View File

@ -72,15 +72,14 @@ Environment Variables Added:
| Name | Value(s) | Default | Description | Usage | | Name | Value(s) | Default | Description | Usage |
| --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- | | --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
| MAX_TOTAL_TOKENS | integer | 0 | Control the padding of input | add -e in docker run, such | | ENABLE_HPU_GRAPH | True/False | True | Enable hpu graph or not | add -e in docker run command |
| ENABLE_HPU_GRAPH | true/false | true | Enable hpu graph or not | add -e in docker run command | | LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
| PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command | | PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command |
| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command | | PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command |
| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command | | PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command |
| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command | | PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command |
| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command | | PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command |
| PROF_RECORD_SHAPES | true/false | false | Control record_shapes option in the profiler | add -e in docker run command | | PROF_RECORD_SHAPES | True/False | False | Control record_shapes option in the profiler | add -e in docker run command |
| LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | | BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | | PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command | | PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command |

View File

@ -376,6 +376,7 @@ fn shard_manager(
revision: Option<String>, revision: Option<String>,
quantize: Option<Quantization>, quantize: Option<Quantization>,
dtype: Option<Dtype>, dtype: Option<Dtype>,
max_total_tokens: usize,
trust_remote_code: bool, trust_remote_code: bool,
uds_path: String, uds_path: String,
rank: usize, rank: usize,
@ -458,6 +459,9 @@ fn shard_manager(
// Copy current process env // Copy current process env
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect(); let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
// Max total tokens
envs.push(("MAX_TOTAL_TOKENS".into(), max_total_tokens.to_string().into()));
// Torch Distributed Env vars // Torch Distributed Env vars
if world_size == 1 { if world_size == 1 {
envs.push(("RANK".into(), rank.to_string().into())); envs.push(("RANK".into(), rank.to_string().into()));
@ -884,6 +888,7 @@ fn spawn_shards(
let otlp_endpoint = args.otlp_endpoint.clone(); let otlp_endpoint = args.otlp_endpoint.clone();
let quantize = args.quantize; let quantize = args.quantize;
let dtype = args.dtype; let dtype = args.dtype;
let max_total_tokens = args.max_total_tokens;
let trust_remote_code = args.trust_remote_code; let trust_remote_code = args.trust_remote_code;
let master_port = args.master_port; let master_port = args.master_port;
let disable_custom_kernels = args.disable_custom_kernels; let disable_custom_kernels = args.disable_custom_kernels;
@ -898,6 +903,7 @@ fn spawn_shards(
revision, revision,
quantize, quantize,
dtype, dtype,
max_total_tokens,
trust_remote_code, trust_remote_code,
uds_path, uds_path,
rank, rank,

View File

@ -42,7 +42,7 @@ from functools import wraps
tracer = trace.get_tracer(__name__) tracer = trace.get_tracer(__name__)
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", "0")) MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 2048))
BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8)) BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8))
PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128)) PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128))
PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4)) PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4))