mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Set MAX_TOTAL_TOKENS automatically (#91)
Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
This commit is contained in:
parent
a5c788cfe4
commit
80ae9ead28
@ -72,15 +72,14 @@ Environment Variables Added:
|
||||
|
||||
| Name | Value(s) | Default | Description | Usage |
|
||||
| --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
|
||||
| MAX_TOTAL_TOKENS | integer | 0 | Control the padding of input | add -e in docker run, such |
|
||||
| ENABLE_HPU_GRAPH | true/false | true | Enable hpu graph or not | add -e in docker run command |
|
||||
| ENABLE_HPU_GRAPH | True/False | True | Enable hpu graph or not | add -e in docker run command |
|
||||
| LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
|
||||
| PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command |
|
||||
| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command |
|
||||
| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command |
|
||||
| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command |
|
||||
| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command |
|
||||
| PROF_RECORD_SHAPES | true/false | false | Control record_shapes option in the profiler | add -e in docker run command |
|
||||
| LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
|
||||
| PROF_RECORD_SHAPES | True/False | False | Control record_shapes option in the profiler | add -e in docker run command |
|
||||
| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
||||
| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
||||
| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command |
|
||||
|
@ -376,6 +376,7 @@ fn shard_manager(
|
||||
revision: Option<String>,
|
||||
quantize: Option<Quantization>,
|
||||
dtype: Option<Dtype>,
|
||||
max_total_tokens: usize,
|
||||
trust_remote_code: bool,
|
||||
uds_path: String,
|
||||
rank: usize,
|
||||
@ -458,6 +459,9 @@ fn shard_manager(
|
||||
// Copy current process env
|
||||
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
|
||||
|
||||
// Max total tokens
|
||||
envs.push(("MAX_TOTAL_TOKENS".into(), max_total_tokens.to_string().into()));
|
||||
|
||||
// Torch Distributed Env vars
|
||||
if world_size == 1 {
|
||||
envs.push(("RANK".into(), rank.to_string().into()));
|
||||
@ -884,6 +888,7 @@ fn spawn_shards(
|
||||
let otlp_endpoint = args.otlp_endpoint.clone();
|
||||
let quantize = args.quantize;
|
||||
let dtype = args.dtype;
|
||||
let max_total_tokens = args.max_total_tokens;
|
||||
let trust_remote_code = args.trust_remote_code;
|
||||
let master_port = args.master_port;
|
||||
let disable_custom_kernels = args.disable_custom_kernels;
|
||||
@ -898,6 +903,7 @@ fn spawn_shards(
|
||||
revision,
|
||||
quantize,
|
||||
dtype,
|
||||
max_total_tokens,
|
||||
trust_remote_code,
|
||||
uds_path,
|
||||
rank,
|
||||
|
@ -42,7 +42,7 @@ from functools import wraps
|
||||
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", "0"))
|
||||
MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 2048))
|
||||
BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8))
|
||||
PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128))
|
||||
PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4))
|
||||
|
Loading…
Reference in New Issue
Block a user