mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Set MAX_TOTAL_TOKENS automatically (#91)
Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
This commit is contained in:
parent
a5c788cfe4
commit
80ae9ead28
@ -72,15 +72,14 @@ Environment Variables Added:
|
|||||||
|
|
||||||
| Name | Value(s) | Default | Description | Usage |
|
| Name | Value(s) | Default | Description | Usage |
|
||||||
| --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
|
| --------------------------- | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
|
||||||
| MAX_TOTAL_TOKENS | integer | 0 | Control the padding of input | add -e in docker run, such |
|
| ENABLE_HPU_GRAPH | True/False | True | Enable hpu graph or not | add -e in docker run command |
|
||||||
| ENABLE_HPU_GRAPH | true/false | true | Enable hpu graph or not | add -e in docker run command |
|
| LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
|
||||||
| PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command |
|
| PROF_WAITSTEP | integer | 0 | Control profile wait steps | add -e in docker run command |
|
||||||
| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command |
|
| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | add -e in docker run command |
|
||||||
| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command |
|
| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | add -e in docker run command |
|
||||||
| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command |
|
| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | add -e in docker run command |
|
||||||
| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command |
|
| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | add -e in docker run command |
|
||||||
| PROF_RECORD_SHAPES | true/false | false | Control record_shapes option in the profiler | add -e in docker run command |
|
| PROF_RECORD_SHAPES | True/False | False | Control record_shapes option in the profiler | add -e in docker run command |
|
||||||
| LIMIT_HPU_GRAPH | True/False | False | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command |
|
|
||||||
| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
||||||
| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
|
||||||
| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command |
|
| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command |
|
||||||
|
@ -376,6 +376,7 @@ fn shard_manager(
|
|||||||
revision: Option<String>,
|
revision: Option<String>,
|
||||||
quantize: Option<Quantization>,
|
quantize: Option<Quantization>,
|
||||||
dtype: Option<Dtype>,
|
dtype: Option<Dtype>,
|
||||||
|
max_total_tokens: usize,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
uds_path: String,
|
uds_path: String,
|
||||||
rank: usize,
|
rank: usize,
|
||||||
@ -458,6 +459,9 @@ fn shard_manager(
|
|||||||
// Copy current process env
|
// Copy current process env
|
||||||
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
|
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
|
||||||
|
|
||||||
|
// Max total tokens
|
||||||
|
envs.push(("MAX_TOTAL_TOKENS".into(), max_total_tokens.to_string().into()));
|
||||||
|
|
||||||
// Torch Distributed Env vars
|
// Torch Distributed Env vars
|
||||||
if world_size == 1 {
|
if world_size == 1 {
|
||||||
envs.push(("RANK".into(), rank.to_string().into()));
|
envs.push(("RANK".into(), rank.to_string().into()));
|
||||||
@ -884,6 +888,7 @@ fn spawn_shards(
|
|||||||
let otlp_endpoint = args.otlp_endpoint.clone();
|
let otlp_endpoint = args.otlp_endpoint.clone();
|
||||||
let quantize = args.quantize;
|
let quantize = args.quantize;
|
||||||
let dtype = args.dtype;
|
let dtype = args.dtype;
|
||||||
|
let max_total_tokens = args.max_total_tokens;
|
||||||
let trust_remote_code = args.trust_remote_code;
|
let trust_remote_code = args.trust_remote_code;
|
||||||
let master_port = args.master_port;
|
let master_port = args.master_port;
|
||||||
let disable_custom_kernels = args.disable_custom_kernels;
|
let disable_custom_kernels = args.disable_custom_kernels;
|
||||||
@ -898,6 +903,7 @@ fn spawn_shards(
|
|||||||
revision,
|
revision,
|
||||||
quantize,
|
quantize,
|
||||||
dtype,
|
dtype,
|
||||||
|
max_total_tokens,
|
||||||
trust_remote_code,
|
trust_remote_code,
|
||||||
uds_path,
|
uds_path,
|
||||||
rank,
|
rank,
|
||||||
|
@ -42,7 +42,7 @@ from functools import wraps
|
|||||||
|
|
||||||
tracer = trace.get_tracer(__name__)
|
tracer = trace.get_tracer(__name__)
|
||||||
|
|
||||||
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", "0"))
|
MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 2048))
|
||||||
BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8))
|
BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 8))
|
||||||
PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128))
|
PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128))
|
||||||
PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4))
|
PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 4))
|
||||||
|
Loading…
Reference in New Issue
Block a user