mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 08:22:07 +00:00
Smaller default for max_input_length.
This commit is contained in:
parent
1e5150f475
commit
e5955851b9
@ -221,7 +221,7 @@ struct Args {
|
|||||||
/// for users. The larger this value, the longer prompt users can send which
|
/// for users. The larger this value, the longer prompt users can send which
|
||||||
/// can impact the overall memory required to handle the load.
|
/// can impact the overall memory required to handle the load.
|
||||||
/// Please note that some models have a finite range of sequence they can handle.
|
/// Please note that some models have a finite range of sequence they can handle.
|
||||||
/// Default to min(max_position_embeddings - 1, 13383)
|
/// Default to min(max_position_embeddings - 1, 4095)
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
max_input_tokens: Option<usize>,
|
max_input_tokens: Option<usize>,
|
||||||
|
|
||||||
@ -237,7 +237,7 @@ struct Args {
|
|||||||
/// `1511` max_new_tokens.
|
/// `1511` max_new_tokens.
|
||||||
/// The larger this value, the larger amount each request will be in your RAM
|
/// The larger this value, the larger amount each request will be in your RAM
|
||||||
/// and the less effective batching can be.
|
/// and the less effective batching can be.
|
||||||
/// Default to min(max_position_embeddings, 16384)
|
/// Default to min(max_position_embeddings, 4096)
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
max_total_tokens: Option<usize>,
|
max_total_tokens: Option<usize>,
|
||||||
|
|
||||||
@ -257,7 +257,7 @@ struct Args {
|
|||||||
/// Limits the number of tokens for the prefill operation.
|
/// Limits the number of tokens for the prefill operation.
|
||||||
/// Since this operation take the most memory and is compute bound, it is interesting
|
/// Since this operation take the most memory and is compute bound, it is interesting
|
||||||
/// to limit the number of requests that can be sent.
|
/// to limit the number of requests that can be sent.
|
||||||
/// Default to min(max_input_length + 50, 16384) to give a bit of room.
|
/// Default to `max_input_length + 50` to give a bit of room.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
max_batch_prefill_tokens: Option<u32>,
|
max_batch_prefill_tokens: Option<u32>,
|
||||||
|
|
||||||
@ -1294,11 +1294,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
let config: Config = serde_json::from_str(&content)?;
|
let config: Config = serde_json::from_str(&content)?;
|
||||||
|
|
||||||
// Quantization usually means you're even more RAM constrained.
|
// Quantization usually means you're even more RAM constrained.
|
||||||
let max_default = if args.quantize.is_some() {
|
let max_default = 4096;
|
||||||
4096
|
|
||||||
} else {
|
|
||||||
2usize.pow(14)
|
|
||||||
};
|
|
||||||
|
|
||||||
let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) {
|
let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) {
|
||||||
(Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
|
(Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
|
||||||
|
Loading…
Reference in New Issue
Block a user