mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
Fixing default for BNB + cuda graphs (they don't work together).
This commit is contained in:
parent
289b0721c4
commit
c4ebcea79c
@ -168,7 +168,7 @@ Options:
|
|||||||
## MAX_BATCH_PREFILL_TOKENS
|
## MAX_BATCH_PREFILL_TOKENS
|
||||||
```shell
|
```shell
|
||||||
--max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
|
--max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
|
||||||
Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent
|
Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to min(max_input_length + 50, 16384) to give a bit of room
|
||||||
|
|
||||||
[env: MAX_BATCH_PREFILL_TOKENS=]
|
[env: MAX_BATCH_PREFILL_TOKENS=]
|
||||||
|
|
||||||
@ -215,10 +215,9 @@ Options:
|
|||||||
## CUDA_GRAPHS
|
## CUDA_GRAPHS
|
||||||
```shell
|
```shell
|
||||||
--cuda-graphs <CUDA_GRAPHS>
|
--cuda-graphs <CUDA_GRAPHS>
|
||||||
Specify the batch sizes to compute cuda graphs for. Use "0" to disable
|
Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
|
||||||
|
|
||||||
[env: CUDA_GRAPHS=]
|
[env: CUDA_GRAPHS=]
|
||||||
[default: 1,2,4,8,16,32,64,96,128]
|
|
||||||
|
|
||||||
```
|
```
|
||||||
## HOSTNAME
|
## HOSTNAME
|
||||||
|
@ -256,6 +256,7 @@ struct Args {
|
|||||||
/// Limits the number of tokens for the prefill operation.
|
/// Limits the number of tokens for the prefill operation.
|
||||||
/// Since this operation take the most memory and is compute bound, it is interesting
|
/// Since this operation take the most memory and is compute bound, it is interesting
|
||||||
/// to limit the number of requests that can be sent.
|
/// to limit the number of requests that can be sent.
|
||||||
|
/// Default to min(max_input_length + 50, 16384) to give a bit of room.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
max_batch_prefill_tokens: Option<u32>,
|
max_batch_prefill_tokens: Option<u32>,
|
||||||
|
|
||||||
@ -306,13 +307,9 @@ struct Args {
|
|||||||
|
|
||||||
/// Specify the batch sizes to compute cuda graphs for.
|
/// Specify the batch sizes to compute cuda graphs for.
|
||||||
/// Use "0" to disable.
|
/// Use "0" to disable.
|
||||||
#[clap(
|
/// Default = "1,2,4,8,16,32"
|
||||||
long,
|
#[clap(long, env, value_delimiter = ',')]
|
||||||
env,
|
cuda_graphs: Option<Vec<usize>>,
|
||||||
value_delimiter = ',',
|
|
||||||
default_value = "1,2,4,8,16,32,64,96,128"
|
|
||||||
)]
|
|
||||||
cuda_graphs: Vec<usize>,
|
|
||||||
|
|
||||||
/// The IP address to listen on
|
/// The IP address to listen on
|
||||||
#[clap(default_value = "0.0.0.0", long, env)]
|
#[clap(default_value = "0.0.0.0", long, env)]
|
||||||
@ -956,6 +953,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
|
|||||||
fn spawn_shards(
|
fn spawn_shards(
|
||||||
num_shard: usize,
|
num_shard: usize,
|
||||||
args: &Args,
|
args: &Args,
|
||||||
|
cuda_graphs: Vec<usize>,
|
||||||
shutdown: Arc<AtomicBool>,
|
shutdown: Arc<AtomicBool>,
|
||||||
shutdown_receiver: &mpsc::Receiver<()>,
|
shutdown_receiver: &mpsc::Receiver<()>,
|
||||||
shutdown_sender: mpsc::Sender<()>,
|
shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -983,11 +981,7 @@ fn spawn_shards(
|
|||||||
let disable_custom_kernels = args.disable_custom_kernels;
|
let disable_custom_kernels = args.disable_custom_kernels;
|
||||||
let watermark_gamma = args.watermark_gamma;
|
let watermark_gamma = args.watermark_gamma;
|
||||||
let watermark_delta = args.watermark_delta;
|
let watermark_delta = args.watermark_delta;
|
||||||
let cuda_graphs: Vec<usize> = args
|
let cuda_graphs_clone = cuda_graphs.clone();
|
||||||
.cuda_graphs
|
|
||||||
.iter()
|
|
||||||
.filter_map(|&c| if c > 0 { Some(c) } else { None })
|
|
||||||
.collect();
|
|
||||||
let cuda_memory_fraction = args.cuda_memory_fraction;
|
let cuda_memory_fraction = args.cuda_memory_fraction;
|
||||||
let rope_scaling = args.rope_scaling;
|
let rope_scaling = args.rope_scaling;
|
||||||
let rope_factor = args.rope_factor;
|
let rope_factor = args.rope_factor;
|
||||||
@ -1009,7 +1003,7 @@ fn spawn_shards(
|
|||||||
disable_custom_kernels,
|
disable_custom_kernels,
|
||||||
watermark_gamma,
|
watermark_gamma,
|
||||||
watermark_delta,
|
watermark_delta,
|
||||||
cuda_graphs,
|
cuda_graphs_clone,
|
||||||
cuda_memory_fraction,
|
cuda_memory_fraction,
|
||||||
rope_scaling,
|
rope_scaling,
|
||||||
rope_factor,
|
rope_factor,
|
||||||
@ -1363,6 +1357,27 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let cuda_graphs = match (&args.cuda_graphs, &args.quantize) {
|
||||||
|
(Some(cuda_graphs), Some(_q)) => cuda_graphs.clone(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
(
|
||||||
|
None,
|
||||||
|
Some(
|
||||||
|
Quantization::Bitsandbytes
|
||||||
|
| Quantization::BitsandbytesNF4
|
||||||
|
| Quantization::BitsandbytesFP4,
|
||||||
|
),
|
||||||
|
) => {
|
||||||
|
tracing::info!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
|
||||||
|
vec![]
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
|
||||||
|
tracing::info!("Using default cuda graphs {cuda_graphs:?}");
|
||||||
|
cuda_graphs
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if args.validation_workers == 0 {
|
if args.validation_workers == 0 {
|
||||||
return Err(LauncherError::ArgumentValidation(
|
return Err(LauncherError::ArgumentValidation(
|
||||||
"`validation_workers` must be > 0".to_string(),
|
"`validation_workers` must be > 0".to_string(),
|
||||||
@ -1437,6 +1452,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
spawn_shards(
|
spawn_shards(
|
||||||
num_shard,
|
num_shard,
|
||||||
&args,
|
&args,
|
||||||
|
cuda_graphs,
|
||||||
shutdown.clone(),
|
shutdown.clone(),
|
||||||
&shutdown_receiver,
|
&shutdown_receiver,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
|
Loading…
Reference in New Issue
Block a user