mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Small CI cleanup.
This commit is contained in:
parent
51ee60da74
commit
f54865e6da
@ -1284,7 +1284,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
tracing::info!("{}", env_runtime);
|
tracing::info!("{}", env_runtime);
|
||||||
}
|
}
|
||||||
|
|
||||||
tracing::info!("{:?}", args);
|
tracing::info!("{:#?}", args);
|
||||||
|
|
||||||
let get_max_position_embeddings = || -> Result<usize, Box<dyn std::error::Error>> {
|
let get_max_position_embeddings = || -> Result<usize, Box<dyn std::error::Error>> {
|
||||||
let model_id = args.model_id.clone();
|
let model_id = args.model_id.clone();
|
||||||
@ -1317,7 +1317,12 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
(Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
|
(Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
|
||||||
if max_position_embeddings > max_default {
|
if max_position_embeddings > max_default {
|
||||||
let max = max_position_embeddings;
|
let max = max_position_embeddings;
|
||||||
tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
|
if args.max_input_tokens.is_none()
|
||||||
|
&& args.max_total_tokens.is_none()
|
||||||
|
&& args.max_batch_prefill_tokens.is_none()
|
||||||
|
{
|
||||||
|
tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
|
||||||
|
}
|
||||||
max_default
|
max_default
|
||||||
} else {
|
} else {
|
||||||
max_position_embeddings
|
max_position_embeddings
|
||||||
@ -1389,8 +1394,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let cuda_graphs = match (&args.cuda_graphs, &args.quantize) {
|
let cuda_graphs = match (&args.cuda_graphs, &args.quantize) {
|
||||||
(Some(cuda_graphs), Some(_q)) => cuda_graphs.clone(),
|
(Some(cuda_graphs), _) => cuda_graphs.iter().cloned().filter(|&c| c > 0).collect(),
|
||||||
(Some(cuda_graphs), None) => cuda_graphs.clone(),
|
|
||||||
#[allow(deprecated)]
|
#[allow(deprecated)]
|
||||||
(
|
(
|
||||||
None,
|
None,
|
||||||
|
@ -1004,7 +1004,7 @@ async fn chat_completions(
|
|||||||
..
|
..
|
||||||
} = req;
|
} = req;
|
||||||
|
|
||||||
let repetition_penalty = presence_penalty.map(|x| x + 2.0);
|
let repetition_penalty = presence_penalty.map(|x| x + 1.0);
|
||||||
let max_new_tokens = max_tokens.or(Some(100));
|
let max_new_tokens = max_tokens.or(Some(100));
|
||||||
let logprobs = logprobs.unwrap_or(false);
|
let logprobs = logprobs.unwrap_or(false);
|
||||||
let tool_prompt = tool_prompt.unwrap_or_default();
|
let tool_prompt = tool_prompt.unwrap_or_default();
|
||||||
|
@ -4,7 +4,7 @@ import os
|
|||||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||||
# This is overridden by the cli
|
# This is overridden by the cli
|
||||||
cuda_graphs = os.getenv("CUDA_GRAPHS")
|
cuda_graphs = os.getenv("CUDA_GRAPHS")
|
||||||
if torch.cuda.is_available() and cuda_graphs is not None and cuda_graphs != "0":
|
if cuda_graphs is not None:
|
||||||
try:
|
try:
|
||||||
cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
|
cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user