mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Improving the logging system.
- Added a debug log for speculated ids (helps seeing in logs quality of a speculator). - Remove newlines from child process logs when re-emitting in non JSON mode. - Made standard level be closer to what's expected (only our binaries level). - Propagate that level correctly to the shard (was forced into INFO).
This commit is contained in:
parent
2f243a1a15
commit
767eb8b0f1
@ -17,7 +17,7 @@ use std::thread::sleep;
|
|||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::{fs, io};
|
use std::{fs, io};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::{filter::LevelFilter, EnvFilter};
|
||||||
|
|
||||||
mod env_runtime;
|
mod env_runtime;
|
||||||
|
|
||||||
@ -452,6 +452,7 @@ fn shard_manager(
|
|||||||
max_total_tokens: usize,
|
max_total_tokens: usize,
|
||||||
max_batch_size: Option<usize>,
|
max_batch_size: Option<usize>,
|
||||||
otlp_endpoint: Option<String>,
|
otlp_endpoint: Option<String>,
|
||||||
|
log_level: LevelFilter,
|
||||||
status_sender: mpsc::Sender<ShardStatus>,
|
status_sender: mpsc::Sender<ShardStatus>,
|
||||||
shutdown: Arc<AtomicBool>,
|
shutdown: Arc<AtomicBool>,
|
||||||
_shutdown_sender: mpsc::Sender<()>,
|
_shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -474,7 +475,7 @@ fn shard_manager(
|
|||||||
"--uds-path".to_string(),
|
"--uds-path".to_string(),
|
||||||
uds_path,
|
uds_path,
|
||||||
"--logger-level".to_string(),
|
"--logger-level".to_string(),
|
||||||
"INFO".to_string(),
|
log_level.to_string().to_uppercase(),
|
||||||
"--json-output".to_string(),
|
"--json-output".to_string(),
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -752,13 +753,13 @@ struct PythonLogMessage {
|
|||||||
impl PythonLogMessage {
|
impl PythonLogMessage {
|
||||||
fn trace(&self) {
|
fn trace(&self) {
|
||||||
match self.record.level.name {
|
match self.record.level.name {
|
||||||
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
|
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
|
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
|
PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
|
PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
|
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
|
PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
|
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -978,6 +979,7 @@ fn spawn_shards(
|
|||||||
args: &Args,
|
args: &Args,
|
||||||
cuda_graphs: Vec<usize>,
|
cuda_graphs: Vec<usize>,
|
||||||
max_total_tokens: usize,
|
max_total_tokens: usize,
|
||||||
|
max_log_level: LevelFilter,
|
||||||
shutdown: Arc<AtomicBool>,
|
shutdown: Arc<AtomicBool>,
|
||||||
shutdown_receiver: &mpsc::Receiver<()>,
|
shutdown_receiver: &mpsc::Receiver<()>,
|
||||||
shutdown_sender: mpsc::Sender<()>,
|
shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -1035,6 +1037,7 @@ fn spawn_shards(
|
|||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
max_batch_size,
|
max_batch_size,
|
||||||
otlp_endpoint,
|
otlp_endpoint,
|
||||||
|
max_log_level,
|
||||||
status_sender,
|
status_sender,
|
||||||
shutdown,
|
shutdown,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
@ -1265,8 +1268,22 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
let args: Args = Args::parse();
|
let args: Args = Args::parse();
|
||||||
|
|
||||||
// Filter events with LOG_LEVEL
|
// Filter events with LOG_LEVEL
|
||||||
let env_filter =
|
let varname = "LOG_LEVEL";
|
||||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||||
|
// Override to avoid simple logs to be spammed with tokio level informations
|
||||||
|
let log_level = match &log_level[..] {
|
||||||
|
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||||
|
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||||
|
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||||
|
log_level => log_level,
|
||||||
|
};
|
||||||
|
EnvFilter::builder()
|
||||||
|
.with_default_directive(LevelFilter::INFO.into())
|
||||||
|
.parse_lossy(log_level)
|
||||||
|
} else {
|
||||||
|
EnvFilter::new("info")
|
||||||
|
};
|
||||||
|
let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);
|
||||||
|
|
||||||
if args.json_output {
|
if args.json_output {
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
@ -1491,6 +1508,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
&args,
|
&args,
|
||||||
cuda_graphs,
|
cuda_graphs,
|
||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
|
max_log_level,
|
||||||
shutdown.clone(),
|
shutdown.clone(),
|
||||||
&shutdown_receiver,
|
&shutdown_receiver,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
|
@ -20,7 +20,7 @@ use tokenizers::Tokenizer;
|
|||||||
use tower_http::cors::AllowOrigin;
|
use tower_http::cors::AllowOrigin;
|
||||||
use tracing_subscriber::layer::SubscriberExt;
|
use tracing_subscriber::layer::SubscriberExt;
|
||||||
use tracing_subscriber::util::SubscriberInitExt;
|
use tracing_subscriber::util::SubscriberInitExt;
|
||||||
use tracing_subscriber::{EnvFilter, Layer};
|
use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
|
||||||
|
|
||||||
/// App Configuration
|
/// App Configuration
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
@ -454,8 +454,21 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Filter events with LOG_LEVEL
|
// Filter events with LOG_LEVEL
|
||||||
let env_filter =
|
let varname = "LOG_LEVEL";
|
||||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||||
|
// Override to avoid simple logs to be spammed with tokio level informations
|
||||||
|
let log_level = match &log_level[..] {
|
||||||
|
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||||
|
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||||
|
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||||
|
log_level => log_level,
|
||||||
|
};
|
||||||
|
EnvFilter::builder()
|
||||||
|
.with_default_directive(LevelFilter::INFO.into())
|
||||||
|
.parse_lossy(log_level)
|
||||||
|
} else {
|
||||||
|
EnvFilter::new("info")
|
||||||
|
};
|
||||||
|
|
||||||
tracing_subscriber::registry()
|
tracing_subscriber::registry()
|
||||||
.with(env_filter)
|
.with(env_filter)
|
||||||
|
@ -17,6 +17,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
|
|||||||
from text_generation_server.utils.import_utils import SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
from text_generation_server.models import Model
|
from text_generation_server.models import Model
|
||||||
from text_generation_server.utils.tokens import batch_top_tokens
|
from text_generation_server.utils.tokens import batch_top_tokens
|
||||||
|
from text_generation_server.utils.dist import RANK
|
||||||
from text_generation_server.utils.speculate import get_speculate
|
from text_generation_server.utils.speculate import get_speculate
|
||||||
from text_generation_server.models.types import (
|
from text_generation_server.models.types import (
|
||||||
Batch,
|
Batch,
|
||||||
@ -1187,6 +1188,10 @@ class FlashCausalLM(Model):
|
|||||||
next_token_texts = []
|
next_token_texts = []
|
||||||
left = 0
|
left = 0
|
||||||
|
|
||||||
|
if n_accepted_ids > 1:
|
||||||
|
if RANK == 0:
|
||||||
|
logger.debug(f"Speculated ids {n_accepted_ids - 1}")
|
||||||
|
|
||||||
current_stopped = False
|
current_stopped = False
|
||||||
for j in range(index, index + n_accepted_ids):
|
for j in range(index, index + n_accepted_ids):
|
||||||
# Generated token
|
# Generated token
|
||||||
|
Loading…
Reference in New Issue
Block a user