mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Improving the logging system.
- Added a debug log for speculated ids (helps seeing in logs quality of a speculator). - Remove newlines from child process logs when re-emitting in non JSON mode. - Made standard level be closer to what's expected (only our binaries level). - Propagate that level correctly to the shard (was forced into INFO).
This commit is contained in:
parent
2f243a1a15
commit
767eb8b0f1
@ -17,7 +17,7 @@ use std::thread::sleep;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{fs, io};
|
||||
use thiserror::Error;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use tracing_subscriber::{filter::LevelFilter, EnvFilter};
|
||||
|
||||
mod env_runtime;
|
||||
|
||||
@ -452,6 +452,7 @@ fn shard_manager(
|
||||
max_total_tokens: usize,
|
||||
max_batch_size: Option<usize>,
|
||||
otlp_endpoint: Option<String>,
|
||||
log_level: LevelFilter,
|
||||
status_sender: mpsc::Sender<ShardStatus>,
|
||||
shutdown: Arc<AtomicBool>,
|
||||
_shutdown_sender: mpsc::Sender<()>,
|
||||
@ -474,7 +475,7 @@ fn shard_manager(
|
||||
"--uds-path".to_string(),
|
||||
uds_path,
|
||||
"--logger-level".to_string(),
|
||||
"INFO".to_string(),
|
||||
log_level.to_string().to_uppercase(),
|
||||
"--json-output".to_string(),
|
||||
];
|
||||
|
||||
@ -752,13 +753,13 @@ struct PythonLogMessage {
|
||||
impl PythonLogMessage {
|
||||
fn trace(&self) {
|
||||
match self.record.level.name {
|
||||
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
|
||||
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
|
||||
PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
|
||||
PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
|
||||
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
|
||||
PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
|
||||
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
|
||||
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
|
||||
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -978,6 +979,7 @@ fn spawn_shards(
|
||||
args: &Args,
|
||||
cuda_graphs: Vec<usize>,
|
||||
max_total_tokens: usize,
|
||||
max_log_level: LevelFilter,
|
||||
shutdown: Arc<AtomicBool>,
|
||||
shutdown_receiver: &mpsc::Receiver<()>,
|
||||
shutdown_sender: mpsc::Sender<()>,
|
||||
@ -1035,6 +1037,7 @@ fn spawn_shards(
|
||||
max_total_tokens,
|
||||
max_batch_size,
|
||||
otlp_endpoint,
|
||||
max_log_level,
|
||||
status_sender,
|
||||
shutdown,
|
||||
shutdown_sender,
|
||||
@ -1265,8 +1268,22 @@ fn main() -> Result<(), LauncherError> {
|
||||
let args: Args = Args::parse();
|
||||
|
||||
// Filter events with LOG_LEVEL
|
||||
let env_filter =
|
||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
||||
let varname = "LOG_LEVEL";
|
||||
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||
// Override to avoid simple logs to be spammed with tokio level informations
|
||||
let log_level = match &log_level[..] {
|
||||
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||
log_level => log_level,
|
||||
};
|
||||
EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.parse_lossy(log_level)
|
||||
} else {
|
||||
EnvFilter::new("info")
|
||||
};
|
||||
let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);
|
||||
|
||||
if args.json_output {
|
||||
tracing_subscriber::fmt()
|
||||
@ -1491,6 +1508,7 @@ fn main() -> Result<(), LauncherError> {
|
||||
&args,
|
||||
cuda_graphs,
|
||||
max_total_tokens,
|
||||
max_log_level,
|
||||
shutdown.clone(),
|
||||
&shutdown_receiver,
|
||||
shutdown_sender,
|
||||
|
@ -20,7 +20,7 @@ use tokenizers::Tokenizer;
|
||||
use tower_http::cors::AllowOrigin;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
use tracing_subscriber::{EnvFilter, Layer};
|
||||
use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
|
||||
|
||||
/// App Configuration
|
||||
#[derive(Parser, Debug)]
|
||||
@ -454,8 +454,21 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
|
||||
}
|
||||
|
||||
// Filter events with LOG_LEVEL
|
||||
let env_filter =
|
||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
||||
let varname = "LOG_LEVEL";
|
||||
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||
// Override to avoid simple logs to be spammed with tokio level informations
|
||||
let log_level = match &log_level[..] {
|
||||
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||
log_level => log_level,
|
||||
};
|
||||
EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.parse_lossy(log_level)
|
||||
} else {
|
||||
EnvFilter::new("info")
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
|
@ -17,6 +17,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
from text_generation_server.models import Model
|
||||
from text_generation_server.utils.tokens import batch_top_tokens
|
||||
from text_generation_server.utils.dist import RANK
|
||||
from text_generation_server.utils.speculate import get_speculate
|
||||
from text_generation_server.models.types import (
|
||||
Batch,
|
||||
@ -1187,6 +1188,10 @@ class FlashCausalLM(Model):
|
||||
next_token_texts = []
|
||||
left = 0
|
||||
|
||||
if n_accepted_ids > 1:
|
||||
if RANK == 0:
|
||||
logger.debug(f"Speculated ids {n_accepted_ids - 1}")
|
||||
|
||||
current_stopped = False
|
||||
for j in range(index, index + n_accepted_ids):
|
||||
# Generated token
|
||||
|
Loading…
Reference in New Issue
Block a user