mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 11:54:52 +00:00
cleanup
This commit is contained in:
parent
1e5a30990b
commit
f81f0828d7
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2109,6 +2109,7 @@ dependencies = [
|
|||||||
"futures",
|
"futures",
|
||||||
"grpc-metadata",
|
"grpc-metadata",
|
||||||
"prost",
|
"prost",
|
||||||
|
"prost-build",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tonic",
|
"tonic",
|
||||||
|
@ -46,6 +46,7 @@ to power LLMs api-inference widgets.
|
|||||||
- Logits warpers (temperature scaling, topk, repetition penalty ...)
|
- Logits warpers (temperature scaling, topk, repetition penalty ...)
|
||||||
- Stop sequences
|
- Stop sequences
|
||||||
- Log probabilities
|
- Log probabilities
|
||||||
|
- Distributed tracing with Open Telemetry
|
||||||
|
|
||||||
## Officially supported models
|
## Officially supported models
|
||||||
|
|
||||||
|
@ -44,6 +44,8 @@ struct Args {
|
|||||||
master_port: usize,
|
master_port: usize,
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
json_output: bool,
|
json_output: bool,
|
||||||
|
#[clap(long, env)]
|
||||||
|
otlp_endpoint: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> ExitCode {
|
fn main() -> ExitCode {
|
||||||
@ -62,6 +64,7 @@ fn main() -> ExitCode {
|
|||||||
master_addr,
|
master_addr,
|
||||||
master_port,
|
master_port,
|
||||||
json_output,
|
json_output,
|
||||||
|
otlp_endpoint,
|
||||||
} = Args::parse();
|
} = Args::parse();
|
||||||
|
|
||||||
if json_output {
|
if json_output {
|
||||||
@ -99,6 +102,7 @@ fn main() -> ExitCode {
|
|||||||
let status_sender = status_sender.clone();
|
let status_sender = status_sender.clone();
|
||||||
let shutdown = shutdown.clone();
|
let shutdown = shutdown.clone();
|
||||||
let shutdown_sender = shutdown_sender.clone();
|
let shutdown_sender = shutdown_sender.clone();
|
||||||
|
let otlp_endpoint = otlp_endpoint.clone();
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
shard_manager(
|
shard_manager(
|
||||||
model_id,
|
model_id,
|
||||||
@ -109,6 +113,7 @@ fn main() -> ExitCode {
|
|||||||
num_shard,
|
num_shard,
|
||||||
master_addr,
|
master_addr,
|
||||||
master_port,
|
master_port,
|
||||||
|
otlp_endpoint,
|
||||||
status_sender,
|
status_sender,
|
||||||
shutdown,
|
shutdown,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
@ -174,6 +179,12 @@ fn main() -> ExitCode {
|
|||||||
argv.push("--json-output".to_string());
|
argv.push("--json-output".to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OpenTelemetry
|
||||||
|
if let Some(otlp_endpoint) = otlp_endpoint {
|
||||||
|
argv.push("--otlp-endpoint".to_string());
|
||||||
|
argv.push(otlp_endpoint);
|
||||||
|
}
|
||||||
|
|
||||||
let mut webserver = match Popen::create(
|
let mut webserver = match Popen::create(
|
||||||
&argv,
|
&argv,
|
||||||
PopenConfig {
|
PopenConfig {
|
||||||
@ -264,6 +275,7 @@ fn shard_manager(
|
|||||||
world_size: usize,
|
world_size: usize,
|
||||||
master_addr: String,
|
master_addr: String,
|
||||||
master_port: usize,
|
master_port: usize,
|
||||||
|
otlp_endpoint: Option<String>,
|
||||||
status_sender: mpsc::Sender<ShardStatus>,
|
status_sender: mpsc::Sender<ShardStatus>,
|
||||||
shutdown: Arc<Mutex<bool>>,
|
shutdown: Arc<Mutex<bool>>,
|
||||||
_shutdown_sender: mpsc::Sender<()>,
|
_shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -286,6 +298,7 @@ fn shard_manager(
|
|||||||
"--json-output".to_string(),
|
"--json-output".to_string(),
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// Activate tensor parallelism
|
||||||
if world_size > 1 {
|
if world_size > 1 {
|
||||||
shard_argv.push("--sharded".to_string());
|
shard_argv.push("--sharded".to_string());
|
||||||
}
|
}
|
||||||
@ -294,11 +307,18 @@ fn shard_manager(
|
|||||||
shard_argv.push("--quantize".to_string())
|
shard_argv.push("--quantize".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Model optional revision
|
||||||
if let Some(revision) = revision {
|
if let Some(revision) = revision {
|
||||||
shard_argv.push("--revision".to_string());
|
shard_argv.push("--revision".to_string());
|
||||||
shard_argv.push(revision)
|
shard_argv.push(revision)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OpenTelemetry
|
||||||
|
if let Some(otlp_endpoint) = otlp_endpoint {
|
||||||
|
shard_argv.push("--otlp-endpoint".to_string());
|
||||||
|
shard_argv.push(otlp_endpoint);
|
||||||
|
}
|
||||||
|
|
||||||
let mut env = vec![
|
let mut env = vec![
|
||||||
("RANK".into(), rank.to_string().into()),
|
("RANK".into(), rank.to_string().into()),
|
||||||
("WORLD_SIZE".into(), world_size.to_string().into()),
|
("WORLD_SIZE".into(), world_size.to_string().into()),
|
||||||
|
@ -15,4 +15,5 @@ tracing = "^0.1"
|
|||||||
tracing-error = "^0.2"
|
tracing-error = "^0.2"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
tonic-build = "^0.8"
|
tonic-build = "0.8.4"
|
||||||
|
prost-build = "0.11.6"
|
||||||
|
@ -3,12 +3,16 @@ use std::fs;
|
|||||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
println!("cargo:rerun-if-changed=../../proto/generate.proto");
|
println!("cargo:rerun-if-changed=../../proto/generate.proto");
|
||||||
fs::create_dir("src/pb").unwrap_or(());
|
fs::create_dir("src/pb").unwrap_or(());
|
||||||
|
|
||||||
|
let mut config = prost_build::Config::new();
|
||||||
|
config.protoc_arg("--experimental_allow_proto3_optional");
|
||||||
|
|
||||||
tonic_build::configure()
|
tonic_build::configure()
|
||||||
.build_client(true)
|
.build_client(true)
|
||||||
.build_server(false)
|
.build_server(false)
|
||||||
.out_dir("src/pb")
|
.out_dir("src/pb")
|
||||||
.include_file("mod.rs")
|
.include_file("mod.rs")
|
||||||
.compile(&["../../proto/generate.proto"], &["../../proto"])
|
.compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
|
||||||
.unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
|
.unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -17,21 +17,25 @@ use tonic::Status;
|
|||||||
|
|
||||||
#[derive(Error, Debug, Clone)]
|
#[derive(Error, Debug, Clone)]
|
||||||
pub enum ClientError {
|
pub enum ClientError {
|
||||||
#[error("Could not connect to Text Generation server: {0:?}")]
|
#[error("Could not connect to Text Generation server: {0}")]
|
||||||
Connection(String),
|
Connection(String),
|
||||||
#[error("Server error: {0:?}")]
|
#[error("Server error: {0}")]
|
||||||
Generation(String),
|
Generation(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<Status> for ClientError {
|
impl From<Status> for ClientError {
|
||||||
fn from(err: Status) -> Self {
|
fn from(err: Status) -> Self {
|
||||||
Self::Generation(err.message().to_string())
|
let err = Self::Generation(err.message().to_string());
|
||||||
|
tracing::error!("{err}");
|
||||||
|
err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<transport::Error> for ClientError {
|
impl From<transport::Error> for ClientError {
|
||||||
fn from(err: transport::Error) -> Self {
|
fn from(err: transport::Error) -> Self {
|
||||||
Self::Connection(err.to_string())
|
let err = Self::Connection(err.to_string());
|
||||||
|
tracing::error!("{err}");
|
||||||
|
err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ use crate::validation::{Validation, ValidationError};
|
|||||||
use crate::GenerateRequest;
|
use crate::GenerateRequest;
|
||||||
use crate::{Entry, Queue, Token};
|
use crate::{Entry, Queue, Token};
|
||||||
use nohash_hasher::IntMap;
|
use nohash_hasher::IntMap;
|
||||||
use opentelemetry::trace::TraceContextExt;
|
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use text_generation_client::{
|
use text_generation_client::{
|
||||||
@ -14,8 +13,7 @@ use tokio::sync::{mpsc, Notify, Semaphore, TryAcquireError};
|
|||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||||
use tokio_stream::StreamExt;
|
use tokio_stream::StreamExt;
|
||||||
use tracing::{info_span, instrument, Instrument};
|
use tracing::{info_span, instrument, Instrument, Span};
|
||||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
|
||||||
|
|
||||||
/// Inference struct
|
/// Inference struct
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -78,7 +76,14 @@ impl Infer {
|
|||||||
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
|
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
|
||||||
// Limit concurrent requests by acquiring a permit from the semaphore
|
// Limit concurrent requests by acquiring a permit from the semaphore
|
||||||
// This permit will live as long as Entry
|
// This permit will live as long as Entry
|
||||||
let permit = self.clone().limit_concurrent_requests.try_acquire_owned()?;
|
let permit = self
|
||||||
|
.clone()
|
||||||
|
.limit_concurrent_requests
|
||||||
|
.try_acquire_owned()
|
||||||
|
.map_err(|err| {
|
||||||
|
tracing::error!("{err}");
|
||||||
|
err
|
||||||
|
})?;
|
||||||
|
|
||||||
// Validate request
|
// Validate request
|
||||||
let valid_request = self.validation.validate(request).await?;
|
let valid_request = self.validation.validate(request).await?;
|
||||||
@ -90,9 +95,9 @@ impl Infer {
|
|||||||
self.queue.append(Entry {
|
self.queue.append(Entry {
|
||||||
request: valid_request,
|
request: valid_request,
|
||||||
response_tx,
|
response_tx,
|
||||||
parent_span: info_span!("entry"),
|
span: Span::current(),
|
||||||
batch_span: None,
|
batch_span: None,
|
||||||
time: Instant::now(),
|
queue_time: Instant::now(),
|
||||||
batch_time: None,
|
batch_time: None,
|
||||||
_permit: permit,
|
_permit: permit,
|
||||||
});
|
});
|
||||||
@ -166,7 +171,9 @@ impl Infer {
|
|||||||
start,
|
start,
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
Err(InferError::IncompleteGeneration)
|
let err = InferError::IncompleteGeneration;
|
||||||
|
tracing::error!("{err}");
|
||||||
|
Err(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -235,13 +242,13 @@ async fn batching_task(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let next_batch_span = info_span!("batch");
|
// Create span for this batch to add context to inference calls
|
||||||
|
let next_batch_span = info_span!(parent: None, "batch");
|
||||||
entries.iter_mut().for_each(|(_, entry)| {
|
entries.iter_mut().for_each(|(_, entry)| {
|
||||||
// Create a new span for this entry/batch tuple
|
// Create a new span to link the batch back to this entry
|
||||||
let entry_batch_span = info_span!(parent: &entry.parent_span, "infer");
|
let entry_batch_span = info_span!(parent: &entry.span, "infer");
|
||||||
// Add link to span
|
// Add relationship
|
||||||
entry_batch_span
|
entry_batch_span.follows_from(&next_batch_span);
|
||||||
.add_link(next_batch_span.context().span().span_context().clone());
|
|
||||||
// Update entry
|
// Update entry
|
||||||
entry.batch_span = Some(entry_batch_span);
|
entry.batch_span = Some(entry_batch_span);
|
||||||
});
|
});
|
||||||
@ -268,7 +275,7 @@ async fn wrap_future(
|
|||||||
}
|
}
|
||||||
// If we have an error, we discard the whole batch
|
// If we have an error, we discard the whole batch
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
send_error(err, entries);
|
send_errors(err, entries);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -276,12 +283,17 @@ async fn wrap_future(
|
|||||||
|
|
||||||
/// Send errors to Infer for all `entries`
|
/// Send errors to Infer for all `entries`
|
||||||
#[instrument]
|
#[instrument]
|
||||||
fn send_error(error: ClientError, entries: &mut IntMap<u64, Entry>) {
|
fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
|
||||||
entries.drain().for_each(|(_, entry)| {
|
entries.drain().for_each(|(_, entry)| {
|
||||||
|
// Create and enter a span to link this function back to the entry
|
||||||
|
let _send_error_span = info_span!(parent: entry.batch_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
|
||||||
|
let err = InferError::GenerationError(error.to_string());
|
||||||
|
tracing::error!("{err}");
|
||||||
|
|
||||||
// unwrap_or is valid here as we don't care if the receiver is gone.
|
// unwrap_or is valid here as we don't care if the receiver is gone.
|
||||||
entry
|
entry
|
||||||
.response_tx
|
.response_tx
|
||||||
.send(Err(InferError::GenerationError(error.to_string())))
|
.send(Err(err))
|
||||||
.unwrap_or(());
|
.unwrap_or(());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -296,6 +308,7 @@ fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entr
|
|||||||
.get(&generation.request_id)
|
.get(&generation.request_id)
|
||||||
.expect("ID not found in entries. This is a bug.");
|
.expect("ID not found in entries. This is a bug.");
|
||||||
|
|
||||||
|
// Create and enter a span to link this function back to the entry
|
||||||
let _generation_span = info_span!(parent: entry.batch_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation").entered();
|
let _generation_span = info_span!(parent: entry.batch_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation").entered();
|
||||||
|
|
||||||
if let Some(prefill_tokens) = generation.prefill_tokens {
|
if let Some(prefill_tokens) = generation.prefill_tokens {
|
||||||
@ -328,7 +341,7 @@ fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entr
|
|||||||
.send(Ok(InferStreamResponse::End {
|
.send(Ok(InferStreamResponse::End {
|
||||||
token,
|
token,
|
||||||
generated_text,
|
generated_text,
|
||||||
queued: entry.time,
|
queued: entry.queue_time,
|
||||||
start: entry.batch_time.unwrap(),
|
start: entry.batch_time.unwrap(),
|
||||||
}))
|
}))
|
||||||
.unwrap_or(());
|
.unwrap_or(());
|
||||||
|
@ -2,14 +2,12 @@ use crate::infer::InferError;
|
|||||||
use crate::infer::InferStreamResponse;
|
use crate::infer::InferStreamResponse;
|
||||||
use crate::validation::ValidGenerateRequest;
|
use crate::validation::ValidGenerateRequest;
|
||||||
use nohash_hasher::{BuildNoHashHasher, IntMap};
|
use nohash_hasher::{BuildNoHashHasher, IntMap};
|
||||||
use opentelemetry::trace::TraceContextExt;
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use text_generation_client::{Batch, Request};
|
use text_generation_client::{Batch, Request};
|
||||||
use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
|
use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
|
||||||
use tokio::sync::{mpsc, oneshot, OwnedSemaphorePermit};
|
use tokio::sync::{mpsc, oneshot, OwnedSemaphorePermit};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tracing::{info_span, Span};
|
use tracing::{info_span, instrument, Span};
|
||||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
|
||||||
|
|
||||||
/// Queue entry
|
/// Queue entry
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -18,12 +16,13 @@ pub(crate) struct Entry {
|
|||||||
pub request: ValidGenerateRequest,
|
pub request: ValidGenerateRequest,
|
||||||
/// Response sender to communicate between the Infer struct and the batching_task
|
/// Response sender to communicate between the Infer struct and the batching_task
|
||||||
pub response_tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
|
pub response_tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
|
||||||
/// Request Span
|
/// Span that will live as long as entry
|
||||||
pub parent_span: Span,
|
pub span: Span,
|
||||||
/// Batch Span
|
/// Span for every inference batch
|
||||||
|
/// This span will only live as long as one prefill/decode
|
||||||
pub batch_span: Option<Span>,
|
pub batch_span: Option<Span>,
|
||||||
/// Instant when this entry was created
|
/// Instant when this entry was queued
|
||||||
pub time: Instant,
|
pub queue_time: Instant,
|
||||||
/// Instant when this entry was added to a batch
|
/// Instant when this entry was added to a batch
|
||||||
pub batch_time: Option<Instant>,
|
pub batch_time: Option<Instant>,
|
||||||
/// Permit
|
/// Permit
|
||||||
@ -49,6 +48,7 @@ impl Queue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Append an entry to the queue
|
/// Append an entry to the queue
|
||||||
|
#[instrument(skip(self))]
|
||||||
pub(crate) fn append(&self, entry: Entry) {
|
pub(crate) fn append(&self, entry: Entry) {
|
||||||
// Send append command to the background task managing the state
|
// Send append command to the background task managing the state
|
||||||
// Unwrap is safe here
|
// Unwrap is safe here
|
||||||
@ -58,6 +58,7 @@ impl Queue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the next batch
|
// Get the next batch
|
||||||
|
#[instrument(skip(self))]
|
||||||
pub(crate) async fn next_batch(
|
pub(crate) async fn next_batch(
|
||||||
&self,
|
&self,
|
||||||
min_size: Option<usize>,
|
min_size: Option<usize>,
|
||||||
@ -142,7 +143,10 @@ impl State {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let next_batch_span = info_span!("batch");
|
// Create span for this batch to add context to inference calls
|
||||||
|
let next_batch_span = info_span!(parent: None, "batch");
|
||||||
|
next_batch_span.follows_from(&Span::current());
|
||||||
|
|
||||||
let next_batch_size = min(self.entries.len(), max_size);
|
let next_batch_size = min(self.entries.len(), max_size);
|
||||||
|
|
||||||
let mut batch_requests = Vec::with_capacity(next_batch_size);
|
let mut batch_requests = Vec::with_capacity(next_batch_size);
|
||||||
@ -153,10 +157,10 @@ impl State {
|
|||||||
self.entries
|
self.entries
|
||||||
.drain(..next_batch_size)
|
.drain(..next_batch_size)
|
||||||
.for_each(|(id, mut entry)| {
|
.for_each(|(id, mut entry)| {
|
||||||
// Create a new span for this entry/batch tuple
|
// Create a new span to link the batch back to this entry
|
||||||
let entry_batch_span = info_span!(parent: &entry.parent_span, "infer");
|
let entry_batch_span = info_span!(parent: &entry.span, "infer");
|
||||||
// Add link to span
|
// Add relationship
|
||||||
entry_batch_span.add_link(next_batch_span.context().span().span_context().clone());
|
entry_batch_span.follows_from(&next_batch_span);
|
||||||
// Update entry
|
// Update entry
|
||||||
entry.batch_span = Some(entry_batch_span);
|
entry.batch_span = Some(entry_batch_span);
|
||||||
|
|
||||||
@ -229,9 +233,9 @@ mod tests {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
response_tx,
|
response_tx,
|
||||||
parent_span: info_span!("entry"),
|
span: info_span!("entry"),
|
||||||
batch_span: None,
|
batch_span: None,
|
||||||
time: Instant::now(),
|
queue_time: Instant::now(),
|
||||||
batch_time: None,
|
batch_time: None,
|
||||||
_permit: permit,
|
_permit: permit,
|
||||||
}
|
}
|
||||||
|
@ -88,10 +88,7 @@ async fn generate(
|
|||||||
|
|
||||||
// Inference
|
// Inference
|
||||||
let details = req.0.parameters.details;
|
let details = req.0.parameters.details;
|
||||||
let response = infer.generate(req.0).await.map_err(|err| {
|
let response = infer.generate(req.0).await?;
|
||||||
tracing::error!("{}", err.to_string());
|
|
||||||
err
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Token details
|
// Token details
|
||||||
let details = match details {
|
let details = match details {
|
||||||
@ -265,19 +262,17 @@ async fn generate_stream(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Trace and yield error
|
// yield error
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error = true;
|
error = true;
|
||||||
tracing::error!("{}", err.to_string());
|
|
||||||
yield Ok(Event::from(err))
|
yield Ok(Event::from(err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
// Trace and yield error
|
// yield error
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error = true;
|
error = true;
|
||||||
tracing::error!("{}", err.to_string());
|
|
||||||
yield Ok(Event::from(err))
|
yield Ok(Event::from(err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -285,7 +280,7 @@ async fn generate_stream(
|
|||||||
// Skip if we already sent an error
|
// Skip if we already sent an error
|
||||||
if !end_reached && !error {
|
if !end_reached && !error {
|
||||||
let err = InferError::IncompleteGeneration;
|
let err = InferError::IncompleteGeneration;
|
||||||
tracing::error!("{}", err.to_string());
|
tracing::error!("{err}");
|
||||||
yield Ok(Event::from(err))
|
yield Ok(Event::from(err))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -105,7 +105,12 @@ fn validation_worker(
|
|||||||
while let Some((request, response_tx, parent_span)) = receiver.blocking_recv() {
|
while let Some((request, response_tx, parent_span)) = receiver.blocking_recv() {
|
||||||
parent_span.in_scope(|| {
|
parent_span.in_scope(|| {
|
||||||
response_tx
|
response_tx
|
||||||
.send(validate(request, &tokenizer, max_input_length, &mut rng))
|
.send(
|
||||||
|
validate(request, &tokenizer, max_input_length, &mut rng).map_err(|err| {
|
||||||
|
tracing::error!("{err}");
|
||||||
|
err
|
||||||
|
}),
|
||||||
|
)
|
||||||
.unwrap_or(())
|
.unwrap_or(())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
gen-server:
|
gen-server:
|
||||||
# Compile protos
|
# Compile protos
|
||||||
pip install grpcio-tools==1.49.1 --no-cache-dir
|
pip install grpcio-tools==1.51.1 --no-cache-dir
|
||||||
mkdir text_generation/pb || true
|
mkdir text_generation/pb || true
|
||||||
python -m grpc_tools.protoc -I../proto --python_out=text_generation/pb --grpc_python_out=text_generation/pb ../proto/generate.proto
|
python -m grpc_tools.protoc -I../proto --python_out=text_generation/pb --grpc_python_out=text_generation/pb ../proto/generate.proto
|
||||||
find text_generation/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
|
find text_generation/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
|
||||||
|
66
server/poetry.lock
generated
66
server/poetry.lock
generated
@ -225,7 +225,7 @@ dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "numpy"
|
name = "numpy"
|
||||||
version = "1.24.1"
|
version = "1.24.2"
|
||||||
description = "Fundamental package for array computing in Python"
|
description = "Fundamental package for array computing in Python"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
@ -511,7 +511,7 @@ torch = ["torch"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "setuptools"
|
name = "setuptools"
|
||||||
version = "67.0.0"
|
version = "67.2.0"
|
||||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
@ -622,7 +622,7 @@ bnb = ["bitsandbytes"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "1a1fe3fd152c3b1a48908f4e3302b92278a3180d6fe5f39c27dc2b7b34d30dbf"
|
content-hash = "f3cab6881b52045770a90ec9be7415a0ee499d9e980892d544f68073700cf321"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
accelerate = [
|
accelerate = [
|
||||||
@ -874,34 +874,34 @@ loguru = [
|
|||||||
{file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
|
{file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
|
||||||
]
|
]
|
||||||
numpy = [
|
numpy = [
|
||||||
{file = "numpy-1.24.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:179a7ef0889ab769cc03573b6217f54c8bd8e16cef80aad369e1e8185f994cd7"},
|
{file = "numpy-1.24.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d"},
|
||||||
{file = "numpy-1.24.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b09804ff570b907da323b3d762e74432fb07955701b17b08ff1b5ebaa8cfe6a9"},
|
{file = "numpy-1.24.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5"},
|
||||||
{file = "numpy-1.24.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b739841821968798947d3afcefd386fa56da0caf97722a5de53e07c4ccedc7"},
|
{file = "numpy-1.24.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253"},
|
||||||
{file = "numpy-1.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e3463e6ac25313462e04aea3fb8a0a30fb906d5d300f58b3bc2c23da6a15398"},
|
{file = "numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978"},
|
||||||
{file = "numpy-1.24.1-cp310-cp310-win32.whl", hash = "sha256:b31da69ed0c18be8b77bfce48d234e55d040793cebb25398e2a7d84199fbc7e2"},
|
{file = "numpy-1.24.2-cp310-cp310-win32.whl", hash = "sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9"},
|
||||||
{file = "numpy-1.24.1-cp310-cp310-win_amd64.whl", hash = "sha256:b07b40f5fb4fa034120a5796288f24c1fe0e0580bbfff99897ba6267af42def2"},
|
{file = "numpy-1.24.2-cp310-cp310-win_amd64.whl", hash = "sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7094891dcf79ccc6bc2a1f30428fa5edb1e6fb955411ffff3401fb4ea93780a8"},
|
{file = "numpy-1.24.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e418681372520c992805bb723e29d69d6b7aa411065f48216d8329d02ba032"},
|
{file = "numpy-1.24.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e274f0f6c7efd0d577744f52032fdd24344f11c5ae668fe8d01aac0422611df1"},
|
{file = "numpy-1.24.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0044f7d944ee882400890f9ae955220d29b33d809a038923d88e4e01d652acd9"},
|
{file = "numpy-1.24.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-win32.whl", hash = "sha256:442feb5e5bada8408e8fcd43f3360b78683ff12a4444670a7d9e9824c1817d36"},
|
{file = "numpy-1.24.2-cp311-cp311-win32.whl", hash = "sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95"},
|
||||||
{file = "numpy-1.24.1-cp311-cp311-win_amd64.whl", hash = "sha256:de92efa737875329b052982e37bd4371d52cabf469f83e7b8be9bb7752d67e51"},
|
{file = "numpy-1.24.2-cp311-cp311-win_amd64.whl", hash = "sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b162ac10ca38850510caf8ea33f89edcb7b0bb0dfa5592d59909419986b72407"},
|
{file = "numpy-1.24.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26089487086f2648944f17adaa1a97ca6aee57f513ba5f1c0b7ebdabbe2b9954"},
|
{file = "numpy-1.24.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:caf65a396c0d1f9809596be2e444e3bd4190d86d5c1ce21f5fc4be60a3bc5b36"},
|
{file = "numpy-1.24.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0677a52f5d896e84414761531947c7a330d1adc07c3a4372262f25d84af7bf7"},
|
{file = "numpy-1.24.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-win32.whl", hash = "sha256:dae46bed2cb79a58d6496ff6d8da1e3b95ba09afeca2e277628171ca99b99db1"},
|
{file = "numpy-1.24.2-cp38-cp38-win32.whl", hash = "sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d"},
|
||||||
{file = "numpy-1.24.1-cp38-cp38-win_amd64.whl", hash = "sha256:6ec0c021cd9fe732e5bab6401adea5a409214ca5592cd92a114f7067febcba0c"},
|
{file = "numpy-1.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:28bc9750ae1f75264ee0f10561709b1462d450a4808cd97c013046073ae64ab6"},
|
{file = "numpy-1.24.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84e789a085aabef2f36c0515f45e459f02f570c4b4c4c108ac1179c34d475ed7"},
|
{file = "numpy-1.24.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e669fbdcdd1e945691079c2cae335f3e3a56554e06bbd45d7609a6cf568c700"},
|
{file = "numpy-1.24.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef85cf1f693c88c1fd229ccd1055570cb41cdf4875873b7728b6301f12cd05bf"},
|
{file = "numpy-1.24.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-win32.whl", hash = "sha256:87a118968fba001b248aac90e502c0b13606721b1343cdaddbc6e552e8dfb56f"},
|
{file = "numpy-1.24.2-cp39-cp39-win32.whl", hash = "sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468"},
|
||||||
{file = "numpy-1.24.1-cp39-cp39-win_amd64.whl", hash = "sha256:ddc7ab52b322eb1e40521eb422c4e0a20716c271a306860979d450decbb51b8e"},
|
{file = "numpy-1.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5"},
|
||||||
{file = "numpy-1.24.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed5fb71d79e771ec930566fae9c02626b939e37271ec285e9efaf1b5d4370e7d"},
|
{file = "numpy-1.24.2-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d"},
|
||||||
{file = "numpy-1.24.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad2925567f43643f51255220424c23d204024ed428afc5aad0f86f3ffc080086"},
|
{file = "numpy-1.24.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"},
|
||||||
{file = "numpy-1.24.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cfa1161c6ac8f92dea03d625c2d0c05e084668f4a06568b77a25a89111621566"},
|
{file = "numpy-1.24.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f"},
|
||||||
{file = "numpy-1.24.1.tar.gz", hash = "sha256:2386da9a471cc00a1f47845e27d916d5ec5346ae9696e01a8a34760858fe9dd2"},
|
{file = "numpy-1.24.2.tar.gz", hash = "sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22"},
|
||||||
]
|
]
|
||||||
nvidia-cublas-cu11 = [
|
nvidia-cublas-cu11 = [
|
||||||
{file = "nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl", hash = "sha256:d32e4d75f94ddfb93ea0a5dda08389bcc65d8916a25cb9f37ac89edaeed3bded"},
|
{file = "nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl", hash = "sha256:d32e4d75f94ddfb93ea0a5dda08389bcc65d8916a25cb9f37ac89edaeed3bded"},
|
||||||
@ -1089,8 +1089,8 @@ safetensors = [
|
|||||||
{file = "safetensors-0.2.8.tar.gz", hash = "sha256:2720b20a6a38c799dca79bd76caeeac2f7df585a9d4f7d59fa7e28eff9ccb27f"},
|
{file = "safetensors-0.2.8.tar.gz", hash = "sha256:2720b20a6a38c799dca79bd76caeeac2f7df585a9d4f7d59fa7e28eff9ccb27f"},
|
||||||
]
|
]
|
||||||
setuptools = [
|
setuptools = [
|
||||||
{file = "setuptools-67.0.0-py3-none-any.whl", hash = "sha256:9d790961ba6219e9ff7d9557622d2fe136816a264dd01d5997cfc057d804853d"},
|
{file = "setuptools-67.2.0-py3-none-any.whl", hash = "sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c"},
|
||||||
{file = "setuptools-67.0.0.tar.gz", hash = "sha256:883131c5b6efa70b9101c7ef30b2b7b780a4283d5fc1616383cdf22c83cbefe6"},
|
{file = "setuptools-67.2.0.tar.gz", hash = "sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"},
|
||||||
]
|
]
|
||||||
tomli = [
|
tomli = [
|
||||||
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
|
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
|
||||||
|
@ -27,7 +27,7 @@ opentelemetry-instrumentation-grpc = "^0.36b0"
|
|||||||
bnb = ["bitsandbytes"]
|
bnb = ["bitsandbytes"]
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
grpcio-tools = "^1.49.1"
|
grpcio-tools = "^1.51.1"
|
||||||
pytest = "^7.2.0"
|
pytest = "^7.2.0"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -10,10 +10,7 @@ from opentelemetry.sdk.resources import Resource
|
|||||||
from opentelemetry.sdk.trace import TracerProvider
|
from opentelemetry.sdk.trace import TracerProvider
|
||||||
from opentelemetry.sdk.trace.export import (
|
from opentelemetry.sdk.trace.export import (
|
||||||
BatchSpanProcessor,
|
BatchSpanProcessor,
|
||||||
ConsoleSpanExporter,
|
|
||||||
SimpleSpanProcessor,
|
|
||||||
)
|
)
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
|
class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
|
||||||
|
Loading…
Reference in New Issue
Block a user