mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
Stream options.
This commit is contained in:
parent
ce85efa968
commit
678721bcf0
@ -5,7 +5,8 @@ members = [
|
|||||||
"backends/grpc-metadata",
|
"backends/grpc-metadata",
|
||||||
"backends/trtllm",
|
"backends/trtllm",
|
||||||
"backends/client",
|
"backends/client",
|
||||||
"launcher"
|
"launcher",
|
||||||
|
"router"
|
||||||
]
|
]
|
||||||
default-members = [
|
default-members = [
|
||||||
"benchmark",
|
"benchmark",
|
||||||
@ -13,7 +14,8 @@ default-members = [
|
|||||||
"backends/grpc-metadata",
|
"backends/grpc-metadata",
|
||||||
# "backends/trtllm",
|
# "backends/trtllm",
|
||||||
"backends/client",
|
"backends/client",
|
||||||
"launcher"
|
"launcher",
|
||||||
|
"router"
|
||||||
]
|
]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
|
@ -684,6 +684,7 @@ pub(crate) struct ChatCompletionChunk {
|
|||||||
pub model: String,
|
pub model: String,
|
||||||
pub system_fingerprint: String,
|
pub system_fingerprint: String,
|
||||||
pub choices: Vec<ChatCompletionChoice>,
|
pub choices: Vec<ChatCompletionChoice>,
|
||||||
|
pub usage: Option<Usage>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Serialize, ToSchema)]
|
#[derive(Clone, Serialize, ToSchema)]
|
||||||
@ -732,6 +733,7 @@ impl ChatCompletionChunk {
|
|||||||
created: u64,
|
created: u64,
|
||||||
logprobs: Option<ChatCompletionLogprobs>,
|
logprobs: Option<ChatCompletionLogprobs>,
|
||||||
finish_reason: Option<String>,
|
finish_reason: Option<String>,
|
||||||
|
usage: Option<Usage>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let delta = match (delta, tool_calls) {
|
let delta = match (delta, tool_calls) {
|
||||||
(Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
|
(Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
|
||||||
@ -766,6 +768,7 @@ impl ChatCompletionChunk {
|
|||||||
logprobs,
|
logprobs,
|
||||||
finish_reason,
|
finish_reason,
|
||||||
}],
|
}],
|
||||||
|
usage,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -880,6 +883,18 @@ pub(crate) struct ChatRequest {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[schema(nullable = true, default = "null", example = "null")]
|
#[schema(nullable = true, default = "null", example = "null")]
|
||||||
pub guideline: Option<String>,
|
pub guideline: Option<String>,
|
||||||
|
|
||||||
|
/// Options for streaming response. Only set this when you set stream: true.
|
||||||
|
#[serde(default)]
|
||||||
|
#[schema(nullable = true, default = "null", example = "null")]
|
||||||
|
pub stream_options: Option<StreamOptions>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Deserialize, ToSchema, Serialize)]
|
||||||
|
struct StreamOptions {
|
||||||
|
/// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
|
||||||
|
#[schema(example = "true")]
|
||||||
|
include_usage: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn default_tool_prompt() -> String {
|
pub fn default_tool_prompt() -> String {
|
||||||
@ -1472,6 +1487,27 @@ mod tests {
|
|||||||
let textmsg: TextMessage = message.into();
|
let textmsg: TextMessage = message.into();
|
||||||
assert_eq!(textmsg.content, "Whats in this image?");
|
assert_eq!(textmsg.content, "Whats in this image?");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chat_stream_options() {
|
||||||
|
let json = json!({
|
||||||
|
"model": "",
|
||||||
|
"stream_options": {"include_usage": true},
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello"
|
||||||
|
}]
|
||||||
|
});
|
||||||
|
let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
|
||||||
|
|
||||||
|
assert!(matches!(
|
||||||
|
request.stream_options,
|
||||||
|
Some(StreamOptions {
|
||||||
|
include_usage: true
|
||||||
|
})
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn openai_output() {
|
fn openai_output() {
|
||||||
let message = OutputMessage::ChatMessage(TextMessage {
|
let message = OutputMessage::ChatMessage(TextMessage {
|
||||||
|
@ -590,7 +590,7 @@ async fn generate_stream_internal(
|
|||||||
let event = on_message_callback(stream_token);
|
let event = on_message_callback(stream_token);
|
||||||
yield Ok(event);
|
yield Ok(event);
|
||||||
}
|
}
|
||||||
// Yield event for last token and compute timings
|
// Yield event for lat token and compute timings
|
||||||
InferStreamResponse::End {
|
InferStreamResponse::End {
|
||||||
token,
|
token,
|
||||||
generated_text,
|
generated_text,
|
||||||
@ -1265,6 +1265,22 @@ async fn chat_completions(
|
|||||||
(content, None)
|
(content, None)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let (usage, finish_reason) = match stream_token.details {
|
||||||
|
Some(details) => {
|
||||||
|
let completion_tokens = details.generated_tokens;
|
||||||
|
let prompt_tokens = details.input_length;
|
||||||
|
let total_tokens = prompt_tokens + completion_tokens;
|
||||||
|
(
|
||||||
|
Some(Usage {
|
||||||
|
completion_tokens,
|
||||||
|
prompt_tokens,
|
||||||
|
total_tokens,
|
||||||
|
}),
|
||||||
|
Some(details.finish_reason.format(true)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => (None, None),
|
||||||
|
};
|
||||||
event
|
event
|
||||||
.json_data(CompletionType::ChatCompletionChunk(
|
.json_data(CompletionType::ChatCompletionChunk(
|
||||||
ChatCompletionChunk::new(
|
ChatCompletionChunk::new(
|
||||||
@ -1274,7 +1290,8 @@ async fn chat_completions(
|
|||||||
tool_calls,
|
tool_calls,
|
||||||
current_time,
|
current_time,
|
||||||
logprobs,
|
logprobs,
|
||||||
stream_token.details.map(|d| d.finish_reason.format(true)),
|
finish_reason,
|
||||||
|
usage,
|
||||||
),
|
),
|
||||||
))
|
))
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
|
Loading…
Reference in New Issue
Block a user