mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 00:12:08 +00:00
v1.4.5 (#1686)
This commit is contained in:
parent
dc1ab2001d
commit
0bf856dc60
20
Cargo.lock
generated
20
Cargo.lock
generated
@ -267,6 +267,12 @@ version = "0.21.7"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
|
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "base64"
|
||||||
|
version = "0.22.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bit-set"
|
name = "bit-set"
|
||||||
version = "0.5.3"
|
version = "0.5.3"
|
||||||
@ -2960,7 +2966,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-benchmark"
|
name = "text-generation-benchmark"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"average",
|
"average",
|
||||||
"clap",
|
"clap",
|
||||||
@ -2981,7 +2987,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-client"
|
name = "text-generation-client"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"grpc-metadata",
|
"grpc-metadata",
|
||||||
@ -2998,7 +3004,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-launcher"
|
name = "text-generation-launcher"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
@ -3014,7 +3020,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router"
|
name = "text-generation-router"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"axum",
|
"axum",
|
||||||
@ -3601,11 +3607,11 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ureq"
|
name = "ureq"
|
||||||
version = "2.9.6"
|
version = "2.9.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35"
|
checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.21.7",
|
"base64 0.22.0",
|
||||||
"flate2",
|
"flate2",
|
||||||
"log",
|
"log",
|
||||||
"native-tls",
|
"native-tls",
|
||||||
|
@ -9,7 +9,7 @@ members = [
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||||
|
@ -444,7 +444,7 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Throughput paragraph
|
/// Throughput paragraph
|
||||||
fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
|
fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> {
|
||||||
// Throughput average/high/low texts
|
// Throughput average/high/low texts
|
||||||
let throughput_texts = statis_spans(throughput, "tokens/secs");
|
let throughput_texts = statis_spans(throughput, "tokens/secs");
|
||||||
|
|
||||||
@ -457,7 +457,7 @@ fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragr
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Latency paragraph
|
/// Latency paragraph
|
||||||
fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
|
fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> {
|
||||||
// Latency average/high/low texts
|
// Latency average/high/low texts
|
||||||
let mut latency_texts = statis_spans(latency, "ms");
|
let mut latency_texts = statis_spans(latency, "ms");
|
||||||
|
|
||||||
@ -483,7 +483,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Average/High/Low spans
|
/// Average/High/Low spans
|
||||||
fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Line<'a>> {
|
fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
|
||||||
vec![
|
vec![
|
||||||
Line::from(vec![Span::styled(
|
Line::from(vec![Span::styled(
|
||||||
format!(
|
format!(
|
||||||
@ -543,7 +543,7 @@ fn latency_histogram<'a>(
|
|||||||
|
|
||||||
/// Latency/Throughput chart
|
/// Latency/Throughput chart
|
||||||
fn latency_throughput_chart<'a>(
|
fn latency_throughput_chart<'a>(
|
||||||
latency_throughput: &'a Vec<(f64, f64)>,
|
latency_throughput: &'a [(f64, f64)],
|
||||||
batch_sizes: &'a [u32],
|
batch_sizes: &'a [u32],
|
||||||
zoom: bool,
|
zoom: bool,
|
||||||
name: &'static str,
|
name: &'static str,
|
||||||
|
@ -151,7 +151,7 @@ fn add_throuhgputs(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
|
fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
|
||||||
let average = data.iter().sum::<f64>() / data.len() as f64;
|
let average = data.iter().sum::<f64>() / data.len() as f64;
|
||||||
let min = data
|
let min = data
|
||||||
.iter()
|
.iter()
|
||||||
@ -164,7 +164,7 @@ fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
|
|||||||
(average, *min, *max)
|
(average, *min, *max)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn px(data: &Vec<f64>, p: u32) -> f64 {
|
fn px(data: &[f64], p: u32) -> f64 {
|
||||||
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
|
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
|
||||||
*data.get(i).unwrap_or(&std::f64::NAN)
|
*data.get(i).unwrap_or(&std::f64::NAN)
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
"name": "Apache 2.0",
|
"name": "Apache 2.0",
|
||||||
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
||||||
},
|
},
|
||||||
"version": "1.4.4"
|
"version": "1.4.5"
|
||||||
},
|
},
|
||||||
"paths": {
|
"paths": {
|
||||||
"/": {
|
"/": {
|
||||||
@ -471,6 +471,90 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"/v1/completions": {
|
||||||
|
"post": {
|
||||||
|
"tags": [
|
||||||
|
"Text Generation Inference"
|
||||||
|
],
|
||||||
|
"summary": "Generate tokens",
|
||||||
|
"description": "Generate tokens",
|
||||||
|
"operationId": "completions",
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/CompletionRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
},
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Generated Text",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ChatCompletionChunk"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"422": {
|
||||||
|
"description": "Input validation error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ErrorResponse"
|
||||||
|
},
|
||||||
|
"example": {
|
||||||
|
"error": "Input validation error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"424": {
|
||||||
|
"description": "Generation Error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ErrorResponse"
|
||||||
|
},
|
||||||
|
"example": {
|
||||||
|
"error": "Request failed during generation"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Model is overloaded",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ErrorResponse"
|
||||||
|
},
|
||||||
|
"example": {
|
||||||
|
"error": "Model is overloaded"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Incomplete generation",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/ErrorResponse"
|
||||||
|
},
|
||||||
|
"example": {
|
||||||
|
"error": "Incomplete generation"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"components": {
|
"components": {
|
||||||
@ -669,17 +753,25 @@
|
|||||||
"ChatCompletionDelta": {
|
"ChatCompletionDelta": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
"role",
|
"role"
|
||||||
"content"
|
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"example": "What is Deep Learning?"
|
"example": "What is Deep Learning?",
|
||||||
|
"nullable": true
|
||||||
},
|
},
|
||||||
"role": {
|
"role": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"example": "user"
|
"example": "user"
|
||||||
|
},
|
||||||
|
"tool_calls": {
|
||||||
|
"allOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/DeltaToolCall"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"nullable": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -739,7 +831,8 @@
|
|||||||
"ChatRequest": {
|
"ChatRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
"model"
|
"model",
|
||||||
|
"messages"
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"frequency_penalty": {
|
"frequency_penalty": {
|
||||||
@ -777,11 +870,12 @@
|
|||||||
"items": {
|
"items": {
|
||||||
"$ref": "#/components/schemas/Message"
|
"$ref": "#/components/schemas/Message"
|
||||||
},
|
},
|
||||||
"description": "A list of messages comprising the conversation so far."
|
"description": "A list of messages comprising the conversation so far.",
|
||||||
|
"example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
|
||||||
},
|
},
|
||||||
"model": {
|
"model": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
|
"description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
|
||||||
"example": "mistralai/Mistral-7B-Instruct-v0.2"
|
"example": "mistralai/Mistral-7B-Instruct-v0.2"
|
||||||
},
|
},
|
||||||
"n": {
|
"n": {
|
||||||
@ -806,6 +900,15 @@
|
|||||||
"nullable": true,
|
"nullable": true,
|
||||||
"minimum": 0
|
"minimum": 0
|
||||||
},
|
},
|
||||||
|
"stop": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Up to 4 sequences where the API will stop generating further tokens.",
|
||||||
|
"example": "null",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
"stream": {
|
"stream": {
|
||||||
"type": "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
@ -816,6 +919,29 @@
|
|||||||
"example": 1.0,
|
"example": 1.0,
|
||||||
"nullable": true
|
"nullable": true
|
||||||
},
|
},
|
||||||
|
"tool_choice": {
|
||||||
|
"allOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/ToolType"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"tool_prompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "A prompt to be appended before the tools",
|
||||||
|
"example": "\"Based on the conversation, please choose the most appropriate tool to use: \"",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"tools": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/Tool"
|
||||||
|
},
|
||||||
|
"description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.",
|
||||||
|
"example": "null",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
"top_logprobs": {
|
"top_logprobs": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"format": "int32",
|
"format": "int32",
|
||||||
@ -852,6 +978,164 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"CompletionComplete": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"index",
|
||||||
|
"text",
|
||||||
|
"finish_reason"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"finish_reason": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int32",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float"
|
||||||
|
},
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CompletionCompleteChunk": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"object",
|
||||||
|
"created",
|
||||||
|
"choices",
|
||||||
|
"model",
|
||||||
|
"system_fingerprint"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"choices": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/CompletionComplete"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int64",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"system_fingerprint": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CompletionRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"model",
|
||||||
|
"prompt"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"frequency_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float",
|
||||||
|
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
|
||||||
|
"example": "1.0",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"max_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int32",
|
||||||
|
"description": "The maximum number of tokens that can be generated in the chat completion.",
|
||||||
|
"default": "32",
|
||||||
|
"nullable": true,
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
|
||||||
|
"example": "mistralai/Mistral-7B-Instruct-v0.2"
|
||||||
|
},
|
||||||
|
"prompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The prompt to generate completions for.",
|
||||||
|
"example": "What is Deep Learning?"
|
||||||
|
},
|
||||||
|
"repetition_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"seed": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int64",
|
||||||
|
"example": 42,
|
||||||
|
"nullable": true,
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"stream": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"suffix": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float",
|
||||||
|
"description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.",
|
||||||
|
"example": 1.0,
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"top_p": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float",
|
||||||
|
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
|
||||||
|
"example": 0.95,
|
||||||
|
"nullable": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"DeltaToolCall": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"index",
|
||||||
|
"id",
|
||||||
|
"type",
|
||||||
|
"function"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"function": {
|
||||||
|
"$ref": "#/components/schemas/Function"
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int32",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"Details": {
|
"Details": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
@ -931,6 +1215,38 @@
|
|||||||
],
|
],
|
||||||
"example": "Length"
|
"example": "Length"
|
||||||
},
|
},
|
||||||
|
"Function": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"arguments"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"arguments": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"FunctionDefinition": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"name",
|
||||||
|
"parameters"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"parameters": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
"GenerateParameters": {
|
"GenerateParameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
@ -1261,13 +1577,13 @@
|
|||||||
"Message": {
|
"Message": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
"role",
|
"role"
|
||||||
"content"
|
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"example": "My name is David and I"
|
"example": "My name is David and I",
|
||||||
|
"nullable": true
|
||||||
},
|
},
|
||||||
"name": {
|
"name": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@ -1277,6 +1593,13 @@
|
|||||||
"role": {
|
"role": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"example": "user"
|
"example": "user"
|
||||||
|
},
|
||||||
|
"tool_calls": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/ToolCall"
|
||||||
|
},
|
||||||
|
"nullable": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -1437,6 +1760,64 @@
|
|||||||
"$ref": "#/components/schemas/SimpleToken"
|
"$ref": "#/components/schemas/SimpleToken"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"Tool": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"function"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"function": {
|
||||||
|
"$ref": "#/components/schemas/FunctionDefinition"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"example": "function"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ToolCall": {
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"type",
|
||||||
|
"function"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"function": {
|
||||||
|
"$ref": "#/components/schemas/FunctionDefinition"
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "integer",
|
||||||
|
"format": "int32",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ToolType": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"required": [
|
||||||
|
"FunctionName"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"FunctionName": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"OneOf"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
"Usage": {
|
"Usage": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"system_fingerprint": "1.4.4-native",
|
"system_fingerprint": "1.4.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 100,
|
"completion_tokens": 100,
|
||||||
"prompt_tokens": 60,
|
"prompt_tokens": 60,
|
||||||
|
@ -31,7 +31,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"system_fingerprint": "1.4.4-native",
|
"system_fingerprint": "1.4.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 29,
|
"completion_tokens": 29,
|
||||||
"prompt_tokens": 316,
|
"prompt_tokens": 316,
|
||||||
|
@ -31,7 +31,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"system_fingerprint": "1.4.4-native",
|
"system_fingerprint": "1.4.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 29,
|
"completion_tokens": 29,
|
||||||
"prompt_tokens": 316,
|
"prompt_tokens": 316,
|
||||||
|
@ -30,7 +30,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"system_fingerprint": "1.4.4-native",
|
"system_fingerprint": "1.4.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 21,
|
"completion_tokens": 21,
|
||||||
"prompt_tokens": 187,
|
"prompt_tokens": 187,
|
||||||
|
@ -23,5 +23,5 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"system_fingerprint": "1.4.4-native"
|
"system_fingerprint": "1.4.5-native"
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "text-generation-integration-tests"
|
name = "text-generation-integration-tests"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
description = "Text Generation Inference integration tests"
|
description = "Text Generation Inference integration tests"
|
||||||
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
|
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
|
||||||
|
|
||||||
|
@ -612,7 +612,7 @@ fn shard_manager(
|
|||||||
// We read stderr in another thread as it seems that lines() can block in some cases
|
// We read stderr in another thread as it seems that lines() can block in some cases
|
||||||
let (err_sender, err_receiver) = mpsc::channel();
|
let (err_sender, err_receiver) = mpsc::channel();
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
for line in shard_stderr_reader.lines().flatten() {
|
for line in shard_stderr_reader.lines().map_while(Result::ok) {
|
||||||
err_sender.send(line).unwrap_or(());
|
err_sender.send(line).unwrap_or(());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -730,7 +730,7 @@ impl TryFrom<&String> for PythonLogMessage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
|
fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
|
||||||
for line in lines.flatten() {
|
for line in lines.map_while(Result::ok) {
|
||||||
match PythonLogMessage::try_from(&line) {
|
match PythonLogMessage::try_from(&line) {
|
||||||
Ok(log) => log.trace(),
|
Ok(log) => log.trace(),
|
||||||
Err(_) => tracing::debug!("{line}"),
|
Err(_) => tracing::debug!("{line}"),
|
||||||
@ -882,7 +882,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
|
|||||||
// We read stderr in another thread as it seems that lines() can block in some cases
|
// We read stderr in another thread as it seems that lines() can block in some cases
|
||||||
let (err_sender, err_receiver) = mpsc::channel();
|
let (err_sender, err_receiver) = mpsc::channel();
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
for line in download_stderr.lines().flatten() {
|
for line in download_stderr.lines().map_while(Result::ok) {
|
||||||
err_sender.send(line).unwrap_or(());
|
err_sender.send(line).unwrap_or(());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -14,7 +14,7 @@ use crate::{
|
|||||||
ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
|
ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
|
||||||
ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
|
ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
|
||||||
ChatRequest, CompatGenerateRequest, Completion, CompletionComplete, CompletionCompleteChunk,
|
ChatRequest, CompatGenerateRequest, Completion, CompletionComplete, CompletionCompleteChunk,
|
||||||
CompletionRequest, VertexRequest, VertexResponse,
|
CompletionRequest, DeltaToolCall, Function, Tool, VertexRequest, VertexResponse,
|
||||||
};
|
};
|
||||||
use crate::{FunctionDefinition, FunctionRef, FunctionsMap, Properties, ToolCall, ToolType, Tools};
|
use crate::{FunctionDefinition, FunctionRef, FunctionsMap, Properties, ToolCall, ToolType, Tools};
|
||||||
use axum::extract::Extension;
|
use axum::extract::Extension;
|
||||||
@ -1213,6 +1213,12 @@ pub async fn run(
|
|||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
GrammarType,
|
GrammarType,
|
||||||
Usage,
|
Usage,
|
||||||
|
DeltaToolCall,
|
||||||
|
ToolType,
|
||||||
|
Tool,
|
||||||
|
ToolCall,
|
||||||
|
Function,
|
||||||
|
FunctionDefinition,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
tags(
|
tags(
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "text-generation-server"
|
name = "text-generation-server"
|
||||||
version = "1.4.4"
|
version = "1.4.5"
|
||||||
description = "Text Generation Inference Python gRPC Server"
|
description = "Text Generation Inference Python gRPC Server"
|
||||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user