mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Revert "Reworked the implementation."
This reverts commit 7c3f29777f17411ae4ade57e2f88e73cde704ee5.
This commit is contained in:
parent
df72deac26
commit
5d9613e0c5
@ -14,7 +14,6 @@ use chat_template::ChatTemplate;
|
|||||||
use futures::future::try_join_all;
|
use futures::future::try_join_all;
|
||||||
use futures::Stream;
|
use futures::Stream;
|
||||||
use minijinja::ErrorKind;
|
use minijinja::ErrorKind;
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
@ -374,25 +373,4 @@ impl InferError {
|
|||||||
InferError::StreamSerializationError(_) => "stream_serialization_error",
|
InferError::StreamSerializationError(_) => "stream_serialization_error",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn into_openai_event(self) -> Event {
|
|
||||||
let message = self.to_string();
|
|
||||||
Event::default().json_data(OpenaiErrorEvent {
|
|
||||||
error: APIError {
|
|
||||||
message,
|
|
||||||
http_status_code: 422,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
|
||||||
pub struct APIError {
|
|
||||||
message: String,
|
|
||||||
http_status_code: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
|
||||||
pub struct OpenaiErrorEvent {
|
|
||||||
error: APIError,
|
|
||||||
}
|
}
|
||||||
|
@ -7,10 +7,6 @@ use crate::kserve::{
|
|||||||
kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
|
kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
|
||||||
kserve_model_metadata, kserve_model_metadata_ready,
|
kserve_model_metadata, kserve_model_metadata_ready,
|
||||||
};
|
};
|
||||||
use crate::sagemaker::{
|
|
||||||
sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse,
|
|
||||||
__path_sagemaker_compatibility,
|
|
||||||
};
|
|
||||||
use crate::validation::ValidationError;
|
use crate::validation::ValidationError;
|
||||||
use crate::vertex::vertex_compatibility;
|
use crate::vertex::vertex_compatibility;
|
||||||
use crate::ChatTokenizeResponse;
|
use crate::ChatTokenizeResponse;
|
||||||
@ -19,8 +15,7 @@ use crate::{
|
|||||||
GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
|
GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
|
||||||
HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
|
HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
|
||||||
OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
|
OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
|
||||||
TextMessage, Token, TokenizeResponse, Tokenizer, ToolCallDelta, ToolCallMessage, Url, Usage,
|
TextMessage, Token, TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
|
||||||
Validation,
|
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
|
ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
|
||||||
@ -46,7 +41,6 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
|
|||||||
use hf_hub::{Cache, Repo, RepoType};
|
use hf_hub::{Cache, Repo, RepoType};
|
||||||
use http::header::AUTHORIZATION;
|
use http::header::AUTHORIZATION;
|
||||||
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
|
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
|
||||||
use pyo3::prelude::*;
|
|
||||||
use pyo3::types::IntoPyDict;
|
use pyo3::types::IntoPyDict;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -56,6 +50,7 @@ use std::io::BufReader;
|
|||||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
use tokio::signal;
|
use tokio::signal;
|
||||||
use tokio::sync::oneshot;
|
use tokio::sync::oneshot;
|
||||||
@ -65,41 +60,6 @@ use tracing::{info_span, instrument, Instrument};
|
|||||||
use utoipa::OpenApi;
|
use utoipa::OpenApi;
|
||||||
use utoipa_swagger_ui::SwaggerUi;
|
use utoipa_swagger_ui::SwaggerUi;
|
||||||
|
|
||||||
fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<SimpleToken> {
|
|
||||||
let offsets = encoding.get_offsets();
|
|
||||||
let input_ids = encoding.get_ids();
|
|
||||||
if offsets.len() == input_ids.len() {
|
|
||||||
input_ids
|
|
||||||
.iter()
|
|
||||||
.zip(offsets)
|
|
||||||
.map(|(&id, &(start, stop))| {
|
|
||||||
let text = input
|
|
||||||
.chars()
|
|
||||||
.skip(start)
|
|
||||||
.take(stop - start)
|
|
||||||
.collect::<String>();
|
|
||||||
SimpleToken {
|
|
||||||
id,
|
|
||||||
text,
|
|
||||||
start,
|
|
||||||
stop,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
encoding
|
|
||||||
.get_ids()
|
|
||||||
.iter()
|
|
||||||
.map(|&id| SimpleToken {
|
|
||||||
id,
|
|
||||||
text: "".to_string(),
|
|
||||||
start: 0,
|
|
||||||
stop: 0,
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generate tokens if `stream == false` or a stream of token if `stream == true`
|
/// Generate tokens if `stream == false` or a stream of token if `stream == true`
|
||||||
#[utoipa::path(
|
#[utoipa::path(
|
||||||
post,
|
post,
|
||||||
@ -109,7 +69,7 @@ request_body = CompatGenerateRequest,
|
|||||||
responses(
|
responses(
|
||||||
(status = 200, description = "Generated Text",
|
(status = 200, description = "Generated Text",
|
||||||
content(
|
content(
|
||||||
("application/json" = Vec<GenerateResponse>),
|
("application/json" = GenerateResponse),
|
||||||
("text/event-stream" = StreamResponse),
|
("text/event-stream" = StreamResponse),
|
||||||
)),
|
)),
|
||||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||||
@ -123,7 +83,7 @@ example = json ! ({"error": "Incomplete generation"})),
|
|||||||
)
|
)
|
||||||
)]
|
)]
|
||||||
#[instrument(skip(infer, req))]
|
#[instrument(skip(infer, req))]
|
||||||
pub(crate) async fn compat_generate(
|
async fn compat_generate(
|
||||||
Extension(default_return_full_text): Extension<bool>,
|
Extension(default_return_full_text): Extension<bool>,
|
||||||
infer: Extension<Infer>,
|
infer: Extension<Infer>,
|
||||||
compute_type: Extension<ComputeType>,
|
compute_type: Extension<ComputeType>,
|
||||||
@ -181,16 +141,12 @@ async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Template and tokenize ChatRequest
|
|
||||||
#[utoipa::path(
|
#[utoipa::path(
|
||||||
post,
|
post,
|
||||||
tag = "Text Generation Inference",
|
tag = "Text Generation Inference",
|
||||||
path = "/chat_tokenize",
|
path = "/chat_tokenize",
|
||||||
request_body = ChatRequest,
|
request_body = ChatRequest,
|
||||||
responses(
|
responses((status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse))
|
||||||
(status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse),
|
|
||||||
(status = 404, description = "Failed to tokenize ChatRequest", body = ErrorResponse),
|
|
||||||
)
|
|
||||||
)]
|
)]
|
||||||
async fn get_chat_tokenize(
|
async fn get_chat_tokenize(
|
||||||
Extension(infer): Extension<Infer>,
|
Extension(infer): Extension<Infer>,
|
||||||
@ -201,14 +157,40 @@ async fn get_chat_tokenize(
|
|||||||
let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
|
let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
|
||||||
let input = generate_request.inputs.clone();
|
let input = generate_request.inputs.clone();
|
||||||
let encoding = infer.tokenize(generate_request).await?;
|
let encoding = infer.tokenize(generate_request).await?;
|
||||||
|
if let Some(encoding) = encoding {
|
||||||
|
let tokens: Vec<SimpleToken> = encoding
|
||||||
|
.get_ids()
|
||||||
|
.iter()
|
||||||
|
.zip(encoding.get_offsets())
|
||||||
|
.map(|(&id, &(start, stop))| {
|
||||||
|
let text = input
|
||||||
|
.chars()
|
||||||
|
.skip(start)
|
||||||
|
.take(stop - start)
|
||||||
|
.collect::<String>();
|
||||||
|
SimpleToken {
|
||||||
|
id,
|
||||||
|
text,
|
||||||
|
start,
|
||||||
|
stop,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
let tokens = encoding_to_tokens(&encoding, &input);
|
let resp = ChatTokenizeResponse {
|
||||||
|
tokenize_response: TokenizeResponse(tokens),
|
||||||
let resp = ChatTokenizeResponse {
|
templated_text: input,
|
||||||
tokenize_response: TokenizeResponse(tokens),
|
};
|
||||||
templated_text: input,
|
Ok((HeaderMap::new(), Json(resp)))
|
||||||
};
|
} else {
|
||||||
Ok((HeaderMap::new(), Json(resp)))
|
Err((
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: "No fast tokenizer or tokenizer.json for this model".to_string(),
|
||||||
|
error_type: "no fast tokenizer".to_string(),
|
||||||
|
}),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[utoipa::path(
|
#[utoipa::path(
|
||||||
@ -696,7 +678,7 @@ time_per_token,
|
|||||||
seed,
|
seed,
|
||||||
)
|
)
|
||||||
)]
|
)]
|
||||||
pub(crate) async fn completions(
|
async fn completions(
|
||||||
Extension(infer): Extension<Infer>,
|
Extension(infer): Extension<Infer>,
|
||||||
Extension(compute_type): Extension<ComputeType>,
|
Extension(compute_type): Extension<ComputeType>,
|
||||||
Extension(info): Extension<Info>,
|
Extension(info): Extension<Info>,
|
||||||
@ -866,7 +848,14 @@ pub(crate) async fn completions(
|
|||||||
|
|
||||||
yield Ok(event);
|
yield Ok(event);
|
||||||
}
|
}
|
||||||
Err(err) => yield Ok(err.into_openai_event()),
|
Err(err) => {
|
||||||
|
let event = Event::default()
|
||||||
|
.json_data(ErrorEvent::into_api_error(err, 422))
|
||||||
|
.unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into());
|
||||||
|
println!("{:?}", event);
|
||||||
|
yield Ok::<Event, Infallible>(event);
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -1220,7 +1209,7 @@ time_per_token,
|
|||||||
seed,
|
seed,
|
||||||
)
|
)
|
||||||
)]
|
)]
|
||||||
pub(crate) async fn chat_completions(
|
async fn chat_completions(
|
||||||
Extension(infer): Extension<Infer>,
|
Extension(infer): Extension<Infer>,
|
||||||
Extension(compute_type): Extension<ComputeType>,
|
Extension(compute_type): Extension<ComputeType>,
|
||||||
Extension(info): Extension<Info>,
|
Extension(info): Extension<Info>,
|
||||||
@ -1274,102 +1263,107 @@ pub(crate) async fn chat_completions(
|
|||||||
};
|
};
|
||||||
let mut response_as_tool = using_tools;
|
let mut response_as_tool = using_tools;
|
||||||
while let Some(result) = response_stream.next().await {
|
while let Some(result) = response_stream.next().await {
|
||||||
match result{
|
match result {
|
||||||
Ok(stream_tokens) => {
|
Ok(stream_token) => {
|
||||||
let token_text = &stream_token.token.text.clone();
|
let token_text = &stream_token.token.text.clone();
|
||||||
match state {
|
match state {
|
||||||
StreamState::Buffering => {
|
StreamState::Buffering => {
|
||||||
json_buffer.push_str(&token_text.replace(" ", ""));
|
json_buffer.push_str(&token_text.replace(" ", ""));
|
||||||
buffer.push(stream_token);
|
buffer.push(stream_token);
|
||||||
if let Some(captures) = function_regex.captures(&json_buffer) {
|
if let Some(captures) = function_regex.captures(&json_buffer) {
|
||||||
let function_name = captures[1].to_string();
|
let function_name = captures[1].to_string();
|
||||||
if function_name == "no_tool" {
|
if function_name == "no_tool" {
|
||||||
state = StreamState::BufferTrailing;
|
state = StreamState::BufferTrailing;
|
||||||
response_as_tool = false;
|
response_as_tool = false;
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
json_buffer.clear();
|
json_buffer.clear();
|
||||||
} else {
|
} else {
|
||||||
state = StreamState::Content {
|
state = StreamState::Content {
|
||||||
skip_close_quote: false,
|
skip_close_quote: false,
|
||||||
};
|
};
|
||||||
// send all the buffered messages
|
// send all the buffered messages
|
||||||
for stream_token in &buffer {
|
for stream_token in &buffer {
|
||||||
let event = create_event_from_stream_token(
|
let event = create_event_from_stream_token(
|
||||||
stream_token,
|
stream_token,
|
||||||
logprobs,
|
logprobs,
|
||||||
stream_options.clone(),
|
stream_options.clone(),
|
||||||
response_as_tool,
|
response_as_tool,
|
||||||
system_fingerprint.clone(),
|
system_fingerprint.clone(),
|
||||||
model_id.clone(),
|
model_id.clone(),
|
||||||
);
|
);
|
||||||
yield Ok::<Event, Infallible>(event);
|
yield Ok::<Event, Infallible>(event);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
// if we skipped sending the buffer we need to avoid sending the following json key and quotes
|
||||||
// if we skipped sending the buffer we need to avoid sending the following json key and quotes
|
StreamState::BufferTrailing => {
|
||||||
StreamState::BufferTrailing => {
|
let infix_text = "\"content\":\"";
|
||||||
let infix_text = "\"content\":\"";
|
json_buffer.push_str(&token_text.replace(" ", ""));
|
||||||
json_buffer.push_str(&token_text.replace(" ", ""));
|
// keep capturing until we find the infix text
|
||||||
// keep capturing until we find the infix text
|
match json_buffer.find(infix_text) {
|
||||||
match json_buffer.find(infix_text) {
|
Some(content_key_index) => {
|
||||||
Some(content_key_index) => {
|
json_buffer =
|
||||||
json_buffer =
|
json_buffer[content_key_index + infix_text.len()..].to_string();
|
||||||
json_buffer[content_key_index + infix_text.len()..].to_string();
|
}
|
||||||
|
None => {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
None => {
|
// if there is leftover text after removing the infix text, we need to send it
|
||||||
continue;
|
if !json_buffer.is_empty() {
|
||||||
|
let event = Event::default();
|
||||||
|
let current_time = std::time::SystemTime::now()
|
||||||
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
|
.unwrap_or_else(|_| std::time::Duration::from_secs(0))
|
||||||
|
.as_secs();
|
||||||
|
let chat_complete =
|
||||||
|
CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
|
||||||
|
model_id.clone(),
|
||||||
|
system_fingerprint.clone(),
|
||||||
|
Some(json_buffer.clone()),
|
||||||
|
None,
|
||||||
|
current_time,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
|
||||||
|
InferError::StreamSerializationError(e.to_string()).into()
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
// cleanup the buffers
|
||||||
|
buffer.clear();
|
||||||
|
json_buffer.clear();
|
||||||
|
state = StreamState::Content {
|
||||||
|
skip_close_quote: true,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
// if there is leftover text after removing the infix text, we need to send it
|
StreamState::Content { skip_close_quote } => {
|
||||||
if !json_buffer.is_empty() {
|
if skip_close_quote && token_text.contains('"') {
|
||||||
let event = Event::default();
|
break;
|
||||||
let current_time = std::time::SystemTime::now()
|
}
|
||||||
.duration_since(std::time::UNIX_EPOCH)
|
// send the content
|
||||||
.unwrap_or_else(|_| std::time::Duration::from_secs(0))
|
let event = create_event_from_stream_token(
|
||||||
.as_secs();
|
&stream_token,
|
||||||
let chat_complete =
|
logprobs,
|
||||||
CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
|
stream_options.clone(),
|
||||||
model_id.clone(),
|
response_as_tool,
|
||||||
system_fingerprint.clone(),
|
system_fingerprint.clone(),
|
||||||
Some(json_buffer.clone()),
|
model_id.clone(),
|
||||||
None,
|
);
|
||||||
current_time,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
));
|
|
||||||
yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
|
|
||||||
InferError::StreamSerializationError(e.to_string()).into()
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
// cleanup the buffers
|
|
||||||
buffer.clear();
|
|
||||||
json_buffer.clear();
|
|
||||||
state = StreamState::Content {
|
|
||||||
skip_close_quote: true,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
StreamState::Content { skip_close_quote } => {
|
|
||||||
if skip_close_quote && token_text.contains('"') {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// send the content
|
yield Ok::<Event, Infallible>(event);
|
||||||
let event = create_event_from_stream_token(
|
}
|
||||||
&stream_token,
|
|
||||||
logprobs,
|
|
||||||
stream_options.clone(),
|
|
||||||
response_as_tool,
|
|
||||||
system_fingerprint.clone(),
|
|
||||||
model_id.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
yield Ok::<Event, Infallible>(event);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
Err(err) => {
|
||||||
Err(err) => yield Event::from_openai(err)
|
let event = Event::default()
|
||||||
|
.json_data(ErrorEvent::into_api_error(err, 422))
|
||||||
|
.unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into());
|
||||||
|
yield Ok::<Event, Infallible>(event);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
|
yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
|
||||||
@ -1475,8 +1469,35 @@ async fn tokenize(
|
|||||||
) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
|
||||||
let input = req.inputs.clone();
|
let input = req.inputs.clone();
|
||||||
let encoding = infer.tokenize(req).await?;
|
let encoding = infer.tokenize(req).await?;
|
||||||
let tokens = encoding_to_tokens(&encoding, &input);
|
if let Some(encoding) = encoding {
|
||||||
Ok(Json(TokenizeResponse(tokens)))
|
let tokens: Vec<SimpleToken> = encoding
|
||||||
|
.get_ids()
|
||||||
|
.iter()
|
||||||
|
.zip(encoding.get_offsets())
|
||||||
|
.map(|(&id, &(start, stop))| {
|
||||||
|
let text = input
|
||||||
|
.chars()
|
||||||
|
.skip(start)
|
||||||
|
.take(stop - start)
|
||||||
|
.collect::<String>();
|
||||||
|
SimpleToken {
|
||||||
|
id,
|
||||||
|
text,
|
||||||
|
start,
|
||||||
|
stop,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
Ok(Json(TokenizeResponse(tokens)))
|
||||||
|
} else {
|
||||||
|
Err((
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: "No fast tokenizer or tokenizer.json for this model".to_string(),
|
||||||
|
error_type: "no fast tokenizer".to_string(),
|
||||||
|
}),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prometheus metrics scrape endpoint
|
/// Prometheus metrics scrape endpoint
|
||||||
@ -1507,14 +1528,11 @@ completions,
|
|||||||
tokenize,
|
tokenize,
|
||||||
metrics,
|
metrics,
|
||||||
openai_get_model_info,
|
openai_get_model_info,
|
||||||
sagemaker_compatibility,
|
|
||||||
get_chat_tokenize,
|
|
||||||
),
|
),
|
||||||
components(
|
components(
|
||||||
schemas(
|
schemas(
|
||||||
Info,
|
Info,
|
||||||
CompatGenerateRequest,
|
CompatGenerateRequest,
|
||||||
SagemakerRequest,
|
|
||||||
GenerateRequest,
|
GenerateRequest,
|
||||||
GrammarType,
|
GrammarType,
|
||||||
ChatRequest,
|
ChatRequest,
|
||||||
@ -1537,8 +1555,6 @@ ChatCompletionTopLogprob,
|
|||||||
ChatCompletion,
|
ChatCompletion,
|
||||||
CompletionRequest,
|
CompletionRequest,
|
||||||
CompletionComplete,
|
CompletionComplete,
|
||||||
SagemakerResponse,
|
|
||||||
SagemakerStreamResponse,
|
|
||||||
Chunk,
|
Chunk,
|
||||||
Completion,
|
Completion,
|
||||||
CompletionFinal,
|
CompletionFinal,
|
||||||
@ -1566,7 +1582,6 @@ Function,
|
|||||||
FunctionDefinition,
|
FunctionDefinition,
|
||||||
ToolChoice,
|
ToolChoice,
|
||||||
ModelInfo,
|
ModelInfo,
|
||||||
ChatTokenizeResponse,
|
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
tags(
|
tags(
|
||||||
@ -1586,71 +1601,6 @@ pub fn schema() -> ApiDoc {
|
|||||||
ApiDoc
|
ApiDoc
|
||||||
}
|
}
|
||||||
|
|
||||||
fn py_resolve_tokenizer(
|
|
||||||
py: pyo3::Python,
|
|
||||||
tokenizer_name: &str,
|
|
||||||
revision: Option<&str>,
|
|
||||||
trust_remote_code: bool,
|
|
||||||
) -> pyo3::PyResult<()> {
|
|
||||||
let transformers = py.import_bound("transformers")?;
|
|
||||||
let auto = transformers.getattr("AutoTokenizer")?;
|
|
||||||
let from_pretrained = auto.getattr("from_pretrained")?;
|
|
||||||
let args = (tokenizer_name,);
|
|
||||||
let kwargs = if let Some(rev) = &revision {
|
|
||||||
[
|
|
||||||
("revision", rev.to_string().into_py(py)),
|
|
||||||
("trust_remote_code", trust_remote_code.into_py(py)),
|
|
||||||
]
|
|
||||||
.into_py_dict_bound(py)
|
|
||||||
} else {
|
|
||||||
[("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
|
|
||||||
};
|
|
||||||
let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
|
|
||||||
let save = tokenizer.getattr("save_pretrained")?;
|
|
||||||
let args = ("out".to_string(),);
|
|
||||||
save.call1(args)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
|
|
||||||
// XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
|
|
||||||
// and state-spaces/mamba-130m
|
|
||||||
tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");
|
|
||||||
|
|
||||||
#[derive(serde::Deserialize)]
|
|
||||||
struct FallbackConfig {
|
|
||||||
base_model_name_or_path: Option<String>,
|
|
||||||
model_type: Option<String>,
|
|
||||||
ssm_config: Option<serde_json::Value>,
|
|
||||||
}
|
|
||||||
config_filename.and_then(|filename| {
|
|
||||||
std::fs::read_to_string(filename)
|
|
||||||
.ok()
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|c| {
|
|
||||||
let config: Result<FallbackConfig, _> = serde_json::from_str(c);
|
|
||||||
if let Ok(config) = config {
|
|
||||||
if config.model_type.is_none() {
|
|
||||||
if let Some(base) = config.base_model_name_or_path {
|
|
||||||
pyo3::Python::with_gil(|py| -> PyResult<()> {
|
|
||||||
py_resolve_tokenizer(py, &base, Some("main"), false)
|
|
||||||
})
|
|
||||||
.ok()?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if config.ssm_config.is_some() {
|
|
||||||
// XXX Legacy mamba
|
|
||||||
pyo3::Python::with_gil(|py| -> PyResult<()> {
|
|
||||||
py_resolve_tokenizer(py, "EleutherAI/gpt-neox-20b", Some("main"), false)
|
|
||||||
})
|
|
||||||
.ok()?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Serving method
|
/// Serving method
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn run(
|
pub async fn run(
|
||||||
@ -1666,13 +1616,13 @@ pub async fn run(
|
|||||||
tokenizer_name: String,
|
tokenizer_name: String,
|
||||||
tokenizer_config_path: Option<String>,
|
tokenizer_config_path: Option<String>,
|
||||||
revision: Option<String>,
|
revision: Option<String>,
|
||||||
trust_remote_code: bool,
|
|
||||||
hostname: String,
|
hostname: String,
|
||||||
port: u16,
|
port: u16,
|
||||||
cors_allow_origin: Option<Vec<String>>,
|
cors_allow_origin: Option<Vec<String>>,
|
||||||
ngrok: bool,
|
ngrok: bool,
|
||||||
_ngrok_authtoken: Option<String>,
|
_ngrok_authtoken: Option<String>,
|
||||||
_ngrok_edge: Option<String>,
|
_ngrok_edge: Option<String>,
|
||||||
|
messages_api_enabled: bool,
|
||||||
disable_grammar_support: bool,
|
disable_grammar_support: bool,
|
||||||
max_client_batch_size: usize,
|
max_client_batch_size: usize,
|
||||||
usage_stats_level: usage_stats::UsageStatsLevel,
|
usage_stats_level: usage_stats::UsageStatsLevel,
|
||||||
@ -1744,6 +1694,7 @@ pub async fn run(
|
|||||||
|
|
||||||
// Load tokenizer and model info
|
// Load tokenizer and model info
|
||||||
let (
|
let (
|
||||||
|
tokenizer_filename,
|
||||||
config_filename,
|
config_filename,
|
||||||
tokenizer_config_filename,
|
tokenizer_config_filename,
|
||||||
preprocessor_config_filename,
|
preprocessor_config_filename,
|
||||||
@ -1751,6 +1702,7 @@ pub async fn run(
|
|||||||
model_info,
|
model_info,
|
||||||
) = match api {
|
) = match api {
|
||||||
Type::None => (
|
Type::None => (
|
||||||
|
Some(local_path.join("tokenizer.json")),
|
||||||
Some(local_path.join("config.json")),
|
Some(local_path.join("config.json")),
|
||||||
Some(local_path.join("tokenizer_config.json")),
|
Some(local_path.join("tokenizer_config.json")),
|
||||||
Some(local_path.join("preprocessor_config.json")),
|
Some(local_path.join("preprocessor_config.json")),
|
||||||
@ -1764,6 +1716,10 @@ pub async fn run(
|
|||||||
revision.clone().unwrap_or_else(|| "main".to_string()),
|
revision.clone().unwrap_or_else(|| "main".to_string()),
|
||||||
));
|
));
|
||||||
|
|
||||||
|
let tokenizer_filename = match api_repo.get("tokenizer.json").await {
|
||||||
|
Ok(tokenizer_filename) => Some(tokenizer_filename),
|
||||||
|
Err(_) => get_base_tokenizer(&api, &api_repo).await,
|
||||||
|
};
|
||||||
let config_filename = api_repo.get("config.json").await.ok();
|
let config_filename = api_repo.get("config.json").await.ok();
|
||||||
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
|
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
|
||||||
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
|
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
|
||||||
@ -1776,6 +1732,7 @@ pub async fn run(
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
(
|
(
|
||||||
|
tokenizer_filename,
|
||||||
config_filename,
|
config_filename,
|
||||||
tokenizer_config_filename,
|
tokenizer_config_filename,
|
||||||
preprocessor_config_filename,
|
preprocessor_config_filename,
|
||||||
@ -1790,6 +1747,7 @@ pub async fn run(
|
|||||||
revision.clone().unwrap_or_else(|| "main".to_string()),
|
revision.clone().unwrap_or_else(|| "main".to_string()),
|
||||||
));
|
));
|
||||||
(
|
(
|
||||||
|
repo.get("tokenizer.json"),
|
||||||
repo.get("config.json"),
|
repo.get("config.json"),
|
||||||
repo.get("tokenizer_config.json"),
|
repo.get("tokenizer_config.json"),
|
||||||
repo.get("preprocessor_config.json"),
|
repo.get("preprocessor_config.json"),
|
||||||
@ -1811,31 +1769,36 @@ pub async fn run(
|
|||||||
HubTokenizerConfig::default()
|
HubTokenizerConfig::default()
|
||||||
});
|
});
|
||||||
|
|
||||||
let tokenizer: Tokenizer = {
|
let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
pyo3::Python::with_gil(|py| -> PyResult<()> {
|
let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
|
||||||
py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), trust_remote_code)?;
|
let transformers = py.import_bound("transformers")?;
|
||||||
|
let auto = transformers.getattr("AutoTokenizer")?;
|
||||||
|
let from_pretrained = auto.getattr("from_pretrained")?;
|
||||||
|
let args = (tokenizer_name.to_string(),);
|
||||||
|
let kwargs = [(
|
||||||
|
"revision",
|
||||||
|
revision.clone().unwrap_or_else(|| "main".to_string()),
|
||||||
|
)]
|
||||||
|
.into_py_dict_bound(py);
|
||||||
|
let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
|
||||||
|
let save = tokenizer.getattr("save_pretrained")?;
|
||||||
|
let args = ("out".to_string(),);
|
||||||
|
save.call1(args)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.inspect_err(|err| {
|
.inspect_err(|err| {
|
||||||
tracing::error!("Failed to import python tokenizer {err}");
|
tracing::error!("Failed to import python tokenizer {err}");
|
||||||
})
|
});
|
||||||
.or_else(|err| {
|
let filename = if convert.is_ok() {
|
||||||
let out = legacy_tokenizer_handle(config_filename.as_ref());
|
// If we have correctly loaded and resaved with transformers
|
||||||
out.ok_or(err)
|
// We might have modified the tokenizer.json according to transformers
|
||||||
})
|
"out/tokenizer.json".into()
|
||||||
.expect("We cannot load a tokenizer");
|
|
||||||
let filename = "out/tokenizer.json";
|
|
||||||
if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
|
|
||||||
Tokenizer::Rust(tok)
|
|
||||||
} else {
|
} else {
|
||||||
Tokenizer::Python {
|
filename
|
||||||
tokenizer_name: tokenizer_name.clone(),
|
};
|
||||||
revision: revision.clone(),
|
Tokenizer::from_file(filename).ok()
|
||||||
trust_remote_code,
|
});
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let config: Option<Config> = config_filename.and_then(|filename| {
|
let config: Option<Config> = config_filename.and_then(|filename| {
|
||||||
std::fs::read_to_string(filename)
|
std::fs::read_to_string(filename)
|
||||||
@ -1863,6 +1826,10 @@ pub async fn run(
|
|||||||
preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
|
preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
|
||||||
|
|
||||||
tracing::info!("Using config {config:?}");
|
tracing::info!("Using config {config:?}");
|
||||||
|
if tokenizer.is_none() {
|
||||||
|
tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
|
||||||
|
tracing::warn!("Rust input length validation and truncation is disabled");
|
||||||
|
}
|
||||||
|
|
||||||
// Only send usage stats when TGI is run in container and the function returns Some
|
// Only send usage stats when TGI is run in container and the function returns Some
|
||||||
let is_container = matches!(usage_stats::is_container(), Ok(true));
|
let is_container = matches!(usage_stats::is_container(), Ok(true));
|
||||||
@ -1884,6 +1851,7 @@ pub async fn run(
|
|||||||
// max_batch_size,
|
// max_batch_size,
|
||||||
revision.clone(),
|
revision.clone(),
|
||||||
validation_workers,
|
validation_workers,
|
||||||
|
messages_api_enabled,
|
||||||
disable_grammar_support,
|
disable_grammar_support,
|
||||||
max_client_batch_size,
|
max_client_batch_size,
|
||||||
usage_stats_level,
|
usage_stats_level,
|
||||||
@ -1925,6 +1893,7 @@ pub async fn run(
|
|||||||
ngrok,
|
ngrok,
|
||||||
_ngrok_authtoken,
|
_ngrok_authtoken,
|
||||||
_ngrok_edge,
|
_ngrok_edge,
|
||||||
|
messages_api_enabled,
|
||||||
disable_grammar_support,
|
disable_grammar_support,
|
||||||
max_client_batch_size,
|
max_client_batch_size,
|
||||||
model_info,
|
model_info,
|
||||||
@ -1977,13 +1946,14 @@ async fn start(
|
|||||||
validation_workers: usize,
|
validation_workers: usize,
|
||||||
api_key: Option<String>,
|
api_key: Option<String>,
|
||||||
config: Option<Config>,
|
config: Option<Config>,
|
||||||
(tokenizer, tokenizer_config): (Tokenizer, HubTokenizerConfig),
|
(tokenizer, tokenizer_config): (Option<Tokenizer>, HubTokenizerConfig),
|
||||||
(preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
|
(preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
|
||||||
hostname: String,
|
hostname: String,
|
||||||
port: u16,
|
port: u16,
|
||||||
ngrok: bool,
|
ngrok: bool,
|
||||||
_ngrok_authtoken: Option<String>,
|
_ngrok_authtoken: Option<String>,
|
||||||
_ngrok_edge: Option<String>,
|
_ngrok_edge: Option<String>,
|
||||||
|
messages_api_enabled: bool,
|
||||||
disable_grammar_support: bool,
|
disable_grammar_support: bool,
|
||||||
max_client_batch_size: usize,
|
max_client_batch_size: usize,
|
||||||
model_info: HubModelInfo,
|
model_info: HubModelInfo,
|
||||||
@ -2298,7 +2268,6 @@ async fn start(
|
|||||||
.route("/v1/chat/completions", post(chat_completions))
|
.route("/v1/chat/completions", post(chat_completions))
|
||||||
.route("/v1/completions", post(completions))
|
.route("/v1/completions", post(completions))
|
||||||
.route("/vertex", post(vertex_compatibility))
|
.route("/vertex", post(vertex_compatibility))
|
||||||
.route("/invocations", post(sagemaker_compatibility))
|
|
||||||
.route("/tokenize", post(tokenize));
|
.route("/tokenize", post(tokenize));
|
||||||
|
|
||||||
if let Some(api_key) = api_key {
|
if let Some(api_key) = api_key {
|
||||||
@ -2334,6 +2303,13 @@ async fn start(
|
|||||||
.route("/metrics", get(metrics))
|
.route("/metrics", get(metrics))
|
||||||
.route("/v1/models", get(openai_get_model_info));
|
.route("/v1/models", get(openai_get_model_info));
|
||||||
|
|
||||||
|
// Conditional AWS Sagemaker route
|
||||||
|
let aws_sagemaker_route = if messages_api_enabled {
|
||||||
|
Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED
|
||||||
|
} else {
|
||||||
|
Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise
|
||||||
|
};
|
||||||
|
|
||||||
let compute_type =
|
let compute_type =
|
||||||
ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));
|
ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));
|
||||||
|
|
||||||
@ -2341,7 +2317,8 @@ async fn start(
|
|||||||
let mut app = Router::new()
|
let mut app = Router::new()
|
||||||
.merge(swagger_ui)
|
.merge(swagger_ui)
|
||||||
.merge(base_routes)
|
.merge(base_routes)
|
||||||
.merge(info_routes);
|
.merge(info_routes)
|
||||||
|
.merge(aws_sagemaker_route);
|
||||||
|
|
||||||
#[cfg(feature = "google")]
|
#[cfg(feature = "google")]
|
||||||
{
|
{
|
||||||
@ -2437,6 +2414,30 @@ pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// get base tokenizer
|
||||||
|
pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
|
||||||
|
let config_filename = api_repo.get("config.json").await.ok()?;
|
||||||
|
|
||||||
|
// Open the file in read-only mode with buffer.
|
||||||
|
let file = File::open(config_filename).ok()?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
|
||||||
|
// Read the JSON contents of the file as an instance of `User`.
|
||||||
|
let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
|
||||||
|
|
||||||
|
if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
|
||||||
|
let api_base_repo = api.repo(Repo::with_revision(
|
||||||
|
base_model_id.to_string(),
|
||||||
|
RepoType::Model,
|
||||||
|
"main".to_string(),
|
||||||
|
));
|
||||||
|
|
||||||
|
api_base_repo.get("tokenizer.json").await.ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// get tokenizer_config from the Huggingface Hub
|
/// get tokenizer_config from the Huggingface Hub
|
||||||
pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
|
pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
|
||||||
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
|
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
|
||||||
@ -2520,6 +2521,28 @@ impl From<InferError> for Event {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
pub struct APIError {
|
||||||
|
message: String,
|
||||||
|
http_status_code: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
pub struct ErrorEvent {
|
||||||
|
error: APIError,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ErrorEvent {
|
||||||
|
fn into_api_error(err: InferError, http_status_code: usize) -> Self {
|
||||||
|
ErrorEvent {
|
||||||
|
error: APIError {
|
||||||
|
message: err.to_string(),
|
||||||
|
http_status_code,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum WebServerError {
|
pub enum WebServerError {
|
||||||
#[error("Axum error: {0}")]
|
#[error("Axum error: {0}")]
|
||||||
@ -2579,11 +2602,10 @@ mod tests {
|
|||||||
use crate::TokenizerConfigToken;
|
use crate::TokenizerConfigToken;
|
||||||
use crate::Tool;
|
use crate::Tool;
|
||||||
|
|
||||||
use crate::tests::get_tokenizer;
|
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
#[tokio::test]
|
#[test]
|
||||||
async fn test_prepare_chat_input() {
|
fn test_prepare_chat_input() {
|
||||||
// Mock Backend to avoid network requests
|
// Mock Backend to avoid network requests
|
||||||
struct MockBackend;
|
struct MockBackend;
|
||||||
|
|
||||||
@ -2624,11 +2646,9 @@ mod tests {
|
|||||||
ChatTemplateVersions::Single("{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS] [\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- \"[TOOL_CALLS] [\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- \" \" + message[\"content\"]|trim + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n".to_string())
|
ChatTemplateVersions::Single("{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS] [\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- \"[TOOL_CALLS] [\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- \" \" + message[\"content\"]|trim + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n".to_string())
|
||||||
);
|
);
|
||||||
|
|
||||||
let tokenizer = get_tokenizer();
|
|
||||||
|
|
||||||
let infer = Infer::new(
|
let infer = Infer::new(
|
||||||
backend,
|
backend,
|
||||||
Validation::new(1, tokenizer, None, None, 1, 1, 1, 1, 1, false),
|
Validation::new(1, None, None, None, 1, 1, 1, 1, 1, false),
|
||||||
1,
|
1,
|
||||||
tokenizer_config,
|
tokenizer_config,
|
||||||
HubProcessorConfig::default(),
|
HubProcessorConfig::default(),
|
||||||
|
Loading…
Reference in New Issue
Block a user