mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
feat: add guideline to chat request and template
This commit is contained in:
parent
6e127dcc96
commit
3b25cd3213
@ -2080,4 +2080,4 @@
|
||||
"description": "Hugging Face Text Generation Inference API"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ We recommend using the official quantization scripts for creating your quants:
|
||||
2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
|
||||
3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
|
||||
|
||||
For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
|
||||
For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
|
||||
|
||||
## Quantization with bitsandbytes, EETQ & fp8
|
||||
|
||||
@ -69,4 +69,4 @@ text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-
|
||||
You can learn more about the quantization options by running `text-generation-server quantize --help`.
|
||||
|
||||
If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
|
||||
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
|
||||
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
|
||||
|
@ -48,6 +48,7 @@ impl ChatTemplate {
|
||||
|
||||
pub(crate) fn apply(
|
||||
&self,
|
||||
guideline: Option<&str>,
|
||||
mut messages: Vec<Message>,
|
||||
grammar_with_prompt: Option<(GrammarType, String)>,
|
||||
) -> Result<String, InferError> {
|
||||
@ -65,6 +66,7 @@ impl ChatTemplate {
|
||||
|
||||
self.template
|
||||
.render(ChatTemplateInputs {
|
||||
guideline,
|
||||
messages,
|
||||
bos_token: self.bos_token.as_deref(),
|
||||
eos_token: self.eos_token.as_deref(),
|
||||
|
@ -138,13 +138,14 @@ impl Infer {
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) fn apply_chat_template(
|
||||
&self,
|
||||
guideline: Option<String>,
|
||||
messages: Vec<Message>,
|
||||
grammar_with_prompt: Option<(GrammarType, String)>,
|
||||
) -> Result<String, InferError> {
|
||||
self.chat_template
|
||||
.as_ref()
|
||||
.ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
|
||||
.apply(messages, grammar_with_prompt)
|
||||
.apply(guideline.as_deref(), messages, grammar_with_prompt)
|
||||
.map_err(|e| {
|
||||
metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
|
||||
tracing::error!("{e}");
|
||||
|
@ -829,6 +829,11 @@ pub(crate) struct ChatRequest {
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub response_format: Option<GrammarType>,
|
||||
|
||||
/// A guideline to be used in the chat_template
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub guideline: Option<String>,
|
||||
}
|
||||
|
||||
fn default_tool_prompt() -> Option<String> {
|
||||
@ -936,6 +941,7 @@ pub(crate) struct ChatTemplateInputs<'a> {
|
||||
add_generation_prompt: bool,
|
||||
tools: Option<&'a str>,
|
||||
tools_prompt: Option<&'a str>,
|
||||
guideline: Option<&'a str>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
|
||||
|
@ -141,6 +141,7 @@ async fn get_chat_tokenize(
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
..
|
||||
} = req;
|
||||
|
||||
@ -151,6 +152,7 @@ async fn get_chat_tokenize(
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
)?;
|
||||
|
||||
@ -1123,6 +1125,7 @@ async fn chat_completions(
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
..
|
||||
} = req;
|
||||
|
||||
@ -1142,6 +1145,7 @@ async fn chat_completions(
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
)?;
|
||||
|
||||
@ -2402,6 +2406,7 @@ fn prepare_chat_input(
|
||||
tools: Option<Vec<Tool>>,
|
||||
tool_choice: ToolChoice,
|
||||
tool_prompt: &str,
|
||||
guideline: Option<String>,
|
||||
messages: Vec<Message>,
|
||||
) -> Result<PreparedInput, InferError> {
|
||||
if response_format.is_some() && tools.is_some() {
|
||||
@ -2411,7 +2416,7 @@ fn prepare_chat_input(
|
||||
}
|
||||
|
||||
if let Some(format) = response_format {
|
||||
let inputs = infer.apply_chat_template(messages, None)?;
|
||||
let inputs = infer.apply_chat_template(guideline, messages, None)?;
|
||||
return Ok((inputs, Some(format), None));
|
||||
}
|
||||
|
||||
@ -2423,6 +2428,6 @@ fn prepare_chat_input(
|
||||
let tools_grammar_prompt = tool_grammar
|
||||
.as_ref()
|
||||
.map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt.into()));
|
||||
let inputs = infer.apply_chat_template(messages, tools_grammar_prompt)?;
|
||||
let inputs = infer.apply_chat_template(guideline, messages, tools_grammar_prompt)?;
|
||||
Ok((inputs, grammar, tool_grammar))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user