From d805612cc0eb66be7c98150f464db445856de825 Mon Sep 17 00:00:00 2001 From: EndlessReform Date: Mon, 22 Jan 2024 22:35:37 -0600 Subject: [PATCH] Transparently pass through temp and top_p --- router/src/lib.rs | 12 ++++++++++++ router/src/server.rs | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index 983079d6..894ab466 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -365,6 +365,18 @@ pub(crate) struct ChatRequest { #[schema(nullable = true, example = 42)] pub seed: Option, + + /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while + /// lower values like 0.2 will make it more focused and deterministic. + /// + /// We generally recommend altering this or `top_p` but not both. + #[serde(default)] + pub temperature: Option, + + /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the + /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. + #[serde(default)] + pub top_p: Option, } #[derive(Clone, Serialize, Deserialize)] diff --git a/router/src/server.rs b/router/src/server.rs index 0cd80ae1..aa1ad202 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -592,10 +592,10 @@ async fn chat_completions( inputs: inputs.to_string(), parameters: GenerateParameters { best_of: None, - temperature: None, + temperature: req.temperature, repetition_penalty, top_k: None, - top_p: None, + top_p: req.top_p, typical_p: None, do_sample: true, max_new_tokens,