From d805612cc0eb66be7c98150f464db445856de825 Mon Sep 17 00:00:00 2001
From: EndlessReform <jacob@keisling.me>
Date: Mon, 22 Jan 2024 22:35:37 -0600
Subject: [PATCH] Transparently pass through temp and top_p

---
 router/src/lib.rs    | 12 ++++++++++++
 router/src/server.rs |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 983079d6..894ab466 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -365,6 +365,18 @@ pub(crate) struct ChatRequest {
 
     #[schema(nullable = true, example = 42)]
     pub seed: Option<u64>,
+
+    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
+    /// lower values like 0.2 will make it more focused and deterministic.
+    ///
+    /// We generally recommend altering this or `top_p` but not both.
+    #[serde(default)]
+    pub temperature: Option<f32>,
+
+    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
+    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+    #[serde(default)]
+    pub top_p: Option<f32>,
 }
 
 #[derive(Clone, Serialize, Deserialize)]
diff --git a/router/src/server.rs b/router/src/server.rs
index 0cd80ae1..aa1ad202 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -592,10 +592,10 @@ async fn chat_completions(
         inputs: inputs.to_string(),
         parameters: GenerateParameters {
             best_of: None,
-            temperature: None,
+            temperature: req.temperature,
             repetition_penalty,
             top_k: None,
-            top_p: None,
+            top_p: req.top_p,
             typical_p: None,
             do_sample: true,
             max_new_tokens,