From 9a8d0462e1737238752aa6a7d9c526db87ad6e20 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 9 Apr 2025 18:42:25 +0200 Subject: [PATCH] =?UTF-8?q?Fixing=20tokenization=20like=20https://github.c?= =?UTF-8?q?om/huggingface/text-embeddin=E2=80=A6=20(#3156)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525 --- router/src/server.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/router/src/server.rs b/router/src/server.rs index 45d2b9f3c..077c41022 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec(); + let text: Vec = input.bytes().skip(start).take(stop - start).collect(); + let text: String = String::from_utf8_lossy(&text).to_string(); SimpleToken { id, text,