From 0eb4bdc909c815d1974d9ac66c4e62dda5c3130a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 9 Apr 2025 15:22:49 +0200 Subject: [PATCH] Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525 --- router/src/server.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/router/src/server.rs b/router/src/server.rs index 45d2b9f3..077c4102 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec(); + let text: Vec = input.bytes().skip(start).take(stop - start).collect(); + let text: String = String::from_utf8_lossy(&text).to_string(); SimpleToken { id, text,