Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525
This commit is contained in:
Nicolas Patry 2025-04-09 18:42:25 +02:00 committed by GitHub
parent 5861da1ad7
commit 9a8d0462e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<Simpl
.iter()
.zip(offsets)
.map(|(&id, &(start, stop))| {
let text = input
.chars()
.skip(start)
.take(stop - start)
.collect::<String>();
let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
let text: String = String::from_utf8_lossy(&text).to_string();
SimpleToken {
id,
text,