Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525
This commit is contained in:
Nicolas Patry 2025-04-09 18:42:25 +02:00 committed by GitHub
parent 5861da1ad7
commit 9a8d0462e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<Simpl
.iter() .iter()
.zip(offsets) .zip(offsets)
.map(|(&id, &(start, stop))| { .map(|(&id, &(start, stop))| {
let text = input let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
.chars() let text: String = String::from_utf8_lossy(&text).to_string();
.skip(start)
.take(stop - start)
.collect::<String>();
SimpleToken { SimpleToken {
id, id,
text, text,