mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525
This commit is contained in:
parent
0b28aabb94
commit
0eb4bdc909
@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<Simpl
|
||||
.iter()
|
||||
.zip(offsets)
|
||||
.map(|(&id, &(start, stop))| {
|
||||
let text = input
|
||||
.chars()
|
||||
.skip(start)
|
||||
.take(stop - start)
|
||||
.collect::<String>();
|
||||
let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
|
||||
let text: String = String::from_utf8_lossy(&text).to_string();
|
||||
SimpleToken {
|
||||
id,
|
||||
text,
|
||||
|
Loading…
Reference in New Issue
Block a user