From b94f30215fab97c6ecbff2f647f911fa6ac38bd6 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 3 Jan 2023 11:07:05 +0100 Subject: [PATCH] fix(server): Use cleanup_tokenization_spaces=False for lossless decoding (#13) Fixes #12 in the easiest way I could think of. --- server/text_generation/models/causal_lm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/text_generation/models/causal_lm.py b/server/text_generation/models/causal_lm.py index 3ad362191..b352eb6bb 100644 --- a/server/text_generation/models/causal_lm.py +++ b/server/text_generation/models/causal_lm.py @@ -354,7 +354,8 @@ class CausalLM(Model): if stop: # Decode all tokens output_text = self.tokenizer.decode( - all_input_ids.squeeze(-1), skip_special_tokens=True + all_input_ids.squeeze(-1), skip_special_tokens=True, + cleanup_tokenization_spaces=False ) # Slice with input_length to remove padding token_ids = all_input_ids[-new_input_length:]