add clean_up_tokenization_spaces

2025-09-09 19:34:53 +00:00 · 2023-03-06 13:10:12 +01:00 · 2023-03-06 13:10:12 +01:00 · 56d23753bb
commit 56d23753bb
parent 6a56f945c0
2 changed files with 7 additions and 3 deletions
--- a/server/text_generation/models/model.py
+++ b/server/text_generation/models/model.py
@ -36,6 +36,8 @@ class Model(ABC):
    def decode_token(self, token_id: int) -> str:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
        # append token to special decode token and decode both
-        result = self.tokenizer.decode([self.special_decode_token_id, token_id], skip_special_tokens=False)
+        result = self.tokenizer.decode(
+            [self.special_decode_token_id, token_id], skip_special_tokens=False
+        )
        # slice to remove special decode token
-        return result[self.special_decode_token_length:]
+        return result[self.special_decode_token_length :]
--- a/server/text_generation/models/seq2seq_lm.py
+++ b/server/text_generation/models/seq2seq_lm.py
@ -342,7 +342,9 @@ class Seq2SeqLM(Model):
        return Seq2SeqLMBatch

    def decode(self, decoder_ids: List[int]) -> str:
-        return self.tokenizer.decode(decoder_ids, skip_special_tokens=True)
+        return self.tokenizer.decode(
+            decoder_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )

    def forward(
        self,