diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py index 0d83abe2..ce3895ca 100644 --- a/server/text_generation_server/models/bloom.py +++ b/server/text_generation_server/models/bloom.py @@ -68,7 +68,7 @@ class BLOOMSharded(BLOOM): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) config = AutoConfig.from_pretrained( diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 2e077ca6..8c092d6a 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -303,7 +303,7 @@ class CausalLM(Model): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) self.model = AutoModelForCausalLM.from_pretrained( model_id, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index c8934547..3801ed24 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -224,7 +224,7 @@ class FlashCausalLM(Model): raise NotImplementedError("FlashCausalLM does not support quantization") tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) self.model = ( model_cls.from_pretrained( diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index e58dab61..063910f4 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -42,6 +42,7 @@ class FlashLlama(FlashCausalLM): model_id, revision=revision, padding_side="left", + truncation_side="left", ) config = AutoConfig.from_pretrained( @@ -160,6 +161,7 @@ class FlashLlamaSharded(FlashLlama): model_id, revision=revision, padding_side="left", + truncation_side="left", ) config = AutoConfig.from_pretrained( diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index ecf68442..b93d9f70 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -45,7 +45,7 @@ class FlashNeoXSharded(FlashNeoX): raise NotImplementedError("FlashNeoX does not support quantization") tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) config = AutoConfig.from_pretrained( diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index 106195c2..2f680995 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -33,7 +33,7 @@ class FlashSantacoder(FlashCausalLM): raise NotImplementedError("FlashSantacoder does not support quantization") tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) config = AutoConfig.from_pretrained( diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py index f1090f63..f7fbb2ad 100644 --- a/server/text_generation_server/models/galactica.py +++ b/server/text_generation_server/models/galactica.py @@ -198,7 +198,7 @@ class GalacticaSharded(Galactica): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) config = AutoConfig.from_pretrained( diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py index 8fabefe3..b81976da 100644 --- a/server/text_generation_server/models/gpt_neox.py +++ b/server/text_generation_server/models/gpt_neox.py @@ -44,7 +44,7 @@ class GPTNeoxSharded(CausalLM): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) tokenizer.pad_token = tokenizer.eos_token diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py index fe15cde0..58361a8d 100644 --- a/server/text_generation_server/models/santacoder.py +++ b/server/text_generation_server/models/santacoder.py @@ -26,7 +26,7 @@ class SantaCoder(CausalLM): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) tokenizer.add_special_tokens( { diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index 134ea681..13eafd62 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -349,7 +349,7 @@ class Seq2SeqLM(Model): load_in_8bit=quantize, ).eval() tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) tokenizer.bos_token_id = self.model.config.decoder_start_token_id diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py index cb4f7f22..300b376e 100644 --- a/server/text_generation_server/models/t5.py +++ b/server/text_generation_server/models/t5.py @@ -44,7 +44,7 @@ class T5Sharded(Seq2SeqLM): dtype = torch.float32 tokenizer = AutoTokenizer.from_pretrained( - model_id, revision=revision, padding_side="left" + model_id, revision=revision, padding_side="left", truncation_side="left" ) config = AutoConfig.from_pretrained(