diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index a383cc88..a28ef381 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1267,15 +1267,6 @@ class FlashCausalLM(Model): prefix = None model = model_class(prefix, config, weights) - - if model.config.vocab_size != tokenizer.vocab_size: - logger.warning( - f"Tokenizer vocab size {tokenizer.vocab_size} does not match model vocab size {model.config.vocab_size}. Updating tokenizer vocab size." - ) - # TODO: HUGE HACK! This is a workaround for the fact that Qwen2TokenizerFast - # returns the incorrect vocab size for the 2B model. - tokenizer._vocab_size = model.config.vocab_size - torch.distributed.barrier(group=self.process_group) # VLM models define the config we care about in their text_config diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 2b1e01df..42588d3b 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -414,6 +414,15 @@ class VlmCausalLM(FlashCausalLM): **kwargs, ) + if self.config.vocab_size != self.tokenizer.vocab_size: + logger.warning( + f"Tokenizer vocab size {self.tokenizer.vocab_size} does not match model vocab size {self.config.vocab_size}. Updating tokenizer vocab size." + ) + # TODO: HUGE HACK! This is a workaround to update the vocab size + # in the tokenizer. When the tokenizer is updated within the model + # the vocab size is not updated in the tokenizer. + self.tokenizer._vocab_size = self.config.vocab_size + @property def batch_type(self) -> Type[VlmCausalLMBatch]: return self.batch_class