Fixing codellama loads by using purely AutoTokenizer.

- The need for the slow tokenizer default stems from back when llama 1 was introduced and all the flags where not supported in `tokenizers`. - Fixes #1891
2025-09-11 20:34:54 +00:00 · 2024-05-24 10:02:35 +00:00 · 2024-05-24 10:02:35 +00:00 · ad0b36bd28
commit ad0b36bd28
parent f41d644a90
1 changed files with 7 additions and 17 deletions
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@ -3,7 +3,6 @@ import torch.distributed
 from opentelemetry import trace
 from transformers import AutoConfig, AutoTokenizer, GenerationConfig
 from transformers.models.llama import LlamaTokenizer
 from typing import Optional
 from text_generation_server.models import FlashCausalLM
@ -41,22 +40,13 @@ class FlashLlama(FlashCausalLM):
        else:
            raise NotImplementedError("FlashLlama is only available on GPU")
-        try:
+        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer = LlamaTokenizer.from_pretrained(
+            model_id,
-                model_id,
+            revision=revision,
-                revision=revision,
+            padding_side="left",
-                padding_side="left",
+            truncation_side="left",
-                truncation_side="left",
+            trust_remote_code=trust_remote_code,
-                trust_remote_code=trust_remote_code,
+        )
            )
        except Exception:
            tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                revision=revision,
                padding_side="left",
                truncation_side="left",
                trust_remote_code=trust_remote_code,
            )
        try:
            generation_config = GenerationConfig.from_pretrained(
                model_id, revision=revision, trust_remote_code=trust_remote_code