Fixing codellama loads by using purely AutoTokenizer.

- The need for the slow tokenizer default stems from back
  when llama 1 was introduced and all the flags where not
  supported in `tokenizers`.

- Fixes #1891
This commit is contained in:
Nicolas Patry 2024-05-24 10:02:35 +00:00
parent f41d644a90
commit ad0b36bd28

View File

@ -3,7 +3,6 @@ import torch.distributed
from opentelemetry import trace
from transformers import AutoConfig, AutoTokenizer, GenerationConfig
from transformers.models.llama import LlamaTokenizer
from typing import Optional
from text_generation_server.models import FlashCausalLM
@ -41,22 +40,13 @@ class FlashLlama(FlashCausalLM):
else:
raise NotImplementedError("FlashLlama is only available on GPU")
try:
tokenizer = LlamaTokenizer.from_pretrained(
model_id,
revision=revision,
padding_side="left",
truncation_side="left",
trust_remote_code=trust_remote_code,
)
except Exception:
tokenizer = AutoTokenizer.from_pretrained(
model_id,
revision=revision,
padding_side="left",
truncation_side="left",
trust_remote_code=trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
revision=revision,
padding_side="left",
truncation_side="left",
trust_remote_code=trust_remote_code,
)
try:
generation_config = GenerationConfig.from_pretrained(
model_id, revision=revision, trust_remote_code=trust_remote_code