Fixing codellama loads by using purely AutoTokenizer.

- The need for the slow tokenizer default stems from back
  when llama 1 was introduced and all the flags where not
  supported in `tokenizers`.

- Fixes #1891
This commit is contained in:
Nicolas Patry 2024-05-24 10:02:35 +00:00
parent f41d644a90
commit ad0b36bd28

View File

@ -3,7 +3,6 @@ import torch.distributed
from opentelemetry import trace from opentelemetry import trace
from transformers import AutoConfig, AutoTokenizer, GenerationConfig from transformers import AutoConfig, AutoTokenizer, GenerationConfig
from transformers.models.llama import LlamaTokenizer
from typing import Optional from typing import Optional
from text_generation_server.models import FlashCausalLM from text_generation_server.models import FlashCausalLM
@ -41,22 +40,13 @@ class FlashLlama(FlashCausalLM):
else: else:
raise NotImplementedError("FlashLlama is only available on GPU") raise NotImplementedError("FlashLlama is only available on GPU")
try: tokenizer = AutoTokenizer.from_pretrained(
tokenizer = LlamaTokenizer.from_pretrained( model_id,
model_id, revision=revision,
revision=revision, padding_side="left",
padding_side="left", truncation_side="left",
truncation_side="left", trust_remote_code=trust_remote_code,
trust_remote_code=trust_remote_code, )
)
except Exception:
tokenizer = AutoTokenizer.from_pretrained(
model_id,
revision=revision,
padding_side="left",
truncation_side="left",
trust_remote_code=trust_remote_code,
)
try: try:
generation_config = GenerationConfig.from_pretrained( generation_config = GenerationConfig.from_pretrained(
model_id, revision=revision, trust_remote_code=trust_remote_code model_id, revision=revision, trust_remote_code=trust_remote_code