diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index f7980d2d..3e8e67ab 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -424,7 +424,7 @@ class FlashLlamaModel(torch.nn.Module): FlashLlamaLayer( index=0, prefix=( - "model.layers.0" if not prefix else "{prefix}.model.layers.0" + "model.layers.0" if not prefix else f"{prefix}.model.layers.0" ), config=config, weights=weights, diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 735c3899..f540b7af 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -832,6 +832,7 @@ class Idefics2ForConditionalGeneration(nn.Module): max_s=max_s, true_max_s=max_s, prefill_cache_indices=None, + adapter_data=adapter_data, ) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 567131ef..e154d805 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -280,6 +280,7 @@ class LlavaNextForConditionalGeneration(nn.Module): max_s=max_s, true_max_s=max_s, prefill_cache_indices=None, + adapter_data=adapter_data, ) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 308d5a3d..7de54aa4 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -14,6 +14,7 @@ from text_generation_server.models.flash_causal_lm import ( ) from text_generation_server.utils.log import log_master from transformers import AutoProcessor +from text_generation_server.layers.attention import Seqlen tracer = trace.get_tracer(__name__) @@ -348,6 +349,7 @@ class VlmCausalLM(FlashCausalLM): else: cuda_graph = None if cu_seqlen_prefill is not None or cuda_graph is None: + input_lengths = Seqlen(input_lengths=input_lengths) logits, speculative_logits = self.model.forward( input_ids=input_ids, position_ids=position_ids,