diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 92f3c51c..4b506532 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -380,6 +380,8 @@ def get_model( transformers_model_class = getattr(transformers, modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]) if transformers_model_class.is_backend_compatible(): transformers_causal_lm_class = TransformersFlashCausalLM + if not FLASH_ATTENTION and lora_adapter_ids is not None and len(lora_adapter_ids) > 0: + raise ValueError("Transformers backend AutoModel do not support `lora_adapter_ids`.") quantization_config = config_dict.get("quantization_config", None) if quantization_config is None: diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py index 42ec1b3f..98fbf9a2 100644 --- a/server/text_generation_server/models/transformers_flash_causal_lm.py +++ b/server/text_generation_server/models/transformers_flash_causal_lm.py @@ -48,7 +48,7 @@ def tgi_flash_attention_forward( softmax_scale: Optional[float] = None, sliding_window: Optional[int] = None, softcap: Optional[float] = None, - **kwargs, # This is needed to "absorb" other args passed by Transformers modeling + **_kwargs, # This is needed to "absorb" other args passed by Transformers modeling ): kv_cache = kv_cache[module.layer_idx]