mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
Fix: don't apply post layernorm in SiglipVisionTransformer
This fixes a bug with LLaVA Next when using Siglip as the vision model. LLaVA Next expects the output of the vision model to be the encoder outputs before layernorm (see original transformers implementation here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L813). This also makes Siglip consistent with the existing Clip implementation: https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/models/custom_modeling/clip.py#L613
This commit is contained in:
parent
f3c5d7d92f
commit
b84303e2e9
@ -386,11 +386,11 @@ class SiglipVisionTransformer(nn.Module):
|
||||
self.encoder = SiglipEncoder(
|
||||
prefix=f"{prefix}.encoder", config=config, weights=weights
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm.load(
|
||||
prefix=f"{prefix}.post_layernorm",
|
||||
weights=weights,
|
||||
eps=config.layer_norm_eps,
|
||||
)
|
||||
# self.post_layernorm = nn.LayerNorm.load(
|
||||
# prefix=f"{prefix}.post_layernorm",
|
||||
# weights=weights,
|
||||
# eps=config.layer_norm_eps,
|
||||
# )
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -412,10 +412,10 @@ class SiglipVisionTransformer(nn.Module):
|
||||
inputs_embeds=hidden_states,
|
||||
)
|
||||
last_hidden_state = encoder_outputs
|
||||
post_last_hidden_state = self.post_layernorm(last_hidden_state)
|
||||
# post_last_hidden_state = self.post_layernorm(last_hidden_state)
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=post_last_hidden_state,
|
||||
last_hidden_state=last_hidden_state,
|
||||
# pooler_output=pooled_output,
|
||||
# hidden_states=encoder_outputs,
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user