This commit is contained in:
drbh 2025-06-13 14:58:24 +02:00 committed by GitHub
commit 5016e38f5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 94 additions and 63 deletions

View File

@ -9,61 +9,61 @@
"tokens": [ "tokens": [
{ {
"id": 13, "id": 13,
"logprob": -0.007621765, "logprob": -0.052612305,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 13, "id": 13,
"logprob": -0.20812988, "logprob": -0.079589844,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 16114, "id": 16114,
"logprob": -1.2587891, "logprob": -1.6865234,
"special": false, "special": false,
"text": "Once" "text": "Once"
}, },
{ {
"id": 3714, "id": 3714,
"logprob": -0.20825195, "logprob": -0.20983887,
"special": false, "special": false,
"text": " upon" "text": " upon"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.0017709732, "logprob": -0.0014019012,
"special": false, "special": false,
"text": " a" "text": " a"
}, },
{ {
"id": 727, "id": 727,
"logprob": -0.011932373, "logprob": -0.0121154785,
"special": false, "special": false,
"text": " time" "text": " time"
}, },
{ {
"id": 28725, "id": 28725,
"logprob": -0.17297363, "logprob": -0.15405273,
"special": false, "special": false,
"text": "," "text": ","
}, },
{ {
"id": 736, "id": 736,
"logprob": -0.9057617, "logprob": -0.4802246,
"special": false, "special": false,
"text": " there" "text": " there"
}, },
{ {
"id": 403, "id": 403,
"logprob": -0.05758667, "logprob": -0.03289795,
"special": false, "special": false,
"text": " was" "text": " was"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.00970459, "logprob": -0.01423645,
"special": false, "special": false,
"text": " a" "text": " a"
} }
@ -82,61 +82,61 @@
"tokens": [ "tokens": [
{ {
"id": 13, "id": 13,
"logprob": -0.007621765, "logprob": -0.052612305,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 13, "id": 13,
"logprob": -0.20275879, "logprob": -0.07946777,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 16114, "id": 16114,
"logprob": -1.2578125, "logprob": -1.6914062,
"special": false, "special": false,
"text": "Once" "text": "Once"
}, },
{ {
"id": 3714, "id": 3714,
"logprob": -0.2084961, "logprob": -0.21020508,
"special": false, "special": false,
"text": " upon" "text": " upon"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.0017738342, "logprob": -0.0014238358,
"special": false, "special": false,
"text": " a" "text": " a"
}, },
{ {
"id": 727, "id": 727,
"logprob": -0.011932373, "logprob": -0.012138367,
"special": false, "special": false,
"text": " time" "text": " time"
}, },
{ {
"id": 28725, "id": 28725,
"logprob": -0.17297363, "logprob": -0.15625,
"special": false, "special": false,
"text": "," "text": ","
}, },
{ {
"id": 736, "id": 736,
"logprob": -0.9057617, "logprob": -0.47827148,
"special": false, "special": false,
"text": " there" "text": " there"
}, },
{ {
"id": 403, "id": 403,
"logprob": -0.05758667, "logprob": -0.03289795,
"special": false, "special": false,
"text": " was" "text": " was"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.00970459, "logprob": -0.01423645,
"special": false, "special": false,
"text": " a" "text": " a"
} }
@ -155,61 +155,61 @@
"tokens": [ "tokens": [
{ {
"id": 13, "id": 13,
"logprob": -0.007621765, "logprob": -0.052246094,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 13, "id": 13,
"logprob": -0.20275879, "logprob": -0.07739258,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 16114, "id": 16114,
"logprob": -1.2578125, "logprob": -1.6875,
"special": false, "special": false,
"text": "Once" "text": "Once"
}, },
{ {
"id": 3714, "id": 3714,
"logprob": -0.2084961, "logprob": -0.20922852,
"special": false, "special": false,
"text": " upon" "text": " upon"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.0017738342, "logprob": -0.0014228821,
"special": false, "special": false,
"text": " a" "text": " a"
}, },
{ {
"id": 727, "id": 727,
"logprob": -0.011932373, "logprob": -0.012130737,
"special": false, "special": false,
"text": " time" "text": " time"
}, },
{ {
"id": 28725, "id": 28725,
"logprob": -0.17297363, "logprob": -0.15612793,
"special": false, "special": false,
"text": "," "text": ","
}, },
{ {
"id": 736, "id": 736,
"logprob": -0.9057617, "logprob": -0.47827148,
"special": false, "special": false,
"text": " there" "text": " there"
}, },
{ {
"id": 403, "id": 403,
"logprob": -0.05758667, "logprob": -0.032928467,
"special": false, "special": false,
"text": " was" "text": " was"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.00970459, "logprob": -0.014144897,
"special": false, "special": false,
"text": " a" "text": " a"
} }
@ -228,61 +228,61 @@
"tokens": [ "tokens": [
{ {
"id": 13, "id": 13,
"logprob": -0.007621765, "logprob": -0.052978516,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 13, "id": 13,
"logprob": -0.20812988, "logprob": -0.080444336,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 16114, "id": 16114,
"logprob": -1.2587891, "logprob": -1.6826172,
"special": false, "special": false,
"text": "Once" "text": "Once"
}, },
{ {
"id": 3714, "id": 3714,
"logprob": -0.20825195, "logprob": -0.21044922,
"special": false, "special": false,
"text": " upon" "text": " upon"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.0017709732, "logprob": -0.0014238358,
"special": false, "special": false,
"text": " a" "text": " a"
}, },
{ {
"id": 727, "id": 727,
"logprob": -0.011932373, "logprob": -0.012107849,
"special": false, "special": false,
"text": " time" "text": " time"
}, },
{ {
"id": 28725, "id": 28725,
"logprob": -0.17297363, "logprob": -0.15405273,
"special": false, "special": false,
"text": "," "text": ","
}, },
{ {
"id": 736, "id": 736,
"logprob": -0.9057617, "logprob": -0.47875977,
"special": false, "special": false,
"text": " there" "text": " there"
}, },
{ {
"id": 403, "id": 403,
"logprob": -0.05758667, "logprob": -0.03289795,
"special": false, "special": false,
"text": " was" "text": " was"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.00970459, "logprob": -0.01423645,
"special": false, "special": false,
"text": " a" "text": " a"
} }

View File

@ -8,61 +8,61 @@
"tokens": [ "tokens": [
{ {
"id": 13, "id": 13,
"logprob": -0.00756073, "logprob": -0.052612305,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 13, "id": 13,
"logprob": -0.20117188, "logprob": -0.07739258,
"special": false, "special": false,
"text": "\n" "text": "\n"
}, },
{ {
"id": 16114, "id": 16114,
"logprob": -1.2597656, "logprob": -1.6914062,
"special": false, "special": false,
"text": "Once" "text": "Once"
}, },
{ {
"id": 3714, "id": 3714,
"logprob": -0.20825195, "logprob": -0.21020508,
"special": false, "special": false,
"text": " upon" "text": " upon"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.00178051, "logprob": -0.0014228821,
"special": false, "special": false,
"text": " a" "text": " a"
}, },
{ {
"id": 727, "id": 727,
"logprob": -0.011955261, "logprob": -0.012123108,
"special": false, "special": false,
"text": " time" "text": " time"
}, },
{ {
"id": 28725, "id": 28725,
"logprob": -0.17541504, "logprob": -0.15625,
"special": false, "special": false,
"text": "," "text": ","
}, },
{ {
"id": 736, "id": 736,
"logprob": -0.91308594, "logprob": -0.47875977,
"special": false, "special": false,
"text": " there" "text": " there"
}, },
{ {
"id": 403, "id": 403,
"logprob": -0.058410645, "logprob": -0.033416748,
"special": false, "special": false,
"text": " was" "text": " was"
}, },
{ {
"id": 264, "id": 264,
"logprob": -0.009689331, "logprob": -0.014137268,
"special": false, "special": false,
"text": " a" "text": " a"
} }

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch Llava-NeXT model.""" """PyTorch Llava-NeXT model."""
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@ -115,12 +115,29 @@ class LlavaNextForConditionalGeneration(nn.Module):
super().__init__() super().__init__()
config.vision_config.quantize = config.quantize config.vision_config.quantize = config.quantize
vision_config = config.vision_config vision_config = config.vision_config
# Instead of selecting in hidden_states[-2].
# Instead compute only the n -2 + 1 layers and don't pool vision_feature_layer = []
if config.vision_feature_layer < 0: # If the vision_feature_layer is an int, we assume it is the number of layers
vision_config.num_hidden_layers += config.vision_feature_layer + 1 if isinstance(config.vision_feature_layer, int):
# Instead of selecting in hidden_states[-2].
# Instead compute only the n -2 + 1 layers and don't pool
if config.vision_feature_layer < 0:
# vision_config.num_hidden_layers += config.vision_feature_layer + 1
num = vision_config.num_hidden_layers + config.vision_feature_layer + 1
vision_feature_layer = [num]
else:
# vision_config.num_hidden_layers = config.vision_feature_layer + 1
num_hidden_layers = [config.vision_feature_layer + 1]
elif isinstance(config.vision_feature_layer, list):
# If the vision_feature_layer is a list, we assume it is a list of layer indices
# and we select the hidden states at those layers
vision_feature_layer = config.vision_feature_layer
else: else:
vision_config.num_hidden_layers = config.vision_feature_layer + 1 vision_feature_layer = [vision_config.num_hidden_layers - 1]
self.vision_feature_layer = vision_feature_layer
self.vision_tower = load_vision_model( self.vision_tower = load_vision_model(
prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
config=config.vision_config, config=config.vision_config,
@ -194,6 +211,14 @@ class LlavaNextForConditionalGeneration(nn.Module):
f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid." f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
) )
if image_features.hidden_states is not None:
# vision_feature_layer is a list of layer indices, we select the hidden states at those layers
hs_pool = [
image_features.hidden_states[layer_idx]
for layer_idx in self.vision_feature_layer
]
selected_image_feature = torch.cat(hs_pool, dim=-1)
image_features = self.multi_modal_projector(selected_image_feature) image_features = self.multi_modal_projector(selected_image_feature)
# split up image_features for each of the individual images # split up image_features for each of the individual images

View File

@ -358,6 +358,8 @@ class SiglipEncoder(nn.Module):
for i in range(config.num_hidden_layers) for i in range(config.num_hidden_layers)
] ]
) )
# Pre-allocate reusable list to avoid memory allocation during forward pass
self._hidden_states_buffer = [None] * config.num_hidden_layers
def forward( def forward(
self, self,
@ -365,13 +367,15 @@ class SiglipEncoder(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
): ):
hidden_states = inputs_embeds hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers): for idx, encoder_layer in enumerate(self.layers):
hidden_states, _ = encoder_layer( hidden_states, _ = encoder_layer(
hidden_states, hidden_states,
attention_mask, attention_mask,
) )
self._hidden_states_buffer[idx] = hidden_states
return hidden_states return self._hidden_states_buffer
class SiglipVisionTransformer(nn.Module): class SiglipVisionTransformer(nn.Module):
@ -393,18 +397,22 @@ class SiglipVisionTransformer(nn.Module):
if pixel_values is None: if pixel_values is None:
raise ValueError("You have to specify pixel_values") raise ValueError("You have to specify pixel_values")
# make sure the pixel values are the correct dtype
pixel_values = pixel_values.to(
dtype=self.embeddings.patch_embedding.weight.dtype
)
hidden_states = self.embeddings(pixel_values) hidden_states = self.embeddings(pixel_values)
# NOTE: up until this point, the code logits are exactly # NOTE: up until this point, the code logits are exactly
# the same as the transformers code. The values evaulate # the same as the transformers code. The values evaulate
# slightly differently in our encoder layer. # slightly differently in our encoder layer.
encoder_outputs = self.encoder( all_encoder_outputs = self.encoder(
inputs_embeds=hidden_states, inputs_embeds=hidden_states,
) )
last_hidden_state = encoder_outputs last_hidden_state = all_encoder_outputs[-1]
return BaseModelOutputWithPooling( return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state, last_hidden_state=last_hidden_state,
# pooler_output=pooled_output, # pooler_output=pooled_output,
# hidden_states=encoder_outputs, hidden_states=all_encoder_outputs,
) )

View File

@ -1,5 +1,5 @@
def load_text_model(prefix, config, weights, name=None): def load_text_model(prefix, config, weights, name=None):
if config.model_type == "llama": if config.model_type == "llama" or config.model_type == "granite":
from text_generation_server.models.custom_modeling.flash_llama_modeling import ( from text_generation_server.models.custom_modeling.flash_llama_modeling import (
FlashLlamaForCausalLM, FlashLlamaForCausalLM,
) )

View File

@ -247,8 +247,6 @@ def get_number_of_features(height: int, width: int, config) -> int:
image_size = config.vision_config.image_size image_size = config.vision_config.image_size
patch_size = config.vision_config.patch_size patch_size = config.vision_config.patch_size
assert image_size % patch_size == 0
npatches = image_size // patch_size npatches = image_size // patch_size
# Dimensions are intentionally swapped to be bug-compatible with # Dimensions are intentionally swapped to be bug-compatible with