Merge 2204f91f32 into 3752143b39

2025-10-10 23:45:23 +00:00 · 2025-06-13 14:58:24 +02:00 · 2025-06-13 14:58:24 +02:00 · 5016e38f5a
commit 5016e38f5a
parent 3752143b39 2204f91f32
6 changed files with 94 additions and 63 deletions
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
@ -9,61 +9,61 @@
      "tokens": [
        {
          "id": 13,
-          "logprob": -0.007621765,
+          "logprob": -0.052612305,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
-          "logprob": -0.20812988,
+          "logprob": -0.079589844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
-          "logprob": -1.2587891,
+          "logprob": -1.6865234,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
-          "logprob": -0.20825195,
+          "logprob": -0.20983887,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
-          "logprob": -0.0017709732,
+          "logprob": -0.0014019012,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
-          "logprob": -0.011932373,
+          "logprob": -0.0121154785,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
-          "logprob": -0.17297363,
+          "logprob": -0.15405273,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
-          "logprob": -0.9057617,
+          "logprob": -0.4802246,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
-          "logprob": -0.05758667,
+          "logprob": -0.03289795,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
-          "logprob": -0.00970459,
+          "logprob": -0.01423645,
          "special": false,
          "text": " a"
        }
@ -82,61 +82,61 @@
      "tokens": [
        {
          "id": 13,
-          "logprob": -0.007621765,
+          "logprob": -0.052612305,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
-          "logprob": -0.20275879,
+          "logprob": -0.07946777,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
-          "logprob": -1.2578125,
+          "logprob": -1.6914062,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
-          "logprob": -0.2084961,
+          "logprob": -0.21020508,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
-          "logprob": -0.0017738342,
+          "logprob": -0.0014238358,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
-          "logprob": -0.011932373,
+          "logprob": -0.012138367,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
-          "logprob": -0.17297363,
+          "logprob": -0.15625,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
-          "logprob": -0.9057617,
+          "logprob": -0.47827148,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
-          "logprob": -0.05758667,
+          "logprob": -0.03289795,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
-          "logprob": -0.00970459,
+          "logprob": -0.01423645,
          "special": false,
          "text": " a"
        }
@ -155,61 +155,61 @@
      "tokens": [
        {
          "id": 13,
-          "logprob": -0.007621765,
+          "logprob": -0.052246094,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
-          "logprob": -0.20275879,
+          "logprob": -0.07739258,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
-          "logprob": -1.2578125,
+          "logprob": -1.6875,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
-          "logprob": -0.2084961,
+          "logprob": -0.20922852,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
-          "logprob": -0.0017738342,
+          "logprob": -0.0014228821,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
-          "logprob": -0.011932373,
+          "logprob": -0.012130737,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
-          "logprob": -0.17297363,
+          "logprob": -0.15612793,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
-          "logprob": -0.9057617,
+          "logprob": -0.47827148,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
-          "logprob": -0.05758667,
+          "logprob": -0.032928467,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
-          "logprob": -0.00970459,
+          "logprob": -0.014144897,
          "special": false,
          "text": " a"
        }
@ -228,61 +228,61 @@
      "tokens": [
        {
          "id": 13,
-          "logprob": -0.007621765,
+          "logprob": -0.052978516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
-          "logprob": -0.20812988,
+          "logprob": -0.080444336,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
-          "logprob": -1.2587891,
+          "logprob": -1.6826172,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
-          "logprob": -0.20825195,
+          "logprob": -0.21044922,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
-          "logprob": -0.0017709732,
+          "logprob": -0.0014238358,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
-          "logprob": -0.011932373,
+          "logprob": -0.012107849,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
-          "logprob": -0.17297363,
+          "logprob": -0.15405273,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
-          "logprob": -0.9057617,
+          "logprob": -0.47875977,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
-          "logprob": -0.05758667,
+          "logprob": -0.03289795,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
-          "logprob": -0.00970459,
+          "logprob": -0.01423645,
          "special": false,
          "text": " a"
        }
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_simple.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_simple.json
@ -8,61 +8,61 @@
    "tokens": [
      {
        "id": 13,
-        "logprob": -0.00756073,
+        "logprob": -0.052612305,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
-        "logprob": -0.20117188,
+        "logprob": -0.07739258,
        "special": false,
        "text": "\n"
      },
      {
        "id": 16114,
-        "logprob": -1.2597656,
+        "logprob": -1.6914062,
        "special": false,
        "text": "Once"
      },
      {
        "id": 3714,
-        "logprob": -0.20825195,
+        "logprob": -0.21020508,
        "special": false,
        "text": " upon"
      },
      {
        "id": 264,
-        "logprob": -0.00178051,
+        "logprob": -0.0014228821,
        "special": false,
        "text": " a"
      },
      {
        "id": 727,
-        "logprob": -0.011955261,
+        "logprob": -0.012123108,
        "special": false,
        "text": " time"
      },
      {
        "id": 28725,
-        "logprob": -0.17541504,
+        "logprob": -0.15625,
        "special": false,
        "text": ","
      },
      {
        "id": 736,
-        "logprob": -0.91308594,
+        "logprob": -0.47875977,
        "special": false,
        "text": " there"
      },
      {
        "id": 403,
-        "logprob": -0.058410645,
+        "logprob": -0.033416748,
        "special": false,
        "text": " was"
      },
      {
        "id": 264,
-        "logprob": -0.009689331,
+        "logprob": -0.014137268,
        "special": false,
        "text": " a"
      }
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Llava-NeXT model."""
+"""PyTorch Llava-NeXT model."""

 from typing import List, Optional, Tuple

@ -115,12 +115,29 @@ class LlavaNextForConditionalGeneration(nn.Module):
        super().__init__()
        config.vision_config.quantize = config.quantize
        vision_config = config.vision_config
-        # Instead of selecting in hidden_states[-2].
-        # Instead compute only the n -2 + 1 layers and don't pool
-        if config.vision_feature_layer < 0:
-            vision_config.num_hidden_layers += config.vision_feature_layer + 1
+
+        vision_feature_layer = []
+        # If the vision_feature_layer is an int, we assume it is the number of layers
+        if isinstance(config.vision_feature_layer, int):
+            # Instead of selecting in hidden_states[-2].
+            # Instead compute only the n -2 + 1 layers and don't pool
+            if config.vision_feature_layer < 0:
+                # vision_config.num_hidden_layers += config.vision_feature_layer + 1
+                num = vision_config.num_hidden_layers + config.vision_feature_layer + 1
+                vision_feature_layer = [num]
+            else:
+                # vision_config.num_hidden_layers = config.vision_feature_layer + 1
+                num_hidden_layers = [config.vision_feature_layer + 1]
+        elif isinstance(config.vision_feature_layer, list):
+            # If the vision_feature_layer is a list, we assume it is a list of layer indices
+            # and we select the hidden states at those layers
+
+            vision_feature_layer = config.vision_feature_layer
        else:
-            vision_config.num_hidden_layers = config.vision_feature_layer + 1
+            vision_feature_layer = [vision_config.num_hidden_layers - 1]
+
+        self.vision_feature_layer = vision_feature_layer
+
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
@ -194,6 +211,14 @@ class LlavaNextForConditionalGeneration(nn.Module):
                f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
            )

+        if image_features.hidden_states is not None:
+            # vision_feature_layer is a list of layer indices, we select the hidden states at those layers
+            hs_pool = [
+                image_features.hidden_states[layer_idx]
+                for layer_idx in self.vision_feature_layer
+            ]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
        image_features = self.multi_modal_projector(selected_image_feature)

        # split up image_features for each of the individual images
--- a/server/text_generation_server/models/custom_modeling/siglip.py
+++ b/server/text_generation_server/models/custom_modeling/siglip.py
@ -358,6 +358,8 @@ class SiglipEncoder(nn.Module):
                for i in range(config.num_hidden_layers)
            ]
        )
+        # Pre-allocate reusable list to avoid memory allocation during forward pass
+        self._hidden_states_buffer = [None] * config.num_hidden_layers

    def forward(
        self,
@ -365,13 +367,15 @@ class SiglipEncoder(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
+
        for idx, encoder_layer in enumerate(self.layers):
            hidden_states, _ = encoder_layer(
                hidden_states,
                attention_mask,
            )
+            self._hidden_states_buffer[idx] = hidden_states

-        return hidden_states
+        return self._hidden_states_buffer


 class SiglipVisionTransformer(nn.Module):
@ -393,18 +397,22 @@ class SiglipVisionTransformer(nn.Module):
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

+        # make sure the pixel values are the correct dtype
+        pixel_values = pixel_values.to(
+            dtype=self.embeddings.patch_embedding.weight.dtype
+        )
        hidden_states = self.embeddings(pixel_values)

        # NOTE: up until this point, the code logits are exactly
        # the same as the transformers code. The values evaulate
        # slightly differently in our encoder layer.
-        encoder_outputs = self.encoder(
+        all_encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
        )
-        last_hidden_state = encoder_outputs
+        last_hidden_state = all_encoder_outputs[-1]

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            # pooler_output=pooled_output,
-            # hidden_states=encoder_outputs,
+            hidden_states=all_encoder_outputs,
        )
--- a/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@ -1,5 +1,5 @@
 def load_text_model(prefix, config, weights, name=None):
-    if config.model_type == "llama":
+    if config.model_type == "llama" or config.model_type == "granite":
        from text_generation_server.models.custom_modeling.flash_llama_modeling import (
            FlashLlamaForCausalLM,
        )
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -247,8 +247,6 @@ def get_number_of_features(height: int, width: int, config) -> int:
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size

-    assert image_size % patch_size == 0
-
    npatches = image_size // patch_size

    # Dimensions are intentionally swapped to be bug-compatible with