Merge branch 'main' into add_vlm_chunking_optimized

2025-09-09 19:34:53 +00:00 · 2025-04-24 09:50:44 +00:00 · 2025-04-24 09:50:44 +00:00 · 8015f5f258
commit 8015f5f258
parent d58ec388bf 375802948d
16 changed files with 1042 additions and 575 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -45,7 +45,7 @@ jobs:
        uses: actions/github-script@v7
        with:
          script: |
-            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');

      - name: Extract TensorRT-LLM version
@ -223,7 +223,7 @@ jobs:
            PLATFORM=${{ env.PLATFORM }}
            build_type=${{ env.BUILD_TYPE }}
            sccache_gha_enabled=on
-            actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
+            actions_results_url=${{ env.ACTIONS_RESULTS_URL }}
            actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
          target: ${{ env.TARGET }}
          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
--- a/9
+++ b/9
@ -3,10 +3,9 @@ ARG cuda_base=12.8.0
 ARG build_type=release
 ARG ompi_version=4.1.7
 ARG sccache_gha_enabled=off
-ARG actions_cache_url=""
+ARG actions_results_url=""
 ARG actions_runtime_token=""

-
 # CUDA dependent dependencies resolver stage
 FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder

@ -66,7 +65,7 @@ WORKDIR /usr/src/text-generation-inference
 ARG cuda_arch_list
 ARG build_type
 ARG sccache_gha_enabled
-ARG actions_cache_url
+ARG actions_results_url
 ARG actions_runtime_token

 # Install Rust
@ -74,7 +73,7 @@ ENV PATH="/root/.cargo/bin:$PATH"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y && \
    chmod -R a+w /root/.rustup && \
    chmod -R a+w /root/.cargo && \
-    cargo install sccache --locked
+    cargo install sccache --version ">=0.10.0" --locked

 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
 ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
@ -85,7 +84,7 @@ ENV CUDA_ARCH_LIST=${cuda_arch_list}

 # SCCACHE Specifics args - before finding a better, more generic, way...
 ENV SCCACHE_GHA_ENABLED=${sccache_gha_enabled}
-ENV ACTIONS_CACHE_URL=${actions_cache_url}
+ENV ACTIONS_RESULTS_URL=${actions_results_url}
 ENV ACTIONS_RUNTIME_TOKEN=${actions_runtime_token}

 COPY Cargo.lock Cargo.lock
--- a/backends/gaudi/server/text_generation_server/layers/attention/common.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/common.py
@ -13,7 +13,6 @@ class HPUPagedAttentionMetadata:
    block_list: Optional[torch.Tensor]
    block_mapping: Optional[torch.Tensor]
    block_usage: Optional[torch.Tensor]
-    block_scales: Optional[torch.Tensor]
    block_groups: Optional[torch.Tensor]
    attn_bias: Optional[torch.Tensor]

@ -66,7 +65,6 @@ def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
            "block_list",
            "block_mapping",
            "block_usage",
-            "block_scales",
            "block_groups",
            "attn_bias",
        ],
--- a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
@ -74,7 +74,6 @@ def paged_attention(
        block_list=hpu_attention_meta.block_list,
        block_mapping=hpu_attention_meta.block_mapping,
        block_bias=hpu_attention_meta.attn_bias,
-        block_scales=hpu_attention_meta.block_scales,
        block_groups=hpu_attention_meta.block_groups,
        scale=softmax_scale,
        matmul_qk_op=Matmul(),
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
@ -681,11 +681,10 @@ class MllamaTextCrossAttention(nn.Module):
        # bsz, q_len, _ = hidden_states.size()
        (
            cross_attention_states,
-            cu_seqlen_q,
-            cu_seqlen_k,
+            cross_attention_len,
            indices,
        ) = cross_attention_states
-        bs = cu_seqlen_q.size(0) - 1
+        bs = cross_attention_len.size(0)
        query_states = self.q_proj(hidden_states)
        query_states = query_states.view(bs, -1, self.num_heads, self.head_size)
        query_states = self.q_norm(query_states)
@ -814,8 +813,6 @@ class FlashLlamaCrossLayer(torch.nn.Module):

        indices = cross_attention_states[-1]
        out_hidden_states = hidden_states[:]
-        if len(indices) > 0:
-            assert max(indices) < hidden_states.shape[0]
        hidden_states = hidden_states[indices]
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
@ -914,59 +911,14 @@ class FlashMllamaForConditionalGeneration(nn.Module):
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor],
        adapter_data: Optional[torch.Tensor] = None,
-        # XXX: Putting these as optional so that the cuda warmup calls can go through.
        cross_attention_states: Optional[torch.Tensor] = None,
-        image_indices=None,
+        indices=None,
+        cross_attention_len: Optional[torch.Tensor] = None,
    ):
        if cross_attention_states is not None:
-            seqlen_q = len(image_indices)
-            n_images = cross_attention_states.shape[0]
-            seqlen_k = cross_attention_states.shape[1]
-            device = cross_attention_states.device
-            if cu_seqlen_prefill is not None:
-                offset = 0
-                cu_q = []
-                indices = []
-                for index in image_indices:
-                    cu_q.append(offset)
-                    length = seqlen.input_lengths[index].item()
-                    assert index < seqlen.cu_seqlen_q.shape[0]
-                    input_ids_offset = seqlen.cu_seqlen_q[index]
-                    indices.extend(range(input_ids_offset, input_ids_offset + length))
-                    offset += length
-                cu_q.append(offset)
-                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)
-
-                assert max(indices) < input_ids.shape[0]
-
-                cu_seqlen_k = (
-                    torch.arange(
-                        n_images + 1,
-                        device=device,
-                        dtype=torch.int32,
-                    )
-                    * seqlen_k
-                )
-            else:
-                cu_seqlen_q = torch.arange(
-                    seqlen_q + 1, device=device, dtype=torch.int32
-                )
-                seqlen_k = cross_attention_states.shape[1]
-                n_images = cross_attention_states.shape[0]
-                cu_seqlen_k = (
-                    torch.arange(
-                        n_images + 1,
-                        device=device,
-                        dtype=torch.int32,
-                    )
-                    * seqlen_k
-                )
-                indices = image_indices[:]
-
            cross_attention_states = (
                cross_attention_states,
-                cu_seqlen_q,
-                cu_seqlen_k,
+                cross_attention_len,
                indices,
            )

--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
--- a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
@ -11,13 +11,18 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
+    prepare_for_decode,
 )
-from text_generation_server.models.globals import PREFIX_CACHING
+from text_generation_server.models.globals import PREFIX_CACHING, BLOCK_SIZE
 from loguru import logger
 from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
 from text_generation_server.layers.attention import Seqlen, trim_seqlen_metadata
 import habana_frameworks.torch as htorch
+from text_generation_server.utils.import_utils import (
+    synchronize,
+)
+import torch.nn.functional as F

 tracer = trace.get_tracer(__name__)

@ -375,6 +380,91 @@ class FlashVlmCausalLM(FlashCausalLM):
    def max_past(self) -> Optional[int]:
        return getattr(self.model.text_model, "max_past", None)

+    def warmup_decode(
+        self, batch_size: int, block_num: int, batch: FlashVlmCausalLMBatch
+    ):
+        input_ids = torch.zeros(
+            batch_size, dtype=batch.input_ids.dtype, device=self.device
+        )
+        position_ids = torch.arange(
+            batch_size, dtype=batch.position_ids.dtype, device=self.device
+        )
+        if batch.position_ids is not None and batch.position_ids.dim() == 2:
+            # qwen2_vl and qwen2_5_vl case
+            position_ids = position_ids.unsqueeze(-1).repeat(
+                (1, batch.position_ids.shape[-1])
+            )
+        blocks = [block_num // batch_size for _ in range(batch_size)]
+        blocks[0] += block_num % batch_size
+        past_len = []
+        block_tables = []
+        slots = []
+        start_idx = 0
+
+        # fetch the last blocked to warmup block num
+        for i in range(batch_size):
+            block_array = list(range(start_idx, start_idx + blocks[i]))
+            slots.append(BLOCK_SIZE * block_array[-1] + BLOCK_SIZE - 1)
+            block_tables.append(block_array)
+            past_len.append(blocks[i] * BLOCK_SIZE - 1)
+            start_idx += blocks[i]
+        input_lengths = torch.ones(batch_size, dtype=torch.int32, device=self.device)
+        cache_lengths_tensor = torch.tensor(
+            past_len, dtype=torch.int32, device=self.device
+        )
+        cu_seqlen_prefill = torch.zeros(
+            batch_size + 1, device=self.device, dtype=torch.int32
+        )
+        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])
+
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+
+        hpu_attention_meta = prepare_for_decode(
+            self.dtype,
+            self.use_contiguous_pa,
+            self.device,
+            slots,
+            block_tables,
+            batch_size,
+            bucketing_ctx=None,
+        )
+        slots_tensor = torch.tensor(slots, dtype=batch.slots.dtype, device=self.device)
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            kv_cache=self.kv_cache,
+            slots=slots_tensor,
+            seqlen=trim_seqlen_metadata(seqlen),
+            hpu_attention_meta=hpu_attention_meta,
+            lm_head_indices=None,
+            pixel_values=None,
+            pixel_attention_mask=None,
+            image_sizes=None,
+            image_grid_thw=None,
+        )
+
+    def warmup_hpu_graph(self, batch: FlashVlmCausalLMBatch):
+        warmup_times = 3
+        # only warmup decode, for prefill, image pixal size may change, make the warmup useless
+        self.bucketing_ctx.generate_decode_buckets(self.bucketing_ctx.num_hpu_blocks)
+        for i, (batch_size, block_num) in enumerate(
+            reversed(self.bucketing_ctx.decode_buckets)
+        ):
+            if batch_size > block_num:
+                continue
+            log_master(
+                logger.info, f"warmup decode bs {batch_size} block_num {block_num}"
+            )
+            for index in range(warmup_times):
+                self.warmup_decode(batch_size, block_num, batch)
+        synchronize(self.device)
+
    def forward(
        self,
        batch: FlashVlmCausalLMBatch,
@ -450,17 +540,75 @@ class FlashVlmCausalLM(FlashCausalLM):

        kwargs = {}
        if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling

+        if batch.prefill_cache_indices is not None:
+            slots_pad = torch.zeros_like(input_ids)
+            slots_pad[batch.prefill_cache_indices] = slots
+            slots = slots_pad
+        if self.bucketing_ctx is not None:
+            if batch.prefilling:
+                padded_bs = self.bucketing_ctx.get_padded_prompt_batch_size(
+                    input_lengths.shape[0]
+                )
+            else:
+                padded_bs = self.bucketing_ctx.get_padded_decode_batch_size(
+                    input_lengths.shape[0]
+                )
+        else:
+            padded_bs = input_lengths.shape[0]
+        orig_bs = input_lengths.shape[0]
+        if padded_bs != input_lengths.shape[0]:
+            padded_input_lengths = F.pad(
+                input_lengths,
+                (0, padded_bs - orig_bs),
+                value=0,
+            )
+            padded_cache_lengths_tensor = F.pad(
+                cache_lengths_tensor,
+                (0, padded_bs - orig_bs),
+                value=0,
+            )
+            if cu_seqlen_prefill is not None:
+                cu_seqlen_prefill = torch.zeros(
+                    padded_bs + 1, device=self.device, dtype=torch.int32
+                )
+                torch.cumsum(padded_input_lengths, -1, out=cu_seqlen_prefill[1:])
+            seqlen = Seqlen(
+                input_lengths=padded_input_lengths,
+                cache_lengths=padded_cache_lengths_tensor,
+                cu_seqlen_q=cu_seqlen_prefill,
+            )
+            input_seq = input_ids.view(orig_bs, -1)
+            input_ids = F.pad(
+                input_ids, (0, (padded_bs - orig_bs) * input_seq.shape[-1]), value=0
+            )
+            if position_ids.dim() == 2:
+                # qwen2_vl and qwen2_5_vl case
+                position_ids = F.pad(
+                    position_ids,
+                    (0, 0, 0, (padded_bs - orig_bs) * input_seq.shape[-1]),
+                    value=1,
+                )
+            else:
+                position_ids = F.pad(
+                    position_ids,
+                    (0, (padded_bs - orig_bs) * input_seq.shape[-1]),
+                    value=1,
+                )
+            slots = F.pad(
+                slots, (0, (padded_bs - orig_bs) * input_seq.shape[-1]), value=0
+            )
+            if lm_head_indices is not None:
+                lm_head_indices = F.pad(
+                    lm_head_indices, (0, padded_bs - orig_bs), value=0
+                )
+        else:
            seqlen = Seqlen(
                input_lengths=input_lengths,
                cache_lengths=cache_lengths_tensor,
                cu_seqlen_q=cu_seqlen_prefill,
            )
-        if batch.prefill_cache_indices is not None:
-            slots_pad = torch.zeros_like(input_ids)
-            slots_pad[batch.prefill_cache_indices] = slots
-            slots = slots_pad
        logits, speculative_logits = self.model.forward(
            input_ids=input_ids,
            position_ids=position_ids,
@ -476,8 +624,6 @@ class FlashVlmCausalLM(FlashCausalLM):
            image_grid_thw=batch.image_grid_thw,
            **kwargs,
        )
-        if batch.prefill_cache_indices is not None:
-            batch.prefill_cache_indices = None
        if batch.pixel_values is not None:
            batch.pixel_values = None
        if batch.pixel_attention_mask is not None:
@ -486,4 +632,6 @@ class FlashVlmCausalLM(FlashCausalLM):
            batch.image_sizes = None
        if batch.image_grid_thw is not None:
            batch.image_grid_thw = None
-        return logits, speculative_logits
+        return logits[:orig_bs], (
+            speculative_logits[:orig_bs] if speculative_logits is not None else None
+        )
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@ -11,7 +11,9 @@ from opentelemetry import trace
 from transformers import (
    PreTrainedTokenizerBase,
 )
-
+from text_generation_server.models.flash_causal_lm import (
+    prepare_for_decode,
+)
 from text_generation_server.models.flash_vlm_causal_lm import (
    FlashVlmCausalLMBatch,
    FlashVlmCausalLM,
@ -19,6 +21,13 @@ from text_generation_server.models.flash_vlm_causal_lm import (
 from text_generation_server.pb import generate_pb2
 from text_generation_server.layers.attention import Seqlen, trim_seqlen_metadata
 import habana_frameworks.torch as htorch
+from loguru import logger
+from text_generation_server.models.globals import BLOCK_SIZE
+from text_generation_server.utils.import_utils import (
+    synchronize,
+)
+import torch.nn.functional as F
+from text_generation_server.utils.log import log_master

 tracer = trace.get_tracer(__name__)

@ -196,7 +205,178 @@ class FlashMllamaCausalLMBatch(FlashVlmCausalLMBatch):
        return batch


+def generate_cross_attention_states(
+    cross_attention_states, image_indices, seqlen, pad_seq_len, prefilling
+):
+    if cross_attention_states is None:
+        return None, None, None
+    device = cross_attention_states.device
+    indices_list = []
+    if prefilling:
+        for i in image_indices:
+            indices_list.append(
+                torch.arange(pad_seq_len * i, pad_seq_len * (i + 1), device=device)
+            )
+        indices = torch.cat(indices_list, dim=0)
+    else:
+        indices = image_indices[:]
+    return indices, seqlen.input_lengths.index_select(0, image_indices)
+
+
 class FlashMllamaCausalLM(FlashVlmCausalLM):
+    def warmup_decode(
+        self, batch_size: int, block_num: int, batch: FlashMllamaCausalLMBatch
+    ):
+        input_ids = torch.zeros(
+            batch_size, dtype=batch.input_ids.dtype, device=self.device
+        )
+        position_ids = torch.arange(
+            batch_size, dtype=batch.position_ids.dtype, device=self.device
+        )
+        blocks = [block_num // batch_size for _ in range(batch_size)]
+        blocks[0] += block_num % batch_size
+        past_len = []
+        block_tables = []
+        slots = []
+        start_idx = 0
+
+        # fetch the last blocked to warmup block num
+        for i in range(batch_size):
+            block_array = list(range(start_idx, start_idx + blocks[i]))
+            slots.append(BLOCK_SIZE * block_array[-1] + BLOCK_SIZE - 1)
+            block_tables.append(block_array)
+            past_len.append(blocks[i] * BLOCK_SIZE - 1)
+            start_idx += blocks[i]
+        input_lengths = torch.ones(batch_size, dtype=torch.int32, device=self.device)
+        cache_lengths_tensor = torch.tensor(
+            past_len, dtype=torch.int32, device=self.device
+        )
+        cu_seqlen_prefill = torch.zeros(
+            batch_size + 1, device=self.device, dtype=torch.int32
+        )
+        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])
+
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+
+        hpu_attention_meta = prepare_for_decode(
+            self.dtype,
+            self.use_contiguous_pa,
+            self.device,
+            slots,
+            block_tables,
+            batch_size,
+            bucketing_ctx=None,
+        )
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        image_indices = torch.tensor(batch.image_indices, device=self.device)
+        image_indices = image_indices.repeat(batch_size)
+        cross_attention_states = batch.cross_attention_states.repeat(batch_size, 1, 1)
+        indices, cross_attention_len = generate_cross_attention_states(
+            cross_attention_states, image_indices, seqlen, 1, False
+        )
+        slots_tensor = torch.tensor(slots, dtype=batch.slots.dtype, device=self.device)
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            kv_cache=self.kv_cache,
+            slots=slots_tensor,
+            seqlen=trim_seqlen_metadata(seqlen),
+            hpu_attention_meta=hpu_attention_meta,
+            lm_head_indices=None,
+            adapter_data=None,
+            cross_attention_states=cross_attention_states,
+            indices=indices,
+            cross_attention_len=cross_attention_len,
+        )
+
+    def warmup_prefill(
+        self, prompt_len: int, batch_size: int, batch: FlashMllamaCausalLMBatch
+    ):
+        input_ids = torch.zeros(
+            prompt_len, dtype=batch.input_ids.dtype, device=self.device
+        ).repeat(batch_size)
+        position_ids = torch.arange(
+            prompt_len, dtype=batch.position_ids.dtype, device=self.device
+        ).repeat(batch_size)
+        max_bt = (prompt_len // BLOCK_SIZE + 1) * batch_size
+        block_tables = torch.arange(
+            max_bt, dtype=torch.int32, device=self.device
+        ).reshape(batch_size, -1)
+        slot_acc = []
+        for i in range(batch_size):
+            slots = []
+            for b in block_tables[i]:
+                slots.extend(range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE))
+            slot_acc.extend(slots[:prompt_len])
+        slots = torch.tensor(slot_acc, dtype=batch.slots.dtype, device=self.device)
+
+        input_lengths = (
+            torch.ones(batch_size, dtype=torch.int32, device=self.device) * prompt_len
+        )
+        cache_lengths_tensor = torch.zeros(
+            batch_size, dtype=torch.int32, device=self.device
+        )
+        cu_seqlen_prefill = torch.zeros(
+            batch_size + 1, device=self.device, dtype=torch.int32
+        )
+        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])
+
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+        lm_head_indices = input_lengths - 1
+
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        image_indices = torch.tensor(batch.image_indices, device=self.device)
+        image_indices = image_indices.repeat(batch_size)
+        cross_attention_states = batch.cross_attention_states.repeat(batch_size, 1, 1)
+        indices, cross_attention_len = generate_cross_attention_states(
+            cross_attention_states, image_indices, seqlen, prompt_len, True
+        )
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=self.kv_cache,
+            slots=slots,
+            seqlen=trim_seqlen_metadata(seqlen),
+            hpu_attention_meta=None,
+            lm_head_indices=lm_head_indices,
+            adapter_data=None,
+            cross_attention_states=cross_attention_states,
+            indices=indices,
+            cross_attention_len=cross_attention_len,
+        )
+
+    def warmup_hpu_graph(self, batch: FlashMllamaCausalLMBatch):
+        warmup_times = 3
+        self.bucketing_ctx.generate_prompt_buckets()
+        for i, (batch_size, seq_len) in enumerate(
+            reversed(self.bucketing_ctx.prompt_buckets)
+        ):
+            log_master(logger.info, f"warmup prefill seq {seq_len} bs {batch_size}")
+            for index in range(warmup_times):
+                self.warmup_prefill(seq_len, batch_size, batch)
+        self.bucketing_ctx.generate_decode_buckets(self.bucketing_ctx.num_hpu_blocks)
+        for i, (batch_size, block_num) in enumerate(
+            reversed(self.bucketing_ctx.decode_buckets)
+        ):
+            if batch_size > block_num:
+                continue
+            log_master(
+                logger.info, f"warmup decode bs {batch_size} block_num {block_num}"
+            )
+            for index in range(warmup_times):
+                self.warmup_decode(batch_size, block_num, batch)
+        synchronize(self.device)
+
    def forward(
        self,
        batch: FlashMllamaCausalLMBatch,
@ -263,12 +443,6 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)

-        seqlen = Seqlen(
-            input_lengths=input_lengths,
-            cache_lengths=cache_lengths_tensor,
-            cu_seqlen_q=cu_seqlen_prefill,
-        )
-
        if batch.pixel_values is not None:
            cross_attention_states = self.model.vision_forward(
                pixel_values=batch.pixel_values,
@ -281,11 +455,82 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):

        kwargs = {}
        if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling
        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids)
            slots_pad[batch.prefill_cache_indices] = slots
            slots = slots_pad
+
+        if self.bucketing_ctx is not None:
+            if batch.prefilling:
+                padded_bs = self.bucketing_ctx.get_padded_prompt_batch_size(
+                    input_lengths.shape[0]
+                )
+            else:
+                padded_bs = self.bucketing_ctx.get_padded_decode_batch_size(
+                    input_lengths.shape[0]
+                )
+        else:
+            padded_bs = input_lengths.shape[0]
+        orig_bs = input_lengths.shape[0]
+        padded_input_len = input_ids.view(orig_bs, -1).shape[-1]
+        image_indices = torch.tensor(batch.image_indices, device=self.device)
+        if padded_bs != input_lengths.shape[0]:
+            padded_input_lengths = F.pad(
+                input_lengths,
+                (0, padded_bs - orig_bs),
+                value=0,
+            )
+            padded_cache_lengths_tensor = F.pad(
+                cache_lengths_tensor,
+                (0, padded_bs - orig_bs),
+                value=0,
+            )
+            if cu_seqlen_prefill is not None:
+                cu_seqlen_prefill = torch.zeros(
+                    padded_bs + 1, device=self.device, dtype=torch.int32
+                )
+                torch.cumsum(padded_input_lengths, -1, out=cu_seqlen_prefill[1:])
+            seqlen = Seqlen(
+                input_lengths=padded_input_lengths,
+                cache_lengths=padded_cache_lengths_tensor,
+                cu_seqlen_q=cu_seqlen_prefill,
+            )
+
+            input_ids = F.pad(
+                input_ids, (0, (padded_bs - orig_bs) * padded_input_len), value=0
+            )
+            position_ids = F.pad(
+                position_ids, (0, (padded_bs - orig_bs) * padded_input_len), value=1
+            )
+            slots = F.pad(slots, (0, (padded_bs - orig_bs) * padded_input_len), value=0)
+            if lm_head_indices is not None:
+                lm_head_indices = F.pad(
+                    lm_head_indices, (0, padded_bs - orig_bs), value=0
+                )
+            if cross_attention_states is not None:
+                cross_attention_states = F.pad(
+                    cross_attention_states,
+                    (0, 0, 0, 0, 0, (padded_bs - orig_bs)),
+                    value=0,
+                )
+            if len(image_indices) != 0:
+                pad_indices = torch.arange(orig_bs, padded_bs, device=self.device)
+                image_indices = torch.cat((image_indices, pad_indices), dim=0)
+        else:
+            seqlen = Seqlen(
+                input_lengths=input_lengths,
+                cache_lengths=cache_lengths_tensor,
+                cu_seqlen_q=cu_seqlen_prefill,
+            )
+
+        indices, cross_attention_len = generate_cross_attention_states(
+            cross_attention_states,
+            image_indices,
+            seqlen,
+            padded_input_len,
+            batch.prefilling,
+        )
        logits, speculative_logits = self.model.forward(
            input_ids=input_ids,
            position_ids=position_ids,
@ -295,14 +540,15 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=batch.hpu_attn_meta,
            lm_head_indices=lm_head_indices,
-            cross_attention_states=cross_attention_states,
            # TODO list
            adapter_data=None,
-            image_indices=batch.image_indices[:],
+            cross_attention_states=cross_attention_states,
+            indices=indices,
+            cross_attention_len=cross_attention_len,
            **kwargs,
        )
-        if batch.prefill_cache_indices is not None:
-            batch.prefill_cache_indices = None
        if batch.pixel_values is not None:
            batch.pixel_values = None
-        return logits, speculative_logits
+        return logits[:orig_bs], (
+            speculative_logits[:orig_bs] if speculative_logits is not None else None
+        )
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -119,6 +119,9 @@ struct Args {
    #[clap(default_value = "3000", long, short, env)]
    port: u16,

+    #[clap(default_value = "9000", long, short, env)]
+    prometheus_port: u16,
+
    /// Enable JSON output format.
    #[clap(long, env)]
    json_output: bool,
@ -317,6 +320,7 @@ async fn main() -> Result<(), RouterError> {
        args.max_client_batch_size,
        args.usage_stats,
        args.payload_limit,
+        args.prometheus_port,
    )
    .await?;
    Ok(())
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -37,6 +37,8 @@ struct Args {
    hostname: String,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
+    #[clap(default_value = "9000", long, short, env)]
+    prometheus_port: u16,
    #[clap(long, env, required = true)]
    tokenizer_name: String,
    #[clap(long, env)]
@ -227,6 +229,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        max_batch_total_tokens,
        hostname,
        port,
+        prometheus_port,
        tokenizer_name,
        tokenizer_config_path,
        revision,
@ -322,6 +325,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
                max_client_batch_size,
                usage_stats,
                payload_limit,
+                prometheus_port,
            )
            .await?;
            Ok(())
--- a/backends/v3/src/block_allocator.rs
+++ b/backends/v3/src/block_allocator.rs
@ -177,7 +177,7 @@ impl Allocator for SimpleAllocator {
            (required_blocks, repeats)
        };

-        let tokens = tokens as usize;
+        let mut tokens = tokens as usize;
        if required_blocks > self.free_blocks.len() as u32 {
            None
        } else {
@ -189,6 +189,8 @@ impl Allocator for SimpleAllocator {
                .split_off(self.free_blocks.len() - required_blocks as usize);
            if self.is_hpu_device {
                blocks.sort();
+                // need 1 slot for ping-pong optimization
+                tokens += 1;
            }
            let mut slots =
                Vec::with_capacity((required_blocks * self.block_size * repeats as u32) as usize);
--- a/docs/source/backends/trtllm.md
+++ b/docs/source/backends/trtllm.md
@ -163,7 +163,7 @@ WORKDIR /usr/src/text-generation-inference
 ARG cuda_arch_list
 ARG build_type
 ARG sccache_gha_enabled
-ARG actions_cache_url
+ARG actions_results_url
 ARG actions_runtime_token

 # Install Rust
@ -171,7 +171,7 @@ ENV PATH="/root/.cargo/bin:$PATH"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
    chmod -R a+w /root/.rustup && \
    chmod -R a+w /root/.cargo && \
-    cargo install sccache --locked
+    cargo install sccache --version ">=0.10.0" --locked

 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
 ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
--- a/router/src/logging.rs
+++ b/router/src/logging.rs
@ -1,13 +1,68 @@
+use axum::{extract::Request, middleware::Next, response::Response};
 use opentelemetry::sdk::propagation::TraceContextPropagator;
 use opentelemetry::sdk::trace;
 use opentelemetry::sdk::trace::Sampler;
 use opentelemetry::sdk::Resource;
+use opentelemetry::trace::{SpanContext, SpanId, TraceContextExt, TraceFlags, TraceId};
+use opentelemetry::Context;
 use opentelemetry::{global, KeyValue};
 use opentelemetry_otlp::WithExportConfig;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};

+struct TraceParent {
+    #[allow(dead_code)]
+    version: u8,
+    trace_id: TraceId,
+    parent_id: SpanId,
+    trace_flags: TraceFlags,
+}
+
+fn parse_traceparent(header_value: &str) -> Option<TraceParent> {
+    let parts: Vec<&str> = header_value.split('-').collect();
+    if parts.len() != 4 {
+        return None;
+    }
+
+    let version = u8::from_str_radix(parts[0], 16).ok()?;
+    if version == 0xff {
+        return None;
+    }
+
+    let trace_id = TraceId::from_hex(parts[1]).ok()?;
+    let parent_id = SpanId::from_hex(parts[2]).ok()?;
+    let trace_flags = u8::from_str_radix(parts[3], 16).ok()?;
+
+    Some(TraceParent {
+        version,
+        trace_id,
+        parent_id,
+        trace_flags: TraceFlags::new(trace_flags),
+    })
+}
+
+pub async fn trace_context_middleware(mut request: Request, next: Next) -> Response {
+    let context = request
+        .headers()
+        .get("traceparent")
+        .and_then(|v| v.to_str().ok())
+        .and_then(parse_traceparent)
+        .map(|traceparent| {
+            Context::new().with_remote_span_context(SpanContext::new(
+                traceparent.trace_id,
+                traceparent.parent_id,
+                traceparent.trace_flags,
+                true,
+                Default::default(),
+            ))
+        });
+
+    request.extensions_mut().insert(context);
+
+    next.run(request).await
+}
+
 /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
 ///     - otlp_endpoint is an optional URL to an Open Telemetry collector
 ///     - otlp_service_name service name to appear in APM
--- a/router/src/sagemaker.rs
+++ b/router/src/sagemaker.rs
@ -67,16 +67,26 @@ pub(crate) async fn sagemaker_compatibility(
    default_return_full_text: Extension<bool>,
    infer: Extension<Infer>,
    compute_type: Extension<ComputeType>,
+    context: Extension<Option<opentelemetry::Context>>,
    info: Extension<Info>,
    Json(req): Json<SagemakerRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    match req {
        SagemakerRequest::Generate(req) => {
-            compat_generate(default_return_full_text, infer, compute_type, Json(req)).await
+            compat_generate(
+                default_return_full_text,
+                infer,
+                compute_type,
+                context,
+                Json(req),
+            )
+            .await
+        }
+        SagemakerRequest::Chat(req) => {
+            chat_completions(infer, compute_type, info, context, Json(req)).await
        }
-        SagemakerRequest::Chat(req) => chat_completions(infer, compute_type, info, Json(req)).await,
        SagemakerRequest::Completion(req) => {
-            completions(infer, compute_type, info, Json(req)).await
+            completions(infer, compute_type, info, context, Json(req)).await
        }
    }
 }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -7,6 +7,7 @@ use crate::kserve::{
    kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
    kserve_model_metadata, kserve_model_metadata_ready,
 };
+use crate::logging::trace_context_middleware;
 use crate::sagemaker::{
    sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse,
    __path_sagemaker_compatibility,
@ -63,6 +64,7 @@ use tokio::sync::oneshot;
 use tokio::time::Instant;
 use tower_http::cors::{AllowOrigin, CorsLayer};
 use tracing::{info_span, instrument, Instrument};
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;

@ -125,6 +127,7 @@ pub(crate) async fn compat_generate(
    Extension(default_return_full_text): Extension<bool>,
    infer: Extension<Infer>,
    compute_type: Extension<ComputeType>,
+    context: Extension<Option<opentelemetry::Context>>,
    Json(mut req): Json<CompatGenerateRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    // default return_full_text given the pipeline_tag
@ -134,11 +137,14 @@ pub(crate) async fn compat_generate(

    // switch on stream
    if req.stream {
-        Ok(generate_stream(infer, compute_type, Json(req.into()))
+        Ok(
+            generate_stream(infer, compute_type, context, Json(req.into()))
                .await
-            .into_response())
+                .into_response(),
+        )
    } else {
-        let (headers, Json(generation)) = generate(infer, compute_type, Json(req.into())).await?;
+        let (headers, Json(generation)) =
+            generate(infer, compute_type, context, Json(req.into())).await?;
        // wrap generation inside a Vec to match api-inference
        Ok((headers, Json(vec![generation])).into_response())
    }
@ -267,9 +273,14 @@ seed,
 async fn generate(
    infer: Extension<Infer>,
    Extension(ComputeType(compute_type)): Extension<ComputeType>,
+    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<GenerateRequest>,
 ) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
+    if let Some(context) = context {
+        span.set_parent(context);
+    }
+
    let (headers, _, response) =
        generate_internal(infer, ComputeType(compute_type), Json(req), span).await?;
    Ok((headers, response))
@ -465,12 +476,17 @@ seed,
 async fn generate_stream(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
+    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<GenerateRequest>,
 ) -> (
    HeaderMap,
    Sse<impl Stream<Item = Result<Event, Infallible>>>,
 ) {
    let span = tracing::Span::current();
+    if let Some(context) = context {
+        span.set_parent(context);
+    }
+
    let (headers, response_stream) =
        generate_stream_internal(infer, compute_type, Json(req), span).await;

@ -700,9 +716,14 @@ pub(crate) async fn completions(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(info): Extension<Info>,
+    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<CompletionRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
+    if let Some(context) = context {
+        span.set_parent(context);
+    }
+
    metrics::counter!("tgi_request_count").increment(1);

    let CompletionRequest {
@ -1148,9 +1169,14 @@ pub(crate) async fn chat_completions(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(info): Extension<Info>,
+    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(mut chat): Json<ChatRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
+    if let Some(context) = context {
+        span.set_parent(context);
+    }
+
    metrics::counter!("tgi_request_count").increment(1);
    let ChatRequest {
        model,
@ -2265,6 +2291,7 @@ async fn start(
        .layer(Extension(prom_handle.clone()))
        .layer(OtelAxumLayer::default())
        .layer(DefaultBodyLimit::max(payload_limit))
+        .layer(axum::middleware::from_fn(trace_context_middleware))
        .layer(cors_layer);

    tracing::info!("Connected");
--- a/router/src/vertex.rs
+++ b/router/src/vertex.rs
@ -7,6 +7,7 @@ use axum::response::{IntoResponse, Response};
 use axum::Json;
 use serde::{Deserialize, Serialize};
 use tracing::instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use utoipa::ToSchema;

 #[derive(Clone, Deserialize, ToSchema)]
@ -70,9 +71,14 @@ example = json ! ({"error": "Incomplete generation"})),
 pub(crate) async fn vertex_compatibility(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
+    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<VertexRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
+    if let Some(context) = context {
+        span.set_parent(context);
+    }
+
    metrics::counter!("tgi_request_count").increment(1);

    // check that theres at least one instance