fix tests

2025-09-11 20:34:54 +00:00 · 2024-10-10 14:52:09 +02:00 · 2024-10-10 14:52:09 +02:00 · b7a1280f25
commit b7a1280f25
parent f85a308ef1
16 changed files with 162 additions and 180 deletions
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@ -158,8 +158,8 @@ impl Client {
                // Blocks and slots will be set on the server side if we use paged attention
                blocks: vec![],
                slots: vec![],
-                prefix_len: 0,
+                cache_len: 0,
-                postfix_len: truncate,
+                chunk_len: None,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@ -246,8 +246,8 @@ impl Health for ShardedClient {
            // Block 0 is reserved for health checks
            blocks: vec![0],
            slots: (0..16).collect(),
-            prefix_len: 0,
+            cache_len: 0,
-            postfix_len: 1,
+            chunk_len: None,
            adapter_id: None,
        };
        let batch = Batch {
--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@ -158,8 +158,8 @@ impl Client {
                // Blocks and slots will be set on the server side if we use paged attention
                blocks: vec![],
                slots: vec![],
-                prefix_len: 0,
+                cache_len: 0,
-                postfix_len: truncate,
+                chunk_len: None,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -235,9 +235,9 @@ impl Health for ShardedClient {
            // Block 0 is reserved for health checks
            blocks: vec![0],
            slots: (0..16).collect(),
-            prefix_len: 0,
+            cache_len: 0,
            adapter_id: None,
-            postfix_len: 1,
+            chunk_len: None,
        };
        let batch = Batch {
            id: u64::MAX,
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@ -280,7 +280,7 @@ impl State {
                continue;
            }
-            let (block_allocation, postfix_len) = match &self.block_allocator {
+            let block_allocation = match &self.block_allocator {
                None => {
                    // We pad to max input length in the Python shards
                    // We need to take these padding tokens into the equation
@ -297,7 +297,7 @@ impl State {
                        self.entries.push_front((id, entry));
                        break 'entry_loop;
                    }
-                    (None, entry.request.input_length)
+                    None
                }
                Some(block_allocator) => {
                    // If users wants the prefill logprobs, we cannot reuse the cache.
@ -337,7 +337,7 @@ impl State {
                        }
                    };
-                    let mut postfix_len = entry.request.input_length - block_allocation.prefix_len;
+                    let postfix_len = entry.request.input_length - block_allocation.prefix_len;
                    // Check equality too as if we don't we might end up with a postfix_len = 0
                    // in the next iteration of the loop
@ -345,9 +345,9 @@ impl State {
                        // Entry is over budget
                        if self.support_chunking {
                            // We support chunking, just set postfix_len to exactly match prefill_token_budget
-                            postfix_len = prefill_token_budget - prefill_tokens;
+                            let chunk_len = prefill_token_budget - prefill_tokens;
                            // Push this entry inside the batch
-                            batch.push((id, entry, Some(block_allocation), postfix_len));
+                            batch.push((id, entry, Some(block_allocation), Some(chunk_len)));
                            break 'entry_loop;
                        } else {
                            // We don't support chunking, this entry needs to go back to the buffer
@ -363,10 +363,10 @@ impl State {
                    prefill_tokens += postfix_len;
-                    (Some(block_allocation), postfix_len)
+                    Some(block_allocation)
                }
            };
-            batch.push((id, entry, block_allocation, postfix_len));
+            batch.push((id, entry, block_allocation, None));
            if Some(batch.len()) == max_size {
                break;
            }
@ -395,7 +395,7 @@ impl State {
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
-        for (id, mut entry, block_allocation, postfix_len) in batch {
+        for (id, mut entry, block_allocation, chunk_len) in batch {
            // Create a new span to link the batch back to this entry
            let entry_batch_span = info_span!(parent: &entry.span, "infer");
            // Add relationships
@ -447,9 +447,9 @@ impl State {
                top_n_tokens: entry.request.top_n_tokens,
                blocks,
                slots,
-                prefix_len,
+                cache_len: prefix_len,
                adapter_id: entry.request.adapter_id.clone(),
-                postfix_len,
+                chunk_len,
            });
            // Set batch_time
            entry.batch_time = Some(Instant::now());
@ -582,7 +582,7 @@ mod tests {
    #[tokio::test]
    async fn test_append() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry, _guard) = default_entry();
        assert_eq!(state.next_id, 0);
@ -598,7 +598,7 @@ mod tests {
    #[tokio::test]
    async fn test_next_batch_empty() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
        assert!(state.next_batch(None, None, 1, 1).await.is_none());
        assert!(state.next_batch(Some(1), None, 1, 1).await.is_none());
@ -606,7 +606,7 @@ mod tests {
    #[tokio::test]
    async fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
@ -638,7 +638,7 @@ mod tests {
    #[tokio::test]
    async fn test_next_batch_max_size() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
@ -658,7 +658,7 @@ mod tests {
    #[tokio::test]
    async fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1, false, None, 0, 2);
+        let mut state = State::new(false, 1, false, None, 0, 2, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
@ -691,14 +691,14 @@ mod tests {
    #[tokio::test]
    async fn test_queue_append() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry, _guard) = default_entry();
        queue.append(entry);
    }
    #[tokio::test]
    async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
@ -706,7 +706,7 @@ mod tests {
    #[tokio::test]
    async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
@ -739,7 +739,7 @@ mod tests {
    #[tokio::test]
    async fn test_queue_next_batch_max_size() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
@ -755,7 +755,7 @@ mod tests {
    #[tokio::test]
    async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
@ -780,7 +780,7 @@ mod tests {
    #[tokio::test]
    async fn test_queue_next_batch_token_speculate() {
-        let queue = Queue::new(false, 1, false, None, 2, 16);
+        let queue = Queue::new(false, 1, false, None, 2, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
@ -799,7 +799,7 @@ mod tests {
    #[tokio::test]
    async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry, _) = default_entry();
        queue.append(entry);
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -158,8 +158,8 @@ async fn prefill(
            top_n_tokens: top_n_tokens.unwrap_or(0),
            blocks: vec![],
            slots: vec![],
-            prefix_len: 0,
+            cache_len: 0,
-            postfix_len: sequence_length,
+            chunk_len: None,
            adapter_id: None,
        })
        .collect();
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -9,13 +9,16 @@ import subprocess
 import sys
 import tempfile
 import time
 from typing import Dict, List, Optional
 import docker
 import pytest
 import base64
 from pathlib import Path
 from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
 from text_generation import AsyncClient
 from text_generation.types import (
    BestOfSequence,
@ -639,3 +642,22 @@ def generate_multi():
        return responses
    return generate_load_inner
 # TODO fix the server parsser to count inline image tokens correctly
@pytest.fixture
 def chicken():
    path = Path(__file__).parent / "images" / "chicken_on_money.png"
    with open(path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.fixture
 def cow_beach():
    path = Path(__file__).parent / "images" / "cow_beach.png"
    with open(path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@ -1,5 +1,4 @@
 import pytest
 import base64
@pytest.fixture(scope="module")
@ -20,24 +19,11 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
    return flash_pali_gemma_handle.client
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
+async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow_beach):
-    cow = get_cow_beach()
+    inputs = f"![]({cow_beach})Where is the cow standing?\n"
    inputs = f"![]({cow})Where is the cow standing?\n"
    response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
    assert response.generated_text == "beach"
@ -47,9 +33,9 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
+async def test_flash_pali_gemma_two_images(
-    chicken = get_chicken()
+    flash_pali_gemma, response_snapshot, chicken, cow_beach
-    cow_beach = get_cow_beach()
+):
    response = await flash_pali_gemma.generate(
        f"caption![]({chicken})![]({cow_beach})\n",
        max_new_tokens=20,
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@ -1,5 +1,4 @@
 import pytest
 import base64
@pytest.fixture(scope="module")
@ -16,22 +15,8 @@ async def idefics(idefics_handle):
    return idefics_handle.client
 # TODO fix the server parsser to count inline image tokens correctly
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.mark.asyncio
-async def test_idefics(idefics, response_snapshot):
+async def test_idefics(idefics, response_snapshot, chicken):
    chicken = get_chicken()
    response = await idefics.generate(
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
@ -48,9 +33,7 @@ async def test_idefics(idefics, response_snapshot):
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
-async def test_idefics_two_images(idefics, response_snapshot):
+async def test_idefics_two_images(idefics, response_snapshot, chicken, cow_beach):
    chicken = get_chicken()
    cow_beach = get_cow_beach()
    response = await idefics.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
@ -63,8 +46,7 @@ async def test_idefics_two_images(idefics, response_snapshot):
@pytest.mark.release
@pytest.mark.asyncio
-async def test_idefics_load(idefics, generate_load, response_snapshot):
+async def test_idefics_load(idefics, generate_load, response_snapshot, chicken):
    chicken = get_chicken()
    responses = await generate_load(
        idefics,
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@ -1,18 +1,4 @@
 import pytest
 import base64
 # TODO fix the server parsser to count inline image tokens correctly
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.fixture(scope="module")
@ -31,8 +17,9 @@ async def flash_idefics2_next(flash_idefics2_next_handle):
@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot):
+async def test_flash_idefics2_next_simple(
-    chicken = get_chicken()
+    flash_idefics2_next, response_snapshot, chicken
 ):
    response = await flash_idefics2_next.generate(
        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
        max_new_tokens=10,
@ -46,9 +33,9 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
+async def test_flash_idefics2_two_images(
-    chicken = get_chicken()
+    flash_idefics2_next, response_snapshot, chicken, cow_beach
-    cow_beach = get_cow_beach()
+):
    response = await flash_idefics2_next.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
@ -87,9 +74,8 @@ async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snap
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_idefics2_next_load(
-    flash_idefics2_next, generate_load, response_snapshot
+    flash_idefics2_next, generate_load, response_snapshot, chicken
 ):
    chicken = get_chicken()
    responses = await generate_load(
        flash_idefics2_next,
        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
--- a/integration-tests/models/test_llava_next.py
+++ b/integration-tests/models/test_llava_next.py
@ -1,12 +1,4 @@
 import pytest
 import base64
 # TODO fix the server parsser to count inline image tokens correctly
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.fixture(scope="module")
@ -29,8 +21,7 @@ async def flash_llava_next(flash_llava_next_handle):
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_llava_next_simple(flash_llava_next, response_snapshot):
+async def test_flash_llava_next_simple(flash_llava_next, response_snapshot, chicken):
    chicken = get_chicken()
    response = await flash_llava_next.generate(
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
@ -70,9 +61,8 @@ async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot):
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_llava_next_load(
-    flash_llava_next, generate_load, response_snapshot
+    flash_llava_next, generate_load, response_snapshot, chicken
 ):
    chicken = get_chicken()
    responses = await generate_load(
        flash_llava_next,
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@ -1,5 +1,4 @@
 import pytest
 import base64
 import asyncio
@ -15,22 +14,8 @@ async def mllama(mllama_handle):
    return mllama_handle.client
 # TODO fix the server parsser to count inline image tokens correctly
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.mark.asyncio
 async def test_mllama_simpl(mllama, response_snapshot):
    # chicken = get_chicken()
    response = await mllama.chat(
        max_tokens=10,
        temperature=0.0,
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -139,12 +139,14 @@ message Request {
  repeated uint32 slots = 10;
  /// LORA adapter index
  optional string adapter_id = 11;
-  /// Prefix length that can be retrieved from the KV cache.
+  /// Tokens that can be retrieved from the KV cache.
-  uint32 prefix_len = 12;
+  /// This value is set for the first prefill and never reset
  uint32 cache_len = 12;
  /// Context truncation
  bool add_special_tokens = 13;
-  /// Postfix length for prefill chunking
+  /// Chunk of tokens that must be computed for the first prefill
-  uint32 postfix_len = 14;
+  /// This value is set for the first prefill and never reset
  optional uint32 chunk_len = 14;
 }
 message Batch {
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -280,24 +280,36 @@ class FlashCausalLMBatch(Batch):
            prompt_length = len(tokenized_input)
            prompt_lengths.append(prompt_length)
-            cache_length = r.prefix_len
+            cache_length = r.cache_len
-            input_length = r.postfix_len
+
            assert (
                cache_length <= prompt_length
            ), f"Prefix {cache_length} vs input {prompt_length}"
            if cache_length == prompt_length:
                assert False, "unreachable"
            if cache_length + input_length < prompt_length:
                # FIXME: speculate is not supported for context chunking at the moment
                assert speculate == 0
                assert get_support_chunking()
                assert input_length > 0
-            postfix_ids = tokenized_input[cache_length : cache_length + input_length]
+            # `chunk_len` is an optional field in the protobuf
            # It is only set if the model support chunking
            if r.HasField("chunk_len"):
                input_length = r.chunk_len
                if cache_length + input_length < prompt_length:
                    # FIXME: speculate is not supported for context chunking at the moment
                    assert speculate == 0
                    assert get_support_chunking()
                    assert input_length > 0
                postfix_ids = tokenized_input[
                    cache_length : cache_length + input_length
                ]
                assert (
                    len(postfix_ids) == input_length
                ), "Rust and Python tokenizers are not aligned"
            else:
                # Use all the remaining ids
                postfix_ids = tokenized_input[cache_length:]
                input_length = len(postfix_ids)
            assert (
                len(postfix_ids) == input_length
            ), "Rust and Python tokenizers are not aligned"
            input_lengths.append(input_length)
            prefix_offsets.append(prompt_length - 5)
@ -1097,6 +1109,7 @@ class FlashCausalLM(Model):
        head_size: Optional[int] = None,
        skip_special_tokens: bool = True,
        kv_cache_dtype: Optional[torch.dtype] = None,
        support_chunking: bool = True,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
@ -1224,7 +1237,7 @@ class FlashCausalLM(Model):
            rank=rank,
            world_size=world_size,
            sliding_window=config.sliding_window,
-            support_chunking=True,
+            support_chunking=support_chunking,
        )
    @property
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@ -1,14 +1,17 @@
 from io import BytesIO
 from PIL import Image
 import torch
 import numpy as np
 from typing import Iterable, Optional, Tuple, List, Dict
 from text_generation_server.pb.generate_pb2 import Request
-
+from io import BytesIO
 from PIL import Image
 from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import (
    PreTrainedTokenizerBase,
 )
 from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
@ -167,6 +170,13 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
        batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
            max=config.text_config.vocab_size - 1
        )
        if isinstance(batch.input_ids, list):
            if len(batch) > 1:
                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
            else:
                input_ids = batch.input_ids[0]
            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
        batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)
        if image_inputs is not None:
@ -190,7 +200,7 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
 class MllamaCausalLM(VlmCausalLM):
    def forward(
        self,
-        batch: VlmCausalLMBatch,
+        batch: MllamaCausalLMBatch,
        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
@ -202,7 +212,7 @@ class MllamaCausalLM(VlmCausalLM):
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices
            speculative_ids = batch.speculative_ids
@ -221,8 +231,8 @@ class MllamaCausalLM(VlmCausalLM):
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
-            prefix_lens_tensor = (
+            cache_lengths_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
            ).reshape(-1)
            # Add Copy the block tables for all members
@ -244,8 +254,8 @@ class MllamaCausalLM(VlmCausalLM):
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
+            cache_lengths_tensor = batch.cache_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices
        if cu_seqlen_prefill is None and self.max_past() is not None:
@ -254,7 +264,6 @@ class MllamaCausalLM(VlmCausalLM):
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)
        bs = input_ids.shape[0]
        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@ -269,38 +278,25 @@ class MllamaCausalLM(VlmCausalLM):
            # Only run cuda graphs when there's no images.
            or batch.cross_attention_states is not None
        ):
            input_lengths = input_lengths + prefix_lens_tensor
            if PREFIX_CACHING:
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
+                    cache_lengths=batch.cache_lengths,
                )
            with self._forward_context(
                block_tables=block_tables,
                cu_seqlen_prefill=cu_seqlen_prefill,
                input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
+                cache_lengths_tensor=cache_lengths_tensor,
            ):
                max_k = (input_lengths + prefix_lens_tensor).max().item()
                seqlen = Seqlen(
                    input_lengths=input_lengths,
-                    cache_lengths=prefix_lens_tensor,
+                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
+                    max_q=batch.max_input_length,
-                    max_k=max_k,
+                    max_k=batch.max_current_length,
                )
                if batch.pixel_values is not None:
                    cross_attention_states = self.model.vision_forward(
                        pixel_values=batch.pixel_values,
                        aspect_ratio_ids=batch.aspect_ratio_ids,
                        aspect_ratio_mask=batch.aspect_ratio_mask,
                    )
                    batch.cross_attention_states = cross_attention_states
                cross_attention_states = batch.cross_attention_states
                logits, speculative_logits = self.model.forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
@ -312,14 +308,18 @@ class MllamaCausalLM(VlmCausalLM):
                    max_s=max_s,
                    prefill_cache_indices=batch.prefill_cache_indices,
                    lm_head_indices=lm_head_indices,
-                    cross_attention_states=cross_attention_states,
+                    pixel_values=batch.pixel_values,
-                    adapter_data=adapter_data,
+                    pixel_attention_mask=batch.pixel_attention_mask,
-                    image_indices=batch.image_indices[:],
+                    image_sizes=batch.image_sizes,
                )
                if batch.prefill_cache_indices is not None:
                    batch.prefill_cache_indices = None
                if batch.pixel_values is not None:
                    batch.pixel_values = None
                if batch.pixel_attention_mask is not None:
                    batch.pixel_attention_mask = None
                if batch.image_sizes is not None:
                    batch.image_sizes = None
                return logits, speculative_logits
        # Copy inputs to the static inputs of the cuda graph
@ -330,22 +330,34 @@ class MllamaCausalLM(VlmCausalLM):
            block_tables = block_tables_to_ragged(
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
+                cache_lengths=batch.cache_lengths,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
            cuda_graph["block_tables"][
                : block_tables.shape[0], : block_tables.shape[1]
            ] = block_tables
        # XXX: This is working only because block 0 is reserved for the healthcheck
        # so it doesn't matter if we override it with bogus values.
        cuda_graph["slots"].fill_(0)
        cuda_graph["slots"][: slots.shape[0]] = slots
        cuda_graph["input_lengths"].zero_()
-        cuda_graph["input_lengths"][: input_lengths.shape[0]] = (
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
-            input_lengths + prefix_lens_tensor
+        cuda_graph["cache_lengths"].zero_()
-        )
+        cuda_graph["cache_lengths"][
            : cache_lengths_tensor.shape[0]
        ] = cache_lengths_tensor
-        # Replay the graph
+        with self._forward_context(
-        cuda_graph["graph"].replay()
+            block_tables=cuda_graph["block_tables"],
            cu_seqlen_prefill=None,
            input_lengths_tensor=cuda_graph["input_lengths"],
            cache_lengths_tensor=cuda_graph["cache_lengths"],
            state=cuda_graph["state"],
        ):
            # Replay the graph
            cuda_graph["graph"].replay()
        # Slice output to the correct shape
        speculative_logits = (
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -271,6 +271,8 @@ class VlmCausalLM(FlashCausalLM):
            model_id=model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            # FIXME: VLM do not work with context chunking yet
            support_chunking=False,
            **kwargs,
        )
@ -356,7 +358,7 @@ class VlmCausalLM(FlashCausalLM):
        else:
            cuda_graph = None
        if cu_seqlen_prefill is not None or cuda_graph is None:
-            if PREFIX_CACHING:
+            if ATTENTION == "flashinfer":
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
@ -368,13 +370,12 @@ class VlmCausalLM(FlashCausalLM):
                input_lengths_tensor=input_lengths,
                cache_lengths_tensor=cache_lengths_tensor,
            ):
                max_k = (input_lengths + cache_lengths_tensor).max().item()
                seqlen = Seqlen(
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
+                    max_q=batch.max_input_length,
-                    max_k=max_k,
+                    max_k=batch.max_current_length,
                )
                logits, speculative_logits = self.model.forward(
                    input_ids=input_ids,
@ -416,7 +417,10 @@ class VlmCausalLM(FlashCausalLM):
            cuda_graph["block_tables"][
                : block_tables.shape[0], : block_tables.shape[1]
            ] = block_tables
-        cuda_graph["slots"].fill_(-1)
+
        # XXX: This is working only because block 0 is reserved for the healthcheck
        # so it doesn't matter if we override it with bogus values.
        cuda_graph["slots"].fill_(0)
        cuda_graph["slots"][: slots.shape[0]] = slots
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths