fix

2025-09-11 20:34:54 +00:00 · 2024-06-05 11:28:33 +02:00 · 2024-06-05 11:28:33 +02:00 · 258ace7cd3
commit 258ace7cd3
parent eb6a02a0f1
3 changed files with 6 additions and 9 deletions
--- a/router/src/infer/v3/block_allocator.rs
+++ b/router/src/infer/v3/block_allocator.rs
@ -111,11 +111,11 @@ async fn block_allocator_task(
                    'slots: for block_id in blocks.repeat(repeats).iter() {
                        for s in (block_id * block_size)..((block_id + 1) * block_size) {
                            slots.push(s);
                        }
                            if slots.len() == tokens {
                                break 'slots;
                            }
                        }
                    }
                    Some((blocks, slots))
                };
                response_sender.send(allocation).unwrap();
--- a/router/src/infer/v3/queue.rs
+++ b/router/src/infer/v3/queue.rs
@ -180,7 +180,7 @@ impl State {
        speculate: u32,
        max_batch_total_tokens: u32,
    ) -> Self {
-        let block_allocator = requires_padding
+        let block_allocator = (!requires_padding)
            .then(|| BlockAllocator::new(max_batch_total_tokens, block_size, window_size));
        Self {
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -183,7 +183,6 @@ class FlashCausalLMBatch(Batch):
        block_tables = []
        slots = []
        cumulative_blocks = 0
        # Parse batch
        for i, (r, tokenized_input) in enumerate(
@ -233,15 +232,13 @@ class FlashCausalLMBatch(Batch):
            if not r.blocks:
                needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
                request_blocks = [
-                    b
+                    b for b in range(num_blocks, num_blocks + needed_blocks)
                    for b in range(cumulative_blocks, cumulative_blocks + needed_blocks)
                ]
                request_slots = [
                    s
                    for b in request_blocks
                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                ]
                cumulative_blocks += needed_blocks
            else:
                request_blocks = r.blocks
                request_slots = r.slots
@ -358,7 +355,7 @@ class FlashCausalLMBatch(Batch):
        slots = torch.tensor(slots, dtype=torch.int64, device=device)
        block_tables_tensor = torch.zeros(
-            (len(block_tables), max_blocks), dtype=torch.int64, device="cpu"
+            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
        )
        for i, request_blocks in enumerate(block_tables):
            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)