mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
Fixing the free algorithm to handle times where the common prefix is
smaller.
This commit is contained in:
parent
9c839ca5df
commit
bef2f6bdaa
@ -172,7 +172,6 @@ impl Allocator for RadixAllocator {
|
|||||||
)
|
)
|
||||||
// Unwrap, failing is a programming error.
|
// Unwrap, failing is a programming error.
|
||||||
.expect("Failed to store prefill tokens");
|
.expect("Failed to store prefill tokens");
|
||||||
|
|
||||||
// We can have a prefill with the following structure:
|
// We can have a prefill with the following structure:
|
||||||
//
|
//
|
||||||
// |---| From the prefix cache.
|
// |---| From the prefix cache.
|
||||||
@ -182,12 +181,14 @@ impl Allocator for RadixAllocator {
|
|||||||
// This means that while processing this request there was a
|
// This means that while processing this request there was a
|
||||||
// partially overlapping request that had A..=E in its
|
// partially overlapping request that had A..=E in its
|
||||||
// prefill. In this case we need to free the blocks D E.
|
// prefill. In this case we need to free the blocks D E.
|
||||||
|
if prefix_len > allocation.cached_prefix_len {
|
||||||
self.free_blocks.extend(
|
self.free_blocks.extend(
|
||||||
&blocks[allocation.cached_prefix_len / self.block_size as usize
|
&blocks[allocation.cached_prefix_len / self.block_size as usize
|
||||||
..prefix_len / self.block_size as usize],
|
..prefix_len / self.block_size as usize],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Free non-prefill blocks.
|
// Free non-prefill blocks.
|
||||||
self.free_blocks
|
self.free_blocks
|
||||||
|
Loading…
Reference in New Issue
Block a user