mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-07 00:10:17 +00:00
Trying to fix non chunking targets.
This commit is contained in:
parent
a31db04709
commit
0a01dde986
@ -1398,6 +1398,7 @@ class FlashCausalLM(Model):
|
|||||||
total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
|
total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
|
||||||
|
|
||||||
if max_total_tokens is None:
|
if max_total_tokens is None:
|
||||||
|
if get_support_chunking():
|
||||||
model_max_length = self.tokenizer.model_max_length
|
model_max_length = self.tokenizer.model_max_length
|
||||||
free_memory = get_free_memory(self.device, MEMORY_FRACTION)
|
free_memory = get_free_memory(self.device, MEMORY_FRACTION)
|
||||||
spare_blocks = (
|
spare_blocks = (
|
||||||
@ -1411,9 +1412,18 @@ class FlashCausalLM(Model):
|
|||||||
batch.num_blocks = available_blocks
|
batch.num_blocks = available_blocks
|
||||||
batch.max_blocks = available_blocks
|
batch.max_blocks = available_blocks
|
||||||
max_input_tokens = (
|
max_input_tokens = (
|
||||||
available_blocks - 1 if max_input_tokens is None else max_input_tokens
|
available_blocks - 1
|
||||||
|
if max_input_tokens is None
|
||||||
|
else max_input_tokens
|
||||||
)
|
)
|
||||||
max_total_tokens = available_blocks
|
max_total_tokens = available_blocks
|
||||||
|
else:
|
||||||
|
max_total_tokens = batch.num_blocks
|
||||||
|
max_input_tokens = (
|
||||||
|
batch.num_blocks - 1
|
||||||
|
if max_input_tokens is None
|
||||||
|
else max_input_tokens
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.init_kv_cache(
|
self.init_kv_cache(
|
||||||
|
Loading…
Reference in New Issue
Block a user