mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
WhaT?
This commit is contained in:
parent
ee47973a2f
commit
66b2015586
@ -511,33 +511,18 @@ class BaseFlashMistral(FlashCausalLM):
|
|||||||
cuda_graph = self.cuda_graphs.get(padded_bs, None)
|
cuda_graph = self.cuda_graphs.get(padded_bs, None)
|
||||||
|
|
||||||
if cu_seqlen_prefill is not None or cuda_graph is None:
|
if cu_seqlen_prefill is not None or cuda_graph is None:
|
||||||
|
logits, speculative_logits = self.model.forward(
|
||||||
if cu_seqlen_prefill is None:
|
input_ids=input_ids,
|
||||||
logits, speculative_logits = self.compiled_model(
|
position_ids=position_ids,
|
||||||
input_ids=input_ids,
|
cu_seqlen_prefill=cu_seqlen_prefill,
|
||||||
position_ids=position_ids,
|
kv_cache=kv_cache,
|
||||||
cu_seqlen_prefill=cu_seqlen_prefill,
|
block_tables=block_tables,
|
||||||
kv_cache=kv_cache,
|
slots=slots,
|
||||||
block_tables=block_tables,
|
input_lengths=input_lengths,
|
||||||
slots=slots,
|
max_s=max_s,
|
||||||
input_lengths=input_lengths,
|
prefill_cache_indices=batch.prefill_cache_indices,
|
||||||
max_s=max_s,
|
lm_head_indices=lm_head_indices,
|
||||||
prefill_cache_indices=batch.prefill_cache_indices,
|
)
|
||||||
lm_head_indices=lm_head_indices,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logits, speculative_logits = self.model.forward(
|
|
||||||
input_ids=input_ids,
|
|
||||||
position_ids=position_ids,
|
|
||||||
cu_seqlen_prefill=cu_seqlen_prefill,
|
|
||||||
kv_cache=kv_cache,
|
|
||||||
block_tables=block_tables,
|
|
||||||
slots=slots,
|
|
||||||
input_lengths=input_lengths,
|
|
||||||
max_s=max_s,
|
|
||||||
prefill_cache_indices=batch.prefill_cache_indices,
|
|
||||||
lm_head_indices=lm_head_indices,
|
|
||||||
)
|
|
||||||
if batch.prefill_cache_indices is not None:
|
if batch.prefill_cache_indices is not None:
|
||||||
batch.prefill_cache_indices = None
|
batch.prefill_cache_indices = None
|
||||||
return logits, speculative_logits
|
return logits, speculative_logits
|
||||||
|
Loading…
Reference in New Issue
Block a user