match the latest vllm_extension ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi A 2025-04-10 19:32:32 -07:00
parent 610dd200e5
commit 4cdc34ec4d
2 changed files with 2 additions and 2 deletions

View File

@ -68,7 +68,7 @@ def paged_attention(
):
batch_size, head_num, head_size = query.shape
output = ops.flat_pa(
query=query,
query=query.view(batch_size, 1, head_num * head_size),
key_cache=kv_cache.key,
value_cache=kv_cache.value,
block_list=hpu_attention_meta.block_list,

View File

@ -11,7 +11,7 @@ from text_generation_server.utils.weights import (
)
from vllm_hpu_extension.ops import scaled_fp8_quant
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
import habana_frameworks.torch.utils.experimental as htexp
w8a8_block_fp8_matmul = None