match the latest vllm_extension ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-10 11:54:52 +00:00 · 2025-04-10 19:32:32 -07:00 · 2025-04-10 19:32:32 -07:00 · 4cdc34ec4d
commit 4cdc34ec4d
parent 610dd200e5
2 changed files with 2 additions and 2 deletions
--- a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
@ -68,7 +68,7 @@ def paged_attention(
 ):
    batch_size, head_num, head_size = query.shape
    output = ops.flat_pa(
-        query=query,
+        query=query.view(batch_size, 1, head_num * head_size),
        key_cache=kv_cache.key,
        value_cache=kv_cache.value,
        block_list=hpu_attention_meta.block_list,
--- a/backends/gaudi/server/text_generation_server/layers/fp8.py
+++ b/backends/gaudi/server/text_generation_server/layers/fp8.py
@ -11,7 +11,7 @@ from text_generation_server.utils.weights import (
 )
 from vllm_hpu_extension.ops import scaled_fp8_quant
-from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
+from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
 import habana_frameworks.torch.utils.experimental as htexp
 w8a8_block_fp8_matmul = None