mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
match the latest vllm_extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
610dd200e5
commit
4cdc34ec4d
@ -68,7 +68,7 @@ def paged_attention(
|
|||||||
):
|
):
|
||||||
batch_size, head_num, head_size = query.shape
|
batch_size, head_num, head_size = query.shape
|
||||||
output = ops.flat_pa(
|
output = ops.flat_pa(
|
||||||
query=query,
|
query=query.view(batch_size, 1, head_num * head_size),
|
||||||
key_cache=kv_cache.key,
|
key_cache=kv_cache.key,
|
||||||
value_cache=kv_cache.value,
|
value_cache=kv_cache.value,
|
||||||
block_list=hpu_attention_meta.block_list,
|
block_list=hpu_attention_meta.block_list,
|
||||||
|
@ -11,7 +11,7 @@ from text_generation_server.utils.weights import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from vllm_hpu_extension.ops import scaled_fp8_quant
|
from vllm_hpu_extension.ops import scaled_fp8_quant
|
||||||
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
|
from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
|
||||||
import habana_frameworks.torch.utils.experimental as htexp
|
import habana_frameworks.torch.utils.experimental as htexp
|
||||||
|
|
||||||
w8a8_block_fp8_matmul = None
|
w8a8_block_fp8_matmul = None
|
||||||
|
Loading…
Reference in New Issue
Block a user