diff --git a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py index 5aec87c2..8cca7a29 100644 --- a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py +++ b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py @@ -88,6 +88,7 @@ def attention( _, kv_head_num, head_size = key.shape query = query.view(bs, -1, head_num, head_size).transpose(1, 2) key = key.view(bs, -1, kv_head_num, head_size).transpose(1, 2) + value = value.view(bs, -1, kv_head_num, head_size).transpose(1, 2) attn_output = fsdpa_op( query, key, diff --git a/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py index ae60e7aa..8f19174f 100644 --- a/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py +++ b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py @@ -155,7 +155,6 @@ class TensorParallelColumnLinear(SuperLayer): @classmethod def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int): - print(f"bias: {bias}") if config.quantize == "exl2": linears = [] for prefix in prefixes: