Fixing dtype + AMD, Ipex targets.

2025-09-11 20:34:54 +00:00 · 2024-10-15 17:56:03 +02:00 · 2024-10-15 17:56:03 +02:00 · fa491e730b
commit fa491e730b
parent 4fa4da3cb6
3 changed files with 4 additions and 3 deletions
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -66,6 +66,7 @@ def paged_attention(
    softcap: Optional[float] = None,
 ):
    out = torch.empty_like(query)
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
        out,
        query,
@ -74,7 +75,7 @@ def paged_attention(
        kv_head_mapping,
        softmax_scale,
        block_tables,
-        seqlen.input_lengths,
+        input_lengths,
        BLOCK_SIZE,
        max_s,
        None,
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -104,7 +104,7 @@ def paged_attention(
        _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM

    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    input_lengths = seqlen.input_lengths
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths

    out = torch.empty_like(query)

--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1917,7 +1917,7 @@ class FlashCausalLM(Model):
            batch.speculative_ids = speculative_ids
            batch.position_ids = next_position_ids + accepted_ids
            batch.cache_lengths_tensor += batch.input_lengths_tensor
-            batch.input_lengths_tensor = accepted_ids
+            batch.input_lengths_tensor = accepted_ids.to(dtype=torch.int32)
            batch.slot_indices += accepted_ids
            batch.adapter_meta.adapter_indices = next_adapter_indices