flashinfer: remove contiguous calls

2025-09-11 12:24:53 +00:00 · 2025-01-06 16:07:58 +00:00 · 2025-01-06 16:07:58 +00:00 · 6defe57d7a
commit 6defe57d7a
parent 02e3dc49be
1 changed files with 2 additions and 4 deletions
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -60,8 +60,7 @@ def paged_attention(
        from text_generation_server.layers.attention.flashinfer import decode_state
        return decode_state.get().forward(
-            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
+            query,
            query.contiguous(),
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,
            sm_scale=softmax_scale,
@ -231,8 +230,7 @@ def attention(
            softcap = 0.0
        return prefill_with_paged_kv_state.get().forward(
-            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
+            query,
            query.contiguous(),
            causal=causal,
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,