lint fix.

2025-09-11 20:34:54 +00:00 · 2024-10-15 18:46:56 +02:00 · 2024-10-15 18:46:56 +02:00 · fc41f0784a
commit fc41f0784a
parent 5c8c5ac81a
3 changed files with 6 additions and 55 deletions
--- a/server/text_generation_server/layers/attention/common.py
+++ b/server/text_generation_server/layers/attention/common.py
@ -1,6 +1,4 @@
 from dataclasses import dataclass
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import ATTENTION
 import torch
 from typing import Optional
@ -52,53 +50,3 @@ class Seqlen:
    def clamp(self, max):
        # Flash decoding doesn't need to clamp
        return self
 =======
 if ATTENTION in {"flashinfer", "flashdecoding"}:
    @dataclass
    class Seqlen:
        input_lengths: torch.Tensor
        prefix_lengths: torch.Tensor
        cu_seqlen_q: Optional[torch.Tensor]
        cu_seqlen_k: Optional[torch.Tensor]
        max_q: int
        max_k: int
        def __init__(
            self,
            input_lengths,
            prefix_lengths,
            cu_seqlen_q=None,
            max_q=None,
            max_k=None,
        ):
            self.input_lengths = input_lengths
            self.prefix_lengths = prefix_lengths
            device = self.input_lengths.device
            shape = self.input_lengths.shape
            if cu_seqlen_q is None:
                cu_seqlen_q = torch.arange(
                    shape[0] + 1,
                    device=device,
                    dtype=torch.int32,
                )
                max_q = 1
            else:
                assert max_q is not None
            assert max_k is not None
            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
            # cuda graphs don't like this and this is necessary to clamp within mistral
            # Although FA2 might not want the clamping
            # cu_seqlen_k[0] = 0
            total = self.input_lengths + self.prefix_lengths
            torch.cumsum(total, -1, out=cu_seqlen_k[1:])
            self.cu_seqlen_q = cu_seqlen_q
            self.cu_seqlen_k = cu_seqlen_k
            self.max_q = max_q
            self.max_k = max_k
        def clamp(self, max):
            # Flash decoding doesn't need to clamp
            return self
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -243,6 +243,7 @@ if ATTENTION == "flashinfer":
            sm_scale=softmax_scale,
            window_left=window_size_left,
        )
 elif ATTENTION == "flashdecoding":
    if V2:
@ -351,6 +352,7 @@ elif ATTENTION == "flashdecoding":
                None,
            )
            return out
 elif ATTENTION == "paged":
    if V2:
@ -459,6 +461,7 @@ elif ATTENTION == "paged":
                None,
            )
            return out
 else:
    raise RuntimeError(f"Unknwon attention {ATTENTION}")
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1679,9 +1679,9 @@ class FlashCausalLM(Model):
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
-        cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
+        cuda_graph["cache_lengths"][
-            cache_lengths_tensor
+            : cache_lengths_tensor.shape[0]
-        )
+        ] = cache_lengths_tensor
        with self._forward_context(
            block_tables=cuda_graph["block_tables"],