feat: prefer triton ops and batch conv

2025-09-11 04:14:52 +00:00 · 2024-02-06 20:38:28 +00:00 · 2024-02-06 20:38:28 +00:00 · 5e102183d8
commit 5e102183d8
parent 8319e854c8
2 changed files with 15 additions and 24 deletions
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@ -1,6 +1,7 @@
 import torch
 import torch.distributed

+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
 from mamba_ssm.utils.generation import InferenceParams
 from torch import nn
@ -14,7 +15,7 @@ from text_generation_server.utils.layers import (
    FastLinear,
 )

-from einops import rearrange, repeat
+from einops import rearrange
 from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 import math

@ -118,35 +119,29 @@ class MambaBlock(nn.Module):
        _xz = self.in_proj(hidden_states)
        _x, _z = _xz.chunk(2, dim=-1)  # (B D)
        conv_state_new = torch.cat([conv_state, _x.transpose(1,2)], dim=-1)
-        conv_out = causal_conv1d_fn( x=conv_state_new, weight=self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)), bias=self.conv1d.bias, activation=self.activation)
+        conv_out = causal_conv1d_fn( 
+            x=conv_state_new, 
+            weight=self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)), 
+            bias=self.conv1d.bias, 
+            activation=self.activation
+        )
        conv_state = conv_state_new[:, :, 1:]
-
        bsz, seqlen, dim = hidden_states.shape
-        # empty output tensor for the loop
        output_tensor = torch.zeros(
            (bsz, seqlen, dim),
            device=hidden_states.device, 
            dtype=hidden_states.dtype
        )
-
        for i in range(0, bsz):
            x = conv_out[i:i+1,:,-1]
            z = _z[i:i+1, -1, :]
            x_db = self.x_proj(x)
            dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
-            dt = self.dt_proj_no_bias(dt)
-            dt = F.softplus(dt + self.dt_proj.bias).view((dt.size(1), -1))
-            dA = torch.exp(dt * self.negA)
-            dB = dt * B.view(-1, B.size(-1))
-            x_shape = (-1, x.size(-1), 1)
-            ssm_state[i] = (ssm_state[i] * dA + dB * x.view(x_shape)) 
-            c_shape = (C.size(0), C.size(1), -1)
-            out_mm_shape = (C.size(0), -1)
-            out = torch.matmul(ssm_state[i].to(C.dtype), C.view(c_shape)).view(out_mm_shape)
-            # in-place ops
-            out.add_((x * self.D).to(out.dtype))
-            out.mul_(F.silu(z))
-            out = self.out_proj(out)
+            df = self.dt_proj_no_bias(x)
+            y = selective_state_update(
+                ssm_state[i:i+1,:,:], x, dt, self.negA, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+            out = self.out_proj(y)
            output_tensor[i] = out

        return output_tensor, conv_state, ssm_state
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -344,12 +344,8 @@ class MambaBatch(Batch):
            for i in range(n_blocks):
                conv_state, ssm_state = batch.inference_params.key_value_memory_dict[i]
                batch_size = batch.inference_params.max_batch_size
-                try:
-                    inference_params.key_value_memory_dict[i][0][current_batch:current_batch + batch_size] = conv_state
-                    inference_params.key_value_memory_dict[i][1][current_batch:current_batch + batch_size] = ssm_state
-                except Exception:
-                    import ipdb;ipdb.set_trace()
-                    pass
+                inference_params.key_value_memory_dict[i][0][current_batch:current_batch + batch_size] = conv_state
+                inference_params.key_value_memory_dict[i][1][current_batch:current_batch + batch_size] = ssm_state
                inference_params.lengths_per_sample[current_batch: current_batch + batch_size] = batch.inference_params.lengths_per_sample
            current_batch += batch_size