From 004e164bdb1c8676abf0c59e9661f42daa616d8f Mon Sep 17 00:00:00 2001
From: ldh2020 <62470572+ldh2020@users.noreply.github.com>
Date: Sun, 21 Dec 2025 11:18:00 +0800
Subject: [PATCH] [Kernel]  Optimize the recurrent op

---
 vllm_kunlun/ops/fla/fused_recurrent.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_kunlun/ops/fla/fused_recurrent.py b/vllm_kunlun/ops/fla/fused_recurrent.py
index 143b6a0..3902bee 100644
--- a/vllm_kunlun/ops/fla/fused_recurrent.py
+++ b/vllm_kunlun/ops/fla/fused_recurrent.py
@@ -44,6 +44,7 @@ class FusedRecurrentFunction(torch.autograd.Function):
             h0_indices=ssm_state_indices,
             num_accepted_tokens=num_accepted_tokens,
             use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            is_h0_transposed=True
         )
         return o, final_state
 
@@ -150,4 +151,4 @@ def fused_recurrent_gated_delta_rule(
         num_accepted_tokens,
         use_qk_l2norm_in_kernel,
     )
-    return o, final_state
\ No newline at end of file
+    return o, final_state