From 004e164bdb1c8676abf0c59e9661f42daa616d8f Mon Sep 17 00:00:00 2001 From: ldh2020 <62470572+ldh2020@users.noreply.github.com> Date: Sun, 21 Dec 2025 11:18:00 +0800 Subject: [PATCH] [Kernel] Optimize the recurrent op --- vllm_kunlun/ops/fla/fused_recurrent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_kunlun/ops/fla/fused_recurrent.py b/vllm_kunlun/ops/fla/fused_recurrent.py index 143b6a0..3902bee 100644 --- a/vllm_kunlun/ops/fla/fused_recurrent.py +++ b/vllm_kunlun/ops/fla/fused_recurrent.py @@ -44,6 +44,7 @@ class FusedRecurrentFunction(torch.autograd.Function): h0_indices=ssm_state_indices, num_accepted_tokens=num_accepted_tokens, use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + is_h0_transposed=True ) return o, final_state @@ -150,4 +151,4 @@ def fused_recurrent_gated_delta_rule( num_accepted_tokens, use_qk_l2norm_in_kernel, ) - return o, final_state \ No newline at end of file + return o, final_state