Merge pull request #40 from ldh2020/v0.11.0dev

[Kernel] Optimize the performance of Qwen3-Next
This commit is contained in:
Xinyu Dong
2025-12-22 21:50:27 +08:00
committed by GitHub
3 changed files with 32 additions and 15 deletions

View File

@@ -44,6 +44,7 @@ class FusedRecurrentFunction(torch.autograd.Function):
h0_indices=ssm_state_indices,
num_accepted_tokens=num_accepted_tokens,
use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
is_h0_transposed=True
)
return o, final_state
@@ -150,4 +151,4 @@ def fused_recurrent_gated_delta_rule(
num_accepted_tokens,
use_qk_l2norm_in_kernel,
)
return o, final_state
return o, final_state