[2/N][Refactor][Qwen3-Next] remove redundant methods and patch methods in Qwen3NextGatedDeltaNet (#3082)

### What this PR does / why we need it? remove redundant methods and patch methods in Qwen3NextGatedDeltaNet involved causal_conv1d_fn, causal_conv1d_update_npu, fused_gdn_gating, fused_reccrrent_gated_delta_rule, torch_chunk_gated_delta_rule, RMSNormGated ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? ``` def main(): prompts = [ "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="Qwen/Qwen3-Next-80B-A3B-Instruct", tensor_parallel_size=4, enforce_eager=True, trust_remote_code=True, max_model_len=256, gpu_memory_utilization=0.7, block_size=64, ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` CI passed with new added/existing test. - vLLM version: v0.10.2 - vLLM main: 5aeb925452 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-09-24 11:25:42 +08:00
parent eb205d9f35
commit e7618d9414
6 changed files with 667 additions and 980 deletions
--- a/vllm_ascend/ops/sigmoid_gating.py
+++ b/vllm_ascend/ops/sigmoid_gating.py
@@ -97,16 +97,6 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
    o_k = i_k * BK + tl.arange(0, BK)
    o_v = i_v * BV + tl.arange(0, BV)

-    # p_q = q + (bos * H + i_h) * K + o_k
-    # p_k = k + (bos * H + i_h) * K + o_k
-    # p_v = v + (bos * HV + i_hv) * V + o_v
-    # if IS_BETA_HEADWISE:
-    #     p_beta = beta + (bos * HV + i_hv) * V + o_v
-    # else:
-    #     p_beta = beta + bos * HV + i_hv
-    # p_g = g + bos * HV + i_hv
-    # p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
-
    mask_k = o_k < K
    mask_v = o_v < V
    mask_h = mask_k[:, None] & mask_v[None, :]
@@ -170,13 +160,6 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
        p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)

-        # p_q += H * K
-        # p_k += H * K
-        # p_o += HV * V
-        # p_v += HV * V
-        # p_g += HV
-        # p_beta += HV * (V if IS_BETA_HEADWISE else 1)
-

 def fused_recurrent_gated_delta_rule_fwd(
    q: torch.Tensor,
@@ -342,13 +325,11 @@ def fused_recurrent_gated_delta_rule(
            Indices to map the input sequences to the initial/final states.
        num_accepted_tokens (Optional[torch.Tensor]):
            Number of accepted tokens for each sequence during decoding.
-
    Returns:
        o (torch.Tensor):
            Outputs of shape `[B, T, HV, V]`.
        final_state (torch.Tensor):
            Final state of shape `[N, HV, K, V]`.
-
    Examples::
        >>> import torch
        >>> import torch.nn.functional as F
@@ -400,4 +381,4 @@ def fused_recurrent_gated_delta_rule(
        num_accepted_tokens,
        use_qk_l2norm_in_kernel,
    )
-    return o, final_state
+    return o, final_state