support qwen3-next full_decode_only mode. (#3949)

### What this PR does / why we need it? support qwen3-next full_decode_only mode. bs=1, max_token=1024 | branch| tps| e2e time| | --- | --- | --- | |piecewise |3.06 | 8.15 | |fulldecodeonly | 7.2 | 3.47 | - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-11-05 08:46:05 +08:00
parent 5f08e07208
commit 738bf2b720
4 changed files with 66 additions and 9 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -192,8 +192,10 @@ class ACLGraphWrapper:

 def update_attn_params(update_stream, forward_context, runtime_shape):
    graph_params = get_graph_params()
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
+    # For Qwen3-next, since the kv_cache_config has already categorized
+    # linear_attn and self_attn, the attn_metadata is first arranged with
+    # self_attn followed by linear_attn. Therefore, using zip directly
+    # filters out the update operations for linear_attn.
    with torch.npu.stream(update_stream):
        for key, param, handle, event in zip(
                forward_context.attn_metadata,
@@ -289,9 +291,9 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,


 def update_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):
-    graph_params = get_graph_params()
    # FIXME: Behold! We are using a temporary hack here to update the args
    # for each layer's attention op in the graph.
+    graph_params = get_graph_params()
    with torch.npu.stream(update_stream):
        for key, param, handle, event in zip(
                forward_context.attn_metadata,