support qwen3-next full_decode_only mode. (#3949)

### What this PR does / why we need it? support qwen3-next full_decode_only mode. bs=1, max_token=1024 | branch| tps| e2e time| | --- | --- | --- | |piecewise |3.06 | 8.15 | |fulldecodeonly | 7.2 | 3.47 | - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-11-05 08:46:05 +08:00
parent 5f08e07208
commit 738bf2b720
4 changed files with 66 additions and 9 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -31,6 +31,7 @@ from vllm.distributed.parallel_state import get_dp_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger

+import numpy as np
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.platform import NPUPlatform
@@ -178,6 +179,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
        num_reqs: int,
        num_tokens: int,
        max_query_len: int,
+        num_scheduled_tokens: np.ndarray,
        aclgraph_runtime_mode: Optional[CUDAGraphMode] = None,
        force_attention: bool = False,
    ) -> Optional[dict[str, Any]]:
@@ -186,7 +188,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
        if with_prefill or self.enable_shared_expert_dp:
            attn_metadata = super()._build_dummy_attn_metadata(
                with_prefill, num_reqs, num_tokens, max_query_len,
-                aclgraph_runtime_mode, force_attention)
+                num_scheduled_tokens, aclgraph_runtime_mode, force_attention)
        else:
            common_attn_metadata = TorchairCommonAttentionMetadata(
                num_reqs=num_reqs,