support qwen3-next full_decode_only mode. (#3949)

### What this PR does / why we need it?
support qwen3-next full_decode_only mode. 
bs=1, max_token=1024
| branch| tps| e2e time|
| --- | --- | --- |
|piecewise  |3.06  | 8.15 |
|fulldecodeonly | 7.2 | 3.47 |

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
XiaoxinWang
2025-11-05 08:46:05 +08:00
committed by GitHub
parent 5f08e07208
commit 738bf2b720
4 changed files with 66 additions and 9 deletions

View File

@@ -31,6 +31,7 @@ from vllm.distributed.parallel_state import get_dp_group
from vllm.forward_context import get_forward_context
from vllm.logger import logger
import numpy as np
import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.platform import NPUPlatform
@@ -178,6 +179,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
num_reqs: int,
num_tokens: int,
max_query_len: int,
num_scheduled_tokens: np.ndarray,
aclgraph_runtime_mode: Optional[CUDAGraphMode] = None,
force_attention: bool = False,
) -> Optional[dict[str, Any]]:
@@ -186,7 +188,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
if with_prefill or self.enable_shared_expert_dp:
attn_metadata = super()._build_dummy_attn_metadata(
with_prefill, num_reqs, num_tokens, max_query_len,
aclgraph_runtime_mode, force_attention)
num_scheduled_tokens, aclgraph_runtime_mode, force_attention)
else:
common_attn_metadata = TorchairCommonAttentionMetadata(
num_reqs=num_reqs,