From 490ddf536fd751b60344a155b4d757164d357c3e Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Wed, 10 Dec 2025 22:31:47 +0800 Subject: [PATCH] [perf][dsv3.2][async_scheduling] improve dsv3.2 performance by eliminating HD synchronization (#4805) ### What this PR does / why we need it? This PR eliminates the simplicit HD synchronization in sfa backend, and _build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly improving dsv3.2 performance in low-latency scenarios. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Performance improvements are observed with E2E performance serving (P: DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`. DSV3.2-W8A8-EXP: TPOT: 41.67ms -> 23.36ms ITL: 85.93ms -> 55.96ms DSV3.2-W8A8 (relaesed in December): TPOT: 18.11ms ITL: 56.13ms - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: linfeng-yuan <1102311262@qq.com> --- vllm_ascend/attention/sfa_v1.py | 6 +++--- vllm_ascend/spec_decode/mtp_proposer.py | 14 +++++++++++--- vllm_ascend/worker/model_runner_v1.py | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index cbf5833b..8f14aa3d 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -170,9 +170,9 @@ class AscendSFAMetadataBuilder: input_positions = common_attn_metadata.positions[: num_input_tokens].long( ) - query_start_loc = common_attn_metadata.query_start_loc - query_lens = query_start_loc[1:] - query_start_loc[:-1] - has_prefill = any(query_lens > self.decode_threshold) + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + has_prefill = any(query_lens_cpu > self.decode_threshold) if self.cos_cache is None: self.cos_cache = model.model.layers[ diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 8ff325ff..caf3f601 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -233,7 +233,13 @@ class MtpProposer(Proposer): num_tokens_across_dp, with_prefill, ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill) - + if self.use_async_scheduling: + # there is synchronization between mtp steps when enabling aclgraph, + # disable aclgraph when use async scheduling to avoid the + # synchronization overhead. + # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run + # and _propose. + aclgraph_runtime_mode = CUDAGraphMode.NONE moe_comm_type = self.runner._select_moe_comm_method(num_tokens) # TODO: remove this after moe_comm_type selection logic is finalized moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type @@ -742,9 +748,11 @@ class MtpProposer(Proposer): aclgraph_runtime_mode, batch_descriptor = \ self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) if self.use_async_scheduling: - # there is synchronize between mtp steps when enable aclgraph, + # there is synchronization between mtp steps when enabling aclgraph, # disable aclgraph when use async scheduling to avoid the - # synchronize overhead. + # synchronization overhead. + # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run + # and _propose. aclgraph_runtime_mode = CUDAGraphMode.NONE if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bfb9a510..705634db 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2923,9 +2923,10 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): cu_num_tokens, arange = self._get_cumsum_and_arange( num_scheduled_tokens) - self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens) self.query_start_loc_cpu[1:num_reqs + 1] = torch.Tensor(cu_num_tokens) + self.query_start_loc = self.query_start_loc_cpu.pin_memory().to( + self.device, non_blocking=True) self.query_lens = torch.from_numpy(num_scheduled_tokens) self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()