[perf][dsv3.2][async_scheduling] improve dsv3.2 performance by eliminating HD synchronization (#4805)

### What this PR does / why we need it? This PR eliminates the simplicit HD synchronization in sfa backend, and _build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly improving dsv3.2 performance in low-latency scenarios. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Performance improvements are observed with E2E performance serving (P: DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`. DSV3.2-W8A8-EXP: TPOT: 41.67ms -> 23.36ms ITL: 85.93ms -> 55.96ms DSV3.2-W8A8 (relaesed in December): TPOT: 18.11ms ITL: 56.13ms - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-12-10 22:31:47 +08:00
parent dd622aa6a6
commit 490ddf536f
3 changed files with 16 additions and 7 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -233,7 +233,13 @@ class MtpProposer(Proposer):
            num_tokens_across_dp,
            with_prefill,
        ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
-
+        if self.use_async_scheduling:
+            # there is synchronization between mtp steps when enabling aclgraph,
+            # disable aclgraph when use async scheduling to avoid the
+            # synchronization overhead.
+            # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
+            # and _propose.
+            aclgraph_runtime_mode = CUDAGraphMode.NONE
        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
        # TODO: remove this after moe_comm_type selection logic is finalized
        moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
@@ -742,9 +748,11 @@ class MtpProposer(Proposer):
        aclgraph_runtime_mode, batch_descriptor = \
            self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
        if self.use_async_scheduling:
-            # there is synchronize between mtp steps when enable aclgraph,
+            # there is synchronization between mtp steps when enabling aclgraph,
            # disable aclgraph when use async scheduling to avoid the
-            # synchronize overhead.
+            # synchronization overhead.
+            # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
+            # and _propose.
            aclgraph_runtime_mode = CUDAGraphMode.NONE

        if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(