From 490ddf536fd751b60344a155b4d757164d357c3e Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Wed, 10 Dec 2025 22:31:47 +0800
Subject: [PATCH] [perf][dsv3.2][async_scheduling] improve dsv3.2 performance
 by eliminating HD synchronization (#4805)

### What this PR does / why we need it?
This PR eliminates the simplicit HD synchronization in sfa backend, and
_build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly
improving dsv3.2 performance in low-latency scenarios.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Performance improvements are observed with E2E performance serving (P:
DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`.

DSV3.2-W8A8-EXP:
TPOT: 41.67ms -> 23.36ms
ITL: 85.93ms -> 55.96ms

DSV3.2-W8A8 (relaesed in December):
TPOT: 18.11ms
ITL: 56.13ms


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 vllm_ascend/attention/sfa_v1.py         |  6 +++---
 vllm_ascend/spec_decode/mtp_proposer.py | 14 +++++++++++---
 vllm_ascend/worker/model_runner_v1.py   |  3 ++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
index cbf5833b..8f14aa3d 100644
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -170,9 +170,9 @@ class AscendSFAMetadataBuilder:
         input_positions = common_attn_metadata.positions[:
                                                          num_input_tokens].long(
                                                          )
-        query_start_loc = common_attn_metadata.query_start_loc
-        query_lens = query_start_loc[1:] - query_start_loc[:-1]
-        has_prefill = any(query_lens > self.decode_threshold)
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        has_prefill = any(query_lens_cpu > self.decode_threshold)
 
         if self.cos_cache is None:
             self.cos_cache = model.model.layers[
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index 8ff325ff..caf3f601 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -233,7 +233,13 @@ class MtpProposer(Proposer):
             num_tokens_across_dp,
             with_prefill,
         ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
-
+        if self.use_async_scheduling:
+            # there is synchronization between mtp steps when enabling aclgraph,
+            # disable aclgraph when use async scheduling to avoid the
+            # synchronization overhead.
+            # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
+            # and _propose.
+            aclgraph_runtime_mode = CUDAGraphMode.NONE
         moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
         # TODO: remove this after moe_comm_type selection logic is finalized
         moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
@@ -742,9 +748,11 @@ class MtpProposer(Proposer):
         aclgraph_runtime_mode, batch_descriptor = \
             self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
         if self.use_async_scheduling:
-            # there is synchronize between mtp steps when enable aclgraph,
+            # there is synchronization between mtp steps when enabling aclgraph,
             # disable aclgraph when use async scheduling to avoid the
-            # synchronize overhead.
+            # synchronization overhead.
+            # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
+            # and _propose.
             aclgraph_runtime_mode = CUDAGraphMode.NONE
 
         if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index bfb9a510..705634db 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2923,9 +2923,10 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
             cu_num_tokens, arange = self._get_cumsum_and_arange(
                 num_scheduled_tokens)
 
-            self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens)
             self.query_start_loc_cpu[1:num_reqs +
                                      1] = torch.Tensor(cu_num_tokens)
+            self.query_start_loc = self.query_start_loc_cpu.pin_memory().to(
+                self.device, non_blocking=True)
             self.query_lens = torch.from_numpy(num_scheduled_tokens)
             self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()