[perf][dsv3.2][async_scheduling] improve dsv3.2 performance by eliminating HD synchronization (#4805)

### What this PR does / why we need it?
This PR eliminates the simplicit HD synchronization in sfa backend, and
_build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly
improving dsv3.2 performance in low-latency scenarios.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Performance improvements are observed with E2E performance serving (P:
DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`.

DSV3.2-W8A8-EXP:
TPOT: 41.67ms -> 23.36ms
ITL: 85.93ms -> 55.96ms

DSV3.2-W8A8 (relaesed in December):
TPOT: 18.11ms
ITL: 56.13ms
 

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
linfeng-yuan
2025-12-10 22:31:47 +08:00
committed by GitHub
parent dd622aa6a6
commit 490ddf536f
3 changed files with 16 additions and 7 deletions

View File

@@ -170,9 +170,9 @@ class AscendSFAMetadataBuilder:
input_positions = common_attn_metadata.positions[: input_positions = common_attn_metadata.positions[:
num_input_tokens].long( num_input_tokens].long(
) )
query_start_loc = common_attn_metadata.query_start_loc query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
query_lens = query_start_loc[1:] - query_start_loc[:-1] query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
has_prefill = any(query_lens > self.decode_threshold) has_prefill = any(query_lens_cpu > self.decode_threshold)
if self.cos_cache is None: if self.cos_cache is None:
self.cos_cache = model.model.layers[ self.cos_cache = model.model.layers[

View File

@@ -233,7 +233,13 @@ class MtpProposer(Proposer):
num_tokens_across_dp, num_tokens_across_dp,
with_prefill, with_prefill,
) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill) ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
if self.use_async_scheduling:
# there is synchronization between mtp steps when enabling aclgraph,
# disable aclgraph when use async scheduling to avoid the
# synchronization overhead.
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
# and _propose.
aclgraph_runtime_mode = CUDAGraphMode.NONE
moe_comm_type = self.runner._select_moe_comm_method(num_tokens) moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
# TODO: remove this after moe_comm_type selection logic is finalized # TODO: remove this after moe_comm_type selection logic is finalized
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
@@ -742,9 +748,11 @@ class MtpProposer(Proposer):
aclgraph_runtime_mode, batch_descriptor = \ aclgraph_runtime_mode, batch_descriptor = \
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
if self.use_async_scheduling: if self.use_async_scheduling:
# there is synchronize between mtp steps when enable aclgraph, # there is synchronization between mtp steps when enabling aclgraph,
# disable aclgraph when use async scheduling to avoid the # disable aclgraph when use async scheduling to avoid the
# synchronize overhead. # synchronization overhead.
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
# and _propose.
aclgraph_runtime_mode = CUDAGraphMode.NONE aclgraph_runtime_mode = CUDAGraphMode.NONE
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs( if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(

View File

@@ -2923,9 +2923,10 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
cu_num_tokens, arange = self._get_cumsum_and_arange( cu_num_tokens, arange = self._get_cumsum_and_arange(
num_scheduled_tokens) num_scheduled_tokens)
self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens)
self.query_start_loc_cpu[1:num_reqs + self.query_start_loc_cpu[1:num_reqs +
1] = torch.Tensor(cu_num_tokens) 1] = torch.Tensor(cu_num_tokens)
self.query_start_loc = self.query_start_loc_cpu.pin_memory().to(
self.device, non_blocking=True)
self.query_lens = torch.from_numpy(num_scheduled_tokens) self.query_lens = torch.from_numpy(num_scheduled_tokens)
self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask() self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()