[perf][dsv3.2][async_scheduling] improve dsv3.2 performance by eliminating HD synchronization (#4805)
### What this PR does / why we need it?
This PR eliminates the simplicit HD synchronization in sfa backend, and
_build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly
improving dsv3.2 performance in low-latency scenarios.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Performance improvements are observed with E2E performance serving (P:
DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`.
DSV3.2-W8A8-EXP:
TPOT: 41.67ms -> 23.36ms
ITL: 85.93ms -> 55.96ms
DSV3.2-W8A8 (relaesed in December):
TPOT: 18.11ms
ITL: 56.13ms
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -170,9 +170,9 @@ class AscendSFAMetadataBuilder:
|
||||
input_positions = common_attn_metadata.positions[:
|
||||
num_input_tokens].long(
|
||||
)
|
||||
query_start_loc = common_attn_metadata.query_start_loc
|
||||
query_lens = query_start_loc[1:] - query_start_loc[:-1]
|
||||
has_prefill = any(query_lens > self.decode_threshold)
|
||||
query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
|
||||
query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
|
||||
has_prefill = any(query_lens_cpu > self.decode_threshold)
|
||||
|
||||
if self.cos_cache is None:
|
||||
self.cos_cache = model.model.layers[
|
||||
|
||||
@@ -233,7 +233,13 @@ class MtpProposer(Proposer):
|
||||
num_tokens_across_dp,
|
||||
with_prefill,
|
||||
) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
|
||||
|
||||
if self.use_async_scheduling:
|
||||
# there is synchronization between mtp steps when enabling aclgraph,
|
||||
# disable aclgraph when use async scheduling to avoid the
|
||||
# synchronization overhead.
|
||||
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
|
||||
# and _propose.
|
||||
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
|
||||
# TODO: remove this after moe_comm_type selection logic is finalized
|
||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
||||
@@ -742,9 +748,11 @@ class MtpProposer(Proposer):
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
if self.use_async_scheduling:
|
||||
# there is synchronize between mtp steps when enable aclgraph,
|
||||
# there is synchronization between mtp steps when enabling aclgraph,
|
||||
# disable aclgraph when use async scheduling to avoid the
|
||||
# synchronize overhead.
|
||||
# synchronization overhead.
|
||||
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
|
||||
# and _propose.
|
||||
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
||||
|
||||
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
|
||||
|
||||
@@ -2923,9 +2923,10 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
cu_num_tokens, arange = self._get_cumsum_and_arange(
|
||||
num_scheduled_tokens)
|
||||
|
||||
self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens)
|
||||
self.query_start_loc_cpu[1:num_reqs +
|
||||
1] = torch.Tensor(cu_num_tokens)
|
||||
self.query_start_loc = self.query_start_loc_cpu.pin_memory().to(
|
||||
self.device, non_blocking=True)
|
||||
self.query_lens = torch.from_numpy(num_scheduled_tokens)
|
||||
self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user