From 05a700d370a21e07d4c0fb7978debb3a490751a3 Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Fri, 19 Sep 2025 14:05:36 +0800 Subject: [PATCH] [Bugfix] Fix async copy bug under single expert scenario (#3005) Add missing barrier when no implicit synchonize by `repeat_interleave` is available. Otherwise, the `non_blocking=True` copy of `output_splits` and `input_splits` from NPU may failed to complete before later `async_all_to_all` uses them. ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/ef7eefe17a7dc212ddb8a8aabd7760218a10e25e Signed-off-by: sdmyzlp --- vllm_ascend/ops/moe/token_dispatcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm_ascend/ops/moe/token_dispatcher.py b/vllm_ascend/ops/moe/token_dispatcher.py index c1a16d0..c57cc1c 100644 --- a/vllm_ascend/ops/moe/token_dispatcher.py +++ b/vllm_ascend/ops/moe/token_dispatcher.py @@ -639,6 +639,10 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher): self.global_input_tokens_local_experts_indices = torch.repeat_interleave( self.expert_ids_per_ep_rank, self.num_global_tokens_per_local_expert.ravel()) + else: + # TODO: This full synchronization can be a performance bottleneck. + # A more granular sync (e.g., blocking D2H copies) should be investigated. + torch.npu.synchronize() return num_tokens_per_local_expert