From 05a700d370a21e07d4c0fb7978debb3a490751a3 Mon Sep 17 00:00:00 2001
From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com>
Date: Fri, 19 Sep 2025 14:05:36 +0800
Subject: [PATCH] [Bugfix] Fix async copy bug under single expert scenario
 (#3005)

Add missing barrier when no implicit synchonize by `repeat_interleave`
is available. Otherwise, the `non_blocking=True` copy of `output_splits`
and `input_splits` from NPU may failed to complete before later
`async_all_to_all` uses them.

### What this PR does / why we need it?

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/ef7eefe17a7dc212ddb8a8aabd7760218a10e25e

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
---
 vllm_ascend/ops/moe/token_dispatcher.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm_ascend/ops/moe/token_dispatcher.py b/vllm_ascend/ops/moe/token_dispatcher.py
index c1a16d0..c57cc1c 100644
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -639,6 +639,10 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
             self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
                 self.expert_ids_per_ep_rank,
                 self.num_global_tokens_per_local_expert.ravel())
+        else:
+            # TODO: This full synchronization can be a performance bottleneck.
+            # A more granular sync (e.g., blocking D2H copies) should be investigated.
+            torch.npu.synchronize()
 
         return num_tokens_per_local_expert