From 39f8af9d96914e2659239046796b13a31215a2e9 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Thu, 29 Jan 2026 08:47:20 +0800 Subject: [PATCH] [Main2Main][BugFix] Add shared_experts check for AscendSharedFusedMoE (#6335) ### What this PR does / why we need it? PR https://github.com/vllm-project/vllm/pull/32082 in vLLM makes Qwen3-Moe models also go into `SharedFusedMoE`, while current implementation of our `AscendSharedFusedMoE` assumes shared_experts always exist. This PR adds checking to `multistream_overlap_shared_expert` and `multistream_overlap_gate` in order to only enable these features when shared experts exist. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All ci passed - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm_ascend/ops/fused_moe/fused_moe.py | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 12d6a63e..50618ae8 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -415,8 +415,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): self.use_overlapped = use_overlapped self.shared_expert_stream = None ascend_config = get_ascend_config() - self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert - self.multistream_overlap_gate = ascend_config.multistream_overlap_gate + self.multistream_overlap_shared_expert = \ + ascend_config.multistream_overlap_shared_expert and \ + self._shared_experts is not None + self.multistream_overlap_gate = \ + ascend_config.multistream_overlap_gate and \ + self._shared_experts is not None if enable_sp(): logger.info_once( "Sequence parallelism is enabled, shared experts are replicated for best performance." @@ -424,19 +428,20 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): self._gate = gate - # Wrap the quant_method's process_weights_after_loading to validate that - # splitting shared expert computation (gate_up projection + activation, - # then down projection) yields identical results to integrated - # computation after weight loading. - original_process_weights = self.quant_method.process_weights_after_loading + if self.multistream_overlap_shared_expert: + # Wrap the quant_method's process_weights_after_loading to validate that + # splitting shared expert computation (gate_up projection + activation, + # then down projection) yields identical results to integrated + # computation after weight loading. + original_process_weights = self.quant_method.process_weights_after_loading - @wraps(original_process_weights) - def wrapped_process_weights(*args, **kwargs): - result = original_process_weights(*args, **kwargs) - self._validate_shared_expert_consistency() - return result + @wraps(original_process_weights) + def wrapped_process_weights(*args, **kwargs): + result = original_process_weights(*args, **kwargs) + self._validate_shared_expert_consistency() + return result - self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore + self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore def _shared_experts_part1(self, hidden_states: torch.Tensor): shared_gate_up, _ = self._shared_experts.gate_up_proj( @@ -516,6 +521,8 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents): + if self._shared_experts is None: + return None def maybe_wait_event(evt: torch.npu.Event | None): if evt is not None: