[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #11) (#6176)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | | `vllm_ascend/ops/fused_moe/comm_utils.py` | | `vllm_ascend/ops/fused_moe/experts_selector.py` | | `vllm_ascend/ops/fused_moe/fused_moe.py` | | `vllm_ascend/ops/fused_moe/moe_comm_method.py` | | `vllm_ascend/ops/fused_moe/moe_mlp.py` | | `vllm_ascend/ops/fused_moe/prepare_finalize.py` | | `vllm_ascend/ops/fused_moe/token_dispatcher.py` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: d68209402d Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com>
2026-02-06 15:28:49 +08:00
parent 4fb3d5e1b2
commit 65b7f716e6
8 changed files with 694 additions and 784 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -14,40 +14,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from collections.abc import Callable
 from dataclasses import dataclass, field
 from functools import wraps
-from typing import Callable, Optional

 import torch
 import torch.nn.functional as F
 from vllm.config import get_current_vllm_config
-from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_model_parallel_all_reduce
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
-from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map)
-from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
-    SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
-from vllm_ascend.flash_common3_context import (get_flash_common3_context,
-                                               set_flash_common3_context)
-from vllm_ascend.ops.fused_moe.experts_selector import (select_experts,
-                                                        zero_experts_compute)
-from vllm_ascend.ops.fused_moe.moe_comm_method import (AllGatherCommImpl,
-                                                       FusedExpertsResult,
-                                                       setup_moe_comm_method)
+from vllm_ascend.flash_common3_context import get_flash_common3_context, set_flash_common3_context
+from vllm_ascend.ops.fused_moe.experts_selector import select_experts, zero_experts_compute
+from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method
 from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType
-from vllm_ascend.utils import (AscendDeviceType, enable_sp,
-                               get_ascend_device_type, maybe_trans_nz,
-                               npu_stream_switch, shared_expert_dp_enabled,
-                               shared_experts_calculation_stream,
-                               vllm_version_is)
+from vllm_ascend.utils import (
+    enable_sp,
+    maybe_trans_nz,
+    npu_stream_switch,
+    shared_expert_dp_enabled,
+    shared_experts_calculation_stream,
+    vllm_version_is,
+)
+

@dataclass
 class FusedMoEResult:
@@ -64,46 +61,43 @@ class FusedMoEEvents:


 class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
-
    def __init__(self, moe: FusedMoEConfig = None):
-
        super().__init__(moe=moe)
        self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb

    def process_weights_after_loading(self, layer):
-        super(UnquantizedFusedMoEMethod,
-              self).process_weights_after_loading(layer)
+        super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer)

-        w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose(
-            1, 2).contiguous()
+        w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose(1, 2).contiguous()
        layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False)

-        w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose(
-            1, 2).contiguous()
+        w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose(1, 2).contiguous()
        layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)

        layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data)
        layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data)

-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              use_grouped_topk: bool,
-              top_k: int,
-              router_logits: torch.Tensor,
-              renormalize: bool,
-              topk_group: Optional[int] = None,
-              num_expert_group: Optional[int] = None,
-              custom_routing_function: Optional[Callable] = None,
-              scoring_func: str = "softmax",
-              routed_scaling_factor: float = 1.0,
-              e_score_correction_bias: Optional[torch.Tensor] = None,
-              global_num_experts: int = -1,
-              expert_map: Optional[torch.Tensor] = None,
-              apply_router_weight_on_input: bool = False,
-              enable_force_load_balance: bool = False,
-              log2phy: torch.Tensor = None,
-              **kwargs) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        enable_force_load_balance: bool = False,
+        log2phy: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
        zero_expert_num = getattr(layer, "zero_expert_num", 0)
        zero_expert_type = getattr(layer, "zero_expert_type", None)
        topk_weights, topk_ids = select_experts(
@@ -118,7 +112,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
-            global_num_experts=global_num_experts)
+            global_num_experts=global_num_experts,
+        )

        if zero_expert_num > 0 and zero_expert_type is not None:
            topk_ids, topk_weights, zero_expert_result = zero_experts_compute(
@@ -134,11 +129,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        # to avoid accumulating too much tokens on a single rank.
        # currently it is only activated when doing profile runs.
        if enable_force_load_balance:
-            random_matrix = torch.rand(topk_ids.size(0),
-                                       global_num_experts,
-                                       device=topk_ids.device)
-            topk_ids = torch.argsort(
-                random_matrix, dim=1)[:, :topk_ids.size(1)].to(topk_ids.dtype)
+            random_matrix = torch.rand(topk_ids.size(0), global_num_experts, device=topk_ids.device)
+            topk_ids = torch.argsort(random_matrix, dim=1)[:, : topk_ids.size(1)].to(topk_ids.dtype)

        moe_comm_method = get_forward_context().moe_comm_method
        final_hidden_states = moe_comm_method.fused_experts(
@@ -151,7 +143,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
            apply_router_weight_on_input=apply_router_weight_on_input,
            dynamic_eplb=self.dynamic_eplb,
            log2phy=log2phy,
-            mc2_mask=kwargs.get("mc2_mask", None))
+            mc2_mask=kwargs.get("mc2_mask"),
+        )
        if zero_expert_num > 0 and zero_expert_type is not None:
            final_hidden_states += zero_expert_result
        return final_hidden_states
@@ -159,7 +152,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):

 class AscendFusedMoE(FusedMoE):
    moe_counter = -1
-    gate_stream: Optional[torch.npu.Stream] = None
+    gate_stream: torch.npu.Stream | None = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -174,11 +167,9 @@ class AscendFusedMoE(FusedMoE):
        self.log2phy = None

        if self.quant_config is None:
-            self.quant_method = AscendUnquantizedFusedMoEMethod(
-                self.moe_config)
+            self.quant_method = AscendUnquantizedFusedMoEMethod(self.moe_config)
        else:
-            self.quant_method = self.quant_config.get_quant_method(
-                self, self.layer_name)
+            self.quant_method = self.quant_config.get_quant_method(self, self.layer_name)

        assert self.quant_method is not None

@@ -195,28 +186,32 @@ class AscendFusedMoE(FusedMoE):
        if self.custom_routing_function is None and self.e_score_correction_bias is not None:
            vllm_config = get_current_vllm_config()
            self.e_score_correction_bias.data = self.e_score_correction_bias.data.to(
-                dtype=vllm_config.model_config.dtype)
+                dtype=vllm_config.model_config.dtype
+            )

        # init moe
        eplb_config = ascend_config.eplb_config
        self.global_expert_map, self._expert_map, self.log2phy, self.global_redundant_expert_num = init_eplb_config(
-            eplb_config, self.moe_instance_id, self.moe_config)
+            eplb_config, self.moe_instance_id, self.moe_config
+        )
        self.global_num_experts = num_experts + self.global_redundant_expert_num
-        self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy
-                                                          is not None)
-        self.local_num_experts = (torch.sum(
-            self._expert_map != -1).item() if self._expert_map is not None else
-                                  self.global_num_experts)
+        self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None)
+        self.local_num_experts = (
+            torch.sum(self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts
+        )
        if self._expert_map is not None:
            logger.info_once(
                "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
                " number of experts: %s/%s. Experts local to global index map:"
-                " %s.", self.ep_rank, self.ep_size, self.local_num_experts,
+                " %s.",
+                self.ep_rank,
+                self.ep_size,
+                self.local_num_experts,
                self.global_num_experts,
-                get_compressed_expert_map(self._expert_map))
+                get_compressed_expert_map(self._expert_map),
+            )
        if self.dynamic_eplb:
-            self.moe_load = torch.zeros(self.local_num_experts,
-                                        dtype=torch.int64).npu()
+            self.moe_load = torch.zeros(self.local_num_experts, dtype=torch.int64).npu()

        self.moe_config.num_experts = self.global_num_experts
        self.moe_config.num_local_experts = self.local_num_experts
@@ -225,14 +220,12 @@ class AscendFusedMoE(FusedMoE):
        moe_quant_params = {
            "num_experts": self.local_num_experts,
            "hidden_size": self.hidden_size,
-            "intermediate_size_per_partition":
-            self.intermediate_size_per_partition,
+            "intermediate_size_per_partition": self.intermediate_size_per_partition,
            "params_dtype": self.params_dtype,
            "weight_loader": self.weight_loader,
        }
        # need full intermediate size pre-sharding for WNA16 act order
-        if (self.quant_method.__class__.__name__
-                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
+        if self.quant_method.__class__.__name__ in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod"):
            moe_quant_params["intermediate_size_full"] = intermediate_size
        self.quant_method.create_weights(layer=self, **moe_quant_params)

@@ -243,15 +236,14 @@ class AscendFusedMoE(FusedMoE):

    def _get_quant_type(self) -> QuantType:
        quant_method = self.quant_method
-        if not hasattr(quant_method,
-                       "quant_method") or quant_method.quant_method is None:
+        if not hasattr(quant_method, "quant_method") or quant_method.quant_method is None:
            return QuantType.NONE

        method = quant_method.quant_method

        if hasattr(method, "quant_type"):
-            from vllm_ascend.quantization.methods.base import \
-                QuantType as SchemeQuantType
+            from vllm_ascend.quantization.methods.base import QuantType as SchemeQuantType
+
            scheme_quant_type = method.quant_type
            if scheme_quant_type == SchemeQuantType.W8A8:
                return QuantType.W8A8
@@ -270,22 +262,18 @@ class AscendFusedMoE(FusedMoE):
        if self.moe_load is not None:
            self.moe_load.zero_()

-    def maybe_all_reduce_tensor_model_parallel(
-            self, final_hidden_states: torch.Tensor):
+    def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor):
        """NOTE(Yizhou): This is to override the parent class method. In `mc2commimpl`,
        and `alltoallcommimpl`, we do not need to all-reduce the final outputs since
        the outputs are already aggregated across tensor parallel ranks in the
        `finalize` function. In `allgathercommimpl`, we still need to all-reduce the
        outputs since each rank only has partial outputs.
        """
-        return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(
-            final_hidden_states)
+        return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)

    def forward_impl(  # type: ignore[override]
-            self,
-            hidden_states: torch.Tensor,
-            router_logits: torch.Tensor,
-            return_with_event: bool = False) -> torch.Tensor | FusedMoEResult:
+        self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
+    ) -> torch.Tensor | FusedMoEResult:
        assert self.quant_method is not None

        forward_context = get_forward_context()
@@ -301,15 +289,16 @@ class AscendFusedMoE(FusedMoE):
            fc3_context = get_flash_common3_context()
            assert fc3_context is not None
            AscendFusedMoE.gate_stream.wait_stream(torch.npu.current_stream())
-            with npu_stream_switch(AscendFusedMoE.gate_stream,
-                                   enabled=self.multistream_overlap_gate):
+            with npu_stream_switch(AscendFusedMoE.gate_stream, enabled=self.multistream_overlap_gate):
                # share_expert
                assert fc3_context.shared_experts is not None
                shared_out = fc3_context.shared_experts(hidden_states)
                # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
                moe_comm_type = forward_context.moe_comm_type
-                if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2} \
-                        and not shared_expert_dp_enabled():
+                if (
+                    moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2}
+                    and not shared_expert_dp_enabled()
+                ):
                    shared_out = tensor_model_parallel_all_reduce(shared_out)
                set_flash_common3_context(shared_out=shared_out)

@@ -325,24 +314,22 @@ class AscendFusedMoE(FusedMoE):
                    scoring_func=self.scoring_func,
                    routed_scaling_factor=self.routed_scaling_factor,
                    e_score_correction_bias=self.e_score_correction_bias,
-                    global_num_experts=self.global_num_experts)
+                    global_num_experts=self.global_num_experts,
+                )

-                if isinstance(forward_context.moe_comm_method,
-                              AllGatherCommImpl):
-                    topk_weights = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
-                        topk_weights, True, True)
-                    topk_ids = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
-                        topk_ids, True, True)
+                if isinstance(forward_context.moe_comm_method, AllGatherCommImpl):
+                    topk_weights = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(topk_weights, True, True)
+                    topk_ids = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(topk_ids, True, True)

-                set_flash_common3_context(topk_weights=topk_weights,
-                                          topk_ids=topk_ids)
+                set_flash_common3_context(topk_weights=topk_weights, topk_ids=topk_ids)

        hidden_states, router_logits, mc2_mask, context_metadata = forward_context.moe_comm_method.prepare(
            hidden_states=hidden_states,
            router_logits=router_logits,
            replace_allreduce=forward_context.sp_enabled,
            enable_shared_expert_dp=self.enable_shared_expert_dp,
-            quant_type=self.quant_type)
+            quant_type=self.quant_type,
+        )

        # Make sure the default stream waits for the gate stream to finish.
        if self.multistream_overlap_gate:
@@ -375,39 +362,45 @@ class AscendFusedMoE(FusedMoE):
            enable_force_load_balance=enable_force_load_balance,
            log2phy=self.log2phy,
            global_redundant_expert_num=self.global_redundant_expert_num,
-            mc2_mask=mc2_mask)
+            mc2_mask=mc2_mask,
+        )

        if self.dynamic_eplb:
            expert_tokens = fused_experts_results.expert_tokens
            group_list_type = fused_experts_results.group_list_type
-            assert expert_tokens is not None and group_list_type is not None, \
+            assert expert_tokens is not None and group_list_type is not None, (
                "expert_tokens and group_list_type should not be None when dynamic_eplb is enabled."
-            local_load = expert_tokens if group_list_type == 1 else \
-                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+            )
+            local_load = (
+                expert_tokens
+                if group_list_type == 1
+                else torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+            )
            self.moe_load.add_(local_load)
        routed_out = forward_context.moe_comm_method.finalize(
            hidden_states=fused_experts_results.routed_out,
            reduce_results=self.reduce_results,
-            context_metadata=context_metadata)
+            context_metadata=context_metadata,
+        )

        if return_with_event:
            return FusedMoEResult(
                routed_out=routed_out,
                before_dispatch_evt=fused_experts_results.before_dispatch_evt,
-                before_combine_evt=fused_experts_results.before_combine_evt)
+                before_combine_evt=fused_experts_results.before_combine_evt,
+            )
        else:
            # The vLLM FusedMoE forward_impl does not return events.
            return routed_out


 class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
-
    def __init__(
        self,
        shared_experts: torch.nn.Module,
-        gate: Optional[torch.nn.Module] = None,
+        gate: torch.nn.Module | None = None,
        use_overlapped: bool = True,
-        routed_input_transform: Optional[torch.nn.Module] = None,
+        routed_input_transform: torch.nn.Module | None = None,
        **kwargs,
    ):
        AscendFusedMoE.__init__(self, **kwargs)
@@ -418,16 +411,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
        self.use_overlapped = use_overlapped
        self.shared_expert_stream = None
        ascend_config = get_ascend_config()
-        self.multistream_overlap_shared_expert = \
-            ascend_config.multistream_overlap_shared_expert and \
-            self._shared_experts is not None
-        self.multistream_overlap_gate = \
-            ascend_config.multistream_overlap_gate and \
-            self._shared_experts is not None
+        self.multistream_overlap_shared_expert = (
+            ascend_config.multistream_overlap_shared_expert and self._shared_experts is not None
+        )
+        self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and self._shared_experts is not None
        if enable_sp():
-            logger.info_once(
-                "Sequence parallelism is enabled, shared experts are replicated for best performance."
-            )
+            logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")

        self._gate = gate

@@ -447,20 +436,15 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
            self.quant_method.process_weights_after_loading = wrapped_process_weights  # type: ignore

    def _shared_experts_part1(self, hidden_states: torch.Tensor):
-        shared_gate_up, _ = self._shared_experts.gate_up_proj(
-            hidden_states)  # type: ignore
+        shared_gate_up, _ = self._shared_experts.gate_up_proj(hidden_states)  # type: ignore
        return shared_gate_up

-    def _shared_experts_part2(self, hidden_states: torch.Tensor,
-                              shared_gate_up: torch.Tensor):
-        shared_act = self._shared_experts.act_fn(
-            shared_gate_up)  # type: ignore
-        shared_out, _ = self._shared_experts.down_proj(
-            shared_act)  # type: ignore
+    def _shared_experts_part2(self, hidden_states: torch.Tensor, shared_gate_up: torch.Tensor):
+        shared_act = self._shared_experts.act_fn(shared_gate_up)  # type: ignore
+        shared_out, _ = self._shared_experts.down_proj(shared_act)  # type: ignore

        # Qwen3-Next specific gating mechanism
-        if hasattr(self._shared_experts, "expert_gate") and \
-            self._shared_experts.expert_gate is not None:
+        if hasattr(self._shared_experts, "expert_gate") and self._shared_experts.expert_gate is not None:
            gate_out, _ = self._shared_experts.expert_gate(hidden_states)  # type: ignore
            shared_out = F.sigmoid(gate_out) * shared_out
        return shared_out
@@ -468,9 +452,9 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
    def _validate_shared_expert_consistency(self):
        """Validate that split shared expert computation matches integrated
        computation."""
-        test_input = torch.rand(
-            10, self.hidden_size, device='npu', dtype=self.moe_config.in_dtype
-        ) * 2 - 1  # Random input for testing, scoped to [-1, 1]
+        test_input = (
+            torch.rand(10, self.hidden_size, device="npu", dtype=self.moe_config.in_dtype) * 2 - 1
+        )  # Random input for testing, scoped to [-1, 1]

        integrated_out = self._shared_experts(test_input)
        part1_out = self._shared_experts_part1(test_input)
@@ -478,25 +462,19 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):

        if not torch.allclose(integrated_out, split_out):
            diff = (integrated_out - split_out).abs()
-            logger.error(
-                "SharedFusedMoE shared experts split computation does not "
-                "match the integrated computation.")
+            logger.error("SharedFusedMoE shared experts split computation does not match the integrated computation.")
            logger.error(f"Max absolute difference: {diff.max().item()}")
-            logger.error("Integrated output - sum: %s, norm: %s",
-                         integrated_out.sum().item(),
-                         integrated_out.norm().item())
-            logger.error("Split output - sum: %s, norm: %s",
-                         split_out.sum().item(),
-                         split_out.norm().item())
+            logger.error(
+                "Integrated output - sum: %s, norm: %s", integrated_out.sum().item(), integrated_out.norm().item()
+            )
+            logger.error("Split output - sum: %s, norm: %s", split_out.sum().item(), split_out.norm().item())
            raise ValueError(
-                "SharedFusedMoE shared experts split computation does not "
-                "match the integrated computation.")
-        logger.info_once(
-            "SharedFusedMoE shared experts split computation matches the "
-            "integrated computation.")
+                "SharedFusedMoE shared experts split computation does not match the integrated computation."
+            )
+        logger.info_once("SharedFusedMoE shared experts split computation matches the integrated computation.")

    @property
-    def gate(self) -> Optional[torch.nn.Module]:
+    def gate(self) -> torch.nn.Module | None:
        return self._gate if self.use_overlapped else None

    @property
@@ -530,8 +508,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
        )
        return shared_out, fused_out

-    def _forward_shared_experts(self, hidden_states: torch.Tensor,
-                                fused_moe_evts: FusedMoEEvents):
+    def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents):
        if self._shared_experts is None:
            return None

@@ -539,11 +516,9 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
            if evt is not None:
                torch.npu.current_stream().wait_event(evt)

-        with npu_stream_switch(shared_experts_calculation_stream(),
-                               enabled=self.multistream_overlap_shared_expert):
+        with npu_stream_switch(shared_experts_calculation_stream(), enabled=self.multistream_overlap_shared_expert):
            # Ensure the shared experts wait for hidden_states to be ready.
-            torch.npu.current_stream().wait_event(
-                fused_moe_evts.before_routed_experts)
+            torch.npu.current_stream().wait_event(fused_moe_evts.before_routed_experts)
            # Execute the gate projection and activation concurrently with the
            # dispatch communication.
            maybe_wait_event(fused_moe_evts.before_dispatch)
@@ -556,20 +531,22 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
        # Make sure the default stream waits for the shared experts stream to
        # finish.
        if self.multistream_overlap_shared_expert:
-            torch.npu.current_stream().wait_stream(
-                shared_experts_calculation_stream())
+            torch.npu.current_stream().wait_stream(shared_experts_calculation_stream())

        # NOTE: This is exactly the opposite of
        # `maybe_all_reduce_tensor_model_parallel`
        forward_context = get_forward_context()
        moe_comm_type = forward_context.moe_comm_type
-        if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2} \
-                and not shared_expert_dp_enabled():
+        if (
+            moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2}
+            and not shared_expert_dp_enabled()
+        ):
            shared_out = tensor_model_parallel_all_reduce(shared_out)
        return shared_out

    def forward_impl(  # type: ignore[override]
-            self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        self, hidden_states: torch.Tensor, router_logits: torch.Tensor
+    ):
        if self.multistream_overlap_gate:
            set_flash_common3_context(shared_experts=self._shared_experts)

@@ -596,6 +573,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
                    before_routed_experts=before_routed_experts,
                    before_dispatch=fused_moe_results.before_dispatch_evt,
                    before_combine=fused_moe_results.before_combine_evt,
-                ))
+                ),
+            )

        return shared_out, routed_out