init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/ops/init.py
+++ b/vllm_ascend/ops/init.py
@@ -20,6 +20,7 @@ import torch
 import vllm_ascend.ops.common_fused_moe  # noqa
 import vllm_ascend.ops.fused_moe  # noqa
 import vllm_ascend.ops.layernorm  # noqa
+import vllm_ascend.ops.register_custom_ops  # noqa
 import vllm_ascend.ops.vocab_parallel_embedding  # noqa
 from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
 from vllm_ascend.ops.rotary_embedding import (
@@ -34,19 +35,20 @@ class dummyFusionOp:


 def register_dummy_fusion_op() -> None:
-    torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm")
-    torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm")
-    torch.ops._C.static_scaled_fp8_quant = dummyFusionOp(
+    torch.ops._C_ascend.rms_norm = dummyFusionOp(name="rms_norm")
+    torch.ops._C_ascend.fused_add_rms_norm = dummyFusionOp(
+        name="fused_add_rms_norm")
+    torch.ops._C_ascend.static_scaled_fp8_quant = dummyFusionOp(
        name="static_scaled_fp8_quant")
-    torch.ops._C.dynamic_scaled_fp8_quant = dummyFusionOp(
+    torch.ops._C_ascend.dynamic_scaled_fp8_quant = dummyFusionOp(
        name="dynamic_scaled_fp8_quant")
-    torch.ops._C.dynamic_per_token_scaled_fp8_quant = dummyFusionOp(
+    torch.ops._C_ascend.dynamic_per_token_scaled_fp8_quant = dummyFusionOp(
        name="dynamic_per_token_scaled_fp8_quant")
-    torch.ops._C.rms_norm_static_fp8_quant = dummyFusionOp(
+    torch.ops._C_ascend.rms_norm_static_fp8_quant = dummyFusionOp(
        name="rms_norm_static_fp8_quant")
-    torch.ops._C.fused_add_rms_norm_static_fp8_quant = dummyFusionOp(
+    torch.ops._C_ascend.fused_add_rms_norm_static_fp8_quant = dummyFusionOp(
        name="fused_add_rms_norm_static_fp8_quant")
-    torch.ops._C.rms_norm_dynamic_per_token_quant = dummyFusionOp(
+    torch.ops._C_ascend.rms_norm_dynamic_per_token_quant = dummyFusionOp(
        name="rms_norm_dynamic_per_token_quant")


--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -35,8 +35,10 @@ class AscendSiluAndMul(SiluAndMul):

        from vllm_ascend.utils import is_310p

+        torch.ops.vllm.maybe_prefetch_mlp_down_proj(x)
        if is_310p():
            out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
        else:
            out = torch_npu.npu_swiglu(x)
+        torch.ops.vllm.maybe_wait_prefetch_done(out)
        return out
--- a/vllm_ascend/ops/casual_conv1d.py
+++ b/vllm_ascend/ops/casual_conv1d.py
@@ -0,0 +1,539 @@
+# adapted from vllm/model_executor/layers/mamba/ops/casual_conv1d.py
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+# and https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+# mypy: ignore-errors
+
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+PAD_SLOT_ID = -1
+
+
+def causal_conv1d_ref(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    return_final_states: bool = False,
+    final_states_out: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    initial_states: (batch, dim, width - 1)
+    final_states_out: (batch, dim, width - 1)
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    x = x.to(weight.dtype)
+    seqlen = x.shape[-1]
+    dim, width = weight.shape
+
+    if initial_states is None:
+        out = F.conv1d(x,
+                       weight.unsqueeze(1),
+                       bias,
+                       padding=width - 1,
+                       groups=dim)
+    else:
+        x = torch.cat([initial_states, x], dim=-1)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
+    out = out[..., :seqlen]
+    if return_final_states:
+        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
+            dtype_in)  # (batch, dim, width - 1)
+        if final_states_out is not None:
+            final_states_out.copy_(final_states)
+        else:
+            final_states_out = final_states
+    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+    return (out, None) if not return_final_states else (out, final_states_out)
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    bias: (dim,)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+    activation: either None or "silu" or "swish"
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    out_ref = []
+    out_ref_b = []
+    seqlens = query_start_loc[1:] - query_start_loc[:-1]
+    seqlens = seqlens.tolist()
+    splits = torch.split(x, seqlens, dim=-1)
+
+    for i in range(len(seqlens)):
+        x_s = splits[i]
+        if cache_indices[i] == PAD_SLOT_ID:
+            continue
+        out_ref_b.append(
+            causal_conv1d_ref(
+                x_s,
+                weight,
+                bias,
+                activation=activation,
+                return_final_states=True,
+                final_states_out=conv_states[cache_indices[i]].unsqueeze(0),
+                initial_states=conv_states[cache_indices[i]]
+                if has_initial_state[i] else None))
+    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=-1))
+    out_ref_tensor = torch.cat(out_ref, dim=0)
+    return out_ref_tensor
+
+
+@triton.jit()
+def _causal_conv1d_update_kernel(
+    # Pointers to matrices
+    x_ptr,  # (batch, dim, seqlen)
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    conv_state_ptr,
+    cache_seqlens_ptr,  # circular buffer
+    conv_state_indices_ptr,
+    num_accepted_tokens_ptr,
+    intermediate_conv_window_ptr,
+    o_ptr,  # (batch, dim, seqlen)
+    # Matrix dimensions
+    batch: int,
+    dim: tl.constexpr,
+    seqlen: tl.constexpr,
+    state_len: tl.constexpr,
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,
+    stride_x_dim: tl.constexpr,
+    stride_x_token: tl.constexpr,
+    stride_w_dim: tl.constexpr,
+    stride_w_width: tl.constexpr,
+    stride_conv_state_seq: tl.constexpr,
+    stride_conv_state_dim: tl.constexpr,
+    stride_conv_state_tok: tl.constexpr,
+    stride_state_indices: tl.constexpr,
+    stride_inter_seq: tl.constexpr,
+    stride_inter_step: tl.constexpr,
+    stride_inter_dim: tl.constexpr,
+    stride_inter_win: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SAVE_INTERMEDIATE: tl.constexpr,
+):
+    # ruff: noqa: E501
+    idx_seq = tl.program_id(0)
+    if idx_seq >= batch:
+        return
+
+    # [BLOCK_N,] elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if IS_CONTINUOUS_BATCHING:
+        # mask = idx_seq < batch
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr +
+                                         idx_seq * stride_state_indices).to(
+                                             tl.int64)
+    else:
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+
+    if IS_SPEC_DECODING:
+        # The rolling of conv state:
+        #
+        # Before forward, the conv_state is:
+        # [history1, history2, ..., historyM].
+        #
+        # After forward, the conv_state becomes:
+        # [history2, ..., historyM, draft1, draft2, ..., draftN].
+        #
+        # After acceptance, it becomes:
+        #
+        # - accept 1 tokens: [history2, ..., historyM, draft1]
+        # - accept 2 tokens: [history3, ..., historyM, draft1, draft2]
+        # - and so on.
+        conv_state_token_offset = tl.load(num_accepted_tokens_ptr +
+                                          idx_seq) - 1
+    else:
+        conv_state_token_offset = 0
+
+    # STEP 1: READ init_state data
+    conv_states_base = (conv_state_ptr +
+                        (conv_state_batch_coord * stride_conv_state_seq) +
+                        (idx_feats * stride_conv_state_dim))
+    mask_w = idx_feats < dim
+
+    prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok
+    if KERNEL_WIDTH >= 2:
+        conv_states_ptrs = prior_tokens  # [BLOCK_N]
+        col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 3:
+        conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok  # [BLOCK_N]
+        col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 4:
+        conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok  # [BLOCK_N]
+        col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH == 5:
+        conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok  # [BLOCK_N]
+        #col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+
+    # STEP 2: assume state_len > seqlen
+    idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+    # The conv_state updates works in a sliding window manner,
+    # at each forward pass, the tokens are shift by 1, so we
+    # load since idx_tokens + 1.
+    conv_state_ptrs_source = (
+        conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) +
+        conv_state_token_offset * stride_conv_state_tok +
+        (idx_feats * stride_conv_state_dim)[None, :] +
+        ((idx_tokens + 1) * stride_conv_state_tok)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = ((conv_state_batch_coord < num_cache_lines)
+            & ((idx_tokens + seqlen) < state_len)[:, None]
+            & (idx_feats < dim)[None, :])
+    conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0)
+
+    VAL = state_len - seqlen
+    x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim
+                                                 )  # [BLOCK_N]
+
+    x_ptrs = (x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None]
+              )  # [BLOCK_M, BLOCK_N]
+
+    mask_x = ((idx_tokens - VAL >= 0)[:, None]
+              & (idx_tokens - VAL < seqlen)[:, None]
+              & (idx_feats < dim)[None, :]
+              )  # token-index  # token-index  # feature-index
+    loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+    tl.debug_barrier()
+
+    new_conv_state = tl.where(mask, conv_state, loaded_x)
+
+    conv_state_base = (conv_state_ptr +
+                       (conv_state_batch_coord * stride_conv_state_seq) +
+                       (idx_feats * stride_conv_state_dim))  # [BLOCK_N,]
+    conv_state_ptrs_target = (conv_state_base +
+                              (idx_tokens * stride_conv_state_tok)[:, None]
+                              )  # [BLOCK_M, BLOCK_N]
+    mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :]
+    tl.store(conv_state_ptrs_target, new_conv_state, mask)
+
+    # STEP 3: init accumulator
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias,
+                              other=0.0).to(tl.float32)  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+
+    # STEP 4:
+    # PRE-LOAD WEIGHTS
+    # first kernel column, configured for weights to handle BLOCK_N features in range
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+
+    x_base_1d = x_base  # starting of chunk [BLOCK_N]
+    mask_x_1d = idx_feats < dim
+
+    # STEP 5: compute each token
+    for idx_token in tl.static_range(seqlen):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        # mask_1d = (idx_token < seqlen) & (
+        #     idx_feats < dim
+        # )  # token-index  # feature-index
+        maskL = idx_feats < dim
+        maskR = tl.full(maskL.shape, False, tl.int1)
+        mask_1d = tl.where(idx_token < seqlen, maskL, maskR)
+
+        o_ptrs = (o_ptr + (idx_seq) * stride_o_seq +
+                  idx_token * stride_o_token + (idx_feats * stride_o_dim))
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+        if SAVE_INTERMEDIATE:
+            # Save the window state after consuming this token
+            # Layout: [seq(cache line), step, dim, win(K-1)]
+            base_ptr = (intermediate_conv_window_ptr +
+                        conv_state_batch_coord * stride_inter_seq +
+                        idx_token * stride_inter_step +
+                        idx_feats * stride_inter_dim)
+            if KERNEL_WIDTH >= 2:
+                tl.store(base_ptr + 0 * stride_inter_win, col0, mask=mask_w)
+            if KERNEL_WIDTH >= 3:
+                tl.store(base_ptr + 1 * stride_inter_win, col1, mask=mask_w)
+            if KERNEL_WIDTH >= 4:
+                tl.store(base_ptr + 2 * stride_inter_win, col2, mask=mask_w)
+
+
+def causal_conv1d_update_npu(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Union[bool, str, None] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    intermediate_conv_window: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+        [shape=2: single token prediction]
+        [shape=3: single or multiple tokens prediction]
+    conv_state: (..., dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if validate_data:
+        assert cache_seqlens is None  # not implemented yet - ok for vLLM
+        assert pad_slot_id is not None
+        assert x.stride(1) == 1
+    if isinstance(activation, bool):
+        activation = "silu" if activation is True else None
+    elif activation is not None:
+        assert activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        # make it (batch, dim, seqlen) with seqlen == 1
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    _, width = weight.shape
+    # conv_state: (..., dim, state_len), where state_len >= width - 1
+    num_cache_lines, _, state_len = conv_state.size()
+
+    if validate_data:
+        assert dim == weight.size(0)
+        assert (
+            conv_state.stride(-2) == 1
+        ), f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        assert state_len >= width - 1
+        # when above happens, we don't shift-left to keep any records in conv_state
+        assert dim == conv_state.size(1)
+        if conv_state_indices is None:
+            assert conv_state.size(0) >= batch
+        else:
+            assert (batch, ) == conv_state_indices.shape
+
+        assert num_cache_lines >= batch
+        assert weight.stride(1) == 1  # Need this
+        assert cache_seqlens is None  # not needed for vLLM - circular buffer
+
+    # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o'
+    out = x
+    stride_w_dim, stride_w_width = weight.stride()
+
+    stride_x_seq, stride_x_dim, stride_x_token = x.stride(
+    )  # X (batch, dim, seqlen)
+
+    stride_o_seq, stride_o_dim, stride_o_token = out.stride()
+    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride(
+    )
+    stride_state_indices = (conv_state_indices.stride(0)
+                            if conv_state_indices is not None else 0)
+    state_len = width - 1 + (seqlen - 1)  # effective state_len needed
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    def grid(META):
+        return (
+            batch,
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    # prepare intermediate buffer strides if provided
+    if intermediate_conv_window is not None:
+        stride_inter_seq, stride_inter_step, stride_inter_dim, stride_inter_win = (
+            intermediate_conv_window.stride(0),
+            intermediate_conv_window.stride(1),
+            intermediate_conv_window.stride(2),
+            intermediate_conv_window.stride(3),
+        )
+    else:
+        stride_inter_seq = stride_inter_step = stride_inter_dim = stride_inter_win = 0
+
+    _causal_conv1d_update_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_state,
+        cache_seqlens,
+        conv_state_indices,
+        num_accepted_tokens,
+        intermediate_conv_window
+        if intermediate_conv_window is not None else x,
+        out,
+        # Matrix dimensions
+        batch,
+        dim,
+        seqlen,
+        state_len,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_state_indices,
+        stride_inter_seq,
+        stride_inter_step,
+        stride_inter_dim,
+        stride_inter_win,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        IS_SPEC_DECODING=num_accepted_tokens is not None,
+        NP2_STATELEN=np2_statelen,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        BLOCK_N=128,
+        SAVE_INTERMEDIATE=intermediate_conv_window is not None,
+    )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -14,212 +14,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-from typing import Any, Callable, Optional
+import os.path
+from typing import Callable, Optional

 import torch
 import torch_npu
 from vllm.config import CompilationLevel, get_current_vllm_config
-from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
+from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
+                              tensor_model_parallel_all_reduce)
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fused_moe.config import \
-    FusedMoEParallelConfig  # isort: skip
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, UnquantizedFusedMoEMethod)
+    FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE

 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
-                                                     AlltoAllCommImpl,
-                                                     MC2CommImpl)
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-    setup_token_dispatchers
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, vllm_version_is
+from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
+                                              determine_default_log2phy_map)
+from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, npu_stream_switch

 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__


-def fused_experts(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    activation: str = "silu",
-    apply_router_weight_on_input: bool = False,
-    use_int8_w8a8: bool = False,
-    use_int4_w4a8: bool = False,
-    global_num_experts: Optional[int] = None,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_scale_bias: torch.Tensor = None,
-    w2_scale_bias: torch.Tensor = None,
-    # For TorchAir graph
-    is_torchair: bool = False,
-    # For Cube/Vector parallel
-    shared_experts: Optional[Any] = None,
-    quantized_x_for_share: Optional[Any] = None,
-    dynamic_scale_for_share: Optional[Any] = None,
-    # For load balance
-    log2phy: torch.Tensor = None,
-    global_redundant_expert_num: int = 0,
-) -> torch.Tensor:
-    # Check constraints
-    assert hidden_states.shape[1] == w1.shape[1], (
-        f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[1]}")
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    if (use_int8_w8a8 or use_int4_w4a8):
-        assert w1_scale is not None and w2_scale is not None, \
-            "INT8 quantization requires weight scales."
-
-        w1_scale = w1_scale.to(torch.float32)
-        down_scale = [w2_scale]
-        down_output_dtype = w2_scale.dtype
-    else:
-        down_scale = None
-        down_output_dtype = None
-
-    moe_comm_method = get_forward_context().moe_comm_method
-    assert moe_comm_method is not None, "Missing communication context"
-
-    num_experts = w1.shape[0]
-
-    permuted_hidden_states, expert_tokens, dynamic_scale, group_list_type = moe_comm_method.permute(
-        hidden_states, topk_ids, topk_weights, expert_map, num_experts,
-        use_int8_w8a8 or use_int4_w4a8)
-
-    gate_up_output = torch_npu.npu_grouped_matmul(
-        x=[permuted_hidden_states],
-        weight=[w1],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=torch.int32 if use_int8_w8a8 else None,
-    )[0]
-
-    if (use_int8_w8a8 or use_int4_w4a8):
-        activated_output, activated_output_scale = torch_npu.npu_dequant_swiglu_quant(
-            x=gate_up_output,
-            weight_scale=w1_scale,
-            activation_scale=dynamic_scale,
-            bias=None,
-            quant_scale=None,
-            quant_offset=None,
-            group_index=expert_tokens,
-            activate_left=True,
-            quant_mode=1,
-        )
-        activated_output_scale = [activated_output_scale]
-    else:
-        activated_output = torch_npu.npu_swiglu(gate_up_output)
-        activated_output_scale = None
-
-    down_output = torch_npu.npu_grouped_matmul(
-        x=[activated_output],
-        weight=[w2],
-        scale=down_scale,
-        per_token_scale=activated_output_scale,
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=down_output_dtype,
-    )[0]
-
-    moe_comm_method.unpermute(down_output, hidden_states)
-
-    return hidden_states
-
-
-def fused_experts_moge(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    moe_parallel_config: FusedMoEParallelConfig,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    global_num_experts: int,
-    expert_map: torch.Tensor = None,
-    apply_router_weight_on_input: bool = False,
-) -> torch.Tensor:
-    """
-
-    Args:
-        hidden_states: Hidden states of shape (num_tokens, hidden_size).
-        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
-        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
-        topk_weights: Routing weights of shape (num_tokens, top_k).
-        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
-        top_k: Number of experts to select.
-        expert_map: Expert mapping of shape (num_experts,).
-
-    Returns:
-        hidden_states: Hidden states after routing.
-    """
-    ep_size = moe_parallel_config.ep_size
-    local_num_experts = global_num_experts // ep_size
-    local_num_group = top_k // ep_size
-
-    bsz, _ = hidden_states.shape
-    flatten_topk_ids = topk_ids.view(-1)
-    sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
-    sorted_topk_ids = sorted_topk_ids.to(torch.int32)
-    sorted_hidden_states = hidden_states.index_select(
-        0, sorted_topk_ids // local_num_group)
-
-    experts_id = torch.arange(0,
-                              local_num_experts,
-                              dtype=topk_ids.dtype,
-                              device=topk_ids.device)
-    num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
-        torch.float32).sum(0)
-    topk_scales = topk_weights.view(-1).index_select(
-        0, sorted_topk_ids).unsqueeze(-1)
-    group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
-
-    gate_up_out = torch_npu.npu_grouped_matmul(
-        x=[sorted_hidden_states],
-        weight=[w1],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    if is_310p():
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
-            torch.float16)
-    else:
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out)
-    gate_up_out *= topk_scales
-
-    down_out_list = torch_npu.npu_grouped_matmul(
-        x=[gate_up_out],
-        weight=[w2],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
-    unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids)
-    final_hidden_states = unsorted_hidden_states.reshape(
-        bsz, top_k // ep_size, -1).sum(1)
-
-    return final_hidden_states
-
-
 def unquantized_fused_moe_init_func(self, *args, **kwargs):
    original_unquantized_fused_moe_init_func(self, *args, **kwargs)

@@ -235,67 +55,7 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs):
        self.use_aclgraph = (vllm_config.compilation_config.level
                             == CompilationLevel.PIECEWISE
                             and not vllm_config.model_config.enforce_eager)
-
-
-def forward_oot_v01011(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-    topk_weights, topk_ids, _ = select_experts(
-        hidden_states=x,
-        router_logits=router_logits,
-        top_k=top_k,
-        use_grouped_topk=use_grouped_topk,
-        renormalize=renormalize,
-        topk_group=topk_group,
-        num_expert_group=num_expert_group,
-        custom_routing_function=custom_routing_function,
-        scoring_func=scoring_func,
-        routed_scaling_factor=1.0,
-        e_score_correction_bias=e_score_correction_bias,
-        global_num_experts=global_num_experts)
-
-    if topk_ids.shape[1] < top_k or is_310p():
-        assert global_num_experts is not None
-        return fused_experts_moge(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            moe_parallel_config=self.moe.moe_parallel_config,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            top_k=top_k,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input)
-
-    return fused_experts(
-        hidden_states=x,
-        w1=layer.w13_weight,
-        w2=layer.w2_weight,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-    )
+    self.transpose = True


 def forward_oot(
@@ -321,7 +81,7 @@ def forward_oot(
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:

-    topk_weights, topk_ids, _ = select_experts(
+    topk_weights, topk_ids, row_idx = select_experts(
        hidden_states=x,
        router_logits=router_logits,
        top_k=top_k,
@@ -335,40 +95,35 @@ def forward_oot(
        e_score_correction_bias=e_score_correction_bias,
        global_num_experts=global_num_experts)

-    if topk_ids.shape[1] < top_k or is_310p():
-        assert global_num_experts is not None
-        return fused_experts_moge(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            moe_parallel_config=self.moe.moe_parallel_config,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            top_k=top_k,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input)
-
-    return fused_experts(
-        hidden_states=x,
-        w1=layer.w13_weight,
-        w2=layer.w2_weight,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-    )
+    moe_comm_method = get_forward_context().moe_comm_method
+    return moe_comm_method.fused_experts(hidden_states=x,
+                                         w1=layer.w13_weight,
+                                         w2=layer.w2_weight,
+                                         topk_weights=topk_weights,
+                                         topk_ids=topk_ids,
+                                         row_idx=row_idx,
+                                         global_num_experts=global_num_experts,
+                                         expert_map=expert_map)


 def process_weights_after_loading(self, layer):
    super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer)
-    w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose(
-        1, 2).contiguous()
-    layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False)
+    if self.transpose:
+        w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose(
+            1, 2).contiguous()
+        layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False)

-    w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose(
-        1, 2).contiguous()
-    layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)
+        w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose(
+            1, 2).contiguous()
+        layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)
+
+        self.transpose = False
+    else:
+        w13_data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False)
+
+        w2_data = self._maybe_pad_weight(layer.w2_weight.data)
+        layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)

    if not is_310p():
        layer.w13_weight.data = torch_npu.npu_format_cast(
@@ -378,119 +133,88 @@ def process_weights_after_loading(self, layer):


 class AscendFusedMoE(FusedMoE):
+    moe_counter = -1

-    def __init__(
-        self,
-        num_experts,
-        top_k,
-        hidden_size,
-        intermediate_size,
-        params_dtype=None,
-        reduce_results=False,
-        renormalize=True,
-        use_grouped_topk=False,
-        num_expert_group=None,
-        topk_group=None,
-        quant_config=None,
-        tp_size=None,
-        ep_size=None,
-        dp_size=None,
-        prefix="",
-        custom_routing_function=None,
-        scoring_func="softmax",
-        routed_scaling_fator: float = 1.0,
-        e_score_correction_bias=None,
-        apply_router_weight_on_input=False,
-        activation="silu",
-        enable_eplb=False,
-        num_redundant_experts=0,
-        has_bias=False,
-    ):
-        if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
-            super().__init__(
-                num_experts,
-                top_k,
-                hidden_size,
-                intermediate_size,
-                params_dtype,
-                reduce_results,
-                renormalize,
-                use_grouped_topk,
-                num_expert_group,
-                topk_group,
-                quant_config,
-                tp_size,
-                ep_size,
-                dp_size,
-                prefix,
-                custom_routing_function,
-                scoring_func,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                num_redundant_experts,
-                has_bias,
-            )
-        else:
-            super().__init__(
-                num_experts,
-                top_k,
-                hidden_size,
-                intermediate_size,
-                params_dtype,
-                reduce_results,
-                renormalize,
-                use_grouped_topk,
-                num_expert_group,
-                topk_group,
-                quant_config,
-                tp_size,
-                ep_size,
-                dp_size,
-                prefix,
-                custom_routing_function,
-                scoring_func,
-                routed_scaling_fator,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                num_redundant_experts,
-                has_bias,
-            )
-
-        setup_token_dispatchers(self.moe_config.ep_size,
-                                top_k=self.top_k,
-                                num_experts=self.global_num_experts,
-                                num_local_experts=self.local_num_experts)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)

+        AscendFusedMoE.moe_counter += 1
+        self.moe_instance_id = AscendFusedMoE.moe_counter
        self.moe_config.tp_group = get_tp_group()
        self.moe_config.dp_group = get_dp_group()
        self.moe_config.ep_group = get_ep_group()
        self.moe_config.mc2_group = get_mc2_group()
+        ascend_config = get_ascend_config()
+        self.dynamic_eplb = ascend_config.dynamic_eplb
+        self.expert_map_path = ascend_config.expert_map_path
+        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        # static eplb initializing with expert_map_path
+        if self.expert_map_path and os.path.exists(
+                self.expert_map_path) and os.access(self.expert_map_path,
+                                                    os.R_OK):
+            self.expert_load_balancer = ExpertLoadBalancer(
+                self.expert_map_path, self.global_num_experts)
+            self.local_num_experts, self.expert_map = (
+                self.expert_load_balancer.get_rank_placement_map(
+                    self.moe_instance_id, self.ep_rank))
+            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
+                self.moe_instance_id, self.ep_rank).npu()
+            self.global_redundant_expert_num = (
+                self.expert_load_balancer.get_global_redundant_expert_num())
+        else:
+            # init moe.
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                self.ep_size, self.ep_rank, self.global_num_experts)
+            # dynamic eplb initializing with not expert_map_path
+            if self.dynamic_eplb:
+                self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+                self.local_num_experts, self.expert_map = determine_default_expert_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+                self.log2phy = determine_default_log2phy_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+        local_num_experts = (torch.sum(
+            self.expert_map != -1) if self.expert_map is not None else
+                             self.global_num_experts)
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)

-        for method in {AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl}:
-            setattr(
-                self, method.__name__.lower(),
-                method(moe_config=self.moe_config))  # type: ignore[abstract]
+        setup_moe_comm_method(self.moe_config)
+
+    def update_expert_map(self, new_expert_map):
+        self.expert_map = new_expert_map
+
+    def get_map(self):
+        return self.expert_map
+
+    def get_log2phy_map(self):
+        return self.logical_to_physical_map
+
+    def clear_moe_load(self):
+        if self.moe_load is not None:
+            self.moe_load.zero_()
+
+    def maybe_all_reduce_tensor_model_parallel(
+            self, final_hidden_states: torch.Tensor):
+        """NOTE(Yizhou): This is to override the parent class method. In `mc2commimpl`,
+        and `alltoallcommimpl`, we do not need to all-reduce the final outputs since
+        the outputs are already aggregated across tensor parallel ranks in the
+        `finalize` function. In `allgathercommimpl`, we still need to all-reduce the
+        outputs since each rank only has partial outputs.
+        """
+        return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(
+            final_hidden_states)

    def forward_impl(self, hidden_states: torch.Tensor,
                     router_logits: torch.Tensor):
        assert self.quant_method is not None

        forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-
-        # TODO: Can we refactor this logic to model_runner?
-        # TODO: Adjusted logic to differentiate between A2 and A3, we check ep_size here since mc2 only support ep_size >= 16 on A3 now
-        if self.moe_config.ep_size < 16:
-            moe_comm_method_name = "allgathercommimpl"
-
-        forward_context.moe_comm_method = getattr(self, moe_comm_method_name)
-
        hidden_states, router_logits = forward_context.moe_comm_method.prepare(
-            hidden_states=hidden_states, router_logits=router_logits)
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            replace_allreduce=forward_context.sp_enabled)

        # Matrix multiply.
        final_hidden_states = self.quant_method.apply(
@@ -514,6 +238,12 @@ class AscendFusedMoE(FusedMoE):
            logical_to_physical_map=self.logical_to_physical_map,
            logical_replica_count=self.logical_replica_count,
        )
+        if isinstance(final_hidden_states, tuple):
+            final_hidden_states, group_list_type, expert_tokens = final_hidden_states
+
+        if self.dynamic_eplb:
+            self.moe_load += expert_tokens if group_list_type else \
+                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])

        final_hidden_states = forward_context.moe_comm_method.finalize(
            hidden_states=final_hidden_states,
@@ -521,11 +251,118 @@ class AscendFusedMoE(FusedMoE):

        return final_hidden_states

+    def transpose_weight(self, loaded_weight, expert_data, shard_dim):
+        # Ensure training and inference weight shapes match during RL weight updates
+        if (
+            loaded_weight.shape[1] != expert_data.shape[1] and \
+            loaded_weight.shape[0] != expert_data.shape[0]
+        ):
+            shard_dim = int(not shard_dim)
+            loaded_weight = loaded_weight.transpose(0, 1).contiguous()
+        return loaded_weight, shard_dim
+
+    def _load_w13(self,
+                  expert_data: torch.Tensor,
+                  shard_dim: int,
+                  shard_id: str,
+                  loaded_weight: torch.Tensor,
+                  tp_rank: int,
+                  load_full: bool = False):
+        # Index the loaded weight for tp sharding.
+        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
+        loaded_weight, shard_dim = self.transpose_weight(
+            loaded_weight, expert_data, shard_dim)
+        shard_size = expert_data.shape[shard_dim] // 2
+        if not load_full:
+            loaded_weight = loaded_weight.narrow(shard_dim,
+                                                 shard_size * tp_rank,
+                                                 shard_size)
+        # Narrow parameter and load.
+        # w1, gate_proj: Load into first logical weight of w13.
+        if shard_id == "w1":
+            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
+        # w3, up_proj: Load into second logical weight of w13.
+        else:
+            assert shard_id == "w3"
+            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
+        expert_data.copy_(loaded_weight)
+
+    def _load_w2(self,
+                 expert_data: torch.Tensor,
+                 shard_dim: int,
+                 loaded_weight: torch.Tensor,
+                 tp_rank: int,
+                 load_full: bool = False):
+        # Index the loaded weight for tp sharding.
+        # down_proj: "RowParallel" so tp sharding on input_dim
+        # Narrow parameter and load.
+        loaded_weight, shard_dim = self.transpose_weight(
+            loaded_weight, expert_data, shard_dim)
+        shard_size = expert_data.shape[shard_dim]
+        if not load_full:
+            loaded_weight = loaded_weight.narrow(shard_dim,
+                                                 shard_size * tp_rank,
+                                                 shard_size)
+        # w2, down_proj: Load into only logical weight of w2.
+        expert_data.copy_(loaded_weight)
+
+
+class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
+
+    def __init__(
+        self,
+        shared_experts: torch.nn.Module,
+        use_overlapped: bool = True,
+        **kwargs,
+    ):
+        AscendFusedMoE.__init__(self, **kwargs)
+        self._shared_experts = shared_experts
+        self.use_overlapped = use_overlapped
+        self.shared_expert_stream = None
+        ascend_config = get_ascend_config()
+        self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert
+        if self.multistream_overlap_shared_expert:
+            self.shared_expert_stream = torch.npu.Stream()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        shared_out, fused_out = AscendFusedMoE.forward(
+            self,
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+        )
+        return shared_out, fused_out
+
+    def forward_impl(self, hidden_states: torch.Tensor,
+                     router_logits: torch.Tensor):
+        # Make sure the shared experts stream begins after hidden_states are ready.
+        if self.multistream_overlap_shared_expert:
+            self.shared_expert_stream.wait_stream(  # type: ignore
+                torch.npu.current_stream())
+        with npu_stream_switch(self.shared_expert_stream,
+                               enabled=self.multistream_overlap_shared_expert):
+            # Use a separate stream to run shared experts.
+            shared_out = self._shared_experts(hidden_states)
+
+            # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
+            forward_context = get_forward_context()
+            moe_comm_type = forward_context.moe_comm_type
+            if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2}:
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
+        fused_output = AscendFusedMoE.forward_impl(
+            self,
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+        )
+        # Make sure the default stream waits for the shared experts stream to finish.
+        if self.multistream_overlap_shared_expert:
+            torch.npu.current_stream().wait_stream(self.shared_expert_stream)
+        return shared_out, fused_output
+

 UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
 UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading
-
-if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
-    UnquantizedFusedMoEMethod.forward_oot = forward_oot_v01011
-else:
-    UnquantizedFusedMoEMethod.forward_oot = forward_oot
+UnquantizedFusedMoEMethod.forward_oot = forward_oot
--- a/vllm_ascend/ops/fla.py
+++ b/vllm_ascend/ops/fla.py
@@ -0,0 +1,218 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/layernorm_gated.py
+# Copyright (c) 2024, Tri Dao.
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+# mypy: ignore-errors
+
+import torch
+import torch.nn.functional as F
+import triton
+from vllm.model_executor.layers.fla.ops.layernorm_guard import \
+    layer_norm_fwd_kernel
+
+
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    z=None,
+    out=None,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N, )
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N, )
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = (torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+            if not is_rms_norm else None)
+    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError(
+            "This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    grid = (M, ngroups)
+    with torch.npu.device(x.device.index):
+        layer_norm_fwd_kernel[grid](
+            x,
+            out,
+            weight,
+            bias,
+            z,
+            mean,
+            rstd,
+            x.stride(0),
+            out.stride(0),
+            z.stride(0) if z is not None else 0,
+            M,
+            group_size,
+            eps,
+            BLOCK_N=BLOCK_N,
+            NORM_BEFORE_GATE=norm_before_gate,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=num_warps,
+        )
+    return out, mean, rstd
+
+
+class LayerNormFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        z=None,
+        eps=1e-6,
+        group_size=None,
+        norm_before_gate=True,
+        is_rms_norm=False,
+    ):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if z is not None:
+            assert z.shape == x_shape_og
+            z = z.reshape(-1, z.shape[-1])
+            if z.stride(-1) != 1:
+                z = z.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        y, mean, rstd = _layer_norm_fwd(
+            x,
+            weight,
+            bias,
+            eps,
+            z=z,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            is_rms_norm=is_rms_norm,
+        )
+        return y.reshape(x_shape_og)
+
+
+def torch_chunk_gated_delta_rule(
+    query,
+    key,
+    value,
+    g,
+    beta,
+    chunk_size=64,
+    initial_state=None,
+    output_final_state=False,
+    use_qk_l2norm_in_kernel=False,
+):
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query = F.normalize(query, p=2, dim=-1)
+        key = F.normalize(key, p=2, dim=-1)
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32)
+        for x in (query, key, value, beta, g)
+    ]
+
+    batch_size, sequence_length, num_heads, k_head_dim = key.shape
+    v_head_dim = value.shape[-1]
+    pad_size = (chunk_size - num_heads % chunk_size) % chunk_size
+    query = F.pad(query, (0, 0, 0, pad_size)).repeat_interleave(2, dim=1)
+    key = F.pad(key, (0, 0, 0, pad_size)).repeat_interleave(2, dim=1)
+    value = F.pad(value, (0, 0, 0, pad_size))
+    beta = F.pad(beta, (0, pad_size))
+    g = F.pad(g, (0, pad_size))
+    tot_heads = num_heads + pad_size
+    scale = 1 / (query.shape[-1]**0.5)
+    query = query * scale
+
+    v_beta = value * beta.unsqueeze(-1)
+    k_beta = key * beta.unsqueeze(-1)
+    # reshape to chunks
+    query, key, value, k_beta, v_beta = [
+        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1])
+        for x in (query, key, value, k_beta, v_beta)
+    ]
+    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
+    mask = torch.triu(torch.ones(chunk_size,
+                                 chunk_size,
+                                 dtype=torch.bool,
+                                 device=query.device),
+                      diagonal=0)
+
+    # chunk decay
+    g = g.cumsum(dim=-1)
+    decay_mask = ((g.unsqueeze(-1) -
+                   g.unsqueeze(-2)).tril().exp().float()).tril()
+    attn = -(
+        (k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
+    for i in range(1, chunk_size):
+        row = attn[..., i, :i].clone()
+        sub = attn[..., :i, :i].clone()
+        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    value = attn @ v_beta
+    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
+
+    last_recurrent_state = (torch.zeros(batch_size, sequence_length,
+                                        k_head_dim, v_head_dim).to(value) if
+                            initial_state is None else initial_state.to(value))
+
+    core_attn_out = torch.zeros_like(value)
+    mask = torch.triu(torch.ones(chunk_size,
+                                 chunk_size,
+                                 dtype=torch.bool,
+                                 device=query.device),
+                      diagonal=1)
+
+    # for each chunk
+    for i in range(0, tot_heads // chunk_size):
+        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
+        attn = (q_i @ k_i.transpose(-1, -2) *
+                decay_mask[:, :, i]).masked_fill_(mask, 0)
+        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+        v_new = v_i - v_prime
+        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        core_attn_out[:, :, i] = attn_inter + attn @ v_new
+        last_recurrent_state = (
+            last_recurrent_state * g[:, :, i, -1, None, None].exp() +
+            (k_i *
+             (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(
+                 -1, -2) @ v_new)
+
+    if not output_final_state:
+        last_recurrent_state = None
+    core_attn_out = core_attn_out.reshape(core_attn_out.shape[0],
+                                          core_attn_out.shape[1], -1,
+                                          core_attn_out.shape[-1])
+    core_attn_out = core_attn_out[:, :, :num_heads]
+    core_attn_out = core_attn_out.transpose(1,
+                                            2).contiguous().to(initial_dtype)
+    return core_attn_out, last_recurrent_state
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -19,13 +19,9 @@ import os
 from typing import Any, Callable, Optional

 import torch
-import torch.distributed as dist
 import torch_npu
-from torch import nn
 from vllm.config import get_current_vllm_config
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
                                             get_tp_group)
 from vllm.forward_context import get_forward_context
@@ -39,70 +35,16 @@ from vllm.model_executor.layers.quantization.base_config import \
    QuantizationConfig

 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.ascend_forward_context import FusedMoEState
-from vllm_ascend.distributed.communication_op import \
-    data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
+from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
+                                              determine_default_log2phy_map)
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
-from vllm_ascend.ops.sequence_parallel import MetadataForPadding
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, dispose_tensor,
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
                               get_all_reduce_merge_state,
-                               get_rm_router_logits_state, is_310p)
-
-
-def unified_fused_experts_eager(hidden_states: torch.Tensor,
-                                w1: torch.Tensor,
-                                w2: torch.Tensor,
-                                topk_weights: torch.Tensor,
-                                topk_ids: torch.Tensor,
-                                row_idx: torch.Tensor,
-                                expert_map: Optional[torch.Tensor] = None,
-                                log2phy: Optional[torch.Tensor] = None,
-                                global_redundant_expert_num: int = 0,
-                                w1_scale: Optional[torch.Tensor] = None,
-                                w1_scale_bias: Optional[torch.Tensor] = None,
-                                w2_scale: Optional[torch.Tensor] = None,
-                                w2_scale_bias: Optional[torch.Tensor] = None,
-                                shared_experts: Optional[torch.Tensor] = None,
-                                shared_gate_up: Optional[Any] = None,
-                                shared_dequant_scale: Optional[Any] = None,
-                                mc2_mask: Optional[torch.Tensor] = None,
-                                apply_router_weight_on_input: bool = False,
-                                with_quant: bool = False):
-    token_dispatcher = get_forward_context().token_dispatcher
-
-    results = token_dispatcher.token_dispatch(
-        hidden_states=hidden_states,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        row_idx=row_idx,
-        expert_map=expert_map,
-        log2phy=log2phy,
-        global_redundant_expert_num=global_redundant_expert_num,
-        shared_experts=shared_experts,
-        shared_gate_up=shared_gate_up,
-        shared_dequant_scale=shared_dequant_scale,
-        mc2_mask=mc2_mask,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-        with_quant=with_quant)
-
-    expert_output = unified_apply_mlp(
-        hidden_states=results["hidden_states"],
-        w1=w1,
-        w1_scale=w1_scale,
-        w2=w2,
-        w2_scale=w2_scale,
-        group_list=results["group_list"],
-        dynamic_scale=results.get("dynamic_scale"),
-        group_list_type=results.get("group_list_type"),
-        w1_scale_bias=w1_scale_bias,
-        w2_scale_bias=w2_scale_bias,
-        topk_scales=results.get("topk_scales"),
-        with_quant=with_quant)
-    final_hidden_states = token_dispatcher.token_combine(expert_output)
-    return final_hidden_states
+                               get_rm_router_logits_state, is_310p,
+                               vllm_version_is)


 class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -115,6 +57,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
        self.max_model_len = vllm_config.model_config.max_model_len
        get_ascend_config()
+        self.dynamic_eplb = get_ascend_config().dynamic_eplb

        try:
            device_group = get_mc2_group().device_group
@@ -182,17 +125,19 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        if enable_force_load_balance and not self.use_aclgraph:
            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

-        return unified_fused_experts_eager(hidden_states=x,
-                                           w1=layer.w13_weight,
-                                           w2=layer.w2_weight,
-                                           topk_weights=topk_weights,
-                                           topk_ids=topk_ids,
-                                           row_idx=row_idx,
-                                           expert_map=expert_map,
-                                           shared_experts=shared_experts,
-                                           mc2_mask=kwargs.get(
-                                               "mc2_mask", None),
-                                           with_quant=False)
+        moe_comm_method = get_forward_context().moe_comm_method
+        return moe_comm_method.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            shared_experts=shared_experts,
+            need_trans=True,
+            dynamic_eplb=self.dynamic_eplb)


 class AscendFusedMoE(FusedMoE):
@@ -290,42 +235,67 @@ class AscendFusedMoE(FusedMoE):
            self.moe_parallel_config.ep_size, is_deepseek_v3_r1)

        ascend_config = get_ascend_config()
-        expert_map_path = ascend_config.expert_map_path
-        if expert_map_path and os.path.exists(expert_map_path):
-            # moe expert load balance
-            expert_load_balancer = ExpertLoadBalancer(expert_map_path,
-                                                      self.global_num_experts)
-            self.local_num_experts, self.expert_map = \
-                                expert_load_balancer.get_rank_placement_map(
-                                                self.moe_instance_id,
-                                                get_ep_group().rank_in_group)
-            self.log2phy = expert_load_balancer.get_rank_log2phy_map(
-                self.moe_instance_id,
-                get_ep_group().rank_in_group)
-            self.global_redundant_expert_num = \
-                        expert_load_balancer.get_global_redundant_expert_num()
+        self.dynamic_eplb = ascend_config.dynamic_eplb
+        self.expert_map_path = ascend_config.expert_map_path
+        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        self.global_num_experts = num_experts + self.global_redundant_expert_num
+        # static eplb initializing with expert_map_path
+        if self.expert_map_path and os.path.exists(
+                self.expert_map_path) and os.access(self.expert_map_path,
+                                                    os.R_OK):
+            self.expert_load_balancer = ExpertLoadBalancer(
+                self.expert_map_path, self.global_num_experts)
+            self.local_num_experts, self.expert_map = (
+                self.expert_load_balancer.get_rank_placement_map(
+                    self.moe_instance_id, self.ep_rank))
+            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
+                self.moe_instance_id, self.ep_rank).npu()
+            self.global_redundant_expert_num = (
+                self.expert_load_balancer.get_global_redundant_expert_num())
        else:
-            # Create a tensor of size num_experts filled with -1
+            # init moe.
            self.local_num_experts, self.expert_map = determine_expert_map(
-                self.ep_size,
-                get_ep_group().rank_in_group, self.global_num_experts)
+                self.ep_size, self.ep_rank, self.global_num_experts)
+            # dynamic eplb initializing with not expert_map_path
+            if self.dynamic_eplb:
+                self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+                self.local_num_experts, self.expert_map = determine_default_expert_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+                self.log2phy = determine_default_log2phy_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+        local_num_experts = (torch.sum(self.expert_map != -1)
+                             if self.expert_map is not None else num_experts)
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-        moe = FusedMoEConfig.make(
-            num_experts=self.global_num_experts,
-            experts_per_token=top_k,
-            hidden_dim=hidden_size,
-            num_local_experts=self.local_num_experts,
-            moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
-            in_dtype=params_dtype,
-            quant_config=quant_config)
-
+        if vllm_version_is("0.10.2"):
+            moe = FusedMoEConfig.make(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                # TODO (bnell): this needs to be fixed for quantized types.
+                in_dtype=params_dtype,
+                quant_config=quant_config)
+        else:
+            moe = FusedMoEConfig(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                in_dtype=params_dtype,
+            )
        self.moe_config = moe
+        # TODO: The self.moe_config.tp_size here is not correct, fixme soon

        if quant_config is None:
            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
@@ -337,6 +307,11 @@ class AscendFusedMoE(FusedMoE):
        local_num_experts = torch.sum(self.expert_map != -1) \
            if self.expert_map is not None else num_experts

+        self.moe_load = None
+
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
+
        moe_quant_params = {
            "num_experts": local_num_experts,
            "hidden_size": hidden_size,
@@ -354,34 +329,27 @@ class AscendFusedMoE(FusedMoE):
        # NOTE: self.tp_group is not expert_tp_group
        self.tp_group = get_tp_group().device_group
        self.quant_method.create_weights(layer=self, **moe_quant_params)
-        self.token_dispatcher = None

-        ep_size = (get_ep_group().world_size if
-                   vllm_config.parallel_config.enable_expert_parallel else 1)
-        from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-            setup_token_dispatchers
-        setup_token_dispatchers(
-            ep_size,
-            top_k=self.top_k,
-            num_experts=self.global_num_experts,
-            num_global_redundant_experts=self.global_redundant_expert_num,
-            num_local_experts=self.local_num_experts)
+        self.moe_config.tp_group = get_tp_group()
+        self.moe_config.dp_group = get_dp_group()
+        self.moe_config.ep_group = get_ep_group()
+        self.moe_config.mc2_group = get_mc2_group()
+        self.moe_config.num_global_redundant_experts = self.global_redundant_expert_num

-    def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_dp_cpu: torch.Tensor):
-        assert (len(x.shape) == 2)
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
-                             device=x.device,
-                             dtype=x.dtype)
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
-        buffer[start:end, :].copy_(x)
-        for idx in range(self.dp_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
-            get_dp_group().broadcast(buffer[start:end, :], idx)
-        return buffer
+        setup_moe_comm_method(self.moe_config)
+
+    def update_expert_map(self, new_expert_map):
+        self.expert_map = new_expert_map
+
+    def get_map(self):
+        return self.expert_map
+
+    def get_log2phy_map(self):
+        return self.logical_to_physical_map
+
+    def clear_moe_load(self):
+        if self.moe_load is not None:
+            self.moe_load.zero_()

    def forward(self,
                hidden_states: torch.Tensor,
@@ -391,8 +359,7 @@ class AscendFusedMoE(FusedMoE):
                top_k: Optional[int] = None,
                shared_experts: Optional[Any] = None,
                gate=None,
-                replace_allreduce: bool = False,
-                _metadata_for_padding: Optional[MetadataForPadding] = None):
+                replace_allreduce: bool = False):

        assert self.quant_method is not None

@@ -401,10 +368,7 @@ class AscendFusedMoE(FusedMoE):
        else:
            real_top_k = self.top_k

-        num_tokens, hidden_size = hidden_states.shape
-
        forward_context = get_forward_context()
-        fused_moe_state = forward_context.fused_moe_state
        mc2_mask = forward_context.mc2_mask
        # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
        quantized_x_for_share, dynamic_scale_for_share = None, None
@@ -413,74 +377,16 @@ class AscendFusedMoE(FusedMoE):
            # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
            shared_hidden_states = shared_experts(hidden_states)

-        mc2_mask = forward_context.mc2_mask
-
-        enable_sp = _metadata_for_padding is not None and _metadata_for_padding.not_dummy_and_is_prefill
-        tp_size = get_tensor_model_parallel_world_size()
-        if enable_sp:
-            tp_rank = get_tensor_model_parallel_rank()
-            mc2_mask_sp = _metadata_for_padding.mc2_mask if _metadata_for_padding is not None else forward_context.mc2_mask
-            chunk_mc2_mask = torch.tensor_split(mc2_mask_sp, tp_size, dim=0)
-            mc2_mask = chunk_mc2_mask[tp_rank]
+        if forward_context.sp_enabled:
            replace_allreduce = True

-        if (fused_moe_state not in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce):
-            if fused_moe_state in {FusedMoEState.MC2}:
-                padding_size = forward_context.padded_num_tokens
-            else:
-                # TODO: Determine if we can remove the padding
-                padding_size = tp_size
-            if num_tokens < padding_size and not self.enable_shared_expert_dp:
-                hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, padding_size - num_tokens))
-                router_logits = nn.functional.pad(
-                    router_logits, (0, 0, 0, padding_size - num_tokens))
-            if tp_size > 1:
-                tp_rank = get_tensor_model_parallel_rank()
-                if not self.enable_shared_expert_dp:
-                    chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                             tp_size,
-                                                             dim=0)
-                    chunk_router_logits = torch.tensor_split(router_logits,
-                                                             tp_size,
-                                                             dim=0)
-                    hidden_states = chunk_hidden_states[tp_rank]
-                    router_logits = chunk_router_logits[tp_rank]
-
-                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
-                mc2_mask = chunk_mc2_mask[tp_rank]
-
-        if self.dp_size > 1:
-            if fused_moe_state == FusedMoEState.AllGather:
-                # NOTE: When in torchair graph, it has been padded in model_runner_v1
-                max_tokens_across_dp = forward_context.max_tokens_across_dp
-                if num_tokens < max_tokens_across_dp:
-                    hidden_states = nn.functional.pad(
-                        hidden_states,
-                        (0, 0, 0, max_tokens_across_dp - num_tokens))
-                    if not self.rm_router_logits:
-                        router_logits = nn.functional.pad(
-                            router_logits,
-                            (0, 0, 0, max_tokens_across_dp - num_tokens))
-                hidden_states = get_dp_group().all_gather(hidden_states, 0)
-                if self.rm_router_logits:
-                    router_logits, _ = gate(hidden_states)
-                else:
-                    router_logits = get_dp_group().all_gather(router_logits, 0)
-
-            elif fused_moe_state == FusedMoEState.NaiveMulticast:
-                cu_tokens_across_dp_cpu = get_forward_context(
-                ).dp_metadata.cu_tokens_across_dp_cpu
-                hidden_states = self.naive_multicast(hidden_states,
-                                                     cu_tokens_across_dp_cpu)
-                if self.rm_router_logits:
-                    router_logits, _ = gate(hidden_states)
-                else:
-                    router_logits = self.naive_multicast(
-                        router_logits, cu_tokens_across_dp_cpu)
+        hidden_states, router_logits = forward_context.moe_comm_method.prepare(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            enable_shared_expert_dp=self.enable_shared_expert_dp,
+            rm_router_logits=self.rm_router_logits,
+            replace_allreduce=replace_allreduce,
+            gate=gate)

        # Matrix multiply.
        e_hidden_states = self.quant_method.apply(
@@ -503,53 +409,27 @@ class AscendFusedMoE(FusedMoE):
            global_redundant_expert_num=self.global_redundant_expert_num,
            shared_experts=None,
            mc2_mask=mc2_mask,
-            token_dispatcher=self.token_dispatcher,
            quantized_x_for_share=quantized_x_for_share,
            dynamic_scale_for_share=dynamic_scale_for_share,
        )

+        group_list_type = None
+
        if shared_experts:
-            if isinstance(e_hidden_states, tuple):
+            if isinstance(e_hidden_states,
+                          tuple) and len(e_hidden_states) == 2:
                e_hidden_states, shared_hidden_states = e_hidden_states

-        if (fused_moe_state not in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce and not self.enable_shared_expert_dp):
-            if tp_size > 1:
-                dist.all_gather(list(chunk_hidden_states), e_hidden_states,
-                                self.tp_group)
-                final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-                dispose_tensor(e_hidden_states)
-            else:
-                final_hidden_states = e_hidden_states
-            if num_tokens < padding_size:
-                final_hidden_states = final_hidden_states[:num_tokens]
-        elif self.dp_size > 1 and not self.enable_shared_expert_dp:
-            if fused_moe_state == FusedMoEState.NaiveMulticast:
-                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-                    self.dp_rank - 1]
-                end = cu_tokens_across_dp_cpu[self.dp_rank]
-                final_hidden_states = get_dp_group().all_reduce(
-                    e_hidden_states)
-                final_hidden_states = final_hidden_states[start:end, :]
-                dispose_tensor(e_hidden_states)
-            elif fused_moe_state == FusedMoEState.AllGather:
-                final_hidden_states = data_parallel_reduce_scatter(
-                    e_hidden_states, dim=0)
-                final_hidden_states = final_hidden_states[:num_tokens]
-                dispose_tensor(e_hidden_states)
-            else:
-                final_hidden_states = e_hidden_states
-        else:
-            final_hidden_states = e_hidden_states
+        if isinstance(e_hidden_states, tuple) and len(e_hidden_states) == 3:
+            e_hidden_states, group_list_type, expert_tokens = e_hidden_states

-        if tp_size > 1 and not self.all_reduce_merge and fused_moe_state in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ]:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+        if self.dynamic_eplb and group_list_type is not None:
+            self.moe_load += expert_tokens if group_list_type else \
+                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+
+        final_hidden_states = forward_context.moe_comm_method.finalize(
+            hidden_states=e_hidden_states,
+            reduce_results=(not self.all_reduce_merge))

        if shared_experts:
            return final_hidden_states, shared_hidden_states
--- a/vllm_ascend/ops/layernorm.py
+++ b/vllm_ascend/ops/layernorm.py
@@ -15,50 +15,124 @@
 # This file is a part of the vllm-ascend project.
 #

-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, cast

 import torch
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm


-class AddRMSNormW8A8Quant(RMSNorm):
-    # Fuse AddRmsNorm and W8A8 quantization ops together
+def _addrmsnorm_forward_oot(
+    self,
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    layer: Optional[torch.nn.Module] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    import torch_npu
+
+    from vllm_ascend.utils import is_310p
+
+    if layer is not None and not is_310p():
+        x, _, residual = torch_npu.npu_add_rms_norm_quant(
+            x,
+            residual,
+            self.weight,
+            layer.aclnn_input_scale,
+            layer.aclnn_input_offset,
+            epsilon=self.variance_epsilon)
+    else:
+        if is_310p():
+            orig_dtype = residual.dtype
+            x = x + residual.to(x.dtype)
+            residual = x.to(orig_dtype)
+            x, _ = torch_npu.npu_rms_norm(x, self.weight,
+                                          self.variance_epsilon)
+        else:
+            x, _, residual = torch_npu.npu_add_rms_norm(
+                x, residual, self.weight, self.variance_epsilon)
+    torch.ops.vllm.maybe_wait_prefetch_done(x)
+    return x, residual
+
+
+class AscendRMSNorm(RMSNorm):
+
+    def forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        import torch_npu
+
+        if residual is not None:
+            residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
+            assert x.size(0) == residual.size(0)
+            x, residual = _addrmsnorm_forward_oot(
+                self, x, residual, self.next_need_quant_fusion_linear)
+            return x, residual
+        x, residual = torch_npu.npu_rms_norm(x, self.weight,
+                                             self.variance_epsilon)
+        return x
+
+    @property
+    def next_need_quant_fusion_linear(self):
+        try:
+            forward_context = get_forward_context()
+            if not forward_context.addrmsnorm_quant_fusion_enabled or \
+                forward_context.layer_idx == forward_context.num_hidden_layers:
+                return None
+        except AssertionError:
+            return None
+
+        next_linear = None
+        model_instance = forward_context.model_instance
+        layer_idx = forward_context.layer_idx
+        fusion_linear = forward_context.fusion_linear
+        next_linear = None
+        if fusion_linear == "qkv_dense":
+            next_linear = model_instance.model.layers[
+                layer_idx].self_attn.qkv_proj
+            forward_context.fusion_linear = "gate_up_dense"
+        elif fusion_linear == "gate_up_dense":
+            next_linear = model_instance.model.layers[
+                layer_idx].mlp.gate_up_proj
+            forward_context.fusion_linear = "qkv_dense"
+            # if prefetch_mlp_weight enabled, following accumulation operation
+            # does not need to be repeated
+            if not forward_context.prefetch_mlp_enabled:
+                forward_context.layer_idx += 1
+        from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
+        if next_linear is not None and \
+            not isinstance(next_linear.quant_method.quant_method, AscendW8A8LinearMethod):
+            next_linear = None
+        return next_linear
+
+
+class AscendQuantRMSNorm(AscendRMSNorm):

    def __init__(
        self,
        hidden_size: int,
-        layer: torch.nn.Module,
        eps: float = 1e-6,
        var_hidden_size: Optional[int] = None,
        has_weight: bool = True,
        dtype: Optional[torch.dtype] = None,
    ) -> None:
        super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
-        self.layer = layer
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
+                                       requires_grad=False)

-    def forward(
+    def forward_oot(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        import torch_npu
-
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if residual is not None:
-            x, _, residual = torch_npu.npu_add_rms_norm_quant(
-                x,
-                residual,
-                self.weight,
-                self.layer.aclnn_input_scale,
-                self.layer.aclnn_input_offset,
-                epsilon=self.variance_epsilon)
-            return x, residual
-
-        x, residual = torch_npu.npu_rms_norm(x, self.weight,
-                                             self.variance_epsilon)
-        return x
+            x, residual = super().forward_oot(x, residual)
+            return x.add_(self.bias), residual
+        return cast(torch.Tensor, super().forward_oot(x)).add_(self.bias)


-class AscendRMSNorm(RMSNorm):
+class AscendGemmaRMSNorm(GemmaRMSNorm):

    def forward_oot(
        self,
@@ -73,13 +147,13 @@ class AscendRMSNorm(RMSNorm):
                orig_dtype = residual.dtype
                x = x + residual.to(x.dtype)
                residual = x.to(orig_dtype)
-                x, _ = torch_npu.npu_rms_norm(x, self.weight,
+                x, _ = torch_npu.npu_rms_norm(x, 1.0 + self.weight,
                                              self.variance_epsilon)
            else:
                x, _, residual = torch_npu.npu_add_rms_norm(
-                    x, residual, self.weight, self.variance_epsilon)
+                    x, residual, 1.0 + self.weight, self.variance_epsilon)
            return x, residual

-        x, residual = torch_npu.npu_rms_norm(x, self.weight,
-                                             self.variance_epsilon)
+        x, _ = torch_npu.npu_rms_norm(x, 1.0 + self.weight,
+                                      self.variance_epsilon)
        return x
--- a/vllm_ascend/ops/linear.py
+++ b/vllm_ascend/ops/linear.py
@@ -1,45 +1,159 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
-Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-This file is a part of the vllm-ascend project.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+To customize linear communication groups or forward of classes in this file,
+extend new linear operations in linear_op.py.
+The classes in this file should not be modified, including AscendQKVParallelLinear,
+AscendMergedColumnParallelLinear, AscendMergedColumnParallelLinear,
+AscendRowParallelLinear and AscendColumnParallelLinear.
 """

 from typing import Optional, Union

 import torch
+import torch.nn as nn
 from torch.nn.parameter import Parameter
-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
-                                               ColumnParallelLinear,
-                                               LinearBase,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.distributed import divide
+from vllm.model_executor.layers.linear import (  # noqa
+    WEIGHT_LOADER_V2_SUPPORTED, ColumnParallelLinear, LinearBase,
+    MergedColumnParallelLinear, QKVParallelLinear, QuantizeMethodBase,
+    RowParallelLinear, UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import \
    QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs

-from vllm_ascend.distributed.parallel_state import (
-    get_mlp_tensor_model_parallel_rank,
-    get_mlp_tensor_model_parallel_world_size, get_mlp_tp_group)
+from vllm_ascend.ops.linear_op import (get_column_parallel_op,
+                                       get_row_parallel_op)


-class AscendMlpColumnParallelLinear(ColumnParallelLinear):
-    """Linear layer with column parallelism.
+# TODO(realliujiaxu): Remove this class after linear of vllm supports custom comm group
+class AscendLinearBase(LinearBase):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+        disable_tp: bool = False,
+    ):
+        nn.Module.__init__(self)
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.quant_config = quant_config
+        self.prefix = prefix
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
+        self.return_bias = return_bias
+        self.disable_tp = disable_tp
+
+
+class AscendQKVParallelLinear(QKVParallelLinear):
+    """Linear layers for the attention's QKV transformation.
+
+    Linear layers for the linear transformation of the query, key, and value
+    vectors in the attention layer. The weight matrix is concatenated along
+    the output dimension. The layer is parallelized along the head dimension.
+    When the number of key/value heads is smaller than the number of query
+    heads (e.g., multi-query/grouped-query attention), the key/value head may
+    be replicated while the query heads are partitioned.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+        disable_tp: bool = False,
+    ):
+        self.custom_op, _, tp_size = get_column_parallel_op(
+            disable_tp, prefix, self)
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.total_num_heads = total_num_heads
+        if total_num_kv_heads is None:
+            total_num_kv_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        # Divide the weight matrix along the last dimension.
+        self.num_heads = divide(self.total_num_heads, tp_size)
+        if tp_size >= self.total_num_kv_heads:
+            self.num_kv_heads = 1
+            self.num_kv_head_replicas = divide(tp_size,
+                                               self.total_num_kv_heads)
+        else:
+            self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
+            self.num_kv_head_replicas = 1
+        input_size = self.hidden_size
+        output_size = (self.num_heads +
+                       2 * self.num_kv_heads) * tp_size * self.head_size
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,  # q_proj
+            self.num_kv_heads * self.head_size * tp_size,  # k_proj
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+        ]
+        AscendColumnParallelLinear.__init__(self,
+                                            input_size=input_size,
+                                            output_size=output_size,
+                                            bias=bias,
+                                            gather_output=False,
+                                            skip_bias_add=skip_bias_add,
+                                            params_dtype=params_dtype,
+                                            quant_config=quant_config,
+                                            prefix=prefix,
+                                            return_bias=return_bias,
+                                            disable_tp=disable_tp)
+
+    def forward(
+        self,
+        input_,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        if self.custom_op is not None:
+            return self.custom_op.apply(input_)
+
+        return super().forward(input_)
+
+
+class AscendMergedColumnParallelLinear(MergedColumnParallelLinear):
+    """Packed linear layers with column parallelism.
+
+    Similar to ColumnParallelLinear, but the weight matrix is concatenated
+    along the output dimension. When the weight matrix is loaded, the
+    different partitions are sharded separately.

    Use the MLP tensor parallelism group in the MLP module,
    and the original TP group in other modules.
@@ -48,73 +162,46 @@ class AscendMlpColumnParallelLinear(ColumnParallelLinear):
    def __init__(
        self,
        input_size: int,
-        output_size: int,
+        output_sizes: list[int],
        bias: bool = True,
        gather_output: bool = False,
        skip_bias_add: bool = False,
        params_dtype: Optional[torch.dtype] = None,
        quant_config: Optional[QuantizationConfig] = None,
-        output_sizes: Optional[list[int]] = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
-        # Divide the weight matrix along the last dimension.
-        if prefix.find("gate_up_proj") != -1:
-            self.tp_size = get_mlp_tensor_model_parallel_world_size()
-            self.tp_rank = get_mlp_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = True
-        else:
-            self.tp_size = get_tensor_model_parallel_world_size()
-            self.tp_rank = get_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = False
-        self.input_size_per_partition = input_size
-        self.output_size_per_partition = divide(output_size, self.tp_size)
-        self.output_partition_sizes = [self.output_size_per_partition]
-        # If QKV or MergedColumn, use output size of each partition.
-        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = [
-                divide(output_size, self.tp_size)
-                for output_size in self.output_sizes
-            ]
-        LinearBase.__init__(self,
-                            input_size,
-                            output_size,
-                            skip_bias_add,
-                            params_dtype,
-                            quant_config,
-                            prefix,
-                            return_bias=return_bias)
+        self.custom_op, self.tp_rank, self.tp_size = get_column_parallel_op(
+            disable_tp, prefix, self)
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.output_sizes = output_sizes
+        assert all(output_size % self.tp_size == 0
+                   for output_size in output_sizes)
+        AscendColumnParallelLinear.__init__(self,
+                                            input_size=input_size,
+                                            output_size=sum(output_sizes),
+                                            bias=bias,
+                                            gather_output=gather_output,
+                                            skip_bias_add=skip_bias_add,
+                                            params_dtype=params_dtype,
+                                            quant_config=quant_config,
+                                            prefix=prefix,
+                                            return_bias=return_bias,
+                                            disable_tp=disable_tp)

-        self.gather_output = gather_output
+    def forward(
+        self,
+        input_,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        if self.custom_op is not None:
+            return self.custom_op.apply(input_)

-        if output_sizes is None:
-            output_sizes = [output_size]
-
-        assert self.quant_method is not None
-        self.quant_method.create_weights(
-            layer=self,
-            input_size_per_partition=self.input_size_per_partition,
-            output_partition_sizes=self.output_partition_sizes,
-            input_size=self.input_size,
-            output_size=self.output_size,
-            params_dtype=self.params_dtype,
-            weight_loader=(
-                self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
-        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
-        else:
-            self.register_parameter("bias", None)
+        return super().forward(input_)


-class AscendMlpRowParallelLinear(RowParallelLinear):
+class AscendRowParallelLinear(RowParallelLinear):
    """Linear layer with row parallelism.
    Use the MLP tensor parallelism group in the MLP module,
    and the original TP group in other modules.
@@ -133,28 +220,25 @@ class AscendMlpRowParallelLinear(RowParallelLinear):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
-        if prefix.find("down_proj") != -1:
-            self.tp_size = get_mlp_tensor_model_parallel_world_size()
-            self.tp_rank = get_mlp_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = True
-        else:
-            self.tp_size = get_tensor_model_parallel_world_size()
-            self.tp_rank = get_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = False
+        self.custom_op, self.tp_rank, self.tp_size = get_row_parallel_op(
+            disable_tp, prefix, self)
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
        # Divide the weight matrix along the first dimension.
        self.input_size_per_partition = divide(input_size, self.tp_size)
        self.output_size_per_partition = output_size
        self.output_partition_sizes = [output_size]

-        LinearBase.__init__(self,
-                            input_size,
-                            output_size,
-                            skip_bias_add,
-                            params_dtype,
-                            quant_config,
-                            prefix,
-                            return_bias=return_bias)
+        AscendLinearBase.__init__(self,
+                                  input_size,
+                                  output_size,
+                                  skip_bias_add,
+                                  params_dtype,
+                                  quant_config,
+                                  prefix,
+                                  return_bias=return_bias,
+                                  disable_tp=disable_tp)

        self.input_is_parallel = input_is_parallel
        self.reduce_results = reduce_results
@@ -184,66 +268,22 @@ class AscendMlpRowParallelLinear(RowParallelLinear):
        else:
            self.register_parameter("bias", None)

+        if self.custom_op is not None:
+            self.custom_op.update_attrs()
+
    def forward(
        self,
        input_,
+        is_prefill: bool = True,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        if self.enable_mlp_optimze:
-            tp_rank = get_mlp_tensor_model_parallel_rank()
-            if self.input_is_parallel:
-                input_parallel = input_
-            else:
-                tp_rank = get_mlp_tensor_model_parallel_rank()
-                splitted_input = split_tensor_along_last_dim(
-                    input_, num_partitions=self.tp_size)
-                input_parallel = splitted_input[tp_rank].contiguous()
-            # Matrix multiply.
-            assert self.quant_method is not None
-            # Only fuse bias add into GEMM for rank 0 (this ensures that
-            # bias will not get added more than once in TP>1 case)
-            bias_ = None if (self.tp_rank > 0
-                             or self.skip_bias_add) else self.bias
-            output_parallel = self.quant_method.apply(self,
-                                                      input_parallel,
-                                                      bias=bias_)
-            output = get_mlp_tp_group().reduce_scatter(output_parallel, 0)
-            # output = output[:num_tokens,:]
-            # dispose_tensor(output_parallel)
-        else:
-            if self.input_is_parallel:
-                input_parallel = input_
-            else:
-                tp_rank = get_tensor_model_parallel_rank()
-                splitted_input = split_tensor_along_last_dim(
-                    input_, num_partitions=self.tp_size)
-                input_parallel = splitted_input[tp_rank].contiguous()
+        if self.custom_op is not None:
+            return self.custom_op.apply(input_)

-            # Matrix multiply.
-            assert self.quant_method is not None
-            # Only fuse bias add into GEMM for rank 0 (this ensures that
-            # bias will not get added more than once in TP>1 case)
-            bias_ = None if (self.tp_rank > 0
-                             or self.skip_bias_add) else self.bias
-            output_parallel = self.quant_method.apply(self,
-                                                      input_parallel,
-                                                      bias=bias_)
-            if self.reduce_results and self.tp_size > 1:
-                output = tensor_model_parallel_all_reduce(output_parallel)
-            else:
-                output = output_parallel
-        output_bias = self.bias if self.skip_bias_add else None
-
-        if not self.return_bias:
-            return output
-        return output, output_bias
+        return super().forward(input_)


-class AscendMlpMergedColumnParallelLinear(MergedColumnParallelLinear):
-    """Packed linear layers with column parallelism.
-
-    Similar to ColumnParallelLinear, but the weight matrix is concatenated
-    along the output dimension. When the weight matrix is loaded, the
-    different partitions are sharded separately.
+class AscendColumnParallelLinear(ColumnParallelLinear):
+    """Linear layer with column parallelism.

    Use the MLP tensor parallelism group in the MLP module,
    and the original TP group in other modules.
@@ -252,58 +292,76 @@ class AscendMlpMergedColumnParallelLinear(MergedColumnParallelLinear):
    def __init__(
        self,
        input_size: int,
-        output_sizes: list[int],
+        output_size: int,
        bias: bool = True,
        gather_output: bool = False,
        skip_bias_add: bool = False,
        params_dtype: Optional[torch.dtype] = None,
        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[list[int]] = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
-        self.output_sizes = output_sizes
-        if prefix.find("gate_up_proj") != -1:
-            self.tp_size = get_mlp_tensor_model_parallel_world_size()
-            self.tp_rank = get_mlp_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = True
+        self.custom_op, self.tp_rank, self.tp_size = get_column_parallel_op(
+            disable_tp, prefix, self)
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.input_size_per_partition = input_size
+        self.output_size_per_partition = divide(output_size, self.tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, self.tp_size)
+                for output_size in self.output_sizes
+            ]
+
+        AscendLinearBase.__init__(self,
+                                  input_size,
+                                  output_size,
+                                  skip_bias_add,
+                                  params_dtype,
+                                  quant_config,
+                                  prefix,
+                                  return_bias=return_bias,
+                                  disable_tp=disable_tp)
+
+        self.gather_output = gather_output
+
+        if output_sizes is None:
+            output_sizes = [output_size]
+
+        assert self.quant_method is not None
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size_per_partition,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2 if self.quant_method.__class__.__name__
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size_per_partition,
+                            dtype=params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
        else:
-            self.tp_size = get_tensor_model_parallel_world_size()
-            self.tp_rank = get_tensor_model_parallel_rank()
-            self.enable_mlp_optimze = False
-        assert all(output_size % self.tp_size == 0
-                   for output_size in output_sizes)
-        AscendMlpColumnParallelLinear.__init__(self,
-                                               input_size=input_size,
-                                               output_size=sum(output_sizes),
-                                               bias=bias,
-                                               gather_output=gather_output,
-                                               skip_bias_add=skip_bias_add,
-                                               params_dtype=params_dtype,
-                                               quant_config=quant_config,
-                                               prefix=prefix,
-                                               return_bias=return_bias)
+            self.register_parameter("bias", None)
+
+        if self.custom_op is not None:
+            self.custom_op.update_attrs()

    def forward(
        self,
        input_,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        bias = self.bias if not self.skip_bias_add else None
-        # self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
-        # Matrix multiply.
-        assert self.quant_method is not None
-        if self.enable_mlp_optimze:
-            input2_ = get_mlp_tp_group().all_gather(input_, 0)
-            output = self.quant_method.apply(self, input2_, bias)
-        else:
-            output_parallel = self.quant_method.apply(self, input_, bias)
-            if self.gather_output:
-                # All-gather across the partitions.
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
+        if self.custom_op is not None:
+            return self.custom_op.apply(input_)

-        output_bias = self.bias if self.skip_bias_add else None
-        if not self.return_bias:
-            return output
-        return output, output_bias
+        return super().forward(input_)
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -0,0 +1,459 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file extends the functionality of linear operations by encapsulating custom
+communication groups and forward functions into classes (linear ops).
+
+Current class inheritance structure:
+CustomTensorParallelOp
+├── CustomColumnParallelOp
+│   ├── MLPColumnParallelOp
+│   ├── DenseOptimMergedColumnParallelOp
+│   └── DenseOptimQKVParallelOp
+└── CustomRowParallelOp
+    ├── MLPRowParallelOp
+    ├── OProjRowParallelOp
+    ├── MatmulAllreduceRowParallelOp
+    └── DenseOptimRowParallelOp
+
+How to extend a new linear op? Taking column parallel op as an example:
+1. Inherit from CustomColumnParallelOp and create a new class MyColumnParallelOp
+2. [Optional] The default communication group is the TP group. If a custom communication group is needed, override the comm_group method
+3. Override the apply method according to requirements, which will replace the original linear.forward
+4. Add selection logic for MyColumnParallelOp in the get_column_parallel_op method, typically based on prefix and configuration judgments
+Row parallel op follows a similar approach - inherit from RowColumnParallelOp and register the new class in get_row_parallel_op.
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch_npu
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+from vllm.distributed import split_tensor_along_last_dim
+from vllm.distributed.parallel_state import get_tp_group
+
+from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
+                                                    get_otp_group)
+from vllm_ascend.utils import (dense_optim_enable, enable_sp,
+                               matmul_allreduce_enable, mlp_tp_enable,
+                               oproj_tp_enable)
+
+
+class CustomTensorParallelOp:
+
+    def __init__(self, layer):
+        self.layer = layer
+        self.bias = None
+        self.skip_bias_add = None
+        self.return_bias = None
+        self.quant_method = None
+
+    # Custom communication group, while determining weight sharding
+    @property
+    def comm_group(self):
+        return get_tp_group()
+
+    @property
+    def tp_rank(self):
+        return self.comm_group.rank_in_group
+
+    @property
+    def tp_size(self):
+        return self.comm_group.world_size
+
+    # Update the attributes required by apply(), obtaining them from the layer.
+    # Call this after the layer completes its initialization, specifically at the end of layer.init().
+    def update_attrs(self):
+        if hasattr(self.layer, "bias"):
+            self.bias = self.layer.bias
+        self.skip_bias_add = self.layer.skip_bias_add
+        self.return_bias = self.layer.return_bias
+        self.quant_method = self.layer.quant_method
+        self.prefix = self.layer.prefix
+
+    def apply_impl(self, input_):
+        raise NotImplementedError
+
+    # Replace layer.forward to customize the layer computation process.
+    def apply(self, input_):
+        output, output_bias = self.apply_impl(input_)
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+
+class CustomColumnParallelOp(CustomTensorParallelOp):
+
+    def __init__(self, layer):
+        super().__init__(layer)
+        self.gather_output = None
+
+    def update_attrs(self):
+        super().update_attrs()
+        self.gather_output = self.layer.gather_output
+
+
+class CustomRowParallelOp(CustomTensorParallelOp):
+
+    def __init__(self, layer):
+        super().__init__(layer)
+        self.reduce_results = None
+        self.input_is_parallel = None
+        self.input_size_per_partition = None
+
+    def update_attrs(self):
+        super().update_attrs()
+        self.input_is_parallel = self.layer.input_is_parallel
+        self.reduce_results = self.layer.reduce_results
+        self.input_size_per_partition = self.layer.input_size_per_partition
+
+    def apply(self, input_):
+        output, output_bias = self.apply_impl(input_)
+        if dense_optim_enable():
+            torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+
+class MLPColumnParallelOp(CustomColumnParallelOp):
+
+    def __init__(self, layer):
+        super().__init__(layer)
+
+    @property
+    def comm_group(self):
+        return get_mlp_tp_group()
+
+    def apply_impl(
+        self,
+        input_: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        bias = self.bias if not self.skip_bias_add else None
+        # Matrix multiply.
+        assert self.quant_method is not None
+        input_parallel = self.comm_group.all_gather(input_, 0)
+        output = self.quant_method.apply(self.layer, input_parallel, bias)
+
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class SequenceMergedColumnParallelOp(CustomColumnParallelOp):
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer with column parallelism.
+
+        Implemented multiple optimization projects for dense models, such as FlashComm and
+        communication-computation fusion.
+        """
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+
+        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(input_, True)
+        output_parallel = self.quant_method.apply(self.layer, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = self.comm_group.all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class SequenceQKVParallelOp(CustomColumnParallelOp):
+
+    def __init__(self, layer, prefix):
+        super().__init__(layer)
+        self.prefix = prefix
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer with column parallelism.
+
+        Implemented multiple optimization projects for dense models, such as FlashComm and
+        communication-computation fusion.
+        """
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+
+        layer_num = self.prefix.split('.')[2]
+
+        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
+            input_, layer_num != '0')
+        output_parallel = self.quant_method.apply(self.layer, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = self.comm_group.all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class MLPRowParallelOp(CustomRowParallelOp):
+
+    def __init__(self, layer):
+        super().__init__(layer)
+
+    @property
+    def comm_group(self):
+        return get_mlp_tp_group()
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        assert self.quant_method is not None
+        bias_ = None if (self.tp_rank > 0
+                         or self.skip_bias_add) else self.layer.bias
+        output_parallel = self.quant_method.apply(self.layer,
+                                                  input_parallel,
+                                                  bias=bias_)
+        output = self.comm_group.reduce_scatter(output_parallel, 0)
+
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class OProjRowParallelOp(CustomRowParallelOp):
+
+    def __init__(self, layer):
+        super().__init__(layer)
+
+    @property
+    def comm_group(self):
+        return get_otp_group()
+
+    def apply_impl(
+        self,
+        input_: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        # Prepare tensors for all-to-all communication
+        local_batch_size = input_parallel.size(0)
+        chunk_size = self.input_size_per_partition
+        total_batch_size = local_batch_size * self.tp_size
+
+        # Reshape tensor for efficient cross-device transfer:
+        # [batch, dim] -> [tp_size, batch, chunk] -> flattened
+        send_buf = (input_parallel.reshape(-1,
+                                           self.tp_size, chunk_size).transpose(
+                                               0, 1).contiguous().view(-1))
+
+        # Create receive buffer
+        recv_buf = torch.empty(total_batch_size * chunk_size,
+                               dtype=input_parallel.dtype,
+                               device=input_parallel.device)
+
+        # Perform all-to-all communication
+        dist.all_to_all_single(recv_buf,
+                               send_buf,
+                               group=self.comm_group.device_group)
+        input_parallel = recv_buf.view(total_batch_size, chunk_size)
+
+        # Only fuse bias add for rank 0 to avoid duplicate bias addition in TP>1
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        assert self.quant_method is not None
+        output_parallel = self.quant_method.apply(self.layer,
+                                                  input_parallel,
+                                                  bias=bias_)
+
+        # otp-specific: Combine partial results across devices
+        output = self.comm_group.reduce_scatter(output_parallel, dim=0)
+        output = output.view(input_.shape[0], self.layer.output_size)
+
+        # Handle bias return based on configuration
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def update_attrs(self):
+        super().update_attrs()
+        self.input_is_parallel = self.layer.input_is_parallel
+        self.input_size_per_partition = self.layer.input_size_per_partition
+
+
+class MatmulAllreduceRowParallelOp(CustomRowParallelOp):
+    _HCOMM_INFO = None
+
+    def __init__(self, layer):
+        super().__init__(layer)
+        self.hcomm_info = self.get_hcomm_info(self.comm_group.device_group)
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+        """Calculate the output tensor of forward by considering
+        fusing communication and computation."""
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if self.reduce_results and self.tp_size > 1:
+            output = torch_npu.npu_mm_all_reduce_base(input_parallel,
+                                                      self.weight_t,
+                                                      self.hcomm_info,
+                                                      bias=bias_)
+        else:
+            assert self.quant_method is not None
+            output = self.quant_method.apply(self.layer,
+                                             input_parallel,
+                                             bias=bias_)
+
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    @classmethod
+    def get_hcomm_info(cls, group: ProcessGroup) -> str:
+        """Get the HCCL communication information for the given group."""
+        if cls._HCOMM_INFO is not None:
+            return cls._HCOMM_INFO
+
+        rank = torch.distributed.get_rank(group)
+        if torch.__version__ > "2.0":
+            global_rank = torch.distributed.get_global_rank(group, rank)
+            cls._HCOMM_INFO = group._get_backend(
+                torch.device("npu")).get_hccl_comm_name(global_rank)
+        else:
+            cls._HCOMM_INFO = group.get_hccl_comm_name(rank)
+        return cls._HCOMM_INFO
+
+    def update_attrs(self):
+        super().update_attrs()
+        self.weight_t = self.layer.weight.t()
+
+
+class SequenceRowParallelOp(CustomRowParallelOp):
+
+    def __init__(self, layer, prefix):
+        super().__init__(layer)
+        self.prefix = prefix
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer with column parallelism.
+
+        Implemented multiple optimization projects for dense models, such as FlashComm and
+        communication-computation fusion.
+        """
+
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        assert self.quant_method is not None
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+
+        if self.tp_size == 1 or not self.reduce_results:
+            output = self.quant_method.apply(self.layer,
+                                             input_parallel,
+                                             bias=bias_)
+        else:
+            output_parallel = self.quant_method.apply(self.layer,
+                                                      input_parallel,
+                                                      bias=bias_)
+            output = torch.ops.vllm.maybe_pad_and_reduce(output_parallel)
+
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def update_attrs(self):
+        super().update_attrs()
+        self.input_is_parallel = self.layer.input_is_parallel
+        self.reduce_results = self.layer.reduce_results
+
+
+def get_column_parallel_op(
+    disable_tp, prefix, layer
+) -> Tuple[Optional[Union[MLPColumnParallelOp, SequenceMergedColumnParallelOp,
+                          SequenceQKVParallelOp]], int, int]:
+    if disable_tp:
+        return None, 0, 1
+
+    custom_op: Optional[Union[
+        MLPColumnParallelOp,
+        SequenceMergedColumnParallelOp,
+        SequenceQKVParallelOp,
+    ]] = None
+    if "gate_up_proj" in prefix and mlp_tp_enable():
+        custom_op = MLPColumnParallelOp(layer)
+    elif "gate_up_proj" in prefix and enable_sp():
+        custom_op = SequenceMergedColumnParallelOp(layer)
+    elif enable_sp():
+        custom_op = SequenceQKVParallelOp(layer, prefix)
+
+    if custom_op is not None:
+        return custom_op, custom_op.tp_rank, custom_op.tp_size
+
+    return None, get_tp_group().rank_in_group, get_tp_group().world_size
+
+
+def get_row_parallel_op(
+    disable_tp, prefix, layer
+) -> Tuple[Optional[Union[MLPRowParallelOp, OProjRowParallelOp,
+                          MatmulAllreduceRowParallelOp,
+                          SequenceRowParallelOp]], int, int]:
+    if disable_tp:
+        return None, 0, 1
+
+    custom_op: Optional[Union[MLPRowParallelOp, OProjRowParallelOp,
+                              MatmulAllreduceRowParallelOp,
+                              SequenceRowParallelOp]] = None
+    if "down_proj" in prefix and mlp_tp_enable():
+        custom_op = MLPRowParallelOp(layer)
+    elif "o_proj" in prefix and oproj_tp_enable():
+        custom_op = OProjRowParallelOp(layer)
+    elif matmul_allreduce_enable():
+        custom_op = MatmulAllreduceRowParallelOp(layer)
+    elif enable_sp():
+        custom_op = SequenceRowParallelOp(layer, prefix)
+
+    if custom_op is not None:
+        return custom_op, custom_op.tp_rank, custom_op.tp_size
+
+    return None, get_tp_group().rank_in_group, get_tp_group().world_size
--- a/vllm_ascend/ops/layers/init.py
+++ b/vllm_ascend/ops/layers/init.py
--- a/vllm_ascend/ops/moe/comm_utils.py
+++ b/vllm_ascend/ops/moe/comm_utils.py
@@ -1,5 +1,7 @@
+# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
+# This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# This file is a part of the vllm-ascend project.
+#
 import torch
 import torch.distributed
 import torch.distributed as dist
@@ -60,3 +62,52 @@ def async_all_to_all(input_,
                                        group=group,
                                        async_op=True)
    return input_, a2a_out, handle
+
+
+def _gather_along_first_dim(input_, group, output_split_sizes=None):
+    """Gather tensors and concatenate along the first dimension.
+
+    Args:
+        input_tensor (torch.Tensor):
+            A tensor to be gathered.
+        output_split_sizes (List[int], optional):
+            A list specifying the sizes of the output splits along the first dimension.
+            If None, equal splitting is assumed. Default: None.
+
+    Returns:
+        torch.Tensor: Gathered tensor.
+    """
+    world_size = torch.distributed.get_world_size(group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    if output_split_sizes is None:
+        dim_size[0] = dim_size[0] * world_size
+
+        output = torch.empty(dim_size,
+                             dtype=input_.dtype,
+                             device=torch.npu.current_device())
+        torch.distributed.all_gather_into_tensor(output,
+                                                 input_.contiguous(),
+                                                 group=group)
+    else:
+        dim_size[0] = sum(output_split_sizes)
+        output = torch.empty(dim_size,
+                             dtype=input_.dtype,
+                             device=torch.npu.current_device())
+        output_tensor_list = list(
+            torch.split(output, output_split_sizes, dim=0))
+        torch.distributed.all_gather(output_tensor_list, input_, group=group)
+
+    return output
+
+
+def gather_from_sequence_parallel_region(
+    input_,
+    group,
+    output_split_sizes=None,
+):
+    """Wrapper for autograd function: forward: AG, backward: RS <first dim>"""
+    return _gather_along_first_dim(input_, group, output_split_sizes)
--- a/vllm_ascend/ops/layers/experts_selector.py
+++ b/vllm_ascend/ops/layers/experts_selector.py
--- a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py
+++ b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py
@@ -0,0 +1,459 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+from abc import ABC, abstractmethod
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_dp_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+
+from vllm_ascend.utils import vllm_version_is
+
+
+class FusedMoEPrepareAndFinalize(ABC):
+    """
+    Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization
+    in distributed environments. Subclasses implement specific communication strategies
+    (e.g., AllGather, All2All, MC2, Naive Multicast) to handle tensor padding, slicing,
+    broadcasting, and reduction across TP/DP/EP groups.
+
+    Attributes:
+        moe_config (FusedMoEConfig): Configuration object containing TP/DP/EP group info,
+                                     sizes, ranks, and communication settings.
+    """
+
+    def __init__(self, moe_config: FusedMoEConfig):
+        self.moe_config = moe_config
+
+    @abstractmethod
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare tensors before MoE computation. May involve:
+          - Padding to align communication boundaries
+          - Slicing across tensor-parallel ranks
+          - Broadcasting across data-parallel ranks
+          - Recomputing router logits if needed
+
+        Args:
+            hidden_states (torch.Tensor): Input features, shape [num_tokens, hidden_size]
+            router_logits (torch.Tensor): Router outputs, shape [num_tokens, num_experts]
+            enable_shared_expert_dp (bool): Skip DP communication for shared experts
+            rm_router_logits (bool): Discard input router_logits and recompute via gate
+            replace_allreduce (bool): Bypass default all-reduce behavior
+            gate (nn.Module, optional): Gate network to recompute router_logits if needed
+
+        Returns:
+            Tuple of:
+                - processed hidden_states (may be padded/sliced/broadcasted)
+                - processed router_logits (may be recomputed or broadcasted)
+                - optional communication mask (e.g., mc2_mask for sparse ops)
+        """
+        raise NotImplementedError("Prepare not implemented.")
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """
+        Finalize MoE output. May involve:
+          - Gathering sliced tensors across TP ranks
+          - Reducing or scattering across DP ranks
+          - Unpadding to original token count
+          - Applying all-reduce across TP/EP if requested
+
+        Args:
+            hidden_states (torch.Tensor): MoE layer output, possibly padded or sliced
+            reduce_results (bool): Whether to apply all-reduce across TP/EP groups
+
+        Returns:
+            torch.Tensor: Final output with shape [original_num_tokens, hidden_size]
+        """
+        raise NotImplementedError("Finalize function not implemented.")
+
+
+class FusedMoEPrepareAndFinalizeWithMC2(FusedMoEPrepareAndFinalize):
+    """
+    MoE communication strategy using MC2 (Memory-Centric Communication).
+    Designed for Ascend or environments requiring explicit padding and slicing control.
+    Relies on `mc2_mask` and `padded_num_tokens` from forward_context for alignment.
+    """
+
+    def __init__(self, moe_config: FusedMoEConfig):
+        super().__init__(moe_config)
+        self._restore_tp_across_dp()
+
+    def _restore_tp_across_dp(self):
+        """
+        Restore original TP configuration.
+        vLLM flattens TP and DP into a single dimension; this method recovers
+        the true TP world size and rank for correct tensor slicing.
+        """
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Preparation steps:
+          1. Fetch `mc2_mask` and target padding length from forward context.
+          2. Pad `hidden_states` and `router_logits` to target length if needed.
+          3. If TP > 1, split tensors along token dimension and select current TP rank's slice.
+          4. Split and return corresponding `mc2_mask`.
+
+        Skips padding/slicing if `enable_shared_expert_dp` or `replace_allreduce` is True.
+
+        Returns:
+            Tuple of (hidden_states, router_logits, mc2_mask), possibly sliced/padded.
+        """
+        self.replace_allreduce = replace_allreduce
+        self.enable_shared_expert_dp = enable_shared_expert_dp
+        forward_context = get_forward_context()
+        mc2_mask = forward_context.mc2_mask
+        if self.tp_size > 1:
+            # Also slice mc2_mask
+            split_mc2_mask = torch.tensor_split(mc2_mask, self.tp_size, dim=0)
+            mc2_mask = split_mc2_mask[self.tp_rank]
+
+        if not self.replace_allreduce:
+            self.num_tokens, _ = hidden_states.shape
+            target_pad_length = forward_context.padded_num_tokens
+            pad_size = target_pad_length - self.num_tokens
+
+            # Pad if necessary (unless shared expert DP is enabled)
+            if pad_size > 0 and not self.enable_shared_expert_dp:
+                hidden_states = nn.functional.pad(hidden_states,
+                                                  (0, 0, 0, pad_size))
+                router_logits = nn.functional.pad(router_logits,
+                                                  (0, 0, 0, pad_size))
+
+            # Slice across TP ranks
+            if self.tp_size > 1 and not self.enable_shared_expert_dp:
+                split_hidden_states = torch.tensor_split(hidden_states,
+                                                         self.tp_size,
+                                                         dim=0)
+                split_router_logits = torch.tensor_split(router_logits,
+                                                         self.tp_size,
+                                                         dim=0)
+                hidden_states = split_hidden_states[self.tp_rank]
+                router_logits = split_router_logits[self.tp_rank]
+                self.split_hidden_states = split_hidden_states  # Save for finalize
+
+        return hidden_states, router_logits, mc2_mask
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """
+        Finalization steps:
+          1. If TP > 1, all-gather slices from all TP ranks to reconstruct full tensor.
+          2. Unpad to original token count if padding was applied.
+          3. Return tensor with shape [original_num_tokens, hidden_size].
+
+        Skips communication and unpadding if `enable_shared_expert_dp` or `replace_allreduce` is True.
+        """
+        if not (self.enable_shared_expert_dp or self.replace_allreduce):
+            if self.tp_size > 1:
+                # All-gather across TP group
+                dist.all_gather(list(self.split_hidden_states), hidden_states,
+                                self.moe_config.tp_group.device_group)
+                hidden_states = torch.cat(self.split_hidden_states, dim=0)
+
+                # TODO: It is a quick bugfix for the memory explosion issue in eager mode.
+                # If the cache is not cleared after `self.split_hidden_states` is created,
+                # it can lead to the memory explosion in eager mode.
+                del self.split_hidden_states
+
+            # Unpad if necessary
+            if self.num_tokens < hidden_states.shape[0]:
+                hidden_states = hidden_states[:self.num_tokens]
+
+        return hidden_states
+
+
+class FusedMoEPrepareAndFinalizeWithAll2All(FusedMoEPrepareAndFinalize):
+    """
+    MoE communication strategy using All-to-All style slicing.
+    Similar to MC2 but does not use mc2_mask; instead pads to TP size for uniform slicing.
+    Will be used when num_tokens exceed mc2's limitation (512 tokens/rank).
+    """
+
+    def __init__(self, moe_config: FusedMoEConfig):
+        super().__init__(moe_config)
+        self._restore_tp_across_dp()
+
+    def _restore_tp_across_dp(self):
+        """Restore original TP configuration (same as MC2)."""
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Preparation steps:
+          1. Pad hidden_states and router_logits to next multiple of TP size.
+          2. If TP > 1, split along token dim and select current TP rank's slice.
+          3. Save splits for later all-gather in finalize.
+
+        Skips if `enable_shared_expert_dp` or `replace_allreduce` is True.
+
+        Returns:
+            Tuple of (hidden_states, router_logits, None) — no mask used in All2All.
+        """
+        self.replace_allreduce = replace_allreduce
+        self.enable_shared_expert_dp = enable_shared_expert_dp
+
+        if not (self.replace_allreduce or self.enable_shared_expert_dp):
+            self.num_tokens, _ = hidden_states.shape
+            pad_size = self.tp_size - self.num_tokens  # Pad to TP size (cyclic)
+
+            if pad_size > 0:
+                hidden_states = nn.functional.pad(hidden_states,
+                                                  (0, 0, 0, pad_size))
+                router_logits = nn.functional.pad(router_logits,
+                                                  (0, 0, 0, pad_size))
+
+            if self.tp_size > 1:
+                split_hidden_states = torch.tensor_split(hidden_states,
+                                                         self.tp_size,
+                                                         dim=0)
+                split_router_logits = torch.tensor_split(router_logits,
+                                                         self.tp_size,
+                                                         dim=0)
+                self.split_hidden_states = split_hidden_states
+
+                hidden_states = split_hidden_states[self.tp_rank]
+                router_logits = split_router_logits[self.tp_rank]
+
+        return hidden_states, router_logits, None
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """
+        Finalization steps:
+          1. If TP > 1, all-gather slices to reconstruct full tensor.
+          2. Unpad to original token count.
+          3. Return [original_num_tokens, hidden_size] tensor.
+
+        Skips if `enable_shared_expert_dp` or `replace_allreduce` is True.
+        """
+        if not (self.enable_shared_expert_dp or self.replace_allreduce):
+            if self.tp_size > 1:
+                dist.all_gather(list(self.split_hidden_states), hidden_states,
+                                self.moe_config.tp_group.device_group)
+                hidden_states = torch.cat(self.split_hidden_states, dim=0)
+
+                # TODO: It is a quick bugfix for the memory explosion issue in eager mode.
+                # If the cache is not cleared after `self.split_hidden_states` is created,
+                # it can lead to the memory explosion in eager mode.
+                del self.split_hidden_states
+
+            if self.num_tokens < hidden_states.shape[0]:
+                hidden_states = hidden_states[:self.num_tokens]
+
+        return hidden_states
+
+
+class FusedMoEPrepareAndFinalizeWithAllGather(FusedMoEPrepareAndFinalize):
+    """
+    MoE communication strategy using All-Gather + Reduce-Scatter.
+    Designed for DP > 1: gather inputs across DP ranks before MoE, scatter outputs after.
+    Uses `max_tokens_across_dp` from forward_context for padding alignment.
+    """
+
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Preparation steps:
+          1. Fetch max token count across DP group from forward context.
+          2. Pad local tensors to that size.
+          3. All-gather across DP group to form global input tensor.
+          4. Optionally recompute router_logits using gate if `rm_router_logits=True`.
+
+        Returns:
+            Tuple of (global_hidden_states, global_router_logits, None)
+        """
+        self.enable_shared_expert_dp = enable_shared_expert_dp
+
+        if self.moe_config.dp_size > 1:
+            forward_context = get_forward_context()
+            max_tokens_across_dp = forward_context.max_tokens_across_dp
+
+            self.num_tokens = hidden_states.shape[0]
+            pad_size = max_tokens_across_dp - self.num_tokens
+            if pad_size > 0:
+                hidden_states = nn.functional.pad(hidden_states,
+                                                  (0, 0, 0, pad_size))
+                if not rm_router_logits:
+                    router_logits = nn.functional.pad(router_logits,
+                                                      (0, 0, 0, pad_size))
+
+            # All-gather across DP group
+            hidden_states = self.moe_config.dp_group.all_gather(
+                hidden_states, 0)
+            if rm_router_logits:
+                router_logits, _ = gate(hidden_states)  # Recompute globally
+            else:
+                router_logits = self.moe_config.dp_group.all_gather(
+                    router_logits, 0)
+
+        return hidden_states, router_logits, None
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """
+        Finalization steps:
+          1. If DP > 1 and not shared expert, reduce-scatter output across DP group.
+          2. Slice to original local token count.
+          3. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce.
+
+        Returns:
+            Tensor with shape [original_local_num_tokens, hidden_size]
+        """
+        if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp:
+            hidden_states = get_dp_group().reduce_scatter(hidden_states, 0)
+            hidden_states = hidden_states[:self.num_tokens]
+
+        if reduce_results and (self.moe_config.tp_size > 1
+                               or self.moe_config.ep_size > 1):
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        return hidden_states
+
+
+class FusedMoEPrepareAndFinalizeWithNaiveMulticast(FusedMoEPrepareAndFinalize):
+    """
+    MoE communication strategy using Naive Multicast (point-to-point broadcast).
+    Will be used in prefill when using allgather in decode. Each DP rank broadcasts its slice to all others.
+    Uses `cu_tokens_across_dp_cpu` (cumulative tokens) to locate slice boundaries.
+    """
+
+    def _naive_multicast(self, x: torch.Tensor,
+                         cu_tokens_across_dp_cpu: torch.Tensor):
+        """
+        Naive multicast implementation:
+          1. Create global buffer sized by total tokens across DP.
+          2. Current rank copies its slice into its designated buffer region.
+          3. Each rank broadcasts its slice to all others via P2P.
+
+        Args:
+            x (torch.Tensor): Local tensor [local_tokens, hidden_size]
+            cu_tokens_across_dp_cpu (torch.Tensor): Cumulative token counts per DP rank
+
+        Returns:
+            torch.Tensor: Global tensor [total_tokens, hidden_size]
+        """
+        assert len(x.shape) == 2, "Input must be 2D [tokens, features]"
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+
+        # Copy local slice into buffer
+        start = 0 if self.moe_config.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.moe_config.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
+        buffer[start:end, :].copy_(x)
+
+        # Broadcast each slice to all ranks
+        for idx in range(self.moe_config.dp_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            get_dp_group().broadcast(buffer[start:end, :], idx)
+        return buffer
+
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Preparation steps:
+          1. Fetch cumulative token boundaries from forward context.
+          2. Multicast hidden_states and router_logits to form global tensors.
+          3. Optionally recompute router_logits globally if `rm_router_logits=True`.
+
+        Returns:
+            Tuple of (global_hidden_states, global_router_logits, None)
+        """
+        self.enable_shared_expert_dp = enable_shared_expert_dp
+
+        if self.moe_config.dp_size > 1:
+            if vllm_version_is("0.10.2"):
+                self.cu_tokens_across_dp_cpu = get_forward_context(
+                ).dp_metadata.cu_tokens_across_dp_cpu
+            else:
+                self.cu_tokens_across_dp_cpu = get_forward_context(
+                ).dp_metadata.cu_tokens_across_sp(1)
+            hidden_states = self._naive_multicast(hidden_states,
+                                                  self.cu_tokens_across_dp_cpu)
+            if rm_router_logits:
+                router_logits, _ = gate(hidden_states)
+            else:
+                router_logits = self._naive_multicast(
+                    router_logits, self.cu_tokens_across_dp_cpu)
+
+        return hidden_states, router_logits, None
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """
+        Finalization steps:
+          1. If DP > 1 and not shared expert:
+               - All-reduce across DP
+               - Slice to current rank's token range using cu_tokens_across_dp_cpu
+          2. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce.
+
+        Returns:
+            Tensor with shape [local_num_tokens, hidden_size]
+        """
+        if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp:
+            start = 0 if self.moe_config.dp_rank == 0 else self.cu_tokens_across_dp_cpu[
+                self.moe_config.dp_rank - 1]
+            end = self.cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
+            hidden_states = get_dp_group().all_reduce(
+                hidden_states)  # Sum across DP
+            hidden_states = hidden_states[start:end, :]
+
+        if reduce_results and (self.moe_config.tp_size > 1
+                               or self.moe_config.ep_size > 1):
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        return hidden_states
--- a/vllm_ascend/ops/moe/moe_comm_method.py
+++ b/vllm_ascend/ops/moe/moe_comm_method.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+
+import torch
+from vllm.config import get_current_vllm_config
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+
+from vllm_ascend.ascend_forward_context import MoECommType
+from vllm_ascend.ops.moe.fused_moe_prepare_and_finalize import (
+    FusedMoEPrepareAndFinalizeWithAll2All,
+    FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2,
+    FusedMoEPrepareAndFinalizeWithNaiveMulticast)
+from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
+from vllm_ascend.ops.moe.token_dispatcher import (TokenDispatcherWithAll2AllV,
+                                                  TokenDispatcherWithAllGather,
+                                                  TokenDispatcherWithMC2,
+                                                  TokenDispatcherWithMoge)
+
+_MoECommMethods: Dict[Optional[MoECommType], MoECommMethod] = {}
+
+
+def get_moe_comm_method(
+        moe_comm_type: Optional[MoECommType]) -> Optional[MoECommMethod]:
+    return _MoECommMethods.get(moe_comm_type)
+
+
+def setup_moe_comm_method(moe_config):
+    _MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config)
+    _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config)
+    _MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config)
+    _MoECommMethods[MoECommType.NAIVE_MULTICAST] = NaiveMulticastCommImpl(
+        moe_config)
+
+
+class MoECommMethod(ABC):
+    """Base class for MoE communication methods."""
+
+    def __init__(self, moe_config: FusedMoEConfig):
+        self.model_type = get_current_vllm_config(
+        ).model_config.hf_config.model_type
+        self.moe_config = moe_config
+        self.mc2_mask = None
+
+        self.token_dispatcher = self._get_token_dispatcher()
+        self.fused_moe_prepare_finalize = self._get_fused_moe_prepare_finalize(
+        )
+
+    def prepare(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                enable_shared_expert_dp: bool = False,
+                rm_router_logits: bool = False,
+                replace_allreduce: bool = False,
+                gate=None) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, router_logits, mc2_mask = self.fused_moe_prepare_finalize.prepare(
+            hidden_states, router_logits, enable_shared_expert_dp,
+            rm_router_logits, replace_allreduce, gate)
+        self.mc2_mask = mc2_mask
+        return hidden_states, router_logits
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        hidden_states = self.fused_moe_prepare_finalize.finalize(
+            hidden_states, reduce_results)
+        return hidden_states
+
+    def fused_experts(
+            self,
+            hidden_states: torch.Tensor,
+            w1: torch.Tensor,
+            w2: torch.Tensor,
+            topk_weights: torch.Tensor,
+            topk_ids: torch.Tensor,
+            row_idx: torch.Tensor,
+            activation: str = "silu",
+            apply_router_weight_on_input: bool = False,
+            use_int8_w8a8: bool = False,
+            use_int4_w4a8: bool = False,
+            global_num_experts: Optional[int] = None,
+            expert_map: Optional[torch.Tensor] = None,
+            w1_scale: Optional[torch.Tensor] = None,
+            w2_scale: Optional[torch.Tensor] = None,
+            w1_scale_bias: torch.Tensor = None,
+            w2_scale_bias: torch.Tensor = None,
+            # For TorchAir graph
+            is_torchair: bool = False,
+            # For Cube/Vector parallel
+            shared_experts: Optional[Any] = None,
+            quantized_x_for_share: Optional[Any] = None,
+            dynamic_scale_for_share: Optional[Any] = None,
+            # For load balance
+            log2phy: torch.Tensor = None,
+            global_redundant_expert_num: int = 0,
+            need_trans: bool = False,
+            dynamic_eplb: bool = False):
+        # Check constraints
+        assert hidden_states.dtype in [
+            torch.float32, torch.float16, torch.bfloat16
+        ]
+
+        moe_comm_method = get_forward_context().moe_comm_method
+        assert moe_comm_method is not None, "Missing communication context"
+
+        results = self.token_dispatcher.token_dispatch(
+            hidden_states=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            expert_map=expert_map,
+            log2phy=log2phy,
+            global_redundant_expert_num=global_redundant_expert_num,
+            shared_experts=shared_experts,
+            quantized_x_for_share=quantized_x_for_share,
+            dynamic_scale_for_share=dynamic_scale_for_share,
+            mc2_mask=self.mc2_mask,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            with_quant=use_int8_w8a8 or use_int4_w4a8)
+
+        permuted_hidden_states, expert_tokens, dynamic_scale, group_list_type, topk_scales = \
+            results["hidden_states"], results["group_list"], results.get("dynamic_scale"), results["group_list_type"], results.get("topk_scales")
+
+        mlp_output = unified_apply_mlp(hidden_states=permuted_hidden_states,
+                                       w1=w1,
+                                       w1_scale=w1_scale,
+                                       w2=w2,
+                                       w2_scale=w2_scale,
+                                       group_list=expert_tokens,
+                                       dynamic_scale=dynamic_scale,
+                                       group_list_type=group_list_type,
+                                       w1_scale_bias=w1_scale_bias,
+                                       w2_scale_bias=w2_scale_bias,
+                                       topk_scales=topk_scales,
+                                       with_quant=use_int8_w8a8
+                                       or use_int4_w4a8,
+                                       fusion=use_int8_w8a8,
+                                       need_trans=need_trans)
+
+        final_hidden_states = self.token_dispatcher.token_combine(
+            hidden_states=mlp_output)
+
+        if dynamic_eplb:
+            return (final_hidden_states, group_list_type, expert_tokens)
+
+        return final_hidden_states
+
+    @abstractmethod
+    def _get_token_dispatcher(self):
+        raise NotImplementedError(
+            "_get_token_dispatcher function not implemented.")
+
+    @abstractmethod
+    def _get_fused_moe_prepare_finalize(self):
+        raise NotImplementedError(
+            "_get_fused_moe_prepare_finalize function not implemented.")
+
+
+class AllGatherCommImpl(MoECommMethod):
+    """This implementation is the same as NativeAllGatherCommImpl,
+    but uses NPU-specific ops for better performance.
+
+    This implementation should be compatible with all scenarios, and
+    thus it is the default implementation for MoE communication methods.
+    It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing
+    and `torch_npu.npu_moe_token_unpermute` for post-processing
+    to handle the token-to-expert mapping and communication efficiently.
+
+    NOTE(Yizhou): TBH, it is really weird that we were supposed to use
+    `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing`
+    or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute`
+    for pre-processing and post-processing, respectively.
+    But `npu_moe_finalize_routing` will lead to accuracy issues so we have to
+    use `torch_npu.npu_moe_token_unpermute` instead.
+    This is a workaround and should be removed after the issue is fixed.
+    """
+
+    def _get_token_dispatcher(self):
+        if self.model_type == "PanguProMoE":
+            return TokenDispatcherWithMoge(
+                top_k=self.moe_config.experts_per_token,
+                num_experts=self.moe_config.num_experts,
+                num_local_experts=self.moe_config.num_local_experts)
+        else:
+            return TokenDispatcherWithAllGather(
+                top_k=self.moe_config.experts_per_token,
+                num_experts=self.moe_config.num_experts,
+                num_local_experts=self.moe_config.num_local_experts)
+
+    def _get_fused_moe_prepare_finalize(self):
+        return FusedMoEPrepareAndFinalizeWithAllGather(self.moe_config)
+
+
+class MC2CommImpl(MoECommMethod):
+    """This implementation is for the scenarios listed below:
+    1. `enable_expert_parallel=True`.
+    2. `npu_moe_distribute_dispatch` and `npu_moe_distribute_combine` are available.
+    3. `enable_expert_parallel=False` is not supported.
+    
+    This implementation uses the MC2 communication method, which is optimized for
+    Communication and Computation parallelism on Ascend devices.
+    """
+
+    def _get_token_dispatcher(self):
+        return TokenDispatcherWithMC2()
+
+    def _get_fused_moe_prepare_finalize(self):
+        return FusedMoEPrepareAndFinalizeWithMC2(self.moe_config)
+
+
+class AlltoAllCommImpl(MoECommMethod):
+    """This implementation is for the scenarios listed below:
+    1. `enable_expert_parallel=True`.
+    2. `npu_grouped_matmul` is available.
+
+    This implementation uses all-to-all communication to exchange tokens
+    between data parallel ranks before and after the MLP computation. It should
+    have better performance than AllGatherCommImpl when DP size > 1.
+    """
+
+    def _get_token_dispatcher(self):
+        return TokenDispatcherWithAll2AllV(
+            top_k=self.moe_config.experts_per_token,
+            num_experts=self.moe_config.num_experts,
+            num_local_experts=self.moe_config.num_local_experts)
+
+    def _get_fused_moe_prepare_finalize(self):
+        return FusedMoEPrepareAndFinalizeWithAll2All(self.moe_config)
+
+
+class NaiveMulticastCommImpl(MoECommMethod):
+    """This implementation is the same as NativeAllGatherCommImpl,
+    but uses NPU-specific ops for better performance.
+
+    This implementation should be compatible with all scenarios, and
+    thus it is the default implementation for MoE communication methods.
+    It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing
+    and `torch_npu.npu_moe_token_unpermute` for post-processing
+    to handle the token-to-expert mapping and communication efficiently.
+
+    NOTE(Yizhou): TBH, it is really weird that we were supposed to use
+    `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing`
+    or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute`
+    for pre-processing and post-processing, respectively.
+    But `npu_moe_finalize_routing` will lead to accuracy issues so we have to
+    use `torch_npu.npu_moe_token_unpermute` instead.
+    This is a workaround and should be removed after the issue is fixed.
+    """
+
+    def _get_token_dispatcher(self):
+        return TokenDispatcherWithAllGather(
+            top_k=self.moe_config.experts_per_token,
+            num_experts=self.moe_config.num_experts,
+            num_local_experts=self.moe_config.num_local_experts)
+
+    def _get_fused_moe_prepare_finalize(self):
+        return FusedMoEPrepareAndFinalizeWithNaiveMulticast(self.moe_config)
--- a/vllm_ascend/ops/layers/moe_mlp.py
+++ b/vllm_ascend/ops/layers/moe_mlp.py
@@ -18,22 +18,52 @@ from typing import Optional

 import torch
 import torch_npu
+from torch.nn.functional import pad
 from vllm.forward_context import get_forward_context

-from vllm_ascend.ascend_forward_context import FusedMoEState
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.utils import dispose_tensor, is_310p


+def cumsum_group_list(group_list: torch.Tensor,
+                      group_list_type: int,
+                      active_num: int = 0,
+                      expert_num: int = 0) -> torch.Tensor:
+    if group_list_type not in [0, 1, 2]:
+        raise ValueError(
+            f"group_list_type should be in [0, 1, 2], but received {group_list_type}"
+        )
+
+    if group_list_type == 0:
+        return group_list
+    if group_list_type == 1:
+        return group_list.cumsum(dim=0)
+
+    experts = pad(group_list[:, 0], (1, 0))
+    tokens = pad(group_list[:, 1].cumsum(dim=0), (1, 0))
+    cumsum_group_list = torch.full(size=(expert_num, ),
+                                   fill_value=active_num,
+                                   dtype=group_list.dtype,
+                                   device=group_list.device)
+
+    for i, (start, end) in enumerate(zip(experts[:-1], experts[1:])):
+        if end > start:
+            cumsum_group_list[start:end] = tokens[i]
+
+    return cumsum_group_list
+
+
 def quant_apply_mlp(hidden_states: torch.Tensor,
                    w1: torch.Tensor,
                    w1_scale: torch.Tensor,
                    w2: torch.Tensor,
                    w2_scale: torch.Tensor,
                    group_list: torch.Tensor,
-                    dynamic_scale: torch.Tensor = None,
                    group_list_type: int = 1,
+                    dynamic_scale: torch.Tensor = None,
                    w1_scale_bias: torch.Tensor = None,
-                    w2_scale_bias: torch.Tensor = None) -> torch.Tensor:
+                    w2_scale_bias: torch.Tensor = None,
+                    fusion: bool = False) -> torch.Tensor:
    if dynamic_scale is None:
        unquantized_hidden_states = hidden_states
        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
@@ -47,33 +77,40 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
    bias1, bias2 = None, None
    _output_dtype = w2_scale.dtype

-    is_mc2 = get_forward_context().fused_moe_state == FusedMoEState.MC2
+    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
    if w1_scale_bias is None and is_mc2:
-        w1_scale = w1_scale.to(torch.float32)
-
-        # gmm1: gate_up_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w1],
-            split_item=3,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=torch.int32)[0]
-
-        # act_fn: swiglu
-        hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
-            x=hidden_states,
-            weight_scale=w1_scale,
-            activation_scale=pertoken_scale,
-            bias=None,
-            quant_scale=None,
-            quant_offset=None,
-            group_index=group_list,
-            activate_left=True,
-            quant_mode=1,
-        )
-
+        if fusion:
+            # gmm1: gate_up_proj & act_fn: swiglu
+            hidden_states, swiglu_out_scale, _ = torch_npu.npu_grouped_matmul_swiglu_quant(
+                x=hidden_states,
+                weight=w1,
+                group_list=cumsum_group_list(group_list, group_list_type),
+                weight_scale=w1_scale,
+                x_scale=pertoken_scale)
+        else:
+            if w1_scale.dtype != torch.float32:
+                w1_scale = w1_scale.to(torch.float32)
+            # gmm1: gate_up_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[w1],
+                split_item=3,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=torch.int32)[0]
+            # act_fn: swiglu
+            hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
+                x=hidden_states,
+                weight_scale=w1_scale,
+                activation_scale=pertoken_scale,
+                bias=None,
+                quant_scale=None,
+                quant_offset=None,
+                group_index=group_list,
+                activate_left=True,
+                quant_mode=1,
+            )
        # gmm2: down_proj
        hidden_states = torch_npu.npu_grouped_matmul(
            x=[hidden_states],
@@ -92,29 +129,37 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
                    [group_list[:1],
                     torch.diff(group_list, dim=0)])
                group_list_type = 1
-            bias1 = [w1_scale_bias]
+            bias1 = [w1_scale_bias] if not fusion else w1_scale_bias
            bias2 = [w2_scale_bias]
            # TODO w4a8 scene: dynamic acquisition of dtype in the future
            _output_dtype = torch.bfloat16

-        # gmm1: gate_up_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w1],
-            scale=[w1_scale],
-            bias=bias1,
-            per_token_scale=[pertoken_scale],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=_output_dtype)[0]
-
-        # act_fn: swiglu
-        hidden_states = torch_npu.npu_swiglu(hidden_states)
-        hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(
-            hidden_states)
-
+        if fusion:
+            # gmm1: gate_up_proj & act_fn: swiglu
+            hidden_states, swiglu_out_scale, _ = torch_npu.npu_grouped_matmul_swiglu_quant(
+                x=hidden_states,
+                weight=w1,
+                bias=bias1,
+                group_list=cumsum_group_list(group_list, group_list_type),
+                weight_scale=w1_scale,
+                x_scale=pertoken_scale)
+        else:
+            # gmm1: gate_up_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[w1],
+                scale=[w1_scale.to(w2_scale.dtype)],
+                bias=bias1,
+                per_token_scale=[pertoken_scale],
+                split_item=2,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=_output_dtype)[0]
+            # act_fn: swiglu
+            hidden_states = torch_npu.npu_swiglu(hidden_states)
+            hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(
+                hidden_states)
        # gmm2: down_proj
        hidden_states = torch_npu.npu_grouped_matmul(
            x=[hidden_states],
@@ -127,17 +172,22 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
            group_type=0,
            group_list=group_list,
            output_dtype=_output_dtype)[0]
+
    return hidden_states


-def unquant_apply_mlp(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        group_list: torch.Tensor,
-        group_list_type: int = 1,
-        topk_scales: Optional[torch.Tensor] = None) -> torch.Tensor:
-    w1 = w1.transpose(1, 2)
+def unquant_apply_mlp(hidden_states: torch.Tensor,
+                      w1: torch.Tensor,
+                      w2: torch.Tensor,
+                      group_list: torch.Tensor,
+                      group_list_type: int = 1,
+                      topk_scales: Optional[torch.Tensor] = None,
+                      need_trans: bool = True) -> torch.Tensor:
+
+    if need_trans:
+        w1 = w1.transpose(1, 2)
+        w2 = w2.transpose(1, 2)
+
    gate_up_out = torch_npu.npu_grouped_matmul(
        x=[hidden_states],
        weight=[w1],
@@ -155,7 +205,6 @@ def unquant_apply_mlp(
    if topk_scales is not None:
        gate_up_out *= topk_scales

-    w2 = w2.transpose(1, 2)
    hidden_states = torch_npu.npu_grouped_matmul(
        x=[gate_up_out],
        weight=[w2],
@@ -178,7 +227,9 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                      w1_scale_bias: torch.Tensor = None,
                      w2_scale_bias: torch.Tensor = None,
                      topk_scales: Optional[torch.Tensor] = None,
-                      with_quant: bool = False) -> torch.Tensor:
+                      with_quant: bool = False,
+                      fusion: bool = False,
+                      need_trans: bool = True) -> torch.Tensor:
    if with_quant:
        return quant_apply_mlp(hidden_states=hidden_states,
                               w1=w1,
@@ -189,11 +240,13 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                               dynamic_scale=dynamic_scale,
                               group_list_type=group_list_type,
                               w1_scale_bias=w1_scale_bias,
-                               w2_scale_bias=w2_scale_bias)
+                               w2_scale_bias=w2_scale_bias,
+                               fusion=fusion)
    else:
        return unquant_apply_mlp(hidden_states=hidden_states,
                                 w1=w1,
                                 w2=w2,
                                 group_list=group_list,
                                 group_list_type=group_list_type,
-                                 topk_scales=topk_scales)
+                                 topk_scales=topk_scales,
+                                 need_trans=need_trans)
--- a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
+++ b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
@@ -22,42 +22,17 @@
 # limitations under the License.

 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
+from typing import Any, Optional

 import torch
 import torch_npu
 from vllm.distributed.parallel_state import get_ep_group

 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.distributed.tensor_parallel import \
-    gather_from_sequence_parallel_region
-from vllm_ascend.ops.comm_utils import async_all_to_all
+from vllm_ascend.ops.moe.comm_utils import (
+    async_all_to_all, gather_from_sequence_parallel_region)
 from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version

-_Dispatchers: Dict[str, Any] = {}
-
-
-def _register_token_dispatcher(dispatcher: Any):
-    _Dispatchers[dispatcher.__class__.__name__] = dispatcher
-
-
-def get_token_dispatcher(name: str):
-    return _Dispatchers.get(name)
-
-
-def setup_token_dispatchers(ep_size: int, **kwargs):
-    existing_dispatchers = set(_Dispatchers.keys())
-
-    if ep_size == 1 and "TokenDispatcherWithAllGather" not in existing_dispatchers:
-        _register_token_dispatcher(TokenDispatcherWithAllGather(**kwargs))
-    elif ep_size < 16 and "TokenDispatcherWithAll2AllV" not in existing_dispatchers:
-        _register_token_dispatcher(TokenDispatcherWithAll2AllV(**kwargs))
-    elif ep_size >= 16:
-        if "TokenDispatcherWithAll2AllV" not in existing_dispatchers:
-            _register_token_dispatcher(TokenDispatcherWithAll2AllV(**kwargs))
-        if "TokenDispatcherWithMC2" not in existing_dispatchers:
-            _register_token_dispatcher(TokenDispatcherWithMC2(**kwargs))
-

 class MoETokenDispatcher(ABC):

@@ -90,9 +65,9 @@ class MoETokenDispatcher(ABC):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[torch.Tensor] = None,
-                       shared_gate_up: Optional[torch.Tensor] = None,
-                       shared_dequant_scale: Optional[torch.Tensor] = None,
+                       shared_experts: Optional[Any] = None,
+                       quantized_x_for_share: Optional[Any] = None,
+                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
@@ -158,6 +133,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            "shared_expert_rank_num": 0,
            "moe_expert_num": moe_expert_num,
            "global_bs": 0,
+            "expert_token_nums_type": 0,
        }

        stage1_kwargs = {
@@ -189,9 +165,9 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[torch.Tensor] = None,
-                       shared_gate_up: Optional[torch.Tensor] = None,
-                       shared_dequant_scale: Optional[torch.Tensor] = None,
+                       shared_experts: Optional[Any] = None,
+                       quantized_x_for_share: Optional[Any] = None,
+                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
@@ -215,6 +191,11 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):

        if self.with_quant:
            if shared_experts is not None:
+                share_up_out, _ = shared_experts.gate_up_proj(
+                    (quantized_x_for_share, dynamic_scale_for_share))
+                shared_gate_up, shared_dequant_scale = share_up_out[
+                    0], share_up_out[1]
+
                shared_act_out = shared_experts.act_fn(
                    (shared_gate_up, shared_dequant_scale))
                self.shared_act, self.swiglu_out_scale = \
@@ -224,7 +205,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            if shared_experts is not None:
                shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
                self.shared_act = shared_experts.act_fn(shared_gate_up)
-        group_list_type = 1
+        group_list_type = 0
        return {
            "group_list_type": group_list_type,
            "hidden_states": expand_x,
@@ -291,6 +272,16 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            **kwargs_mc2
        ) if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine(
            **kwargs_mc2)
+
+        # these values are no longer used, so they need to be set to None for memory release.
+        self.output = None
+        self.assist_info_for_combine = None
+        self.ep_recv_counts = None
+        self.topk_ids = None
+        self.topk_weights = None
+        self.mc2_mask = None
+        self.expert_map = None
+
        if self.shared_experts is None:
            return hidden_states
        else:
@@ -300,6 +291,9 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            else:
                shared_hidden_states, _ = self.shared_experts.down_proj(
                    self.shared_act)
+            self.shared_act = None
+            self.shared_experts = None
+            self.swiglu_out_scale = None
            return hidden_states, shared_hidden_states


@@ -328,9 +322,9 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[torch.Tensor] = None,
-                       shared_gate_up: Optional[torch.Tensor] = None,
-                       shared_dequant_scale: Optional[torch.Tensor] = None,
+                       shared_experts: Optional[Any] = None,
+                       quantized_x_for_share: Optional[Any] = None,
+                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
@@ -338,8 +332,6 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
        self.original_shape = hidden_states.shape

        num_tokens = hidden_states.shape[:-1].numel()
-        dtype = hidden_states.dtype
-        device = hidden_states.device
        self.expert_map = expert_map
        self.topk_weights = topk_weights
        self.topk_ids = topk_ids
@@ -353,144 +345,65 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
            hidden_states = hidden_states * \
                topk_weights.to(hidden_states.dtype)
-
        if expert_map is not None:
-            # Generate token indices and flatten
-            token_indices = (torch.arange(
-                num_tokens, device=device,
-                dtype=torch.int64).unsqueeze(1).expand(-1,
-                                                       self.top_k).reshape(-1))
-
-            # Flatten token-to-expert mappings and map to local experts
-            weights_flat = topk_weights.view(-1)
-            experts_flat = topk_ids.view(-1)
-            local_experts_flat = expert_map[experts_flat]
-
-            # Filter valid token-expert pairs
-            self.mask = local_experts_flat != -1
-            filtered_weights = torch.where(
-                self.mask, weights_flat,
-                torch.zeros_like(weights_flat)).to(dtype)
-            filtered_experts = torch.where(
-                self.mask, local_experts_flat,
-                torch.full_like(local_experts_flat,
-                                self.num_experts_local)).to(topk_ids.dtype)
-
-            # Sort by local expert IDs
-            sort_indices = torch.argsort(filtered_experts.view(torch.float32))
-            self.sorted_token_indices = token_indices[sort_indices]
-            self.sorted_weights = filtered_weights[sort_indices]
-
-            # Compute token counts with minlength of num_experts
-            # This is equivalent to but faster than:
-            # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1]
-            token_counts = torch.zeros(self.num_experts_local + 1,
-                                       device=device,
-                                       dtype=torch.int64)
-            ones = torch.ones_like(filtered_experts, dtype=torch.int64)
-            token_counts.scatter_add_(0, filtered_experts.to(torch.int64),
-                                      ones)
-            token_counts = token_counts[:self.num_experts_local]
-
-            # Rearrange hidden_states
-            sorted_hidden_states = hidden_states[self.sorted_token_indices]
-            if self.with_quant:
-                group_list_type = 1
-                expert_tokens = token_counts
-            else:
-                expert_tokens = torch.cumsum(token_counts,
-                                             dim=0,
-                                             dtype=torch.int64)
-                group_list_type = 0
+            global_num_experts = len(expert_map)
+            mask = (expert_map[topk_ids] != -1)
+            self.topk_weights = topk_weights * mask
+            first_expert_idx = get_ep_group(
+            ).rank_in_group * self.num_experts_local
+            last_expert_idx = first_expert_idx + self.num_experts_local
        else:
-            active_num = self.max_num_tokens if self.max_num_tokens is not None else num_tokens
-            sorted_hidden_states, self.expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
-                hidden_states,
-                row_idx=row_idx,
-                expert_idx=topk_ids,
-                active_num=active_num)
+            first_expert_idx = 0
+            last_expert_idx = self.num_experts_local
+            global_num_experts = self.num_experts_local

-            expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
-                expanded_expert_idx, self.num_experts_local)
-            expert_tokens = expert_tokens.to(torch.int64)
-            group_list_type = 0
+        sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
+            torch_npu.npu_moe_init_routing_v2(
+                hidden_states,
+                topk_ids,
+                active_num=num_tokens * self.top_k,
+                expert_num=global_num_experts,
+                expert_tokens_num_type=1,
+                expert_tokens_num_flag=True,
+                active_expert_range=[first_expert_idx, last_expert_idx],
+                quant_mode=1 if self.with_quant else -1,
+            ))
+        expert_tokens = expert_tokens.to(torch.int64)
+        group_list_type = 1  # `count` mode
        return {
            "group_list_type": group_list_type,
            "hidden_states": sorted_hidden_states,
            "group_list": expert_tokens,
+            "dynamic_scale": pertoken_scale if self.with_quant else None,
        }

    def token_combine(self,
                      hidden_states: torch.Tensor,
                      bias: torch.Tensor = None):
        assert self.original_shape is not None
-        dtype = hidden_states.dtype
-        device = hidden_states.device
-        if self.expert_map is not None:
-            assert self.mask is not None
-            assert self.sorted_token_indices is not None
-            assert self.sorted_weights is not None
+        final_hidden_states = torch_npu.npu_moe_token_unpermute(
+            permuted_tokens=hidden_states,
+            sorted_indices=self.expanded_row_idx,
+            probs=self.topk_weights)
+        if len(self.original_shape) == 3:
+            final_hidden_states = final_hidden_states.view(self.original_shape)

-            weighted_down_out = hidden_states * \
-                self.sorted_weights.unsqueeze(1)
-
-            final_hidden_states = torch.zeros(*self.original_shape,
-                                              device=hidden_states.device,
-                                              dtype=hidden_states.dtype)
-
-            # TODO: npu_grouped_matmul output random values at [num_valid_tokens:, ...]
-            # This created multiple NaN and index_add_ will mix them up which harms accuracy
-            # remove this mask and filter after it being fixed
-            num_valid_tokens = self.mask.sum()
-            valid_token_mask = torch.arange(
-                0, self.sorted_token_indices.shape[0],
-                device=device).unsqueeze(1) < num_valid_tokens
-            valid_output = torch.where(
-                valid_token_mask, weighted_down_out,
-                torch.zeros_like(weighted_down_out)).to(dtype)
-            final_hidden_states.index_add_(0, self.sorted_token_indices,
-                                           valid_output)
-        else:
-            if self.with_quant:
-                final_hidden_states = torch_npu.npu_moe_finalize_routing(
-                    hidden_states,
-                    skip1=None,
-                    skip2=None,
-                    bias=None,
-                    scales=self.topk_weights,
-                    expanded_src_to_dst_row=self.expanded_row_idx,
-                    export_for_source_row=self.topk_ids,
-                )
-                if len(self.original_shape) == 3:
-                    final_hidden_states = final_hidden_states.view(
-                        self.original_shape)
-            else:
-                scales = torch.ones_like(
-                    self.topk_weights
-                ) if self.apply_router_weight_on_input else self.topk_weights
-                # TODO: Reorder device memory 2 times here, replace the current
-                # implementation here when suitable operators become available.
-                final_hidden_states = torch_npu.npu_moe_finalize_routing(
-                    hidden_states,
-                    skip1=None,
-                    skip2=None,
-                    bias=None,
-                    scales=scales,
-                    expanded_src_to_dst_row=self.expanded_row_idx,
-                    export_for_source_row=self.topk_ids,
-                )
+        # these values are no longer used, so they need to be set to None for memory release.
+        self.expert_map = None
+        self.topk_weights = None
+        self.topk_ids = None
+        self.expanded_row_idx = None
        return final_hidden_states


 # mypy: disable-error-code="override"
-class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
+class TokenDispatcherWithMoge(MoETokenDispatcher):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.apply_router_weight_on_input = False
-        self.local_ep = 1
-        self.local_num_experts = self.num_experts // self.local_ep
-        self.local_num_group = self.top_k // self.local_ep
+        self.local_num_experts = self.num_experts // self.ep_size
+        self.local_num_group = self.top_k // self.ep_size
        self.bsz = None

    def token_dispatch(self,
@@ -501,23 +414,12 @@ class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[torch.Tensor] = None,
-                       shared_gate_up: Optional[torch.Tensor] = None,
-                       shared_dequant_scale: Optional[torch.Tensor] = None,
+                       shared_experts: Optional[Any] = None,
+                       quantized_x_for_share: Optional[Any] = None,
+                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
-        self.apply_router_weight_on_input = apply_router_weight_on_input
-        if self.apply_router_weight_on_input:
-            assert (topk_weights.dim() == 2
-                    ), "`topk_weights` should be in shape (num_tokens, topk)"
-            _, topk = topk_weights.shape
-            assert (
-                topk == 1
-            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-            hidden_states = hidden_states * \
-                topk_weights.to(hidden_states.dtype)
-
        self.bsz, _ = hidden_states.shape
        flatten_topk_ids = topk_ids.view(-1)
        self.sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
@@ -551,7 +453,7 @@ class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
        unsorted_hidden_states = hidden_states.index_select(
            0, unsorted_topk_ids)
        final_hidden_states = unsorted_hidden_states.reshape(
-            self.bsz, self.top_k // self.local_ep, -1).sum(1)
+            self.bsz, self.top_k // self.ep_size, -1).sum(1)
        return final_hidden_states


@@ -613,9 +515,9 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[torch.Tensor] = None,
-                       shared_gate_up: Optional[torch.Tensor] = None,
-                       shared_dequant_scale: Optional[torch.Tensor] = None,
+                       shared_experts: Optional[Any] = None,
+                       quantized_x_for_share: Optional[Any] = None,
+                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
@@ -681,9 +583,14 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):

        output = self._combine_postprocess(permutated_local_input_tokens)

+        # these values are no longer used, so they need to be set to None for memory release.
        self.input_splits = None
        self.output_splits = None
        self.num_global_tokens_per_local_expert = None
+        self.topk_weights = None
+        self.reversed_local_input_permutation_mapping = None
+        self.reversed_global_input_permutation_mapping = None
+        self.global_input_tokens_local_experts_indices = None

        return output

@@ -745,6 +652,10 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
            self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
                self.expert_ids_per_ep_rank,
                self.num_global_tokens_per_local_expert.ravel())
+        else:
+            # TODO: This full synchronization can be a performance bottleneck.
+            # A more granular sync (e.g., blocking D2H copies) should be investigated.
+            torch.npu.synchronize()

        return num_tokens_per_local_expert

--- a/vllm_ascend/ops/moe_dispatcher/init.py
+++ b/vllm_ascend/ops/moe_dispatcher/init.py
--- a/vllm_ascend/ops/register_custom_ops.py
+++ b/vllm_ascend/ops/register_custom_ops.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn.functional as F
+import torch_npu
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.forward_context import get_forward_context
+from vllm.utils import direct_register_custom_op
+
+import vllm_ascend.envs as envs_ascend
+from vllm_ascend.ascend_forward_context import MoECommType
+
+
+def _maybe_chunk_residual_impl(x: torch.Tensor,
+                               residual: torch.Tensor) -> torch.Tensor:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return residual
+
+    if x.size(0) != residual.size(0):
+        sp_enabled = forward_context.sp_enabled
+        assert sp_enabled is True, ("Currently, this situation only occurs "
+                                    "when sp is enabled")
+        pad_size = forward_context.pad_size
+        if pad_size > 0:
+            residual = F.pad(residual, (0, 0, 0, pad_size))
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        residual = torch.chunk(residual, tp_size, dim=0)[tp_rank]
+
+    return residual
+
+
+def _maybe_all_gather_and_maybe_unpad_impl(x: torch.Tensor,
+                                           label: bool) -> torch.Tensor:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return x
+
+    sp_enabled = forward_context.sp_enabled
+    if sp_enabled and label:
+        x = tensor_model_parallel_all_gather(x, 0)
+        pad_size = forward_context.pad_size
+        if pad_size > 0:
+            x = x[:-pad_size, :]
+    return x
+
+
+def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return tensor_model_parallel_all_reduce(x)
+
+    sp_enabled = forward_context.sp_enabled
+    if sp_enabled:
+        pad_size = forward_context.pad_size
+        if pad_size > 0:
+            x = F.pad(x, (0, 0, 0, pad_size))
+        return tensor_model_parallel_reduce_scatter(x, 0)
+    else:
+        return tensor_model_parallel_all_reduce(x)
+
+
+def _maybe_prefetch_mlp_gate_up_proj_impl(x_dependency: torch.Tensor,
+                                          prefix: str) -> None:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return
+
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    model_instance = forward_context.model_instance
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = int(prefix.split('.')[2])
+
+    # start point of gate_up_proj weight prefetch
+    if prefix.split('.')[-2] == "self_attn":
+        forward_context.prefetch_mlp_gate_up_proj = True
+    if forward_context.prefetch_mlp_gate_up_proj:
+        prefetch_stream.wait_stream(torch.npu.current_stream())
+
+        with torch.npu.stream(prefetch_stream):
+            mlp_gate_up_prefetch_size = envs_ascend.VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE
+            torch_npu.npu_prefetch(model_instance.model.layers[layer_idx].mlp.gate_up_proj.weight, \
+                                x_dependency, mlp_gate_up_prefetch_size)
+    return
+
+
+def _maybe_prefetch_mlp_gate_up_proj_impl_fake(x_dependency: torch.Tensor,
+                                               prefix: str) -> None:
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return
+
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    forward_context.prefetch_mlp_down_proj = True
+    model_instance = forward_context.model_instance
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = forward_context.layer_idx
+
+    # start point of down_proj weight prefetch
+    prefetch_stream.wait_stream(torch.npu.current_stream())
+
+    with torch.npu.stream(prefetch_stream):
+        mlp_down_prefetch_size = envs_ascend.VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE
+        torch_npu.npu_prefetch(model_instance.model.layers[layer_idx].mlp.down_proj.weight, \
+                            x_dependency, mlp_down_prefetch_size)
+    forward_context.layer_idx += 1
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl_fake(
+        x_dependency: torch.Tensor) -> None:
+    return
+
+
+def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None:
+    try:
+        forward_context = get_forward_context()
+    except AssertionError:
+        return
+
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    if forward_context.prefetch_mlp_gate_up_proj or \
+        forward_context.prefetch_mlp_down_proj:
+        prefetch_stream = forward_context.prefetch_stream
+        # wait until prefetch done
+        torch.npu.current_stream().wait_stream(prefetch_stream)
+        forward_context.prefetch_mlp_gate_up_proj = False
+        forward_context.prefetch_mlp_down_proj = False
+    return
+
+
+def _maybe_wait_prefetch_done_impl_fake(x: torch.Tensor) -> None:
+    return
+
+
+def _maybe_all_reduce_tensor_model_parallel_impl(
+        final_hidden_states: torch.Tensor) -> torch.Tensor:
+    forward_context = get_forward_context()
+    moe_comm_type = forward_context.moe_comm_type
+    if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2}:
+        return final_hidden_states
+    else:
+        return tensor_model_parallel_all_reduce(final_hidden_states)
+
+
+direct_register_custom_op(op_name="maybe_chunk_residual",
+                          op_func=_maybe_chunk_residual_impl,
+                          fake_impl=lambda x, residual: residual,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_all_gather_and_maybe_unpad",
+                          op_func=_maybe_all_gather_and_maybe_unpad_impl,
+                          fake_impl=lambda x, label: x,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_pad_and_reduce",
+                          op_func=_maybe_pad_and_reduce_impl,
+                          fake_impl=lambda x: x,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_prefetch_mlp_gate_up_proj",
+                          op_func=_maybe_prefetch_mlp_gate_up_proj_impl,
+                          fake_impl=_maybe_prefetch_mlp_gate_up_proj_impl_fake,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_prefetch_mlp_down_proj",
+                          op_func=_maybe_prefetch_mlp_down_proj_impl,
+                          fake_impl=_maybe_prefetch_mlp_down_proj_impl_fake,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_wait_prefetch_done",
+                          op_func=_maybe_wait_prefetch_done_impl,
+                          fake_impl=_maybe_wait_prefetch_done_impl_fake,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+direct_register_custom_op(op_name="maybe_all_reduce_tensor_model_parallel",
+                          op_func=_maybe_all_reduce_tensor_model_parallel_impl,
+                          fake_impl=lambda x: x,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -20,6 +20,7 @@ from typing import Optional, Tuple

 import torch
 import torch_npu
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.rotary_embedding import (
    DeepseekScalingRotaryEmbedding, RotaryEmbedding)

@@ -37,34 +38,39 @@ def _rope_forward_oot(
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    is_neox_style_override: Optional[bool] = None,
+    is_neox_style: bool,
+    offsets: Optional[torch.Tensor] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    query_shape, key_shape = query.shape, key.shape
    if self.cos_sin_cache.device != query.device:
        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
    if self.cos_sin_cache.dtype != query.dtype:
        self.cos_sin_cache = self.cos_sin_cache.to(query.dtype)
-    neox_style = self.is_neox_style
-    if is_neox_style_override is not None:
-        neox_style = is_neox_style_override
    # adopt custom kernel path for rotary_embedding
-    if _custom_rotary_embedding_enabled(query, neox_style,
+    if _custom_rotary_embedding_enabled(query, is_neox_style,
                                        self.head_size) and not is_310p():
-        query, key = torch.ops._C.rotary_embedding(
+        query, key = torch.ops._C_ascend.rotary_embedding(
            positions,
            query,
            key,
            self.head_size,
            self.cos_sin_cache,
-            neox_style,
+            is_neox_style,
        )
        return query.view(query_shape), key.view(key_shape)
    if offsets is not None:
        raise NotImplementedError(
            "Batched rotary embedding is currently not supported on NPU.")
    else:
-        if self.rotary_dim < self.head_size:
+        if self.cos is not None and \
+            self.sin is not None:
+            # If cos and sin are generated outside, use npu_apply_rotary_pos_emb to avoid redundant calculation.
+            # This method requires head_size and rotary_dim equal 128 and neox_style is True
+            query = query.contiguous().view(1, query.shape[0], -1,
+                                            self.head_size)
+            key = key.contiguous().view(1, key.shape[0], -1, self.head_size)
+            torch_npu.npu_apply_rotary_pos_emb(query, key, self.cos, self.sin)
+        elif self.rotary_dim < self.head_size:
            num_tokens = query.shape[0]
            query = query.view(num_tokens, -1, self.head_size)
            key = key.view(num_tokens, -1, self.head_size)
@@ -80,25 +86,26 @@ def _rope_forward_oot(
                k_rot,
                self.head_size,
                self.cos_sin_cache,
-                neox_style,
+                is_neox_style,
            )
            q_rot = q_rot.view(num_tokens, -1, self.rotary_dim)
            k_rot = k_rot.view(num_tokens, -1, self.rotary_dim)
            q = torch.cat((q_rot, q_pass), dim=-1).reshape(query_shape)
            k = torch.cat((k_rot, k_pass), dim=-1).reshape(key_shape)
            return q, k
-        # TODO: Remove the contiguous in the future.
-        query = query.contiguous().view(query.shape[0], -1)
-        key = key.contiguous().view(key.shape[0], -1)
-        torch_npu._npu_rotary_embedding(
-            positions,
-            query,
-            key,
-            self.head_size,
-            self.cos_sin_cache,
-            neox_style,
-        )
-    return query.view(query_shape), key.view(key_shape)
+        else:
+            # TODO: Remove the contiguous in the future.
+            query = query.contiguous().view(query.shape[0], -1)
+            key = key.contiguous().view(key.shape[0], -1)
+            torch_npu._npu_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                is_neox_style,
+            )
+        return query.view(query_shape), key.view(key_shape)


 class AscendRotaryEmbedding(RotaryEmbedding):
@@ -112,6 +119,8 @@ class AscendRotaryEmbedding(RotaryEmbedding):
        is_neox_style: bool,
        dtype: torch.dtype,
    ) -> None:
+        self.cos = None
+        self.sin = None
        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                         is_neox_style, dtype)

@@ -123,14 +132,25 @@ class AscendRotaryEmbedding(RotaryEmbedding):
        offsets: Optional[torch.Tensor] = None,
        is_neox_style_override: Optional[bool] = None,
    ):
-        return _rope_forward_oot(
-            self,
-            positions,
-            query,
-            key,
-            offsets,
-            is_neox_style_override,
-        )
+        is_neox_style = self.is_neox_style
+        if is_neox_style_override is not None:
+            is_neox_style = is_neox_style_override
+        forward_context = get_forward_context()
+        is_first_layer = forward_context.is_first_layer
+        # Generate cos and sin outside layers to avoid repeated calculation.
+        if is_neox_style and self.head_size == 128 and self.cos_sin_cache.shape[
+                -1] == 128:
+            if is_first_layer:
+                cos_sin = self.cos_sin_cache.index_select(0, positions)
+                last_dim = cos_sin.size()[-1]
+                cos, sin = cos_sin.reshape(-1, 2, last_dim // 2).repeat(
+                    1, 1, 2).chunk(2, dim=-2)
+                # BSNH
+                self.cos = cos.view(1, -1, 1, last_dim).contiguous()
+                self.sin = sin.view(1, -1, 1, last_dim).contiguous()
+                forward_context.is_first_layer = False
+        return _rope_forward_oot(self, positions, query, key, is_neox_style,
+                                 offsets)


 class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
@@ -168,8 +188,10 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
        super(DeepseekScalingRotaryEmbedding,
              self).__init__(head_size, rotary_dim, max_position_embeddings,
                             base, is_neox_style, dtype)
-        self.max_seq_len = max_position_embeddings
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+
+        # NOTE: For ascend friendly computing, reorder sin and cos cache
+        self.max_seq_len = math.ceil(max_position_embeddings * scaling_factor)
+        self._set_cos_sin_cache(self.max_seq_len,
                                device=NPUPlatform.device_type,
                                dtype=dtype)

@@ -275,8 +297,7 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):

        return q_embed, k_embed

-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
+    def _set_cos_sin_cache(self, max_seq_len, device, dtype):
        dim = self.rotary_dim

        freq_extra = 1.0 / (self.base**(
@@ -297,9 +318,7 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
                                 inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)

-        t = torch.arange(seq_len * self.scaling_factor,
-                         device=device,
-                         dtype=torch.float32)
+        t = torch.arange(max_seq_len, device=device, dtype=torch.float32)

        freqs = torch.outer(t, inv_freq)
        cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale
@@ -317,16 +336,13 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
                positions: torch.Tensor,
                query: torch.Tensor,
                key: torch.Tensor,
-                offsets: Optional[torch.Tensor] = None,
-                max_seq_len: Optional[int] = None):
-        if max_seq_len is not None and max_seq_len > self.max_seq_len:
-            self._set_cos_sin_cache(max_seq_len, query.device, query.dtype)
+                offsets: Optional[torch.Tensor] = None):
        if len(key.shape) == 2:
            key = key[:, None, :]
        # Note: we implement the non neox_style method with shuffle the last dim and neox style
        # calculation method which is also more compute friendly to the ascend machine
        # https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py
-        neox_style = True
+        is_neox_style = True
        if self.is_neox_style is False:
            b, h_q, d = query.shape
            query = query.view(b, h_q, d // 2,
@@ -334,6 +350,6 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
            b, h_k, d = key.shape
            key = key.view(b, h_k, d // 2, 2).transpose(3,
                                                        2).reshape(b, h_k, d)
-        q_pe, k_pe = _rope_forward_oot(self, positions, query, key, offsets,
-                                       neox_style)
+        q_pe, k_pe = _rope_forward_oot(self, positions, query, key,
+                                       is_neox_style, offsets)
        return q_pe, k_pe
--- a/vllm_ascend/ops/sequence_parallel.py
+++ b/vllm_ascend/ops/sequence_parallel.py
@@ -1,120 +0,0 @@
-import torch
-from torch.nn import functional as F
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              get_tp_group, tensor_model_parallel_all_gather,
-                              tensor_model_parallel_reduce_scatter)
-from vllm.forward_context import get_forward_context
-
-from vllm_ascend.platform import NPUPlatform
-
-
-class MetadataForPadding:
-
-    def __init__(self,
-                 padding_flag=False,
-                 lengths_sum_padding=0,
-                 lengths_sum_unpadding=0,
-                 pad_size=0,
-                 not_dummy_and_is_prefill=False):
-        self.padding_flag = padding_flag
-        self.not_dummy_and_is_prefill = not_dummy_and_is_prefill
-
-        self.lengths_sum_padding = lengths_sum_padding
-        self.lengths_sum_unpadding = lengths_sum_unpadding
-        self.pad_size = pad_size
-
-        self.tp_size = get_tp_group().world_size
-        self.tp_rank_in_group = get_tp_group().rank_in_group
-
-        assert self.lengths_sum_padding % self.tp_size == 0
-        self.slice_size = self.lengths_sum_padding // self.tp_size
-
-        self.mc2_mask = torch.zeros(
-            self.lengths_sum_padding,
-            dtype=torch.bool,
-            device=NPUPlatform.device_type,
-        )
-        self.mc2_mask[:lengths_sum_unpadding] = True
-
-    def padding_aligned_reduce_scatter(self,
-                                       data: torch.Tensor) -> torch.Tensor:
-        if self.padding_flag:
-            pad_size = self.pad_size
-            padded_data = F.pad(data, (0, 0, 0, pad_size))
-        else:
-            padded_data = data
-        padded_data_reduce_scatter = tensor_model_parallel_reduce_scatter(
-            padded_data, 0)
-
-        return padded_data_reduce_scatter
-
-    def allgather_unpadding_aligned(self,
-                                    padded_data: torch.Tensor) -> torch.Tensor:
-        padded_data_allgather = tensor_model_parallel_all_gather(
-            padded_data, 0)
-        if self.padding_flag:
-            lengths_sum_unpadding = self.lengths_sum_unpadding
-            unpadding_data = padded_data_allgather[:lengths_sum_unpadding]
-        else:
-            unpadding_data = padded_data_allgather
-        return unpadding_data
-
-    def padding_slice(self, data: torch.Tensor) -> torch.Tensor:
-
-        padded_data = F.pad(data, (0, 0, 0, self.pad_size))
-        start = self.tp_rank_in_group * self.slice_size
-        end = start + self.slice_size
-        slice_data = padded_data[start:end]
-
-        return slice_data
-
-    def padding_aligned_scatter(self, data: torch.Tensor) -> torch.Tensor:
-        if self.padding_flag:
-            pad_size = self.pad_size
-            padded_data = F.pad(data, (0, 0, 0, pad_size))
-        else:
-            padded_data = data
-        # padded_data = data
-        padded_data = torch.tensor_split(padded_data, self.tp_size, dim=0)
-
-        padded_data_reduce_scatter = padded_data[self.tp_rank_in_group]
-
-        return padded_data_reduce_scatter
-
-
-def init_metadata_for_sp(input_ids, enable_sequence_parallelism):
-    if not enable_sequence_parallelism:
-        return MetadataForPadding(padding_flag=False,
-                                  not_dummy_and_is_prefill=False)
-
-    is_perifll = 0
-    attn_metadata = get_forward_context().attn_metadata
-    tp_size = get_tensor_model_parallel_world_size()
-    if attn_metadata is not None:
-        if hasattr(attn_metadata,
-                   'is_only_prefill') and attn_metadata.is_only_prefill:
-            is_perifll = 1
-        if hasattr(attn_metadata,
-                   'num_prefills') and attn_metadata.num_prefills > 0:
-            is_perifll = 1
-
-        if is_perifll:
-            lengths_sum_unpadding = input_ids.shape[0]
-            lengths_sum_padding = (
-                (lengths_sum_unpadding + tp_size - 1) // tp_size) * tp_size
-            if lengths_sum_unpadding == lengths_sum_padding:
-                padding_flag = False
-            else:
-                padding_flag = True
-            pad_size = lengths_sum_padding - lengths_sum_unpadding
-            _metadata_for_padding = MetadataForPadding(
-                lengths_sum_unpadding=lengths_sum_unpadding,
-                lengths_sum_padding=lengths_sum_padding,
-                padding_flag=padding_flag,
-                pad_size=pad_size,
-                not_dummy_and_is_prefill=True)
-
-            return _metadata_for_padding
-
-    return MetadataForPadding(padding_flag=False,
-                              not_dummy_and_is_prefill=False)
--- a/vllm_ascend/ops/sigmoid_gating.py
+++ b/vllm_ascend/ops/sigmoid_gating.py
@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+# mypy: ignore-errors
+
+import os
+from typing import Optional
+
+import torch
+from vllm.triton_utils import tl, tldevice, triton
+
+if os.environ.get('FLA_USE_FAST_OPS', '0') == '1':
+    div = tldevice.fast_dividef
+    exp = tldevice.fast_expf
+    log = tldevice.fast_logf
+    log2 = tldevice.fast_log2f
+else:
+
+    @triton.jit
+    def div_normal(x, y):
+        return x / y
+
+    div = div_normal
+    exp = tl.exp
+    log = tl.log
+    log2 = tl.log2
+
+
+@triton.heuristics({
+    'USE_INITIAL_STATE':
+    lambda args: args['h0'] is not None,
+    'IS_VARLEN':
+    lambda args: args['cu_seqlens'] is not None,
+    "IS_CONTINUOUS_BATCHING":
+    lambda args: args['ssm_state_indices'] is not None,
+    "IS_SPEC_DECODING":
+    lambda args: args['num_accepted_tokens'] is not None,
+})
+@triton.jit(do_not_specialize=['N', 'T'])
+def fused_recurrent_gated_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    ssm_state_indices,
+    num_accepted_tokens,
+    scale,
+    N: tl.constexpr,  # num of sequences
+    T: tl.constexpr,  # num of tokens
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    stride_init_state_token: tl.constexpr,
+    stride_final_state_token: tl.constexpr,
+    stride_indices_seq: tl.constexpr,
+    stride_indices_tok: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    INPLACE_FINAL_STATE: tl.constexpr,  # whether to store final state inplace
+    IS_BETA_HEADWISE: tl.
+    constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+
+    if T == 0:
+        # no tokens to process for this sequence
+        return
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        if IS_CONTINUOUS_BATCHING:
+            if IS_SPEC_DECODING:
+                i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1
+            else:
+                i_t = 0
+            p_h0 = h0 + tl.load(ssm_state_indices + i_n * stride_indices_seq +
+                                i_t).to(tl.int64) * stride_init_state_token
+        else:
+            p_h0 = h0 + bos * HV * K * V
+        p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for i_t in range(0, T):
+        p_q = q + (bos * H + i_h) * K + o_k + H * K * i_t
+        p_k = k + (bos * H + i_h) * K + o_k + H * K * i_t
+        p_v = v + (bos * HV + i_hv) * V + o_v + HV * V * i_t
+        if IS_BETA_HEADWISE:
+            p_beta = beta + (bos * HV + i_hv) * V + o_v + HV * V * i_t
+        else:
+            p_beta = beta + bos * HV + i_hv + HV * i_t
+        p_g = g + bos * HV + i_hv + HV * i_t
+        p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v + HV * V * i_t
+
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_g = tl.load(p_g).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6)
+            b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6)
+        b_q = b_q * scale
+        # [BK, BV]
+        # b_h *= tl.exp(b_g)
+        b_h *= exp(b_g)
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # keep the states for multi-query tokens
+        if INPLACE_FINAL_STATE:
+            p_ht = ht + tl.load(ssm_state_indices + i_n * stride_indices_seq +
+                                i_t).to(tl.int64) * stride_final_state_token
+        else:
+            p_ht = ht + (bos + i_t) * stride_final_state_token
+        p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+
+
+def fused_recurrent_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    inplace_final_state: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    ssm_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = q.new_empty(NK, *v.shape)
+    if inplace_final_state:
+        final_state = initial_state
+    else:
+        final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype)
+
+    stride_init_state_token = initial_state.stride(0)
+    stride_final_state_token = final_state.stride(0)
+
+    if ssm_state_indices is None:
+        stride_indices_seq, stride_indices_tok = 1, 1
+    elif ssm_state_indices.ndim == 1:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1
+    else:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride()
+
+    # print("N: ", N)
+    # print("T: ", T)
+    # print("B: ", B)
+    # print("H: ", H)
+    # print("HV: ", HV)
+    # print("K: ", K)
+    # print("V: ", V)
+    # print("BK: ", BK)
+    # print("BV: ", BV)
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        scale=scale,
+        N=N,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        stride_init_state_token=stride_init_state_token,
+        stride_final_state_token=stride_final_state_token,
+        stride_indices_seq=stride_indices_seq,
+        stride_indices_tok=stride_indices_tok,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        INPLACE_FINAL_STATE=inplace_final_state,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o, final_state
+
+
+class FusedRecurrentFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx,
+                q: torch.Tensor,
+                k: torch.Tensor,
+                v: torch.Tensor,
+                g: torch.Tensor,
+                beta: torch.Tensor,
+                scale: float,
+                initial_state: torch.Tensor,
+                inplace_final_state: bool = True,
+                cu_seqlens: Optional[torch.LongTensor] = None,
+                ssm_state_indices: Optional[torch.Tensor] = None,
+                num_accepted_tokens: Optional[torch.Tensor] = None,
+                use_qk_l2norm_in_kernel: bool = False):
+        o, final_state = fused_recurrent_gated_delta_rule_fwd(
+            q=q.contiguous(),
+            k=k.contiguous(),
+            v=v.contiguous(),
+            g=g.contiguous(),
+            beta=beta.contiguous(),
+            scale=scale,
+            initial_state=initial_state,
+            inplace_final_state=inplace_final_state,
+            cu_seqlens=cu_seqlens,
+            ssm_state_indices=ssm_state_indices,
+            num_accepted_tokens=num_accepted_tokens,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+        )
+
+        return o, final_state
+
+
+def fused_recurrent_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    inplace_final_state: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    ssm_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, HV, V]`.
+            GVA is applied if `HV > H`.
+        g (torch.Tensor):
+            g (decays) of shape `[B, T, HV]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, HV]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        inplace_final_state: bool:
+            Whether to store the final state in-place to save memory.
+            Default: `True`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        ssm_state_indices (Optional[torch.Tensor]):
+            Indices to map the input sequences to the initial/final states.
+        num_accepted_tokens (Optional[torch.Tensor]):
+            Number of accepted tokens for each sequence during decoding.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HV, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, HV, K, V]`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, HV, V, device='cuda')
+        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
+        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
+        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
+        >>> o, ht = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    if cu_seqlens is not None and q.shape[0] != 1:
+        raise ValueError(
+            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+            f"Please flatten variable-length inputs before processing.")
+    if scale is None:
+        scale = k.shape[-1]**-0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o, final_state = FusedRecurrentFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        inplace_final_state,
+        cu_seqlens,
+        ssm_state_indices,
+        num_accepted_tokens,
+        use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
--- a/vllm_ascend/ops/vocab_parallel_embedding.py
+++ b/vllm_ascend/ops/vocab_parallel_embedding.py
@@ -97,6 +97,7 @@ class AscendVocabParallelEmbedding(VocabParallelEmbedding):

        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
        # Divide the weight matrix along the vocaburaly dimension.
        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
@@ -252,3 +253,16 @@ class AscendLogitsProcessor(LogitsProcessor):
            logits = logits[..., :self.org_vocab_size]

        return logits
+
+    def forward(
+        self,
+        lm_head: VocabParallelEmbedding,
+        hidden_states: torch.Tensor,
+        # keep this for version compatibility
+        sampling_metadata=None,  # type: ignore
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        return LogitsProcessor.forward(self,
+                                       lm_head,
+                                       hidden_states,
+                                       embedding_bias=embedding_bias)