init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/torchair/models/qwen2.py
+++ b/vllm_ascend/torchair/models/qwen2.py
@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM  # noqa: F401
 from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
 from vllm.model_executor.models.utils import (AutoWeightsLoader,
                                              PPMissingLayer, maybe_prefix)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from vllm_ascend.ascend_config import get_ascend_config
@@ -343,9 +342,9 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        return hidden_states

    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata=None,  # type: ignore
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
--- a/vllm_ascend/torchair/models/qwen3_moe.py
+++ b/vllm_ascend/torchair/models/qwen3_moe.py
@@ -54,8 +54,9 @@ from vllm.sequence import IntermediateTensors
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
-from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
-                                               init_metadata_for_sp)
+from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
+                                                        init_metadata_for_sp)
+from vllm_ascend.utils import vllm_version_is


 class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -311,9 +312,14 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
                                                quant_config=quant_config,
                                                prefix=f"{prefix}.mlp")
            else:
-                self.mlp = Qwen3MoeSparseMoeBlock(config=config,
-                                                  quant_config=quant_config,
-                                                  prefix=f"{prefix}.mlp")
+                if vllm_version_is("0.10.2"):
+                    self.mlp = Qwen3MoeSparseMoeBlock(
+                        config=config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.mlp")
+                else:
+                    self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
+                                                      prefix=f"{prefix}.mlp")
        else:
            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                   intermediate_size=config.intermediate_size,
@@ -394,7 +400,8 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
        quant_config = vllm_config.quant_config

        parallel_config = vllm_config.parallel_config
-        self.num_redundant_experts = parallel_config.num_redundant_experts
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.config = config
--- a/vllm_ascend/torchair/models/torchair_deepseek_mtp.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py
@@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.models.deepseek_mtp import (
    DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
    SharedHead)
 from vllm.model_executor.models.utils import maybe_prefix
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
@@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        sampling_metadata=None,  # type: ignore
        spec_step_idx: int = 0,
    ) -> torch.Tensor:
        current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP):
        self.model = TorchairDeepSeekMultiTokenPredictor(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))

-        self.sampler = get_sampler()
-
    def forward(
        self,
        input_ids: torch.Tensor,
--- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -32,8 +32,7 @@ import torch_npu
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
-                         get_current_vllm_config)
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                              get_tensor_model_parallel_world_size,
                              get_tp_group, split_tensor_along_last_dim,
@@ -52,7 +51,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -69,12 +67,14 @@ from vllm.model_executor.models.utils import (
    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
 from vllm.sequence import IntermediateTensors

+from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.models.layers.sfa import Indexer
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
 from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \
    TorchairAscendW8A8DynamicLinearMethod
-from vllm_ascend.utils import dispose_tensor, npu_prefetch
+from vllm_ascend.utils import dispose_tensor, npu_prefetch, oproj_tp_enable


 class TorchairDeepseekV2SiluAndMul(SiluAndMul):
@@ -322,8 +322,8 @@ class TorchairDeepseekV2MoE(nn.Module):

        ascend_config = get_ascend_config()
        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe and \
+        self.multistream_overlap_shared_expert = \
+            ascend_config.multistream_overlap_shared_expert and \
            self.torchair_graph_enabled

        self.gate = ReplicatedLinear(config.hidden_size,
@@ -364,7 +364,7 @@ class TorchairDeepseekV2MoE(nn.Module):
                hidden_act=config.hidden_act,
                quant_config=quant_config,
                reduce_results=reduce_results,
-                force_replicate=self.enable_multistream_moe
+                force_replicate=self.multistream_overlap_shared_expert
                or enable_shared_expert_dp,
                prefix=f"{prefix}.shared_experts",
            )
@@ -377,10 +377,6 @@ class TorchairDeepseekV2MoE(nn.Module):
        self.tp_group = get_tp_group().device_group
        self.tp_rank = get_tp_group().rank_in_group
        self.ep_group = get_ep_group()
-        self.kv_consumer = None
-        transfer_config = get_current_vllm_config().kv_transfer_config
-        if transfer_config is not None:
-            self.kv_consumer = transfer_config.kv_role == "kv_consumer"

        self.params_dtype = torch.get_default_dtype()
        self.rm_router_logits = self.experts.rm_router_logits
@@ -398,15 +394,9 @@ class TorchairDeepseekV2MoE(nn.Module):

        is_prefill = forward_context.with_prefill

-        # If this node is kv_consumer, we force the moe always runs in decode path to make sure
-        # the behaviour aligned between dummy_run and normal model_execute.
-        if self.kv_consumer:
-            is_prefill = False
-            enable_force_load_balance = False
-
        # router_logits: (num_tokens, n_experts)
        router_logits = None
-        if not self.rm_router_logits and not self.enable_multistream_moe:
+        if not self.rm_router_logits and not self.multistream_overlap_shared_expert:
            router_logits, _ = self.gate(hidden_states)

        experts_hidden_states = self.experts(
@@ -447,6 +437,7 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
+        decoder_layer=None,
    ) -> None:
        nn.Module.__init__(self)
        self.hidden_size = hidden_size
@@ -514,11 +505,18 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.kv_b_proj")
-        if (config.n_routed_experts is not None
-                and self.debug_layer_idx >= config.first_k_dense_replace
-                and self.debug_layer_idx % config.moe_layer_freq == 0
-                and (ascend_config.torchair_graph_config.enable_multistream_moe
-                     or self.enable_shared_expert_dp)):
+
+        if oproj_tp_enable():
+            self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                            self.hidden_size,
+                                            bias=False,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.o_proj")
+        elif (config.n_routed_experts is not None
+              and self.debug_layer_idx >= config.first_k_dense_replace
+              and self.debug_layer_idx % config.moe_layer_freq == 0
+              and (ascend_config.multistream_overlap_shared_expert
+                   or self.enable_shared_expert_dp)):
            self.o_proj = TorchairDeepseekV2RowParallelLinearReplaceAllreduce(
                self.num_heads * self.v_head_dim,
                self.hidden_size,
@@ -635,6 +633,225 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
                                 output_shape=output_shape)


+class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer=None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        self.tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % self.tp_size == 0
+        self.num_local_heads = num_heads // self.tp_size
+        self.layers = config.num_hidden_layers
+        self.first_k_dense_replace = config.first_k_dense_replace
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+        ascend_config = get_ascend_config()
+        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_a_proj",
+                return_bias=False,
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+                return_bias=False,
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+                return_bias=False,
+            )
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
+            return_bias=False,
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+            return_bias=False,
+        )
+        if (config.n_routed_experts is not None
+                and self.debug_layer_idx >= config.first_k_dense_replace
+                and self.debug_layer_idx % config.moe_layer_freq == 0
+                and (ascend_config.multistream_overlap_shared_expert
+                     or self.enable_shared_expert_dp)):
+            self.o_proj = TorchairDeepseekV2RowParallelLinearReplaceAllreduce(
+                self.num_heads * self.v_head_dim,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+                return_bias=False,
+            )
+        else:
+            self.o_proj = TorchairDeepseekV2RowParallelLinear(
+                self.num_heads * self.v_head_dim,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+                return_bias=False,
+            )
+
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.dim: int = config.hidden_size  # 7168
+        # TODO(zzzzwwjj): wait transformers add these params
+        self.n_heads: int = 64  # 64
+        self.head_dim: int = 128  # 128
+        self.index_topk: int = 2048  # 2048
+        self.indexer = Indexer(
+            config,
+            quant_config=quant_config,
+            dim=self.dim,
+            n_heads=self.n_heads,
+            head_dim=self.head_dim,
+            index_topk=self.index_topk,
+            prefix=f"{prefix}.indexer",
+        )
+
+        self.sfa_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            use_sfa=True,
+            # SFA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
+            q_a_layernorm=self.q_a_layernorm
+            if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+            indexer=self.indexer,
+            decoder_layer=decoder_layer,
+        )
+
+    def forward(
+            self,
+            positions: torch.Tensor,
+            hidden_states: torch.Tensor,
+            kv_cache: Optional[torch.Tensor] = None,
+            attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor:
+        forward_context = get_forward_context()
+        if not self.torchair_graph_enabled:
+            if forward_context.attn_metadata is not None and isinstance(
+                    forward_context.attn_metadata, dict):
+                attn_metadata = next(
+                    iter(forward_context.attn_metadata.values()), None)
+            else:
+                attn_metadata = forward_context.attn_metadata
+            if kv_cache is None:
+                kv_cache = self.sfa_attn.kv_cache[
+                    forward_context.virtual_engine]
+
+        num_tokens = hidden_states.shape[0]
+        need_gather_q_kv = False
+        # if self.enable_shared_expert_dp and self.debug_layer_idx > self.first_k_dense_replace and self.debug_layer_idx < self.layers:
+        #     # Simulate all gather to calculate output shape
+        #     num_tokens = num_tokens * self.tp_size
+        #     need_gather_q_kv = True
+        if not self.enable_shared_expert_dp or self.debug_layer_idx != self.first_k_dense_replace:
+            output_shape = hidden_states.shape
+        if self.enable_shared_expert_dp and (
+                self.debug_layer_idx == self.first_k_dense_replace
+                or self.debug_layer_idx == self.layers):
+            rows = num_tokens // self.tp_size
+            if num_tokens % self.tp_size:
+                rows += 1
+            output_shape = (rows, hidden_states.shape[1])
+        output = torch.empty(output_shape,
+                             dtype=hidden_states.dtype,
+                             device=hidden_states.device)
+        self.sfa_attn.impl.forward(hidden_states, kv_cache, attn_metadata,
+                                   need_gather_q_kv, output)
+        output = output.view(-1, output_shape[-1])
+        return output
+
+
 class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):

    def __init__(
@@ -659,9 +876,16 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
        self.tp_size = get_tensor_model_parallel_world_size()
        self.tp_rank = get_tp_group().rank_in_group
        ascend_config = get_ascend_config()
+        self.use_mla = False
+        self.use_sfa = False
        # TODO: enable mla in vllm-ascend
        if model_config.use_mla:
-            attn_cls = TorchairDeepseekV2MLAAttention
+            if ascend_config.use_sfa:
+                attn_cls = TorchairDeepseekV2SFAAttention
+                self.use_sfa = True
+            else:
+                attn_cls = TorchairDeepseekV2MLAAttention  # type: ignore[assignment]
+            self.use_mla = True
        else:
            attn_cls = DeepseekV2Attention
        self.self_attn = attn_cls(
@@ -680,6 +904,7 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
+            decoder_layer=self,
        )

        if (config.n_routed_experts is not None
@@ -690,7 +915,7 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
                quant_config=quant_config,
                prefix=f"{prefix}.mlp",
            )
-            self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
+            self.mla_moe_communication = ascend_config.multistream_overlap_shared_expert \
                and model_config.use_mla and self.tp_size > 1
        else:
            self.mlp = TorchairDeepseekV2MLP(
@@ -720,21 +945,34 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
        replace_allreduce: bool = False,
    ) -> torch.Tensor:
        # Self Attention
-        if attn_metadata is not None and attn_metadata.num_decodes > 0:
-            mla_moe_communication = self.mla_moe_communication and replace_allreduce
+        if attn_metadata is not None:
+            decoding_condition_met = (
+                not attn_metadata.is_prefill if self.use_sfa else
+                attn_metadata.num_decodes > 0 if self.use_mla else False)
+            mla_moe_communication = decoding_condition_met and self.mla_moe_communication and replace_allreduce
        else:
            mla_moe_communication = False
-        if residual is None:
+
+        forward_context = get_forward_context()
+        if (envs.VLLM_ASCEND_ENABLE_MLAPO
+                and isinstance(self.self_attn, TorchairDeepseekV2SFAAttention)
+                and attn_metadata is not None
+                and not forward_context.with_prefill):
+            if residual is not None:
+                hidden_states = hidden_states + residual
            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
        else:
-            previous_hidden_states, previous_residual = hidden_states, residual
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-            # Dispose hidden_states and residual from the previous layer
-            # to save npu memory because they're no longer used.
-            dispose_tensor(previous_hidden_states)
-            dispose_tensor(previous_residual)
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                previous_hidden_states, previous_residual = hidden_states, residual
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+                # Dispose hidden_states and residual from the previous layer
+                # to save npu memory because they're no longer used.
+                dispose_tensor(previous_hidden_states)
+                dispose_tensor(previous_residual)
        if mla_moe_communication and self.layer_idx > self.first_k_dense_replace:
            hidden_states = tensor_model_parallel_all_gather(hidden_states,
                                                             dim=0)
@@ -806,6 +1044,8 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
            residual = get_tp_group().all_gather(residual, 0)

            attn_metadata = get_forward_context().attn_metadata
+            if attn_metadata is not None and isinstance(attn_metadata, dict):
+                attn_metadata = next(iter(attn_metadata.values()), None)
            if attn_metadata is not None:
                num_tokens = attn_metadata.num_actual_tokens
            else:
@@ -921,6 +1161,8 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        self.config = config
+        self.num_dense_layers = self.config.first_k_dense_replace
+        self.num_moe_layers = self.config.num_hidden_layers - self.num_dense_layers
        self.quant_config = quant_config
        self.model = TorchairDeepseekV2Model(vllm_config=vllm_config,
                                             prefix=maybe_prefix(
@@ -934,7 +1176,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
        else:
            self.lm_head = PPMissingLayer()
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

--- a/vllm_ascend/torchair/models/torchair_pangu_moe.py
+++ b/vllm_ascend/torchair/models/torchair_pangu_moe.py
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.models.utils import (
    extract_layer_index, is_pp_missing_parameter,
    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
+from vllm.v1.sample.sampler import Sampler

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
@@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
        if self.config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
+        self.sampler = Sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

@@ -935,19 +934,19 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
        return hidden_states

    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata=None,  # type: ignore
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
+            self,
+            logits: Optional[torch.Tensor],
+            sampling_metadata,  # type: ignore
+    ):
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens

--- a/vllm_ascend/torchair/ops/sequence_parallel.py
+++ b/vllm_ascend/torchair/ops/sequence_parallel.py
@@ -0,0 +1,120 @@
+import torch
+from torch.nn import functional as F
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              get_tp_group, tensor_model_parallel_all_gather,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.forward_context import get_forward_context
+
+from vllm_ascend.platform import NPUPlatform
+
+
+class MetadataForPadding:
+
+    def __init__(self,
+                 padding_flag=False,
+                 lengths_sum_padding=0,
+                 lengths_sum_unpadding=0,
+                 pad_size=0,
+                 not_dummy_and_is_prefill=False):
+        self.padding_flag = padding_flag
+        self.not_dummy_and_is_prefill = not_dummy_and_is_prefill
+
+        self.lengths_sum_padding = lengths_sum_padding
+        self.lengths_sum_unpadding = lengths_sum_unpadding
+        self.pad_size = pad_size
+
+        self.tp_size = get_tp_group().world_size
+        self.tp_rank_in_group = get_tp_group().rank_in_group
+
+        assert self.lengths_sum_padding % self.tp_size == 0
+        self.slice_size = self.lengths_sum_padding // self.tp_size
+
+        self.mc2_mask = torch.zeros(
+            self.lengths_sum_padding,
+            dtype=torch.bool,
+            device=NPUPlatform.device_type,
+        )
+        self.mc2_mask[:lengths_sum_unpadding] = True
+
+    def padding_aligned_reduce_scatter(self,
+                                       data: torch.Tensor) -> torch.Tensor:
+        if self.padding_flag:
+            pad_size = self.pad_size
+            padded_data = F.pad(data, (0, 0, 0, pad_size))
+        else:
+            padded_data = data
+        padded_data_reduce_scatter = tensor_model_parallel_reduce_scatter(
+            padded_data, 0)
+
+        return padded_data_reduce_scatter
+
+    def allgather_unpadding_aligned(self,
+                                    padded_data: torch.Tensor) -> torch.Tensor:
+        padded_data_allgather = tensor_model_parallel_all_gather(
+            padded_data, 0)
+        if self.padding_flag:
+            lengths_sum_unpadding = self.lengths_sum_unpadding
+            unpadding_data = padded_data_allgather[:lengths_sum_unpadding]
+        else:
+            unpadding_data = padded_data_allgather
+        return unpadding_data
+
+    def padding_slice(self, data: torch.Tensor) -> torch.Tensor:
+
+        padded_data = F.pad(data, (0, 0, 0, self.pad_size))
+        start = self.tp_rank_in_group * self.slice_size
+        end = start + self.slice_size
+        slice_data = padded_data[start:end]
+
+        return slice_data
+
+    def padding_aligned_scatter(self, data: torch.Tensor) -> torch.Tensor:
+        if self.padding_flag:
+            pad_size = self.pad_size
+            padded_data = F.pad(data, (0, 0, 0, pad_size))
+        else:
+            padded_data = data
+        # padded_data = data
+        padded_data = torch.tensor_split(padded_data, self.tp_size, dim=0)
+
+        padded_data_reduce_scatter = padded_data[self.tp_rank_in_group]
+
+        return padded_data_reduce_scatter
+
+
+def init_metadata_for_sp(input_ids, enable_sequence_parallelism):
+    if not enable_sequence_parallelism:
+        return MetadataForPadding(padding_flag=False,
+                                  not_dummy_and_is_prefill=False)
+
+    is_perifll = 0
+    attn_metadata = get_forward_context().attn_metadata
+    tp_size = get_tensor_model_parallel_world_size()
+    if attn_metadata is not None:
+        if hasattr(attn_metadata,
+                   'is_only_prefill') and attn_metadata.is_only_prefill:
+            is_perifll = 1
+        if hasattr(attn_metadata,
+                   'num_prefills') and attn_metadata.num_prefills > 0:
+            is_perifll = 1
+
+        if is_perifll:
+            lengths_sum_unpadding = input_ids.shape[0]
+            lengths_sum_padding = (
+                (lengths_sum_unpadding + tp_size - 1) // tp_size) * tp_size
+            if lengths_sum_unpadding == lengths_sum_padding:
+                padding_flag = False
+            else:
+                padding_flag = True
+            pad_size = lengths_sum_padding - lengths_sum_unpadding
+            _metadata_for_padding = MetadataForPadding(
+                lengths_sum_unpadding=lengths_sum_unpadding,
+                lengths_sum_padding=lengths_sum_padding,
+                padding_flag=padding_flag,
+                pad_size=pad_size,
+                not_dummy_and_is_prefill=True)
+
+            return _metadata_for_padding
+
+    return MetadataForPadding(padding_flag=False,
+                              not_dummy_and_is_prefill=False)
--- a/vllm_ascend/torchair/ops/shared_weight_layer.py
+++ b/vllm_ascend/torchair/ops/shared_weight_layer.py
@@ -0,0 +1,245 @@
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from vllm.distributed.parallel_state import GroupCoordinator
+from vllm.model_executor.layers.linear import LinearBase
+
+
+def dispose_tensor(x: torch.Tensor):
+    x.set_(torch.empty([], device=x.device, dtype=x.dtype))
+
+
+@dataclass
+class LayerMetadata:
+    """Metadata for a layer.
+    """
+    layer: Optional[LinearBase]  # The layer object.
+    post_method: Callable[[
+        torch.nn.Module
+    ], None]  # The `process_weights_after_loading` method from the quant method.
+    weight: torch.Tensor  # The weight tensor.
+    window_idx: int  # The index of the window.
+
+
+@dataclass
+class SharedWindowMetadata:
+    """Metadata for a shared window.
+    """
+    weight: torch.Tensor  # The weight tensor to be shared by layers.
+    data_layer_idx: int  # The index of the layer this window's weight is equal to.
+    work: Optional[torch.distributed.Work]  # The asynchronous broadcast work.
+
+
+@dataclass
+class SeriesMetadata:
+    """Metadata for a weight shared series.
+    """
+    group: GroupCoordinator
+    start_layer: int
+    end_layer: int
+    num_layers: int
+    prefetch_step: int
+    dummy_weight: torch.Tensor  # Dummy weight to replace the loaded weight matrix. All the layers in the series share the same dummy weight tensor.
+    layers: list[LayerMetadata]
+    shared_windows: list[
+        SharedWindowMetadata]  # Shared windows for prefetching. The window size is (`prefetch_step` + 1), as only the weights for the next (`prefetch_step` + 1) layers need to be stored.
+    window_offset: int  # The index of the window for the next coming layer.
+
+    def is_source(self, layer_idx) -> bool:
+        return layer_idx % self.group.world_size == self.group.rank_in_group
+
+    def post_process_after_loading(self):
+        # This method only needs to be called once per series.
+        if self.shared_windows:
+            return
+        for layer_idx in range(self.start_layer, self.end_layer):
+            layer = self.layers[layer_idx - self.start_layer]
+            is_source = self.is_source(layer_idx)
+            # If the weight uses dummy weight, make a copy temporary such that the post method call won't affect other layers which also uses dummy weight.
+            if not is_source:
+                layer.weight.set_(torch.empty_like(self.dummy_weight))
+            # Broadcast to get the true weight.
+            dist.broadcast(layer.weight,
+                           src=self.group.ranks[layer_idx %
+                                                self.group.world_size],
+                           group=self.group.device_group)
+            assert layer.layer is not None
+            # Call `process_weights_after_loading` from the quant method.
+            layer.post_method(layer.layer)
+            step = layer_idx - self.start_layer
+            if step < self.prefetch_step:
+                # Build the windows for the first `prefetch_step` layers. The weights can be used for the first `prefetch_step` layers in `forward()`, so also clone the weights.
+                self.shared_windows.append(
+                    SharedWindowMetadata(
+                        weight=layer.weight.clone().detach(),
+                        data_layer_idx=layer_idx,
+                        work=None,
+                    ))
+                layer.window_idx = step
+                # When the layer not intended to be stored in this device, link to the corresponding window's tensor.
+                if not is_source:
+                    layer.weight.set_(self.shared_windows[-1].weight)
+            else:
+                # Build one more window for prefetch. The weight is useless, so just keep the shape.
+                if step == self.prefetch_step:
+                    self.shared_windows.append(
+                        SharedWindowMetadata(
+                            weight=torch.empty_like(layer.weight),
+                            data_layer_idx=-1,
+                            work=None,
+                        ))
+                # When the layer not intended to be stored in this device, dispose the tensor.
+                if not is_source:
+                    dispose_tensor(layer.weight)
+
+        dispose_tensor(self.dummy_weight)
+
+    def reach_layer(self, layer_idx: int):
+        # The index of the layer to be prefetched.
+        next_layer_idx = (layer_idx + self.prefetch_step
+                          ) % self.num_layers + self.start_layer
+        next_layer = self.layers[next_layer_idx - self.start_layer]
+        # The index of the window to store the weight for the coming layer.
+        next_layer.window_idx = self.window_offset
+        window = self.shared_windows[next_layer.window_idx]
+        # When the layer not intended to be stored in this device, link to the corresponding window's tensor.
+        if not self.is_source(next_layer_idx):
+            next_layer.weight.set_(window.weight)
+        # Update `window_offset` by rolling one step.
+        self.window_offset = (self.window_offset + 1) % (self.prefetch_step +
+                                                         1)
+        assert window.data_layer_idx != next_layer_idx
+        window.data_layer_idx = next_layer_idx
+        # Start asynchronous broadcast work.
+        window.work = dist.broadcast(
+            next_layer.weight,
+            src=self.group.ranks[next_layer_idx % self.group.world_size],
+            group=self.group.device_group,
+            async_op=True)
+
+    def wait_weight(self, layer_idx: int):
+        # Find the asynchronous broadcast work and wait for it.
+        assert self.shared_windows
+        window = self.shared_windows[self.layers[layer_idx -
+                                                 self.start_layer].window_idx]
+        # Make sure the data in the corresponding shared window is for the current layer.
+        assert window.data_layer_idx == layer_idx
+        if window.work is not None:
+            window.work.wait()
+            window.work = None
+
+
+@dataclass
+class LayerExternalMetadata:
+    """External metadata for a layer.
+    """
+    series: SeriesMetadata
+    layer_idx: int
+
+
+_series_dict: dict[str, SeriesMetadata] = {}
+
+_layer_external_dict: dict[int, LayerExternalMetadata] = {}
+
+
+def _create_forward_wrapper(forward: Callable, series: SeriesMetadata,
+                            layer_idx: int) -> Callable:
+
+    def wrapped_forward(*args, **kwargs):
+        # Wait for the weight.
+        series.wait_weight(layer_idx)
+        return forward(*args, **kwargs)
+
+    return wrapped_forward
+
+
+"""
+Register linear layers into a shared storage series.
+
+In a parallel group, each device stores a distinct, non-overlapping subset of layers from the series. All layers in a series must have the same structure (are isomorphic). The weight matrix for the i-th layer is stored on device (i % n), where n is the number of devices.
+
+After loading the model, you must call `post_process_after_loading_for_shared_weight_series(layer)` on any layer of this series to complete the initialization.
+
+During execution, each time a new layer is reached, you must call `reach_layer_for_shared_weight_series(layer)` for that layer to prefetch the weights. The argument `prefetch_step` is a non-negative integer k that manages asynchronous weight prefetching. Each call to `reach_layer_for_shared_weight_series(current_layer)` method will trigger an asynchronous prefetch for the weights of the k-th subsequent layer after `current_layer` within the series.
+
+Note: The layers are managed as a circular buffer. The index of the layer to prefetch is determined by the formula:
+- total_layers = end_layer - start_layer
+- prefetch_layer_idx = (layer_idx + prefetch_step) % total_layers + start_layer
+
+To hold the weights for the current layer and the k prefetched layers, a pool of (k + 1) shared tensor buffers will be created for this series.
+
+Arguments:
+    series_name: This name identifies which series this layer belongs to.
+    group: The group coordinator for handling asynchronous communications. It is recommended to create a new group coordinator for each new series.
+    start_layer: The index of the first layer in the series (inclusive).
+    end_layer: The index of the last layer in the series (exclusive). Thus, the series includes all layers with indices in the range [start_layer, end_layer).
+    layer_idx: The index of the current layer.
+    layer: The linear layer object to register.
+    prefetch_step: An integer that manages asynchronous weight prefetching. Setting it to 0 or 1 can cover most cases.
+"""
+
+
+def register_layer_to_shared_weight_series(
+    series_name: str,
+    group: GroupCoordinator,
+    start_layer: int,
+    end_layer: int,
+    layer_idx: int,
+    layer: LinearBase,
+    prefetch_step: int = 1,
+):
+    global _series_dict
+    if series_name not in _series_dict:
+        num_layers = end_layer - start_layer
+        assert num_layers > 0
+        assert prefetch_step >= 0 and prefetch_step <= num_layers - 2
+        _series_dict[series_name] = SeriesMetadata(
+            group=group,
+            start_layer=start_layer,
+            end_layer=end_layer,
+            num_layers=num_layers,
+            prefetch_step=prefetch_step,
+            dummy_weight=torch.empty_like(layer.weight),
+            layers=[
+                LayerMetadata(
+                    layer=None,
+                    post_method=lambda layer: None,
+                    weight=torch.empty([]),
+                    window_idx=-1,
+                ) for _ in range(num_layers)
+            ],
+            shared_windows=[],
+            window_offset=prefetch_step,
+        )
+    series = _series_dict[series_name]
+    assert layer.quant_method is not None
+    series.layers[layer_idx - start_layer] = LayerMetadata(
+        layer=layer,
+        post_method=layer.quant_method.process_weights_after_loading,
+        weight=layer.weight,
+        window_idx=-1,
+    )
+    # Discard the original `process_weights_after_loading` method such that it won't be called by others.
+    layer.quant_method.process_weights_after_loading = lambda layer: None
+    # When the layer not intended to be stored in this device, dispose the tensor and skip weight loading.
+    if not series.is_source(layer_idx):
+        dispose_tensor(layer.weight)
+        layer.weight.weight_loader = lambda *args, **kwargs: None
+    layer.forward = _create_forward_wrapper(layer.forward, series, layer_idx)
+    global _layer_external_dict
+    _layer_external_dict[id(layer)] = LayerExternalMetadata(
+        series=series,
+        layer_idx=layer_idx,
+    )
+
+
+def post_process_after_loading_for_shared_weight_series(layer: LinearBase):
+    ext = _layer_external_dict[id(layer)]
+    ext.series.post_process_after_loading()
+
+
+def reach_layer_for_shared_weight_series(layer: LinearBase):
+    ext = _layer_external_dict[id(layer)]
+    ext.series.reach_layer(ext.layer_idx)
--- a/vllm_ascend/torchair/ops/torchair_activation.py
+++ b/vllm_ascend/torchair/ops/torchair_activation.py
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import torch
+
+
+def torchair_silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+    """AscendSiluAndMul forward in torchair mode.
+
+    The key difference from the original implementation is the removal of operators
+    from the torch.ops.vllm class, as these operators only function in non-torchair
+    modes. Adding them back would cause the graph compilation to fail.
+    """
+
+    import torch_npu
+
+    from vllm_ascend.utils import is_310p
+
+    if is_310p():
+        out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
+    else:
+        out = torch_npu.npu_swiglu(x)
+    return out
--- a/vllm_ascend/torchair/ops/torchair_fused_moe.py
+++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py
@@ -40,17 +40,18 @@ from vllm.model_executor.layers.quantization.base_config import \

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
-from vllm_ascend.distributed.communication_op import \
-    data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
+from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
+                                              determine_default_log2phy_map)
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
-from vllm_ascend.ops.sequence_parallel import MetadataForPadding
 from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
+from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                               get_all_reduce_merge_state,
                               get_ascend_soc_version,
-                               get_rm_router_logits_state, is_310p)
+                               get_rm_router_logits_state, is_310p,
+                               vllm_version_is)


 def torchair_fused_experts_with_mc2(
@@ -802,6 +803,7 @@ class TorchairAscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):

        ascend_config = get_ascend_config()
        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        try:
            device_group = get_mc2_group().device_group
@@ -883,6 +885,8 @@ class TorchairAscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

        fused_moe_state = get_forward_context().fused_moe_state
+        if self.enable_shared_expert_dp and fused_moe_state == FusedMoEState.MC2:
+            fused_moe_state = FusedMoEState.All2All

        if fused_moe_state == FusedMoEState.MC2:
            return torchair_fused_experts_with_mc2(
@@ -1013,45 +1017,70 @@ class TorchairAscendFusedMoE(FusedMoE):
            self.moe_parallel_config.ep_size, is_deepseek_v3_r1)

        ascend_config = get_ascend_config()
-        expert_map_path = ascend_config.expert_map_path
-        if expert_map_path and os.path.exists(expert_map_path):
-            # moe expert load balance
-            expert_load_balancer = ExpertLoadBalancer(expert_map_path,
-                                                      self.global_num_experts)
-            self.local_num_experts, self.expert_map = \
-                                expert_load_balancer.get_rank_placement_map(
-                                                self.moe_instance_id,
-                                                get_ep_group().rank_in_group)
-            self.log2phy = expert_load_balancer.get_rank_log2phy_map(
-                self.moe_instance_id,
-                get_ep_group().rank_in_group)
-            self.global_redundant_expert_num = \
-                        expert_load_balancer.get_global_redundant_expert_num()
+        self.dynamic_eplb = ascend_config.dynamic_eplb
+        self.expert_map_path = ascend_config.expert_map_path
+        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        self.global_num_experts = num_experts + self.global_redundant_expert_num
+        # static eplb initializing with expert_map_path
+        if self.expert_map_path and os.path.exists(
+                self.expert_map_path) and os.access(self.expert_map_path,
+                                                    os.R_OK):
+            self.expert_load_balancer = ExpertLoadBalancer(
+                self.expert_map_path, self.global_num_experts)
+            self.local_num_experts, self.expert_map = (
+                self.expert_load_balancer.get_rank_placement_map(
+                    self.moe_instance_id, self.ep_rank))
+            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
+                self.moe_instance_id, self.ep_rank).npu()
+            self.global_redundant_expert_num = (
+                self.expert_load_balancer.get_global_redundant_expert_num())
        else:
-            # Create a tensor of size num_experts filled with -1
+            # init moe.
            self.local_num_experts, self.expert_map = determine_expert_map(
-                self.ep_size,
-                get_ep_group().rank_in_group, self.global_num_experts)
+                self.ep_size, self.ep_rank, self.global_num_experts)
+            # dynamic eplb initializing with not expert_map_path
+            if self.dynamic_eplb:
+                self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+                self.local_num_experts, self.expert_map = determine_default_expert_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+                self.log2phy = determine_default_log2phy_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+        local_num_experts = (torch.sum(self.expert_map != -1)
+                             if self.expert_map is not None else num_experts)
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)

        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe and \
+        self.multistream_overlap_shared_expert = \
+            ascend_config.multistream_overlap_shared_expert and \
            self.torchair_graph_enabled
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-        self.moe = FusedMoEConfig.make(
-            num_experts=self.global_num_experts,
-            experts_per_token=top_k,
-            hidden_dim=hidden_size,
-            num_local_experts=self.local_num_experts,
-            moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
-            in_dtype=params_dtype,
-            quant_config=quant_config)

+        if vllm_version_is("0.10.2"):
+            self.moe = FusedMoEConfig.make(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                # TODO (bnell): this needs to be fixed for quantized types.
+                in_dtype=params_dtype,
+                quant_config=quant_config)
+        else:
+            self.moe = FusedMoEConfig(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                in_dtype=params_dtype,
+            )
        if quant_config is None:
            self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
                self.moe)
@@ -1066,8 +1095,11 @@ class TorchairAscendFusedMoE(FusedMoE):

        assert self.quant_method is not None

-        local_num_experts = torch.sum(self.expert_map != -1) \
-            if self.expert_map is not None else num_experts
+        self.moe_load = None
+        local_num_experts = (torch.sum(self.expert_map != -1)
+                             if self.expert_map is not None else num_experts)
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)

        moe_quant_params = {
            "num_experts": local_num_experts,
@@ -1126,23 +1158,25 @@ class TorchairAscendFusedMoE(FusedMoE):
        forward_context = get_forward_context()
        fused_moe_state = forward_context.fused_moe_state
        mc2_mask = forward_context.mc2_mask
+        if self.enable_shared_expert_dp and fused_moe_state == FusedMoEState.MC2:
+            fused_moe_state = FusedMoEState.All2All
        # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
        quantized_x_for_share, dynamic_scale_for_share = None, None
-        from vllm_ascend.quantization.w8a8_dynamic import \
-            AscendW8A8DynamicFusedMoEMethod
-        if self.enable_multistream_moe:
+        from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \
+            TorchairAscendW8A8DynamicFusedMoEMethod
+        if self.multistream_overlap_shared_expert:
            if not self.rm_router_logits:
                router_logits, _ = gate(hidden_states)
            if hasattr(self.quant_method, "quant_method") and \
               isinstance(self.quant_method.quant_method,
-                          AscendW8A8DynamicFusedMoEMethod
+                          TorchairAscendW8A8DynamicFusedMoEMethod
                          ) and fused_moe_state == FusedMoEState.MC2:
                with npu_stream_switch("moe_secondary", 0):
                    quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
                        hidden_states)

        if shared_experts:
-            if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
+            if not self.multistream_overlap_shared_expert or fused_moe_state != FusedMoEState.MC2:
                # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
                shared_hidden_states = shared_experts(hidden_states)

@@ -1160,31 +1194,33 @@ class TorchairAscendFusedMoE(FusedMoE):
        if (fused_moe_state not in [
                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce):
-            if fused_moe_state in {FusedMoEState.MC2}:
-                padding_size = forward_context.padded_num_tokens
-            else:
-                # TODO: Determine if we can remove the padding
-                padding_size = tp_size
-            if num_tokens < padding_size and not self.enable_shared_expert_dp:
-                hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, padding_size - num_tokens))
-                router_logits = nn.functional.pad(
-                    router_logits, (0, 0, 0, padding_size - num_tokens))
+        ]):
            if tp_size > 1:
                tp_rank = get_tensor_model_parallel_rank()
-                if not self.enable_shared_expert_dp:
-                    chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                             tp_size,
-                                                             dim=0)
-                    chunk_router_logits = torch.tensor_split(router_logits,
-                                                             tp_size,
-                                                             dim=0)
-                    hidden_states = chunk_hidden_states[tp_rank]
-                    router_logits = chunk_router_logits[tp_rank]
-
                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
                mc2_mask = chunk_mc2_mask[tp_rank]
+            if not replace_allreduce:
+                if fused_moe_state in {FusedMoEState.MC2}:
+                    padding_size = forward_context.padded_num_tokens
+                else:
+                    # TODO: Determine if we can remove the padding
+                    padding_size = tp_size
+                if num_tokens < padding_size and not self.enable_shared_expert_dp:
+                    hidden_states = nn.functional.pad(
+                        hidden_states, (0, 0, 0, padding_size - num_tokens))
+                    router_logits = nn.functional.pad(
+                        router_logits, (0, 0, 0, padding_size - num_tokens))
+                if tp_size > 1:
+                    tp_rank = get_tensor_model_parallel_rank()
+                    if not self.enable_shared_expert_dp:
+                        chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                                 tp_size,
+                                                                 dim=0)
+                        chunk_router_logits = torch.tensor_split(router_logits,
+                                                                 tp_size,
+                                                                 dim=0)
+                        hidden_states = chunk_hidden_states[tp_rank]
+                        router_logits = chunk_router_logits[tp_rank]

        if self.dp_size > 1:
            if fused_moe_state == FusedMoEState.AllGather:
@@ -1206,8 +1242,12 @@ class TorchairAscendFusedMoE(FusedMoE):
                    router_logits = get_dp_group().all_gather(router_logits, 0)

            elif fused_moe_state == FusedMoEState.NaiveMulticast:
-                cu_tokens_across_dp_cpu = get_forward_context(
-                ).dp_metadata.cu_tokens_across_dp_cpu
+                if vllm_version_is("0.10.2"):
+                    cu_tokens_across_dp_cpu = get_forward_context(
+                    ).dp_metadata.cu_tokens_across_dp_cpu
+                else:
+                    cu_tokens_across_dp_cpu = get_forward_context(
+                    ).dp_metadata.cu_tokens_across_sp(1)
                hidden_states = self.naive_multicast(hidden_states,
                                                     cu_tokens_across_dp_cpu)
                if self.rm_router_logits:
@@ -1236,7 +1276,8 @@ class TorchairAscendFusedMoE(FusedMoE):
            log2phy=self.log2phy,
            global_redundant_expert_num=self.global_redundant_expert_num,
            shared_experts=shared_experts if self.torchair_graph_enabled
-            and self.enable_multistream_moe and not is_prefill else None,
+            and self.multistream_overlap_shared_expert and not is_prefill else
+            None,
            mc2_mask=mc2_mask,
            quantized_x_for_share=quantized_x_for_share,
            dynamic_scale_for_share=dynamic_scale_for_share,
@@ -1246,6 +1287,11 @@ class TorchairAscendFusedMoE(FusedMoE):
            if isinstance(e_hidden_states, tuple):
                e_hidden_states, shared_hidden_states = e_hidden_states

+        if self.dynamic_eplb and isinstance(
+                e_hidden_states, tuple) and len(e_hidden_states) == 3:
+            self.moe_load += e_hidden_states[2] if e_hidden_states[1] == 0 else \
+                torch.cat(e_hidden_states[2][:1], e_hidden_states[2][1:] - e_hidden_states[2][:-1])
+
        if (fused_moe_state not in [
                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
                FusedMoEState.NaiveMulticast
@@ -1269,8 +1315,8 @@ class TorchairAscendFusedMoE(FusedMoE):
                final_hidden_states = final_hidden_states[start:end, :]
                dispose_tensor(e_hidden_states)
            elif fused_moe_state == FusedMoEState.AllGather:
-                final_hidden_states = data_parallel_reduce_scatter(
-                    e_hidden_states, dim=0)
+                final_hidden_states = get_dp_group().reduce_scatter(
+                    e_hidden_states, 0)
                final_hidden_states = final_hidden_states[:num_tokens]
                dispose_tensor(e_hidden_states)
            else:
@@ -1290,6 +1336,19 @@ class TorchairAscendFusedMoE(FusedMoE):
        else:
            return final_hidden_states

+    def update_expert_map(self, new_expert_map):
+        self.expert_map = new_expert_map
+
+    def get_map(self):
+        return self.expert_map
+
+    def get_log2phy_map(self):
+        return self.logical_to_physical_map
+
+    def clear_moe_load(self):
+        if self.moe_load is not None:
+            self.moe_load.zero_()
+
    # ----------------------------------------- TBO-related --------------------------------------------

    def _forward_ms_fused_moe_comp(
--- a/vllm_ascend/torchair/ops/torchair_layernorm.py
+++ b/vllm_ascend/torchair/ops/torchair_layernorm.py
@@ -0,0 +1,51 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+def torchair_rmsnorm_forward_oot(
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """AscendRMSNorm forward in torchair mode.
+
+    The key difference from the original implementation is the removal of operators
+    from the torch.ops.vllm class, as these operators only function in non-torchair
+    modes. Adding them back would cause the graph compilation to fail.
+    """
+
+    import torch_npu
+
+    from vllm_ascend.utils import is_310p
+    if residual is not None:
+        if is_310p():
+            orig_dtype = residual.dtype
+            x = x + residual.to(x.dtype)
+            residual = x.to(orig_dtype)
+            x, _ = torch_npu.npu_rms_norm(x, self.weight,
+                                          self.variance_epsilon)
+        else:
+            x, _, residual = torch_npu.npu_add_rms_norm(
+                x, residual, self.weight, self.variance_epsilon)
+        return x, residual
+
+    x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon)
+    return x
--- a/vllm_ascend/torchair/ops/torchair_rotary_embedding.py
+++ b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py
@@ -62,7 +62,7 @@ def rope_forward_oot(
    # adopt custom kernel path for rotary_embedding
    if custom_rotary_embedding_enabled(query, neox_style,
                                       self.head_size) and not is_310p():
-        query, key = torch.ops._C.rotary_embedding(
+        query, key = torch.ops._C_ascend.rotary_embedding(
            positions,
            query,
            key,
@@ -93,10 +93,7 @@ def native_rope_deepseek_forward(self,
                                 positions: torch.Tensor,
                                 query: torch.Tensor,
                                 key: torch.Tensor,
-                                 offsets: Optional[torch.Tensor] = None,
-                                 max_seq_len: Optional[int] = None):
-    if max_seq_len is not None and max_seq_len > self.max_seq_len:
-        _set_cos_sin_cache(self, max_seq_len, query.device, query.dtype)
+                                 offsets: Optional[torch.Tensor] = None):
    if len(key.shape) == 2:
        key = key[:, None, :]
    # Note: we implement the non neox_style method with shuffle the last dim and neox style
@@ -211,8 +208,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    return q_embed, k_embed


-def _set_cos_sin_cache(self, seq_len, device, dtype):
-    self.max_seq_len_cached = seq_len
+def _set_cos_sin_cache(self, max_seq_len, device, dtype):
    dim = self.rotary_dim

    freq_extra = 1.0 / (self.base**(
@@ -232,9 +228,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
    inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
    self.register_buffer("inv_freq", inv_freq, persistent=False)

-    t = torch.arange(seq_len * self.scaling_factor,
-                     device=device,
-                     dtype=torch.float32)
+    t = torch.arange(max_seq_len, device=device, dtype=torch.float32)

    freqs = torch.outer(t, inv_freq)
    cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale
@@ -365,8 +359,7 @@ def deepseek_rope_init_func(
    super(DeepseekScalingRotaryEmbedding,
          self).__init__(head_size, rotary_dim, max_position_embeddings, base,
                         is_neox_style, dtype)
-    self.max_seq_len = max_position_embeddings
-    _set_cos_sin_cache(self,
-                       max_position_embeddings,
-                       dtype=dtype,
-                       device="npu")
+
+    # NOTE: For ascend friendly computing, reorder sin and cos cache
+    self.max_seq_len = math.ceil(max_position_embeddings * scaling_factor)
+    _set_cos_sin_cache(self, self.max_seq_len, dtype=dtype, device="npu")
--- a/vllm_ascend/torchair/quantization/torchair_quantizer.py
+++ b/vllm_ascend/torchair/quantization/torchair_quantizer.py
@@ -1,29 +0,0 @@
-from vllm_ascend.quantization.quantizer import VLLMAscendQuantizer
-from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import (
-    TorchairAscendW4A8DynamicFusedMoEMethod,
-    TorchairAscendW4A8DynamicLinearMethod)
-from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import (
-    TorchairAscendW8A8DynamicFusedMoEMethod,
-    TorchairAscendW8A8DynamicLinearMethod)
-
-
-class TorchairW8A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return TorchairAscendW8A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return TorchairAscendW8A8DynamicFusedMoEMethod()
-
-
-class TorchairW4A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return TorchairAscendW4A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return TorchairAscendW4A8DynamicFusedMoEMethod()
--- a/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py
+++ b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py
@@ -139,6 +139,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
        vllm_config = get_current_vllm_config()
        self.group_size = vllm_config.quant_config.quant_description.get(
            "group_size", 256)
+        # NOTE: the weights are quantized from bf16 to int4 through a per-channel quantization process
+        self.is_per_channel_weight = self.group_size == 0
        quant_version = vllm_config.quant_config.quant_description.get(
            "version", "0")
        # NOTE: new quantize weights: 2 int4 pack into int8
@@ -188,44 +190,45 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
            num_experts,
            2 * intermediate_size_per_partition,
            1,
-            dtype=params_dtype)
+            dtype=torch.float32)

        param_dict["w13_weight_offset"] = torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            1,
-            dtype=params_dtype)
-
-        param_dict["w13_weight_scale_second"] = torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_sizes // self.group_size,
-            dtype=params_dtype)
-
-        param_dict["w13_weight_offset_second"] = torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_sizes // self.group_size,
-            dtype=params_dtype)
+            dtype=torch.float32)

        param_dict["w2_weight_scale"] = torch.empty(num_experts,
                                                    hidden_sizes,
                                                    1,
-                                                    dtype=params_dtype)
+                                                    dtype=torch.float32)
        param_dict["w2_weight_offset"] = torch.empty(num_experts,
                                                     hidden_sizes,
                                                     1,
-                                                     dtype=params_dtype)
-        param_dict["w2_weight_scale_second"] = torch.empty(
-            num_experts,
-            hidden_sizes,
-            intermediate_size_per_partition // self.group_size,
-            dtype=params_dtype)
-        param_dict["w2_weight_offset_second"] = torch.empty(
-            num_experts,
-            hidden_sizes,
-            intermediate_size_per_partition // self.group_size,
-            dtype=params_dtype)
+                                                     dtype=torch.float32)
+
+        if not self.is_per_channel_weight:
+            param_dict["w13_weight_scale_second"] = torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_sizes // self.group_size,
+                dtype=torch.float32)
+            param_dict["w13_weight_offset_second"] = torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_sizes // self.group_size,
+                dtype=torch.float32)
+
+            param_dict["w2_weight_scale_second"] = torch.empty(
+                num_experts,
+                hidden_sizes,
+                intermediate_size_per_partition // self.group_size,
+                dtype=torch.float32)
+            param_dict["w2_weight_offset_second"] = torch.empty(
+                num_experts,
+                hidden_sizes,
+                intermediate_size_per_partition // self.group_size,
+                dtype=torch.float32)

        if self.new_quant_version:
            param_dict["w13_scale_bias"] = torch.empty(
@@ -318,8 +321,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale_second,
-                w2_scale=layer.w2_weight_scale_second,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                w1_scale_bias=layer.w13_scale_bias,
                w2_scale_bias=layer.w2_scale_bias,
                topk_weights=topk_weights,
@@ -343,8 +346,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale_second,
-                w2_scale=layer.w2_weight_scale_second,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                w1_scale_bias=layer.w13_scale_bias,
                w2_scale_bias=layer.w2_scale_bias,
                topk_weights=topk_weights,
@@ -357,6 +360,14 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
            )

    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
+        scale = scale.transpose(1, 2).contiguous()
+        if self.is_per_channel_weight:
+            scale_np = scale.cpu().numpy()
+            scale_np.dtype = np.uint32
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
+                np.int64)).npu()
+            return scale_uint64_tensor, None
+        per_group_scale = per_group_scale.transpose(1, 2).contiguous()
        group_num, k, n = weight.shape
        # the weight of the new version is reduced by half by pack n, so it needs to be restored
        if self.new_quant_version:
@@ -399,13 +410,10 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:

    def pack_to_int32(self, weight: torch.Tensor):
        if self.new_quant_version:
-            group_num, k, n = weight.shape
-            assert n % 4 == 0, "the last dim of weight needs to be divided by 4"
-            packed_n = n // 4
            # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
-            packed_weight = torch.from_numpy(
-                np.frombuffer(weight.cpu().numpy().tobytes(), dtype=np.int32))
-            return packed_weight.reshape(group_num, k, packed_n).npu()
+            assert weight.shape[
+                -1] % 4 == 0, "the last dim of weight needs to be divided by 4"
+            return weight.view(torch.int32).contiguous()
        else:
            return torch_npu.npu_quantize(weight.to(torch.float32),
                                          torch.tensor([1.]).npu(), None,
@@ -417,21 +425,22 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
                1, 2).contiguous()
-        layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose(
-            1, 2).contiguous()
-        layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose(
-            1, 2).contiguous()
-        layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose(
-            1, 2).contiguous()
-        layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose(
-            1, 2).contiguous()
-
-        layer.w13_weight_scale_second.data, w13_bias = self.process_scale(
+        w13_weight_scale_second = layer.w13_weight_scale_second.data if hasattr(
+            layer, "w13_weight_scale_second") else None
+        w2_weight_scale_second = layer.w2_weight_scale_second.data if hasattr(
+            layer, "w2_weight_scale_second") else None
+        layer.w13_weight_scale.data, w13_bias = self.process_scale(
            layer.w13_weight, layer.w13_weight_scale.data,
-            layer.w13_weight_scale_second.data)
-        layer.w2_weight_scale_second.data, w2_bias = self.process_scale(
+            w13_weight_scale_second)
+        layer.w2_weight_scale.data, w2_bias = self.process_scale(
            layer.w2_weight, layer.w2_weight_scale.data,
-            layer.w2_weight_scale_second.data)
+            w2_weight_scale_second)
+        if hasattr(layer, "w13_weight_scale_second"):
+            # scale_second is no longer used, release this part of the memory
+            del layer.w13_weight_scale_second
+            del layer.w2_weight_scale_second
+            del layer.w13_weight_offset_second
+            del layer.w2_weight_offset_second

        self.update_bias(layer, w13_bias, w2_bias)

--- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
+++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
@@ -23,7 +23,6 @@ import torch_npu
 from vllm.distributed import GroupCoordinator, get_ep_group
 from vllm.forward_context import get_forward_context

-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -417,6 +416,7 @@ def torchair_fused_experts_with_all2all(
    num_experts = w1.shape[0]

    if expert_map is not None:
+        assert ep_group is not None, "ep_group must be provided when expert_map is given"
        global_num_experts = len(expert_map) + global_redundant_expert_num
        if hasattr(torch_npu, "npu_moe_init_routing_quant"):
            quantized_tokens, expanded_row_idx, global_expert_tokens, _, token_scales = torch_npu.npu_moe_init_routing_quant(
@@ -436,8 +436,9 @@ def torchair_fused_experts_with_all2all(

        gather_sizes = global_expert_tokens.new_empty(
            global_expert_tokens.shape[0])
-        dist.all_to_all_single(gather_sizes, global_expert_tokens)
-
+        dist.all_to_all_single(gather_sizes,
+                               global_expert_tokens,
+                               group=ep_group.device_group)
        token_counts_combined = torch.stack(
            [gather_sizes, global_expert_tokens], dim=0)
        token_counts_combined = token_counts_combined.view(
@@ -452,10 +453,16 @@ def torchair_fused_experts_with_all2all(
        gather_size_list = token_counts_combined_cpu[1]
        scatter_size_list = token_counts_combined_cpu[0]

-        dist.all_to_all_single(gathered_tokens, quantized_tokens,
-                               scatter_size_list, gather_size_list)
-        dist.all_to_all_single(dynamic_scale, token_scales, scatter_size_list,
-                               gather_size_list)
+        dist.all_to_all_single(gathered_tokens,
+                               quantized_tokens,
+                               scatter_size_list,
+                               gather_size_list,
+                               group=ep_group.device_group)
+        dist.all_to_all_single(dynamic_scale,
+                               token_scales,
+                               scatter_size_list,
+                               gather_size_list,
+                               group=ep_group.device_group)

        hidden_states, dynamic_scale, inverse_indices, expert_tokens = torch_npu.npu_moe_re_routing(
            gathered_tokens,
@@ -503,9 +510,11 @@ def torchair_fused_experts_with_all2all(
            index=inverse_indices.to(torch.float32).argsort().to(torch.int32))

        hidden_states = reordered_outputs.new_empty(*quantized_tokens.shape)
-        dist.all_to_all_single(hidden_states, reordered_outputs,
-                               gather_size_list, scatter_size_list)
-
+        dist.all_to_all_single(hidden_states,
+                               reordered_outputs,
+                               gather_size_list,
+                               scatter_size_list,
+                               group=ep_group.device_group)
        final_hidden_states = torch_npu.npu_moe_finalize_routing(
            hidden_states,
            skip1=None,
@@ -824,6 +833,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:

        ascend_config = get_ascend_config()
        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        try:
            device_group = get_mc2_group().device_group
@@ -937,6 +947,8 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
            )

        fused_moe_state = get_forward_context().fused_moe_state
+        if self.enable_shared_expert_dp and fused_moe_state == FusedMoEState.MC2:
+            fused_moe_state = FusedMoEState.All2All
        shared_gate_up, shared_dequant_scale = None, None
        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
            with npu_stream_switch("moe_secondary", 0):
@@ -1021,8 +1033,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
                1, 2).contiguous()
-        if envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP:
-            torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
+        torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
            layer.w13_weight_scale.data.shape[0], -1)
        layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
--- a/vllm_ascend/torchair/torchair_attention.py
+++ b/vllm_ascend/torchair/torchair_attention.py
@@ -98,10 +98,12 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):

    def __init__(
        self,
+        kv_cache_spec,
+        layer_names,
        vllm_config: VllmConfig,
        device: torch.device,
    ):
-        super().__init__(vllm_config, device)
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        self.max_num_blocks_per_req = cdiv(
            self.model_config.max_model_len,
            self.vllm_config.cache_config.block_size)
@@ -171,8 +173,9 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):

    def build(
        self,
+        common_prefix_len: int,
        common_attn_metadata: AscendCommonAttentionMetadata,
-        model: nn.Module,
+        model: Optional[nn.Module] = None,
    ):
        num_reqs = common_attn_metadata.num_reqs
        num_actual_tokens = common_attn_metadata.num_actual_tokens
@@ -182,11 +185,7 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
            block_table[:num_reqs])

        seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
-        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
-                                                             num_actual_tokens].to(
-                                                                 self.device,
-                                                                 non_blocking=
-                                                                 True)
+        slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
        attn_mask = common_attn_metadata.attn_mask

        attn_state = common_attn_metadata.attn_state
@@ -374,6 +373,9 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
            indices = torch.cat((block_indices, slots_indices), dim=1)
            torch_npu.npu_scatter_nd_update_(key_cache, indices, key)
            torch_npu.npu_scatter_nd_update_(value_cache, indices, value)
+            if attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+                self.key_cache = key_cache
+                self.value_cache = value_cache

        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
            assert attn_metadata is not None
@@ -411,11 +413,13 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
            assert attn_metadata is not None
            assert attn_metadata.attn_mask is not None
            compress_mask = attn_metadata.attn_mask
+            batch_size = attn_metadata.query_lens.shape[0]
+            block_table = attn_metadata.block_tables[:batch_size, :]
            torch_npu._npu_flash_attention_qlens(
                query=query,
                key_cache=self.key_cache,
                value_cache=self.value_cache,
-                block_table=attn_metadata.block_tables,
+                block_table=block_table,
                mask=compress_mask,
                seq_len=attn_metadata.query_lens,
                context_lens=attn_metadata.seq_lens,
@@ -431,17 +435,24 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
            block_size = key_cache.shape[1]
            query = query.view(num_tokens, 1,
                               self.num_heads * self.head_size).contiguous()
-            output = torch_npu.npu_incre_flash_attention(
-                query,
-                key_cache,
-                value_cache,
-                num_key_value_heads=self.num_kv_heads,
+            output, _ = torch_npu.npu_fused_infer_attention_score(
+                query=query,
+                key=key_cache,
+                value=value_cache,
+                query_rope=None,
+                key_rope=None,
                num_heads=self.num_heads,
-                actual_seq_lengths=seq_lens,
-                scale_value=self.scale,
-                block_table=block_table,
+                num_key_value_heads=self.num_kv_heads,
                input_layout='BSH',
-                block_size=block_size)
+                atten_mask=decode_meta.attn_mask,
+                sparse_mode=0,
+                scale=self.scale,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                block_table=block_table,
+                block_size=block_size,
+                actual_seq_lengths_kv=seq_lens,
+            )
        else:
            raise NotImplementedError(
                "Torchair graph mode with non-MLA attention backend is still experimental."
--- a/vllm_ascend/torchair/torchair_mla.py
+++ b/vllm_ascend/torchair/torchair_mla.py
@@ -23,7 +23,6 @@ from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
-from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
                                        npu_stream_switch, npu_wait_tensor)
 from vllm_ascend.utils import npu_prefetch
@@ -176,6 +175,8 @@ class AscendMLATorchairMetadataBuilder:

    # _attn_mask_builder = None
    def __init__(self,
+                 kv_cache_spec,
+                 layer_names,
                 vllm_config: VllmConfig,
                 device: torch.device,
                 metadata_cls: Optional[AscendMLATorchairMetadata] = None):
@@ -372,6 +373,7 @@ class AscendMLATorchairMetadataBuilder:

    def build(
        self,
+        common_prefix_len: int,
        common_attn_metadata: AscendCommonAttentionMetadata,
        model: nn.Module,
    ) -> AscendMLATorchairMetadata:
@@ -398,11 +400,7 @@ class AscendMLATorchairMetadataBuilder:
        device = self.device

        block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
-        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
-                                                             num_actual_tokens].to(
-                                                                 device,
-                                                                 non_blocking=
-                                                                 True)
+        slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
        input_positions = common_attn_metadata.positions[:
                                                         num_actual_tokens].long(
                                                         )
@@ -492,11 +490,12 @@ class AscendMLATorchairMetadataBuilder:
        graph_pad_size = common_attn_metadata.graph_pad_size
        use_torchair_graph = graph_pad_size != -1
        if num_decodes > 0:
+            # Notice that num_decodes != num_decode_tokens in SpecDecoding Scenario
            actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist()
            max_seq_lens = seq_lens[:num_decodes].max().item()
-            seq_lens = seq_lens[:num_decode_tokens]
+            seq_lens = seq_lens[:num_decodes]
            input_positions = input_positions[:num_decode_tokens]
-            block_table = block_table[:num_decode_tokens, ...]
+            block_table = block_table[:num_decodes, ...]
            num_token_pad_size = 0
            if use_torchair_graph and common_attn_metadata.attn_state in [
                    AscendAttentionState.DecodeOnly,
@@ -535,10 +534,9 @@ class AscendMLATorchairMetadataBuilder:
                                               device=input_positions.device)
                input_positions = torch.cat(
                    [input_positions, position_padding])
-                actual_seq_lengths_q = (
-                    actual_seq_lengths_q + common_attn_metadata.
-                    actual_seq_lengths_q[num_reqs:num_reqs +
-                                         num_reqs_pad_size])
+                actual_seq_lengths_q = self.pad_actual_seq_len_q(
+                    num_reqs_pad_size, num_reqs, actual_seq_lengths_q,
+                    common_attn_metadata)
            else:
                seq_lens_list = seq_lens.tolist()
            # mtp torchair + PD scenario, last element of actual_seq_lengths_q must equal to batch_size(num_tokens)
@@ -581,6 +579,48 @@ class AscendMLATorchairMetadataBuilder:
            enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp,
        )

+    def pad_actual_seq_len_q(self, num_reqs_pad_size, num_reqs,
+                             actual_seq_lengths_q, common_attn_metadata):
+        """
+        Pads actual_seq_lengths_q evenly to not exceed 16 tokens per request 
+        in order to meet the requirement of npu_fused_infer_attention_score.
+
+        In Torchair scenario, the lengths of the queries must be padded to the same length.
+        And npu_fused_infer_attention_score constraint requires the last element must equal to batch_size(num_tokens).
+
+        For example:
+        batch_size=36, num_reqs_pad_size=2, num_reqs=16
+        By default, each request should have inference 2 token, which means actual_seq_lengths_q should be 
+        [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36].
+
+        However, mtp torchair + PD scenario, the actual_seq_lengths_q may be 
+        [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] before padding, since the first decode request only has 1 token.
+        In order to meet the requirement of npu_fused_infer_attention_score, we need to pad actual_seq_lengths_q evenly to not exceed 16 tokens per request.
+        after padding actual_seq_lengths_q should be similar to [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,36]
+        """
+        FIA_SEQ_LEN_LIMIT = 16
+        need_padding = num_reqs_pad_size != 0 and \
+            len(common_attn_metadata.actual_seq_lengths_q) > num_reqs and \
+            common_attn_metadata.actual_seq_lengths_q[num_reqs] - actual_seq_lengths_q[-1] > FIA_SEQ_LEN_LIMIT
+        if need_padding:
+            padding_seq_len_q = common_attn_metadata.actual_seq_lengths_q[
+                num_reqs:num_reqs + num_reqs_pad_size]
+            start_val = actual_seq_lengths_q[-1]
+            end_val = padding_seq_len_q[-1]
+
+            num_step = len(padding_seq_len_q)
+            interpolated = np.round(
+                np.linspace(start_val, end_val,
+                            num_step + 1)[1:]).astype(int).tolist()
+            assert interpolated[-1] == end_val
+            assert len(interpolated) == len(padding_seq_len_q)
+            actual_seq_lengths_q = actual_seq_lengths_q + interpolated
+        else:
+            actual_seq_lengths_q = actual_seq_lengths_q + common_attn_metadata.actual_seq_lengths_q[
+                num_reqs:num_reqs + num_reqs_pad_size]
+
+        return actual_seq_lengths_q
+

 class AscendMLATorchairImpl(MLAAttentionImpl):
    """
@@ -629,12 +669,10 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        self.running_in_graph = False
+        self.prefill_mask = None
+        self.ring_mla_mask_size = 512

-        # Adapt torch air graph mode with spec decoding.
-        speculative_config = get_current_vllm_config().speculative_config
-        if speculative_config is not None:
-            self.spec_token_num = speculative_config.num_speculative_tokens
-            assert self.spec_token_num > 0
+        self.speculative_config = get_current_vllm_config().speculative_config

    def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False):
        # Convert from (B, N, L) to (N, B, L)
@@ -775,16 +813,13 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
            k_nope, v = kv_nope\
                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
-            mask = torch.triu(
-                torch.ones(512, 512, device=query.device, dtype=query.dtype),
-                1)
            torch_npu.atb.npu_ring_mla(
                q_nope=q_nope,
                q_rope=q_pe,
                k_nope=k_nope,
                k_rope=k_pe,
                value=v,
-                mask=mask,
+                mask=self.prefill_mask,
                seqlen=seq_len,
                head_num=self.num_heads,
                kv_head_num=self.num_heads,
@@ -816,104 +851,54 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
                                  self.v_head_dim,
                                  dtype=query.dtype,
                                  device=query.device)
+        attn_lse = torch.empty(self.num_heads,
+                               num_tokens,
+                               dtype=torch.float32,
+                               device=query.device)
        k_nope, value = self.kv_b_proj(kv_c_normed)[0].view(
            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).split(
                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
        k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
        # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
-        ascend_config = get_ascend_config()
+        q_pe = query[..., self.qk_nope_head_dim:]
+        q_nope = query[..., :self.qk_nope_head_dim]
+        if self.prefill_mask is None:
+            if q_nope.dtype == torch.float16:
+                mask_value = torch.finfo(torch.float32).min
+            else:
+                mask_value = 1
+            prefill_mask = torch.triu(
+                torch.ones(self.ring_mla_mask_size,
+                           self.ring_mla_mask_size,
+                           device=q_nope.device,
+                           dtype=q_nope.dtype), 1)
+            self.prefill_mask = torch.where(prefill_mask == 1, mask_value,
+                                            0).to(q_nope.dtype)
+        torch_npu.atb.npu_ring_mla(q_nope=q_nope,
+                                   q_rope=q_pe,
+                                   k_nope=k_nope,
+                                   k_rope=k_pe,
+                                   value=value,
+                                   mask=self.prefill_mask,
+                                   seqlen=torch.tensor(
+                                       attn_metadata.prefill.query_lens,
+                                       dtype=torch.int32),
+                                   head_num=self.num_heads,
+                                   kv_head_num=self.num_heads,
+                                   pre_out=None,
+                                   prev_lse=None,
+                                   qk_scale=self.scale,
+                                   kernel_type="kernel_type_high_precision",
+                                   mask_type="mask_type_triu",
+                                   input_layout="type_bsnd",
+                                   calc_type="calc_type_first_ring",
+                                   output=attn_output,
+                                   softmax_lse=attn_lse)
+        attn_output, attn_lse = self._compute_prefill_context( \
+            query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)

-        if attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ] and not ascend_config.chunked_prefill_for_mla:
-            attn_output_torch = torch.empty(num_tokens,
-                                            self.num_heads * self.v_head_dim,
-                                            dtype=query.dtype,
-                                            device=query.device)
-            # current requests is chunked in prefill, disable flash attention with chunked prefill
-            vanilla_chunked_prefill_mla(
-                output=attn_output_torch,
-                query=query,
-                kv_cache=kv_c_and_k_pe_cache,
-                block_tables=attn_metadata.prefill.block_table,
-                query_lens=attn_metadata.prefill.query_lens,
-                context_lens=attn_metadata.prefill.context_lens,
-                kv_b_proj=self.kv_b_proj,
-                max_query_len=attn_metadata.prefill.max_query_len,
-                max_context_len=attn_metadata.prefill.max_seq_lens,
-                nope_dim=self.qk_nope_head_dim,
-                rope_dim=self.qk_rope_head_dim,
-                v_head_dim=self.v_head_dim,
-                scale=self.scale,
-                alibi_slopes=None,
-                causal=True)
-        elif attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ]:
-            attn_lse = torch.empty(self.num_heads,
-                                   num_tokens,
-                                   dtype=torch.float32,
-                                   device=query.device)
-            q_pe = query[..., self.qk_nope_head_dim:]
-            q_nope = query[..., :self.qk_nope_head_dim]
-            mask = torch.triu(
-                torch.ones(512, 512, device=query.device, dtype=query.dtype),
-                1)  # 512: mask only support 512
-            if attn_metadata.num_prefills > 1:
-                mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1,
-                                                1)
-            torch_npu.atb.npu_ring_mla(
-                q_nope=q_nope,
-                q_rope=q_pe,
-                k_nope=k_nope,
-                k_rope=k_pe,
-                value=value,
-                mask=mask,
-                seqlen=torch.tensor(attn_metadata.prefill.query_lens,
-                                    dtype=torch.int32),
-                head_num=self.num_heads,
-                kv_head_num=self.num_heads,
-                pre_out=None,
-                prev_lse=None,
-                qk_scale=self.scale,
-                kernel_type="kernel_type_high_precision",
-                mask_type="mask_type_triu",
-                input_layout="type_bsnd",
-                calc_type="calc_type_first_ring",
-                output=attn_output,
-                softmax_lse=attn_lse)
-            attn_output, attn_lse = self._compute_prefill_context( \
-                query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
-
-        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
-            key = torch.cat((k_nope, k_pe), dim=-1)
-            torch_npu._npu_flash_attention(
-                query=query,
-                key=key,
-                value=value,
-                mask=attn_metadata.attn_mask,
-                seq_len=attn_metadata.prefill.context_lens,
-                scale_value=self.scale,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_heads,
-                out=attn_output)
-            attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim)
-        else:
-            raise RuntimeError(
-                "Unexpected path reached, AscendMLATorchairImpl should only have PrefillNoCache, PrefillCacheHit, ChunkedPrefill and SpecDecoding scenario in forward prefill, please file a bug to vllm-ascend !"
-            )
        attn_output = attn_output.reshape(
            [num_tokens, self.num_heads * self.v_head_dim])
-        if attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ] and not ascend_config.chunked_prefill_for_mla:
-            attn_output = attn_output_torch

        return attn_output

@@ -961,7 +946,7 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA"
+        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
        _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache(
            kv,
            self.kv_a_layernorm.weight,
@@ -1019,8 +1004,11 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
                                 self.qk_rope_head_dim)
                input_layout = "BNSD"

-            if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
-                assert num_tokens % self.spec_token_num == 0
+            if attn_metadata.attn_state in [
+                    AscendAttentionState.SpecDecoding,
+                    AscendAttentionState.ChunkedPrefill
+            ] and self.speculative_config is not None:
+                # Use TND layout for pure SpecDecoding and SpecDecoding in ChunkedPrefill
                input_layout = "TND"
                # [bs * q_seq_len, num_heads_per_rank, dim]
                q_nope = q_nope.view(num_tokens, self.num_heads, -1)
@@ -1199,9 +1187,7 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
            else:
                decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                    attn_metadata.decode.input_positions,
-                    decode_q_pe.contiguous(),
-                    decode_k_pe,
-                    max_seq_len=attn_metadata.decode.max_seq_lens)
+                    decode_q_pe.contiguous(), decode_k_pe)
        if has_prefill:
            assert attn_metadata.prefill is not None
            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
@@ -1226,9 +1212,7 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
            else:
                prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
                    attn_metadata.prefill.input_positions,
-                    prefill_q_pe.contiguous(),
-                    prefill_k_pe,
-                    max_seq_len=attn_metadata.prefill.max_seq_lens)
+                    prefill_q_pe.contiguous(), prefill_k_pe)

        assert len(
            kv_cache
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -17,6 +17,7 @@
 # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
 # isort: skip_file

+import math
 import types
 from typing import Optional

@@ -24,7 +25,6 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch_npu
-import vllm.envs as envs_vllm
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_dp_group
@@ -40,25 +40,39 @@ from vllm_ascend.torchair.utils import (
    register_torchair_model, torchair_ops_patch,
    torchair_quant_method_register, write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               is_310p)
+                               is_310p, get_ascend_soc_version,
+                               AscendSocVersion)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner


 class NPUTorchairModelRunner(NPUModelRunner):

    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.ascend_config = get_ascend_config()
+        self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
        super().__init__(vllm_config, device)
-        ascend_config = get_ascend_config()
+        if self.speculative_config:
+            self.actual_seq_lengths_q = list(
+                range(self.decode_token_per_req, self.max_num_tokens + 1,
+                      self.decode_token_per_req))
+        self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
+            None, None, vllm_config, device)
+
+        register_torchair_model()
+        torchair_ops_patch()
+        torchair_quant_method_register()
+        if self.enable_shared_expert_dp:
+            return
        self.new_kv_cache_bytes = -1
        self.torchair_compiled_model = None  # type: ignore
        self.torchair_compiled_models = {}  # type: ignore
-        self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
-        self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
-        self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
-        if ascend_config.torchair_graph_config.graph_batch_sizes_init:
+        self.use_cached_npu_graph = self.ascend_config.torchair_graph_config.use_cached_graph
+        self.use_cached_kv_cache_bytes = self.ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
+        self.torchair_graph_batch_sizes = self.ascend_config.torchair_graph_config.graph_batch_sizes
+        if self.ascend_config.torchair_graph_config.graph_batch_sizes_init:
            self.init_torchair_graph_batch_sizes()

-        self.check_torchair_graph_batch_sizes()
+        self.update_torchair_graph_batch_sizes()

        torch._dynamo.cache_size.config.cache_size_limit += len(
            self.torchair_graph_batch_sizes)
@@ -67,14 +81,14 @@ class NPUTorchairModelRunner(NPUModelRunner):
            recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)

        self._check_batch_sizes_consistency()
-        register_torchair_model()
-        torchair_ops_patch()
-        torchair_quant_method_register()

    def _sync_metadata_across_dp(
            self, num_tokens: int, with_prefill: bool, enable_dbo: bool
    ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
        """Override from NPUModelRunner to pad num_tokens"""
+        if self.enable_shared_expert_dp:
+            # Padding is not required for shared_expert_dp cases in eager mode.
+            return num_tokens, None, with_prefill, enable_dbo
        if self.dp_size == 1:
            if not with_prefill:
                maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
@@ -107,10 +121,15 @@ class NPUTorchairModelRunner(NPUModelRunner):

        return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo

-    def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
+    def _build_attention_metadata(self, with_prefill, num_reqs, num_tokens,
+                                  max_query_len, force_attention):
        # NOTE: If torchair graph mode and not with_prefill,
        # we can't skip_attn, it will cause graph recompile.
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            attn_metadata = super()._build_attention_metadata(
+                with_prefill, num_reqs, num_tokens, max_query_len,
+                force_attention)
+        else:
            common_attn_metadata = TorchairCommonAttentionMetadata(
                num_reqs=num_reqs,
                num_actual_tokens=1,
@@ -121,17 +140,19 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )
            attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
                common_attn_metadata)
-        else:
-            attn_metadata = super()._build_attention_metadata(
-                with_prefill, num_reqs, skip_attn)
        return attn_metadata

    def _generate_dummy_run_hidden_states(self, with_prefill,
                                          is_torchair_compile, input_ids,
                                          positions, attn_metadata, num_tokens,
                                          intermediate_tensors, inputs_embeds):
-
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
+            hidden_states = super()._generate_dummy_run_hidden_states(
+                with_prefill, is_torchair_compile, input_ids, positions,
+                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
+        else:
            # Only mark static while compiling
            if is_torchair_compile:
                torch._dynamo.mark_static(input_ids)
@@ -163,15 +184,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
                inputs_embeds=None,
                **model_kwargs,
            )
-        else:
-            if is_310p():
-                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
-            hidden_states = super()._generate_dummy_run_hidden_states(
-                with_prefill, is_torchair_compile, input_ids, positions,
-                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
        return hidden_states

    def _convert_torch_format(self, kv_cache):
+        if self.enable_shared_expert_dp:
+            return super()._convert_torch_format(kv_cache)
        kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
        return kv_cache

@@ -189,6 +206,8 @@ class NPUTorchairModelRunner(NPUModelRunner):

    def _capture_model(self):
        """Override from NPUModelRunner to use torchair graph capture."""
+        if self.enable_shared_expert_dp:
+            return super()._capture_model()
        # TODO(NeverRaR): Calling graph_capture(device=self.device) in
        # torchair graph capture can cause some issues, so now we just
        # temporarily split the codepath for the two different graph patterns.
@@ -228,6 +247,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
                                         self.new_kv_cache_bytes)

    def _use_aclgraph(self) -> bool:
+        if self.enable_shared_expert_dp:
+            return super()._use_aclgraph()
        return False

    def _check_batch_sizes_consistency(self) -> None:
@@ -253,10 +274,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )

    def _update_graph_pad_size(self, with_prefill, graph_pad_size):
-        if not with_prefill:
-            self.graph_pad_size = graph_pad_size
-        else:
+        if with_prefill or self.enable_shared_expert_dp:
            super()._update_graph_pad_size(with_prefill, graph_pad_size)
+        else:
+            self.graph_pad_size = graph_pad_size

    def _update_input_ids_and_positions(self, input_ids, positions,
                                        num_input_tokens, with_prefill,
@@ -266,7 +287,9 @@ class NPUTorchairModelRunner(NPUModelRunner):
            input_ids, positions, num_input_tokens, with_prefill,
            padded_num_tokens_across_dp)

-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            return input_ids, positions
+        else:
            input_ids = self.input_ids[:padded_num_tokens_across_dp]
            positions = self.positions[:padded_num_tokens_across_dp]
        return input_ids, positions
@@ -276,6 +299,13 @@ class NPUTorchairModelRunner(NPUModelRunner):
                                             input_ids, positions,
                                             intermediate_tensors,
                                             inputs_embeds):
+        if attn_metadata is not None and isinstance(attn_metadata, dict):
+            attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
+
+        if self.enable_shared_expert_dp:
+            return super()._generate_process_reqs_hidden_states(
+                attn_metadata, with_prefill, padded_num_tokens_across_dp,
+                input_ids, positions, intermediate_tensors, inputs_embeds)
        model_kwargs = {
            "kv_caches": self.kv_caches,
            "attn_metadata": attn_metadata
@@ -332,21 +362,22 @@ class NPUTorchairModelRunner(NPUModelRunner):
            communication_adaptation_310p()

        config = torchair.CompilerConfig()
-        if get_ascend_config().torchair_graph_config.mode:
-            config.mode = get_ascend_config().torchair_graph_config.mode
-        config.experimental_config.frozen_parameter = True
+        if self.ascend_config.torchair_graph_config.mode:
+            config.mode = self.ascend_config.torchair_graph_config.mode
+        config.experimental_config.frozen_parameter = \
+        self.ascend_config.torchair_graph_config.enable_frozen_parameter
        # enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
        # disable it on 300I Duo platform now.
        config.experimental_config.tiling_schedule_optimize = not is_310p()
        config.experimental_config.enable_view_optimize = \
-        get_ascend_config().torchair_graph_config.enable_view_optimize
+        self.ascend_config.torchair_graph_config.enable_view_optimize
        torch.npu.set_compile_mode(jit_compile=False)
        if not self.use_cached_npu_graph:
            npu_backend = torchair.get_npu_backend(compiler_config=config)
            self.torchair_compiled_model = torch.compile(
                self.model,
-                dynamic=True,
-                fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                dynamic=not self.ascend_config.use_sfa,
+                fullgraph=True,
                backend=npu_backend)
            return self.torchair_compiled_model
        else:
@@ -368,8 +399,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
            self.torchair_compiled_models[
                batch_size] = torchair.inference.cache_compile(
                    self.model.__dict__[forward_proxy_name],
-                    dynamic=True,
-                    fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    dynamic=not self.ascend_config.use_sfa,
+                    fullgraph=True,
                    cache_dir=TORCHAIR_CACHE_DIR,
                    config=config,
                    ge_cache=False)
@@ -396,10 +427,16 @@ class NPUTorchairModelRunner(NPUModelRunner):
            f"{self.torchair_graph_batch_sizes}, but cur batch_size is {batch_size}."
        )

-    def check_torchair_graph_batch_sizes(self):
+    def update_torchair_graph_batch_sizes(self):
        # return graph_batch_sizes according to the max number of tokens
        # first pad according to the number of requests
-        if len(self.torchair_graph_batch_sizes) == 0:
+        if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+            # pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs
+            self.torchair_graph_batch_sizes = [self.max_num_reqs]
+            logger.warning(
+                "is kv_consumer, torch_graph_batch_sizes sets to [max_num_seqs]"
+            )
+        elif len(self.torchair_graph_batch_sizes) == 0:
            self.torchair_graph_batch_sizes = [1, self.max_num_reqs]
        else:
            self.torchair_graph_batch_sizes = sorted(
@@ -420,27 +457,47 @@ class NPUTorchairModelRunner(NPUModelRunner):
            for graph_batch_size in self.torchair_graph_batch_sizes
        ]

-        # NOTE: when enable_expert_parallel, we need to check if `graph_batch_size` is divisible by `tp_size`
+        # NOTE: when enable_expert_parallel on A3, we need to check if `graph_batch_size` is divisible by `tp_size`
+        # Because we use x_active_mask for dispatch/combine op on A3, which requires that input shape should be same
+        # on all EP ranks
+        if get_ascend_soc_version(
+        ) == AscendSocVersion.A3 and self.parallel_config.enable_expert_parallel:
+            self._align_graph_size_divisible_by_tp_size()
+
+    def _align_graph_size_divisible_by_tp_size(self):
        tp_size = self.parallel_config.tensor_parallel_size
-        if self.parallel_config.enable_expert_parallel:
-            new_graph_batch_sizes = []
-            for graph_batch_size in self.torchair_graph_batch_sizes:
-                cur_graph_batch_size = (graph_batch_size + tp_size -
-                                        1) // tp_size * tp_size
-                if cur_graph_batch_size not in new_graph_batch_sizes and \
-                    cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
-                    new_graph_batch_sizes.append(cur_graph_batch_size)
-                elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \
-                        and self.decode_token_per_req > 1:
-                    logger.warning(
-                        f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens",
-                        f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size."
-                    )
+        new_graph_batch_sizes = []
+        for graph_batch_size in self.torchair_graph_batch_sizes:
+            cur_graph_batch_size = (graph_batch_size + tp_size -
+                                    1) // tp_size * tp_size
+            # MTP > 1: Cal LCMLeast Common Multiple with graph_batch_size and tp_size,
+            # Both adapter multi-dp and FIA operator
+            if self.speculative_config is not None and self.speculative_config.num_speculative_tokens > 1:
+                cur_graph_batch_size = (tp_size * graph_batch_size) \
+                                       // math.gcd(tp_size, graph_batch_size)
+            if cur_graph_batch_size not in new_graph_batch_sizes and \
+                cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
+                new_graph_batch_sizes.append(cur_graph_batch_size)
+            elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \
+                    and self.decode_token_per_req > 1:
+                logger.warning(
+                    f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens",
+                    f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size."
+                )
+        new_max_num_reqs = max(new_graph_batch_sizes)
+        if self.max_num_reqs != new_max_num_reqs:
+            logger.warning(f"max_num_reqs is updated to {new_max_num_reqs}")
+            self.max_num_reqs = new_max_num_reqs
+            self.scheduler_config.max_num_seqs = new_max_num_reqs
+
+        if new_graph_batch_sizes != self.torchair_graph_batch_sizes:
+            logger.warning(
+                f"torchair_graph_batch_sizes are updated to {new_graph_batch_sizes}."
+            )
            self.torchair_graph_batch_sizes = new_graph_batch_sizes

    def _build_drafter_prepare_inputs_torchair_param(self):
-        return True
-
-    def get_dp_padding(self, num_tokens):
-        """Override from NPUModelRunner to get dp padding"""
-        return 0, None
+        if self.enable_shared_expert_dp:
+            return super()._build_drafter_prepare_inputs_torchair_param()
+        else:
+            return True
--- a/vllm_ascend/torchair/torchair_sfa.py
+++ b/vllm_ascend/torchair/torchair_sfa.py
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -32,28 +32,28 @@ class NPUTorchairWorker(NPUWorker):
        """Override determine_available_memory to use cached torchair kv_cache_bytes."""

        available_kv_cache_memory = super().determine_available_memory()
-
-        if get_ascend_config(
-        ).torchair_graph_config.use_cached_kv_cache_bytes and check_kv_cache_bytes_cache_exist(
-        ):
-            old_kv_cache_bytes = read_kv_cache_bytes_from_file(
-                torch.distributed.get_rank())
-            if 0 < old_kv_cache_bytes <= available_kv_cache_memory:
-                logger.info(
-                    f"Use cached torchair kv_cache_bytes: {old_kv_cache_bytes}"
-                )
-                self.model_runner.new_kv_cache_bytes = old_kv_cache_bytes
-                return old_kv_cache_bytes
-            else:
-                logger.info(
-                    "Cached torchair kv_cache_bytes is too big, invalidate old torchair_cache"
-                )
-                delete_torchair_cache_file()
-        bytes_floating_tolerance = 1024 * 1024 * envs_ascend.VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE
-        available_kv_cache_memory -= bytes_floating_tolerance
-        logger.info(f"Use new kv_cache_bytes: {available_kv_cache_memory}")
-        self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
-
+        ascend_config = get_ascend_config()
+        if ascend_config.enable_shared_expert_dp:
+            return available_kv_cache_memory
+        if ascend_config.torchair_graph_config.use_cached_kv_cache_bytes:
+            if check_kv_cache_bytes_cache_exist():
+                old_kv_cache_bytes = read_kv_cache_bytes_from_file(
+                    torch.distributed.get_rank())
+                if 0 < old_kv_cache_bytes <= available_kv_cache_memory:
+                    logger.info(
+                        f"Use cached torchair kv_cache_bytes: {old_kv_cache_bytes}"
+                    )
+                    self.model_runner.new_kv_cache_bytes = old_kv_cache_bytes
+                    return old_kv_cache_bytes
+                else:
+                    logger.info(
+                        "Cached torchair kv_cache_bytes is too big, invalidate old torchair_cache"
+                    )
+                    delete_torchair_cache_file()
+            bytes_floating_tolerance = 1024 * 1024 * envs_ascend.VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE
+            available_kv_cache_memory -= bytes_floating_tolerance
+            logger.info(f"Use new kv_cache_bytes: {available_kv_cache_memory}")
+            self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
        return available_kv_cache_memory

    def init_device(self):
--- a/vllm_ascend/torchair/utils.py
+++ b/vllm_ascend/torchair/utils.py
@@ -165,6 +165,11 @@ def register_torchair_model():
        "vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM"
    )

+    ModelRegistry.register_model(
+        "DeepseekV32ForCausalLM",
+        "vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM"
+    )
+
    ModelRegistry.register_model(
        "Qwen2ForCausalLM",
        "vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM")
@@ -180,20 +185,31 @@ def register_torchair_model():


 def torchair_quant_method_register():
-    from vllm_ascend.quantization.quantizer import \
-        SUPPORT_ASCEND_QUANTIZER_TYPE
-    from vllm_ascend.torchair.quantization.torchair_quantizer import (
-        TorchairW4A8DYNAMICQuantizer, TorchairW8A8DYNAMICQuantizer)
+    from vllm_ascend.quantization.utils import ASCEND_QUANTIZATION_METHOD_MAP
+    from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import (
+        TorchairAscendW4A8DynamicFusedMoEMethod,
+        TorchairAscendW4A8DynamicLinearMethod)
+    from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import (
+        TorchairAscendW8A8DynamicFusedMoEMethod,
+        TorchairAscendW8A8DynamicLinearMethod)

-    SUPPORT_ASCEND_QUANTIZER_TYPE[
-        "W8A8_DYNAMIC"] = TorchairW8A8DYNAMICQuantizer
-    SUPPORT_ASCEND_QUANTIZER_TYPE[
-        "W4A8_DYNAMIC"] = TorchairW4A8DYNAMICQuantizer
+    ASCEND_QUANTIZATION_METHOD_MAP["W8A8_DYNAMIC"][
+        "linear"] = TorchairAscendW8A8DynamicLinearMethod
+    ASCEND_QUANTIZATION_METHOD_MAP["W8A8_DYNAMIC"][
+        "moe"] = TorchairAscendW8A8DynamicFusedMoEMethod
+    ASCEND_QUANTIZATION_METHOD_MAP["W4A8_DYNAMIC"][
+        "linear"] = TorchairAscendW4A8DynamicLinearMethod
+    ASCEND_QUANTIZATION_METHOD_MAP["W4A8_DYNAMIC"][
+        "moe"] = TorchairAscendW4A8DynamicFusedMoEMethod


 def torchair_ops_patch():
+    from vllm_ascend.ops.activation import AscendSiluAndMul
+    from vllm_ascend.ops.layernorm import AscendRMSNorm
    from vllm_ascend.ops.rotary_embedding import (
        AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding)
+    from vllm_ascend.torchair.ops import (torchair_activation,
+                                          torchair_layernorm)
    from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
        deepseek_rope_init_func, native_rope_deepseek_forward,
        qwen_rope_init_func, rope_forward)
@@ -203,3 +219,6 @@ def torchair_ops_patch():

    AscendDeepseekScalingRotaryEmbedding.__init__ = deepseek_rope_init_func  # type: ignore[method-assign]
    AscendDeepseekScalingRotaryEmbedding.forward = native_rope_deepseek_forward  # type: ignore[method-assign]
+
+    AscendRMSNorm.forward_oot = torchair_layernorm.torchair_rmsnorm_forward_oot  # type: ignore[method-assign]
+    AscendSiluAndMul.forward_oot = torchair_activation.torchair_silu_and_mul_forward_oot  # type: ignore[method-assign]