init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/quantization/func_wrapper.py
+++ b/vllm_ascend/quantization/func_wrapper.py
@@ -1,184 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Optional, Tuple, Union
-
-import torch
-import torch_npu
-from vllm.logger import logger
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, QuantizationConfig)
-
-
-# func refers to vocabParallelEmbedding.__init__
-def wrapper_vocab_parallel_embedding_init(func):
-
-    def init(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        params_dtype: Optional[torch.dtype] = None,
-        org_num_embeddings: Optional[int] = None,
-        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
-        func(
-            self,
-            num_embeddings,
-            embedding_dim,
-            params_dtype,
-            org_num_embeddings,
-            padding_size,
-            quant_config,
-            prefix,
-        )
-        # TODO: Contact vLLM maintainers to add a `params_dtype` attribute to the `VocabParallelEmbedding` class.
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
-
-    return init
-
-
-# func refers to RMSNorm.__init__
-def wrapper_rmsnorm_init(func):
-
-    def init(self, hidden_size: int, **extra_args) -> None:
-        func(self, hidden_size, **extra_args)
-        self.ignore_anti = True
-        self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
-                                       requires_grad=False)
-
-    return init
-
-
-# func refers to RMSNorm.forward_oot
-def wrapper_rmsnorm_forward_oot(func):
-
-    def _rmsnorm_forward_oot(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        if not self.ignore_anti:
-            if residual is not None:
-                residual += x
-                out = torch_npu._npu_quant_rms_norm(
-                    residual,
-                    self.weight,
-                    self.bias,
-                    self.input_scale,
-                    self.input_offset,
-                    self.variance_epsilon,
-                )
-                return out, residual
-            out = torch_npu._npu_quant_rms_norm(
-                x,
-                self.weight,
-                self.bias,
-                self.input_scale,
-                self.input_offset,
-                self.variance_epsilon,
-            )
-            return out
-
-        if residual is not None:
-            x, residual = func(self, x, residual)
-            return x.add_(self.bias), residual
-
-        return func(self, x).add_(self.bias)
-
-    return _rmsnorm_forward_oot
-
-
-MODEL_LAYER_MAPPING = {
-    "LlamaModel": {
-        "attn": {
-            "layer_attr": "self_attn",
-            "proj_attr": "qkv_proj",
-            "norm_attr": "input_layernorm",
-            "unquantized_type": UnquantizedLinearMethod,
-        },
-        "mlp": {
-            "layer_attr": "mlp",
-            "proj_attr": "gate_up_proj",
-            "norm_attr": "post_attention_layernorm",
-            "unquantized_type": UnquantizedLinearMethod,
-        },
-    },
-}
-
-
-def wrapper_load_model(func):
-
-    def postprocess_loading(self) -> None:
-        func(self)
-
-        def process_layer(layer, idx, mapping):
-
-            def process_module(module_cfg, layer_obj):
-                if module_cfg is None:
-                    return
-
-                module_obj = getattr(layer_obj, module_cfg["layer_attr"], None)
-                if module_obj is None:
-                    return
-
-                proj_attr = module_cfg["proj_attr"]
-                if callable(proj_attr):
-                    proj = proj_attr(module_obj, idx)
-                else:
-                    proj = getattr(module_obj, proj_attr, None)
-
-                norm = getattr(layer_obj, module_cfg["norm_attr"], None)
-
-                if proj is None or norm is None:
-                    return
-
-                norm.ignore_anti = isinstance(proj.quant_method,
-                                              module_cfg["unquantized_type"])
-                if not norm.ignore_anti:
-                    for param_name in ["input_scale", "input_offset"]:
-                        if hasattr(proj, param_name):
-                            param = getattr(proj, param_name)
-                            norm.register_parameter(
-                                param_name,
-                                torch.nn.Parameter(param.clone(),
-                                                   requires_grad=False))
-
-            process_module(mapping.get("attn"), layer)
-            process_module(mapping.get("mlp"), layer)
-
-        model_type = self.model.model.__class__.__name__
-        mapping = MODEL_LAYER_MAPPING.get(model_type)
-
-        if not mapping:
-            logger.info(
-                f"Warning: Model type '{model_type}' not found in MODEL_LAYER_MAPPING. Skipping layer mapping."
-            )
-            return
-
-        for idx, layer in enumerate(self.model.model.layers):
-            process_layer(layer, idx, mapping)
-
-        if isinstance(self.model.model.norm, RMSNorm):
-            self.model.model.norm.ignore_anti = True
-
-    return postprocess_loading
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -19,6 +19,7 @@ from types import MappingProxyType
 from typing import Any, Callable, Dict, List, Mapping, Optional

 import torch
+from vllm.config import get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                  FusedMoeWeightScaleSupported)
@@ -32,13 +33,15 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    UnquantizedEmbeddingMethod, VocabParallelEmbedding)
-from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs

+from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
+                                                    get_otp_group)
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
-from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, mlp_tp_enable,
+                               oproj_tp_enable)

-from .quantizer import AscendQuantizer
+from .utils import get_quant_method


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -50,6 +53,7 @@ class AscendQuantConfig(QuantizationConfig):
    """

    def __init__(self, quant_config: Dict[str, Any]):
+        super().__init__()
        self.quant_description = quant_config

    def __repr__(self) -> str:
@@ -85,7 +89,14 @@ class AscendQuantConfig(QuantizationConfig):

    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        vllm_config = get_current_vllm_config()
+        model_type = vllm_config.model_config.hf_config.model_type
+        if model_type in packed_modules_model_mapping:
+            self.packed_modules_mapping = packed_modules_model_mapping[
+                model_type]
        from vllm.attention.layer import Attention
+        if prefix.startswith("language_model"):
+            prefix = prefix.split('.', 1)[-1]
        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix,
                                            self.packed_modules_mapping):
@@ -147,21 +158,86 @@ class AscendQuantConfig(QuantizationConfig):
        return []


+packed_modules_model_mapping = {
+    "qwen3_moe": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+    },
+    "deepseek_v2": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    "deepseek_v3": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
+    # NOTE 2.The description file generated by the current msmodelslim tool does not have
+    # MTP layer info. Please manually add it and set the value to FLOAT.
+    "deepseek_mtp": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    "qwen3_next": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
+    },
+    "qwen2_5_vl": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    },
+    "glm4_moe": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+}
+
+
 class AscendLinearMethod(LinearMethodBase):
    """Linear method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for linear methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -174,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase):
        **extra_weight_attrs,
    ) -> None:
        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")

        weight_dict = self.quant_method.get_weight(input_size_per_partition,
                                                   output_size_per_partition,
@@ -187,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase):

        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
        for pertensor_name, pertensor_param in pertensor_dict.items():
-            param = PerTensorScaleParameter(data=pertensor_param,
-                                            weight_loader=weight_loader)
+            param = torch.nn.Parameter(pertensor_param, requires_grad=False)
            # disable warning
            param.ignore_warning = True
            layer.register_parameter(pertensor_name, param)
@@ -223,25 +297,27 @@ class AscendLinearMethod(LinearMethodBase):
        bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if isinstance(layer, RowParallelLinear):
-            tp_rank = get_tensor_model_parallel_rank()
-            return self.quant_method.apply(layer, x, bias, tp_rank)
-        return self.quant_method.apply(layer, x, bias)
+            if layer.prefix.find("o_proj") != -1 and oproj_tp_enable():
+                tp_rank = get_otp_group().rank_in_group
+            elif layer.prefix.find("down_proj") != -1 and mlp_tp_enable():
+                tp_rank = get_mlp_tp_group().rank_in_group
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+        else:
+            tp_rank = 0
+        return self.quant_method.apply(layer, x, bias, tp_rank)


 class AscendKVCacheMethod(BaseKVCacheMethod):
    """KVCache method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix)
-        self.quant_method = self.quantizer.build_attention_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "attention")

    def create_weights(self, layer: torch.nn.Module) -> None:
        # Different from linear method, there are no weight processing/slicing
@@ -263,18 +339,15 @@ class AscendKVCacheMethod(BaseKVCacheMethod):
 class AscendFusedMoEMethod(FusedMoEMethodBase):
    """FusedMoE method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]):
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_moe_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "moe",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -341,17 +414,20 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
        if hasattr(self.quant_method, "process_weights_after_loading"):
            self.quant_method.process_weights_after_loading(layer)

+    def get_fused_moe_quant_config(self, layer: torch.nn.Module):
+        # TODO: implement this function
+        pass
+

 class AscendEmbeddingMethod(AscendLinearMethod):
    """Embedding method for Ascend quantization.
-      This class calls AscendQuantizer to search a specific quantization
-      implementations supported on ascend hardware for Embedding methods.
+    
      Args:
          quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -1,311 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib
-import sys
-import types
-from typing import Any, Dict, List, Optional
-
-from vllm.logger import logger
-
-from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init,
-                           wrapper_vocab_parallel_embedding_init)
-from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
-                           AscendW4A8DynamicLinearMethod)
-from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
-                   AscendW8A8LinearMethod)
-from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
-                           AscendW8A8DynamicLinearMethod)
-
-CUSTOMIZED_QUANTIZER_TYPE: List[str] = []
-
-
-class AscendQuantizer:
-    """An interface to different quantization implementations for ascend hardwares."""
-
-    @classmethod
-    def get_quantizer(cls,
-                      quant_config: Dict[str, Any],
-                      prefix: str,
-                      packed_modules_mapping: Optional[Dict[str,
-                                                            Any]] = dict()):
-        # TODO: Need a param to choose quantization algorithms.
-        quantization_algorithm = ''
-
-        if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE:
-            return
-
-        return VLLMAscendQuantizer.get_quantizer(quant_config, prefix,
-                                                 packed_modules_mapping)
-
-    def build_linear_method(self):
-        raise NotImplementedError
-
-    def build_moe_method(self):
-        raise NotImplementedError
-
-    def build_attention_method(self):
-        raise NotImplementedError
-
-
-class VLLMAscendQuantizer:
-    _instance: Optional[object] = None
-    patched = False
-
-    def __init__(self, quant_description):
-        if VLLMAscendQuantizer.patched:
-            return
-        for name in quant_description.keys():
-            if "norm.bias" in name:
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm.model_executor.layers.layernorm.RMSNorm", "__init__",
-                    [wrapper_rmsnorm_init])
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm_ascend.ops.layernorm.AscendRMSNorm", "forward_oot",
-                    [wrapper_rmsnorm_forward_oot])
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding",
-                    "__init__", [wrapper_vocab_parallel_embedding_init])
-                break
-        VLLMAscendQuantizer.patched = True
-        logger.info("Using the vLLM Ascend Quantizer version now!")
-
-    @staticmethod
-    def apply_patch(target_module, target_function, wrappers):
-
-        original_module, original_function = VLLMAscendQuantizer.parse_path(
-            target_module, target_function, False)
-
-        original_function_id = id(original_function)
-
-        candidate = original_function
-        for wrapper in wrappers:
-            candidate = wrapper(candidate)
-        if target_function is not None:
-            setattr(original_module, target_function, candidate)
-
-        for _, value in sys.modules.copy().items():
-            if target_function is None:
-                continue
-            try:
-                attr = getattr(value, target_function, None)
-                if attr is not None and id(attr) == original_function_id:
-                    setattr(value, target_function, candidate)
-            except ImportError:
-                continue
-
-    @staticmethod
-    def parse_path(module_path, function_name, create_dummy):
-        """
-        Parse module path and resolve/create modules as needed.
-
-        Args:
-            module_path: Dot-separated module path
-            function_name: Target function name (None for module only)
-            create_dummy: Create dummy modules/functions when missing
-
-        Returns:
-            Tuple of (resolved module, target function/none)
-
-        Raises:
-            ModuleNotFoundError: If module path is invalid and create_dummy=False
-            AttributeError: If function is missing and create_dummy=False
-        """
-        from importlib.machinery import ModuleSpec
-
-        def create_dummy_module(full_path, parent=None):
-            """Create and register a placeholder module"""
-            dummy = types.ModuleType(full_path)
-            dummy.__file__ = "vllm_ascend.dummy_module.py"
-            dummy.__spec__ = ModuleSpec(full_path, None)
-            sys.modules[full_path] = dummy
-            if parent:
-                setattr(parent, full_path.split(".")[-1], dummy)
-            return dummy
-
-        def create_placeholder_function(func_name):
-            """Create dummy function that raises when called"""
-
-            def placeholder(*args, **kwargs):
-                raise NotImplementedError(
-                    f"Function {func_name} is a placeholder")
-
-            placeholder.__name__ = func_name
-            return placeholder
-
-        modules = module_path.split(".")
-        current_module = None
-        processed_path = []
-
-        for idx, part in enumerate(modules):
-            current_path = ".".join(modules[:idx + 1])
-            parent_path = ".".join(modules[:idx]) if idx > 0 else None
-
-            try:
-                current_module = importlib.import_module(current_path)
-            except ModuleNotFoundError:
-                # Handle missing module
-                parent = importlib.import_module(
-                    parent_path) if parent_path else None
-                if parent and hasattr(parent, part):
-                    # Use existing attribute from parent
-                    current_module = getattr(parent, part)
-                    # Check for early function resolution
-                    if function_name and hasattr(current_module,
-                                                 function_name):
-                        return current_module, getattr(current_module,
-                                                       function_name)
-                    if function_name and create_dummy:
-                        ph_func = create_placeholder_function(function_name)
-                        setattr(current_module, function_name, ph_func)
-                        return current_module, ph_func
-                    if function_name:
-                        raise AttributeError(
-                            f"Function {function_name} missing in {current_path}"
-                        )
-                else:
-                    if not create_dummy:
-                        raise
-                    # Create and register dummy module
-                    current_module = create_dummy_module(
-                        current_path,
-                        parent=importlib.import_module(parent_path)
-                        if parent_path else None)
-
-            processed_path.append(part)
-
-        # Final function handling
-        final_module = sys.modules[module_path]
-        if function_name is not None:
-            if not hasattr(final_module, function_name):
-                if create_dummy:
-                    ph_func = create_placeholder_function(function_name)
-                    setattr(final_module, function_name, ph_func)
-                else:
-                    setattr(final_module, function_name, None)
-            return final_module, getattr(final_module, function_name)
-
-        return final_module, None
-
-    @staticmethod
-    def build_linear_method():
-        raise NotImplementedError(
-            "Linear method is not implemented for the current quant type.")
-
-    @staticmethod
-    def build_moe_method():
-        raise NotImplementedError(
-            "MoE method is not implemented for the current quant type.")
-
-    @staticmethod
-    def build_attention_method():
-        raise NotImplementedError(
-            "Attention method is not implemented for the current quant type.")
-
-    @staticmethod
-    def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
-                              packed_modules_mapping: Dict[str, Any]):
-        proj_name = prefix.split(".")[-1]
-        if proj_name in packed_modules_mapping:
-            quant_type = None
-            shard_prefixes = [
-                prefix.replace(proj_name, shard_proj_name)
-                for shard_proj_name in packed_modules_mapping[proj_name]
-            ]
-            for shard_prefix in shard_prefixes:
-                shard_quant_type = quant_description[shard_prefix + '.weight']
-
-                if quant_type is None:
-                    quant_type = shard_quant_type
-                elif shard_quant_type != quant_type:
-                    raise ValueError(
-                        f"Not all shards of {prefix} are quantized with same quant type."
-                        f"Shard {proj_name} uses {shard_quant_type}, but another shard"
-                        f"use {quant_type}. Please check quantization config.")
-        else:
-            quant_type = quant_description[prefix + '.weight']
-        return quant_type
-
-    @classmethod
-    def get_quantizer(cls,
-                      quant_description: Dict[str, Any],
-                      prefix: str,
-                      packed_modules_mapping: Optional[Dict[str, Any]] = None):
-        if packed_modules_mapping is None:
-            packed_modules_mapping = dict()
-        # Attention
-        if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
-            quant_type = quant_description['fa_quant_type']
-        # Use KVCache int8
-        elif '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
-            quant_type = quant_description['kv_quant_type']
-        # Linear
-        else:
-            quant_type = cls.get_linear_quant_type(quant_description, prefix,
-                                                   packed_modules_mapping)
-        if quant_type in SUPPORT_ASCEND_QUANTIZER_TYPE.keys():
-            cls = SUPPORT_ASCEND_QUANTIZER_TYPE[quant_type]
-            if not cls._instance:
-                cls._instance = cls(quant_description)
-            return cls._instance
-        raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
-                                  f"{list(SUPPORT_ASCEND_QUANTIZER_TYPE.keys())}")
-
-
-class W4A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW4A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW4A8DynamicFusedMoEMethod()
-
-
-class W8A8Quantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW8A8LinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW8A8FusedMoEMethod()
-
-    @staticmethod
-    def build_attention_method():
-        return AscendC8KVCacheMethod()
-
-
-class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW8A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW8A8DynamicFusedMoEMethod()
-
-
-SUPPORT_ASCEND_QUANTIZER_TYPE = {
-    "W4A8_DYNAMIC": W4A8DYNAMICQuantizer,
-    "W8A8": W8A8Quantizer,
-    "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
-    "C8": W8A8Quantizer,
-}
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict, Optional, Type
+
+from vllm.logger import logger
+
+from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
+                           AscendW4A8DynamicLinearMethod)
+from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
+                   AscendW8A8LinearMethod)
+from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
+                           AscendW8A8DynamicLinearMethod)
+
+ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
+    "W4A8_DYNAMIC": {
+        "linear": AscendW4A8DynamicLinearMethod,
+        "moe": AscendW4A8DynamicFusedMoEMethod,
+    },
+    "W8A8": {
+        "linear": AscendW8A8LinearMethod,
+        "moe": AscendW8A8FusedMoEMethod,
+        "attention": AscendC8KVCacheMethod,
+    },
+    "W8A8_DYNAMIC": {
+        "linear": AscendW8A8DynamicLinearMethod,
+        "moe": AscendW8A8DynamicFusedMoEMethod,
+    },
+    "C8": {
+        "attention": AscendC8KVCacheMethod,
+    },
+}
+
+
+def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
+                          packed_modules_mapping: Dict[str, Any]):
+    proj_name = prefix.split(".")[-1]
+    if proj_name in packed_modules_mapping:
+        quant_type = None
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in packed_modules_mapping[proj_name]
+        ]
+        for shard_prefix in shard_prefixes:
+            shard_quant_type = quant_description[shard_prefix + '.weight']
+
+            if quant_type is None:
+                quant_type = shard_quant_type
+            elif shard_quant_type != quant_type:
+                raise ValueError(
+                    f"Not all shards of {prefix} are quantized with same quant type."
+                    f"Shard {proj_name} uses {shard_quant_type}, but another shard"
+                    f"use {quant_type}. Please check quantization config.")
+    else:
+        quant_type = quant_description[prefix + '.weight']
+    return quant_type
+
+
+def get_quant_method(quant_description: Dict[str, Any],
+                     prefix: str,
+                     layer_type: str,
+                     packed_modules_mapping: Optional[Dict[str, Any]] = None):
+    logger.info_once("Using the vLLM Ascend Quantization now!")
+    if packed_modules_mapping is None:
+        packed_modules_mapping = dict()
+    # Attention
+    if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
+        quant_type = quant_description['fa_quant_type']
+    # Use KVCache int8
+    elif '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
+        quant_type = quant_description['kv_quant_type']
+    # Linear
+    else:
+        quant_type = get_linear_quant_type(quant_description, prefix,
+                                           packed_modules_mapping)
+    if quant_type in ASCEND_QUANTIZATION_METHOD_MAP.keys():
+        method_map = ASCEND_QUANTIZATION_METHOD_MAP[quant_type]
+        if layer_type in method_map.keys():
+            method_cls = method_map[layer_type]
+            return method_cls()
+        else:
+            raise NotImplementedError(
+                f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}."
+            )
+    raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
+                                f"{list(ASCEND_QUANTIZATION_METHOD_MAP.keys())}")
--- a/vllm_ascend/quantization/w4a8_dynamic.py
+++ b/vllm_ascend/quantization/w4a8_dynamic.py
@@ -24,10 +24,10 @@ from vllm.config import get_current_vllm_config
 from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context

-from vllm_ascend.ascend_forward_context import FusedMoEState
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.fused_moe import unified_fused_experts_eager
-from vllm_ascend.ops.layers.experts_selector import select_experts
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ


 class AscendW4A8DynamicLinearMethod:
@@ -133,11 +133,14 @@ class AscendW4A8DynamicFusedMoEMethod:
        vllm_config = get_current_vllm_config()
        self.group_size = vllm_config.quant_config.quant_description.get(
            "group_size", 256)
+        # NOTE: the weights are quantized from bf16 to int4 through a per-channel quantization process
+        self.is_per_channel_weight = self.group_size == 0
        quant_version = vllm_config.quant_config.quant_description.get(
            "version", "0")
        # NOTE: new quantize weights: 2 int4 pack into int8
        self.new_quant_version = quant_version == "1.0.0"
        self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size
+        self.dynamic_eplb = get_ascend_config().dynamic_eplb
        if self.new_quant_version and self.tp_size > 16:
            raise ValueError(
                "The current weight does not support moe part tp>16.")
@@ -182,44 +185,44 @@ class AscendW4A8DynamicFusedMoEMethod:
            num_experts,
            2 * intermediate_size_per_partition,
            1,
-            dtype=params_dtype)
+            dtype=torch.float32)

        param_dict["w13_weight_offset"] = torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            1,
-            dtype=params_dtype)
-
-        param_dict["w13_weight_scale_second"] = torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_sizes // self.group_size,
-            dtype=params_dtype)
-
-        param_dict["w13_weight_offset_second"] = torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_sizes // self.group_size,
-            dtype=params_dtype)
+            dtype=torch.float32)

        param_dict["w2_weight_scale"] = torch.empty(num_experts,
                                                    hidden_sizes,
                                                    1,
-                                                    dtype=params_dtype)
+                                                    dtype=torch.float32)
        param_dict["w2_weight_offset"] = torch.empty(num_experts,
                                                     hidden_sizes,
                                                     1,
-                                                     dtype=params_dtype)
-        param_dict["w2_weight_scale_second"] = torch.empty(
-            num_experts,
-            hidden_sizes,
-            intermediate_size_per_partition // self.group_size,
-            dtype=params_dtype)
-        param_dict["w2_weight_offset_second"] = torch.empty(
-            num_experts,
-            hidden_sizes,
-            intermediate_size_per_partition // self.group_size,
-            dtype=params_dtype)
+                                                     dtype=torch.float32)
+        if not self.is_per_channel_weight:
+            param_dict["w13_weight_scale_second"] = torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_sizes // self.group_size,
+                dtype=torch.float32)
+            param_dict["w13_weight_offset_second"] = torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_sizes // self.group_size,
+                dtype=torch.float32)
+
+            param_dict["w2_weight_scale_second"] = torch.empty(
+                num_experts,
+                hidden_sizes,
+                intermediate_size_per_partition // self.group_size,
+                dtype=torch.float32)
+            param_dict["w2_weight_offset_second"] = torch.empty(
+                num_experts,
+                hidden_sizes,
+                intermediate_size_per_partition // self.group_size,
+                dtype=torch.float32)

        if self.new_quant_version:
            param_dict["w13_scale_bias"] = torch.empty(
@@ -275,14 +278,6 @@ class AscendW4A8DynamicFusedMoEMethod:
            e_score_correction_bias=e_score_correction_bias,
            global_num_experts=global_num_experts)

-        fused_moe_state = get_forward_context().fused_moe_state
-        shared_gate_up, shared_dequant_scale = None, None
-        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            share_up_out, _ = shared_experts.gate_up_proj(
-                (quantized_x_for_share, dynamic_scale_for_share))
-            shared_gate_up, shared_dequant_scale = share_up_out[
-                0], share_up_out[1]
-
        # this is a naive implementation for experts load balance so as
        # to avoid accumulating too much tokens on a single rank.
        # currently it is only activated when doing profile runs.
@@ -291,27 +286,36 @@ class AscendW4A8DynamicFusedMoEMethod:

        topk_weights = topk_weights.to(x.dtype)

-        return unified_fused_experts_eager(
+        moe_comm_method = get_forward_context().moe_comm_method
+        return moe_comm_method.fused_experts(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
-            w1_scale=layer.w13_weight_scale_second,
-            w2_scale=layer.w2_weight_scale_second,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
            w1_scale_bias=layer.w13_scale_bias,
            w2_scale_bias=layer.w2_scale_bias,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            row_idx=row_idx,
+            use_int4_w4a8=True,
            expert_map=expert_map,
            log2phy=log2phy,
            global_redundant_expert_num=global_redundant_expert_num,
            shared_experts=shared_experts,
-            shared_gate_up=shared_gate_up,
-            shared_dequant_scale=shared_dequant_scale,
-            mc2_mask=kwargs.get("mc2_mask", None),
-            with_quant=True)
+            quantized_x_for_share=quantized_x_for_share,
+            dynamic_scale_for_share=dynamic_scale_for_share,
+            dynamic_eplb=self.dynamic_eplb)

    def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
+        scale = scale.transpose(1, 2).contiguous()
+        if self.is_per_channel_weight:
+            scale_np = scale.cpu().numpy()
+            scale_np.dtype = np.uint32
+            scale_uint64_tensor = torch.from_numpy(scale_np.astype(
+                np.int64)).npu()
+            return scale_uint64_tensor, None
+        per_group_scale = per_group_scale.transpose(1, 2).contiguous()
        group_num, k, n = weight.shape
        # the weight of the new version is reduced by half by pack n, so it needs to be restored
        if self.new_quant_version:
@@ -354,13 +358,10 @@ class AscendW4A8DynamicFusedMoEMethod:

    def pack_to_int32(self, weight: torch.Tensor):
        if self.new_quant_version:
-            group_num, k, n = weight.shape
-            assert n % 4 == 0, "the last dim of weight needs to be divided by 4"
-            packed_n = n // 4
            # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
-            packed_weight = torch.from_numpy(
-                np.frombuffer(weight.cpu().numpy().tobytes(), dtype=np.int32))
-            return packed_weight.reshape(group_num, k, packed_n).npu()
+            assert weight.shape[
+                -1] % 4 == 0, "the last dim of weight needs to be divided by 4"
+            return weight.view(torch.int32).contiguous()
        else:
            return torch_npu.npu_quantize(weight.to(torch.float32),
                                          torch.tensor([1.]).npu(), None,
@@ -372,23 +373,29 @@ class AscendW4A8DynamicFusedMoEMethod:
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
                1, 2).contiguous()
-        layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose(
-            1, 2).contiguous()
-        layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose(
-            1, 2).contiguous()
-        layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose(
-            1, 2).contiguous()
-        layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose(
-            1, 2).contiguous()

-        layer.w13_weight_scale_second.data, w13_bias = self.process_scale(
+        w13_weight_scale_second = layer.w13_weight_scale_second.data if hasattr(
+            layer, "w13_weight_scale_second") else None
+        w2_weight_scale_second = layer.w2_weight_scale_second.data if hasattr(
+            layer, "w2_weight_scale_second") else None
+        layer.w13_weight_scale.data, w13_bias = self.process_scale(
            layer.w13_weight, layer.w13_weight_scale.data,
-            layer.w13_weight_scale_second.data)
-        layer.w2_weight_scale_second.data, w2_bias = self.process_scale(
+            w13_weight_scale_second)
+        layer.w2_weight_scale.data, w2_bias = self.process_scale(
            layer.w2_weight, layer.w2_weight_scale.data,
-            layer.w2_weight_scale_second.data)
+            w2_weight_scale_second)
+        if hasattr(layer, "w13_weight_scale_second"):
+            # scale_second is no longer used, release this part of the memory
+            del layer.w13_weight_scale_second
+            del layer.w2_weight_scale_second
+            del layer.w13_weight_offset_second
+            del layer.w2_weight_offset_second

        self.update_bias(layer, w13_bias, w2_bias)

+        layer.w13_weight.data = torch_npu.npu_format_cast(
+            layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
+        layer.w2_weight.data = torch_npu.npu_format_cast(
+            layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
        layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
        layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -23,7 +23,7 @@ from vllm.attention.backends.abstract import AttentionType
 from vllm.distributed.parallel_state import get_ep_group

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm_ascend.ops.layers.experts_selector import select_experts
+from vllm_ascend.ops.moe.experts_selector import select_experts
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p


--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -23,181 +23,10 @@ from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context

-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.common_fused_moe import \
-    fused_experts as unified_fused_experts
-from vllm_ascend.ops.fused_moe import unified_fused_experts_eager
-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, dispose_tensor
-
-
-def apply_mlp_decode(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w1_scale: torch.Tensor,
-                     w2: torch.Tensor,
-                     w2_scale: torch.Tensor,
-                     group_list: torch.Tensor,
-                     dynamic_scale: torch.Tensor = None,
-                     group_list_type: int = 1) -> torch.Tensor:
-    """
-    apply MLP: gate_up_proj -> swiglu -> down_proj
-    Args:
-        hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size).
-        w1: expert weights1 with shape
-            (num_experts, hidden_size, intermediate_size * 2)
-        w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2)
-        w2: expert weights2 with shape
-            (num_experts, intermediate_size, hidden_size)
-        w2_scale: weights2 scale with shape (num_experts, hidden_size)
-        group_list: number of tokens for each expert, follow cumsum mode, and
-            with shape (num_experts).
-        transpose_weight:
-            w1: (num_experts, intermediate_size * 2, hidden_size) ->
-                    (num_experts, hidden_size, intermediate_size * 2)
-            w2: (num_experts, hidden_size, intermediate_size) ->
-                    (num_experts, intermediate_size, hidden_size)
-    Returns:
-        hidden_states: output hidden states after MLP.
-    """
-
-    if dynamic_scale is None:
-        unquantized_hidden_states = hidden_states
-        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
-            hidden_states)
-        # Dispose the original unquantized hidden states
-        # to save npu memory because they're no longer used.
-        dispose_tensor(unquantized_hidden_states)
-    else:
-        pertoken_scale = dynamic_scale
-
-    # gmm1: gate_up_proj
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w1],
-        split_item=3,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-        output_dtype=torch.int32)[0]
-
-    # act_fn: swiglu
-    hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
-        x=hidden_states,
-        weight_scale=w1_scale,
-        activation_scale=pertoken_scale,
-        bias=None,
-        quant_scale=None,
-        quant_offset=None,
-        group_index=group_list,
-        activate_left=True,
-        quant_mode=1,
-    )
-
-    # gmm2: down_proj
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w2],
-        scale=[w2_scale],
-        per_token_scale=[swiglu_out_scale],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-        output_dtype=w2_scale.dtype)[0]
-    return hidden_states
-
-
-def apply_mlp(hidden_states: torch.Tensor,
-              w1: torch.Tensor,
-              w1_scale: torch.Tensor,
-              w2: torch.Tensor,
-              w2_scale: torch.Tensor,
-              group_list: torch.Tensor,
-              dynamic_scale: torch.Tensor = None,
-              group_list_type: int = 1,
-              w1_scale_bias: torch.Tensor = None,
-              w2_scale_bias: torch.Tensor = None) -> torch.Tensor:
-    """
-    apply MLP: gate_up_proj -> swiglu -> down_proj
-
-    Args:
-        hidden_states: input hidden states with shape (num_tokens, hidden_size).
-        w1: expert weights1 with shape
-            (num_experts, hidden_size, intermediate_size * 2)
-        w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2)
-        w2: expert weights2 with shape
-            (num_experts, intermediate_size, hidden_size)
-        w2_scale: weights2 scale with shape (num_experts, hidden_size)
-        group_list: number of tokens for each expert, follow cumsum mode, and
-            with shape (num_experts).
-        transpose_weight:
-            w1: (num_experts, intermediate_size * 2, hidden_size) ->
-                    (num_experts, hidden_size, intermediate_size * 2)
-            w2: (num_experts, hidden_size, intermediate_size) ->
-                    (num_experts, intermediate_size, hidden_size)
-
-    Returns:
-        hidden_states: output hidden states after MLP.
-    """
-
-    if dynamic_scale is None:
-        unquantized_hidden_states = hidden_states
-        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
-            hidden_states)
-        # Dispose the original unquantized hidden states
-        # to save npu memory because they're no longer used.
-        dispose_tensor(unquantized_hidden_states)
-    else:
-        pertoken_scale = dynamic_scale
-
-    bias1, bias2 = None, None
-    _output_dtype = w2_scale.dtype
-
-    if w1_scale_bias is not None:
-        if group_list_type == 0:
-            group_list = torch.cat(
-                [group_list[:1], torch.diff(group_list, dim=0)])
-            group_list_type = 1
-        bias1 = [w1_scale_bias]
-        bias2 = [w2_scale_bias]
-        # TODO w4a8 scene: dynamic acquisition of dtype in the future
-        _output_dtype = torch.bfloat16
-
-    # gmm1: gate_up_proj
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w1],
-        scale=[w1_scale],
-        bias=bias1,
-        per_token_scale=[pertoken_scale],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-        output_dtype=_output_dtype)[0]
-
-    # act_fn: swiglu
-    hidden_states = torch_npu.npu_swiglu(hidden_states)
-    hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(
-        hidden_states)
-
-    # gmm2: down_proj
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w2],
-        scale=[w2_scale],
-        bias=bias2,
-        per_token_scale=[swiglu_out_scale],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-        output_dtype=_output_dtype)[0]
-
-    return hidden_states
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ


 class AscendW8A8DynamicLinearMethod:
@@ -271,8 +100,9 @@ class AscendW8A8DynamicLinearMethod:
    def process_weights_after_loading(self, layer):
        if self.transpose_weight:
            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
-        # cast quantized weight tensors in NZ format (29) for higher inference speed
-        layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29)
+        # cast quantized weight tensors in NZ format for higher inference speed
+        layer.weight.data = torch_npu.npu_format_cast(layer.weight.data,
+                                                      ACL_FORMAT_FRACTAL_NZ)
        layer.weight_scale.data = layer.weight_scale.data.flatten()
        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
        layer.weight_offset.data = layer.weight_offset.data.flatten()
@@ -293,6 +123,7 @@ class AscendW8A8DynamicFusedMoEMethod:
            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
            and not vllm_config.model_config.enforce_eager
            and not ascend_config.torchair_graph_config.enabled)
+        self.dynamic_eplb = ascend_config.dynamic_eplb

        try:
            device_group = get_mc2_group().device_group
@@ -387,25 +218,19 @@ class AscendW8A8DynamicFusedMoEMethod:
            global_num_experts=global_num_experts)

        if self.use_aclgraph:
-            return unified_fused_experts(
+            moe_comm_method = get_forward_context().moe_comm_method
+            return moe_comm_method.fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
+                row_idx=row_idx,
                use_int8_w8a8=True,
                w1_scale=layer.w13_weight_scale,
                w2_scale=layer.w2_weight_scale,
                expert_map=expert_map,
-            )
-
-        fused_moe_state = get_forward_context().fused_moe_state
-        shared_gate_up, shared_dequant_scale = None, None
-        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            share_up_out, _ = shared_experts.gate_up_proj(
-                (quantized_x_for_share, dynamic_scale_for_share))
-            shared_gate_up, shared_dequant_scale = share_up_out[
-                0], share_up_out[1]
+                dynamic_eplb=self.dynamic_eplb)

        # this is a naive implementation for experts load balance so as
        # to avoid accumulating too much tokens on a single rank.
@@ -415,23 +240,24 @@ class AscendW8A8DynamicFusedMoEMethod:

        topk_weights = topk_weights.to(x.dtype)

-        return unified_fused_experts_eager(
+        moe_comm_method = get_forward_context().moe_comm_method
+        return moe_comm_method.fused_experts(
            hidden_states=x,
            w1=layer.w13_weight,
-            w1_scale=layer.w13_weight_scale,
+            w1_scale=layer.w13_weight_scale_fp32,
            w2=layer.w2_weight,
            w2_scale=layer.w2_weight_scale,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            row_idx=row_idx,
+            use_int8_w8a8=True,
            expert_map=expert_map,
            log2phy=log2phy,
            global_redundant_expert_num=global_redundant_expert_num,
            shared_experts=shared_experts,
-            shared_gate_up=shared_gate_up,
-            shared_dequant_scale=shared_dequant_scale,
-            mc2_mask=kwargs.get("mc2_mask", None),
-            with_quant=True)
+            quantized_x_for_share=quantized_x_for_share,
+            dynamic_scale_for_share=dynamic_scale_for_share,
+            dynamic_eplb=self.dynamic_eplb)

    def process_weights_after_loading(self, layer):
        if self.transpose_weight:
@@ -439,8 +265,8 @@ class AscendW8A8DynamicFusedMoEMethod:
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
                1, 2).contiguous()
-        if envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP:
-            torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
+        torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
+        torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
            layer.w13_weight_scale.data.shape[0], -1)
        layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(