add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py
@@ -0,0 +1,2 @@
+print("Apply Expert Parallel Demo!")
+from . import model_executor
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/init.py
@@ -0,0 +1,5 @@
+from .layers import sparse_moe_mlp
+from .models import custom
+from .models import mixtral
+from .models import qwen2_moe
+from .models import deepseek_v2
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/init.py
@@ -0,0 +1 @@
+
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py
@@ -0,0 +1,142 @@
+"""
+Inference-only MOE model.
+
+Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
+which means each rank holds partial weight of all experts.
+While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
+which means each rank holds part of the experts' full weight.
+
+As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
+then computes using the partial weights, while for Expert Parallel, each rank only receives
+part of tokens' hidden states for experts on this rank, then computes using the full weights.
+
+When both Tensor Parallel and Expert Parallel are enabled, each rank handles
+a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
+across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
+enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
+"""
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              get_tensor_model_parallel_group)
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm_mlu._mlu_utils import get_device_major_capability
+
+
+def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        up_proj_name: str,
+        is_gated: bool,
+        down_proj_name: str,
+        has_bias: bool,
+        skip_bias_add: bool = False,
+        renormalize:bool = False,
+        hidden_act: str = "silu",
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_use_fused_moe: bool = False,
+        expert_group: int = 1,
+        topk_group: int = 1,
+    ):
+    super(SparseMoeMlp, self).__init__()
+    self.tp_rank = get_tensor_model_parallel_rank()
+    self.tp_size = get_tensor_model_parallel_world_size()
+    self.tp_group = get_tensor_model_parallel_group()
+    self.num_total_experts = num_experts
+    self.top_k = top_k
+    self.hidden_size = hidden_size
+    self.intermediate_size = intermediate_size
+    self.up_proj_name = up_proj_name
+    self.is_gated = is_gated
+    self.down_proj_name = down_proj_name
+    self.has_bias = has_bias
+    self.renormalize = renormalize
+    self.hidden_act = hidden_act
+    self.quant_config = quant_config
+    self.is_use_fused_moe = is_use_fused_moe
+    self.expert_group = expert_group
+    self.topk_group = topk_group
+    if get_device_major_capability() == 3:
+        self.is_use_fused_moe = False
+
+    if params_dtype is None:
+        params_dtype = torch.get_default_dtype()
+    self.params_dtype = params_dtype
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add moe relative distribution
+    '''
+    self.moe_tp_size = get_moe_tensor_parallel_world_size()
+    self.moe_tp_rank = get_moe_tensor_parallel_rank()
+    self.moe_tp_group = get_moe_tensor_parallel_group()
+    self.moe_ep_size = get_moe_expert_parallel_world_size()
+    self.moe_ep_rank = get_moe_expert_parallel_rank()
+    self.moe_ep_group = get_moe_expert_parallel_group()
+
+    # NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
+    # contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
+    self.skip_bias_add = True if self.moe_tp_rank > 0 else False
+
+    assert self.num_total_experts >= self.moe_ep_size, (
+        f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
+
+    assert self.intermediate_size % self.moe_tp_size == 0, (
+        f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
+
+    self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
+    if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
+        self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
+
+    self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
+
+    # Gate always runs at half / full precision for now.
+    self.gate = ReplicatedLinear(self.hidden_size,
+                                 self.num_total_experts,
+                                 bias=False,
+                                 params_dtype=self.params_dtype,
+                                 quant_config=None)
+    self.experts = nn.ModuleList([
+        FeedForward(hidden_size=self.hidden_size,
+                    intermediate_size=self.intermediate_size,
+                    hidden_act=self.hidden_act,
+                    up_proj_name=self.up_proj_name,
+                    is_gated=self.is_gated,
+                    down_proj_name=self.down_proj_name,
+                    bias=self.has_bias,
+                    quant_config=self.quant_config,
+                    skip_bias_add=self.skip_bias_add,
+                    reduce_results=False,
+                    tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
+    ])
+
+    self.init_pack_param()
+
+
+MluHijackObject.apply_hijack(SparseMoeMlp,
+                             SparseMoeMlp.__init__,
+                             vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/init.py
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py
@@ -0,0 +1,183 @@
+import torch
+import torch.nn.functional as F
+from typing import Optional
+from vllm.config import CacheConfig
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm_mlu.transformers_utils.configs import CustomConfig
+from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm_mlu.model_executor.models.layer_utils import (
+    decoder_layer_forward_base, is_per_tensor_smoothquant,
+    is_per_token_smoothquant, quant_fusion_with_rmsnorm,
+    quant_fusion_with_layernorm)
+
+
+class CustomMoeBlock(SparseMoeMlp):
+
+    def __init__(
+        self,
+        config: CustomConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(num_experts=config.num_experts,
+                         top_k=config.num_experts_per_tok,
+                         hidden_size=config.hidden_size,
+                         intermediate_size=config.moe_intermediate_size,
+                         up_proj_name="gate_up_proj",
+                         is_gated=config.is_gated,
+                         down_proj_name="down_proj",
+                         has_bias=config.mlp_bias,
+                         skip_bias_add=False,
+                         renormalize=config.norm_topk_prob,
+                         hidden_act=config.hidden_act,
+                         params_dtype=None,
+                         quant_config=quant_config,
+                         is_use_fused_moe=True)
+
+        self.config = config
+        self.rank = self.tp_rank
+        self.shared_expert = None
+        self.shared_expert_gate = None
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = FeedForward(hidden_size=config.hidden_size,
+                                             intermediate_size=config.shared_expert_intermediate_size,
+                                             hidden_act=config.hidden_act,
+                                             up_proj_name='gate_up_proj',
+                                             is_gated=config.is_gated,
+                                             down_proj_name='down_proj',
+                                             bias=config.mlp_bias,
+                                             quant_config=quant_config,
+                                             reduce_results=False)
+            self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
+                                                       1,
+                                                       bias=False,
+                                                       params_dtype=self.params_dtype,
+                                                       quant_config=None)
+
+
+    def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                gate_output = self.shared_expert_gate(hidden_states)
+                shared_output = F.sigmoid(gate_output[0]) * shared_output
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        residual_ = None if self.rank > 0 else residual
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: modify bt_ops.fused_moe to forward_experts
+        '''
+        final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: add comment to explain use_parallel_residual usage
+        '''
+        # use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
+        # use_parallel_residual = False:
+        #   if apply_residual_connection_post_layernorm:
+        #       x_attn = ln1(x) + attn(ln1(x))
+        #       x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
+        #   else:
+        #       x_attn = x + attn(ln1(x))
+        #       x_mlp = x_attn + mlp(ln2(x_attn))
+        # When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
+        # reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
+        # But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
+        # when mlp is finished.
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        reduce_results = (self.config.use_parallel_residual == False)
+        if reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
+        self,
+        config: CustomConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+    super(CustomDecoderLayer, self).__init__()
+    self.config = config
+    self.self_attn = CustomAttention(
+        config=config,
+        cache_config=cache_config,
+        quant_config=quant_config,
+    )
+
+    mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
+    is_gated = getattr(config, "is_gated", False)
+
+    if config.num_experts is not None:
+        '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: nothing changed, only use the CustomMoeBlock class in this file
+            '''
+        self.mlp = CustomMoeBlock(config=config,
+                                quant_config=quant_config)
+        '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+    else:
+        self.mlp = FeedForward(hidden_size=config.hidden_size,
+                               intermediate_size=config.intermediate_size,
+                               hidden_act=self.config.hidden_act,
+                               up_proj_name='up_proj',
+                               is_gated=is_gated,
+                               down_proj_name='down_proj',
+                               bias=mlp_bias,
+                               quant_config=quant_config,
+                               skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
+                               reduce_results = (self.config.use_parallel_residual == False))
+
+    self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
+    self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
+
+    # perf per-tensor sq cases by fusing quantization in layernorm
+    self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
+                                        not self.config.apply_residual_connection_post_layernorm)
+    self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
+                                        not self.config.apply_residual_connection_post_layernorm)
+    if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
+        self.self_attn.qkv_proj.quant_method.skip_quant_input = True
+        self.quant_fusion_attn_layernorm = None
+        self.is_moe = config.num_experts is not None
+        self.use_rmsnorm = self.config.norm_type == "rmsnorm"
+        if not self.is_moe:
+            self.mlp.up_proj.quant_method.skip_quant_input = True
+            self.quant_fusion_mlp_layernorm = None
+
+
+MluHijackObject.apply_hijack(CustomDecoderLayer,
+                             CustomDecoderLayer.__init__,
+                             vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py
@@ -0,0 +1,222 @@
+
+import re
+import torch
+from torch import nn
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from transformers import PretrainedConfig
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
+from vllm_mlu.model_executor.models.deepseek_v2  import DeepseekV2MoE
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+
+
+def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+    super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
+                     top_k=config.num_experts_per_tok,
+                     hidden_size=config.hidden_size,
+                     intermediate_size=config.moe_intermediate_size,
+                     up_proj_name="gate_up_proj",
+                     is_gated=True,
+                     down_proj_name="down_proj",
+                     has_bias=False,
+                     skip_bias_add=False,
+                     renormalize=config.norm_topk_prob,
+                     hidden_act=config.hidden_act,
+                     params_dtype=None,
+                     quant_config=quant_config,
+                     is_use_fused_moe=True,
+                     expert_group=config.n_group,
+                     topk_group=config.topk_group)
+    self.config = config
+    self.routed_scaling_factor = config.routed_scaling_factor
+    self.n_shared_experts = config.n_shared_experts
+    self.routed_scaling_factor = config.routed_scaling_factor
+    if self.moe_tp_size > config.n_routed_experts:
+        raise ValueError(
+            f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
+            f"the number of experts {config.n_routed_experts}.")
+
+    if config.hidden_act != "silu":
+        raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                         "Only silu is supported for now.")
+
+    self.gate = ReplicatedLinear(config.hidden_size,
+                                 config.n_routed_experts,
+                                 bias=False,
+                                 quant_config=None,
+                                 prefix=f"{prefix}.gate")
+    if config.n_shared_experts is not None:
+        intermediate_size = (config.moe_intermediate_size *
+                             config.n_shared_experts)
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace MLP with FeedForward.
+        '''
+        self.shared_experts = FeedForward(hidden_size=config.hidden_size,
+                                         intermediate_size=intermediate_size,
+                                         hidden_act=config.hidden_act,
+                                         up_proj_name='gate_up_proj',
+                                         is_gated=True,
+                                         down_proj_name='down_proj',
+                                         bias=False,
+                                         quant_config=quant_config,
+                                         reduce_results=False)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+
+def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.n_routed_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("gate_up_proj", "gate_proj", 0),
+        ("gate_up_proj", "up_proj", 1),
+    ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "mlp.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            # Skip non-stacked layers and experts (experts handled below).
+            if weight_name not in name:
+                continue
+            # We have mlp.experts[0].gate_proj in the checkpoint.
+            # Since we handle the experts below in expert_params_mapping,
+            # we need to skip here BEFORE we update the name, otherwise
+            # name will be updated to mlp.experts[0].gate_up_proj, which
+            # will then be updated below in expert_params_mapping
+            # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
+            '''
+            name = name.replace(weight_name, param_name)
+            if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+
+
+MluHijackObject.apply_hijack(DeepseekV2MoE,
+                             DeepseekV2MoE.__init__,
+                             vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
+MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
+                             DeepseekV2ForCausalLM.load_weights,
+                             vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py
@@ -0,0 +1,143 @@
+import torch
+import re
+import vllm
+from torch import nn
+from typing import List, Optional, Tuple, Iterable
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.model_executor.models.mixtral import MixtralForCausalLM
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.num_local_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("qkv_proj", "q_proj", "q"),
+        ("qkv_proj", "k_proj", "k"),
+        ("qkv_proj", "v_proj", "v"),
+        ("w13", "w1", 0),
+        ("w13", "w3", 1),
+        ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "block_sparse_moe.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if weight_name not in name:
+                continue
+            name = name.replace(weight_name, param_name)
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            # Remapping the name of FP8 kv-scale.
+            name = maybe_remap_kv_scale_name(name, params_dict)
+            if name is None:
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+MluHijackObject.apply_hijack(MixtralForCausalLM,
+                             MixtralForCausalLM.load_weights,
+                             vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py
@@ -0,0 +1,179 @@
+import torch
+import re
+from typing import Optional, Iterable, Tuple
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.model_executor.models.qwen2_moe import  Qwen2MoeForCausalLM
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.utils import print_warning_once
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.num_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("qkv_proj", "q_proj", "q"),
+        ("qkv_proj", "k_proj", "k"),
+        ("qkv_proj", "v_proj", "v"),
+        ("gate_up_proj", "gate_proj", 0),
+        ("gate_up_proj", "up_proj", 1),
+    ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "mlp.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if weight_name not in name:
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: delete if "mlp.experts" in name: continue condition
+            '''
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            name = name.replace(weight_name, param_name)
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                continue
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: delete for mapping in expert_params_mapping condition
+            '''
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                continue
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            # Remapping the name of FP8 kv-scale.
+            if name.endswith("kv_scale"):
+                remapped_kv_scale_name = name.replace(
+                    ".kv_scale", ".attn.kv_scale")
+                if remapped_kv_scale_name not in params_dict:
+                    print_warning_once(
+                        "Found kv scale in the checkpoint "
+                        f"(e.g. {name}), but not found the expected "
+                        f"name in the model "
+                        f"(e.g. {remapped_kv_scale_name}). "
+                        "kv-scale is not loaded.")
+                    continue
+                else:
+                    name = remapped_kv_scale_name
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
+                             Qwen2MoeForCausalLM.load_weights,
+                             vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)