add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/README.md
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/README.md
@@ -0,0 +1,26 @@
+### 简介
+
+该example是vLLM中进行Expert Parallel的实验，mlu_hijack是对仓库代码的劫持，避免修改主仓库代码
+
+### 支持模型
+
+- qwen2_moe
+- mixtral
+- custom model
+- deepseek_v2
+
+### 支持板卡
+
+300系列设备只能用于功能测试，性能测试需要其他系列设备。
+
+### 运行demo
+```python
+python examples/cambricon_custom_func/expert_parallel/offline_inference.py
+```
+
+### 使用Expert Parallel特性
+
+- 设置环境变量export EXPERT_PARALLEL_EN=1|True|true|TRUE， LLM主接口传入tensor_parallel_size的同时，传入moe_tp_size或moe_ep_size，或两者都传；
+- 若只传moe_tp_size和moe_ep_size中的一个，另一个等于tensor_parallel_size除以传入其中一个的除数，所以必须保证传入数可以被tensor_parallel_size整除；
+- 若moe_tp_size和moe_ep_size都传入，则必须保证moe_tp_size * moe_ep_size == tensor_parallel_size；
+- 若moe_tp_size和moe_ep_size都不传，则它们默认值等于-1，即不开启专家并行；
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu.sh
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+rm output -rf
+mkdir output
+
+DATA_DIR=/data
+MODELS_DEEPSEEK_V2=(
+  "${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
+)
+
+MODELS=(${MODELS_DEEPSEEK_V2[@]})
+
+# 定义变量
+use_ray=0
+use_eager=0
+use_pp=0
+# context parameter
+input_sizes=(1024)
+output_sizes=(1)
+# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
+batch_sizes=(1 4 8 16 32)
+
+# decoder parameter
+# input_sizes=(1)
+# output_sizes=(128)
+# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
+# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
+
+tp_sizes=(8)
+moe_ep_sizes=(8 -1)
+pp_sizes=(1)
+
+if [ $use_pp -gt 0 ]; then
+  tp_sizes=(1)
+  moe_ep_sizes=(-1)
+  pp_sizes=(8)
+  BENCHMARK_CMD=benchmarks/benchmark_throughput.py
+  benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
+else
+  BENCHMARK_CMD=benchmarks/benchmark_latency.py
+  benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
+fi
+
+max_position_embeddings=163840
+
+#export MLU_VISIBLE_DEVICES=4,5,6,7
+export EXPERT_PARALLEL_EN=true
+export VLLM_LATENCY_DEBUG=true
+export VLLM_GRAPH_DEBUG=false
+# export VLLM_DUMP_MLU_INFO=true
+export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
+
+ray_option=""
+if [ $use_ray -gt 0 ]; then
+    ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
+fi
+eager_option=""
+if [ $use_eager -gt 0 ]; then
+    eager_option="--enforce-eager"
+fi
+
+# 遍历所有组合
+for HF_MODEL in "${MODELS[@]}"; do
+    quantization_option=""
+    if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
+        quantization_option="--quantization=smoothquant"
+    fi
+    for tp_size in "${tp_sizes[@]}"; do
+        for moe_ep_size in "${moe_ep_sizes[@]}"; do
+            for pp_size in "${pp_sizes[@]}"; do
+                for input_size in "${input_sizes[@]}"; do
+                    for output_size in "${output_sizes[@]}"; do
+                        for batch_size in "${batch_sizes[@]}"; do
+                            max_seq_len_to_capture=$(expr $input_size \+ $output_size)
+                            max_num_batched_tokens=$(expr $batch_size \* $input_size)
+                            max_model_len=$max_seq_len_to_capture
+                            if [ $max_model_len -gt $max_position_embeddings ]; then
+                                continue
+                            fi
+                            # max_num_seqs=256
+                            # if [ $max_num_seqs -lt $batch_size ]; then
+                            #     max_num_seqs=$batch_size
+                            # fi
+                            max_num_seqs=$batch_size
+                            if [ $max_model_len -gt $max_num_batched_tokens ]; then
+                                max_num_batched_tokens=$max_model_len
+                            fi
+                            if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
+                                max_num_batched_tokens=$max_num_seqs
+                            fi
+
+                            pp_option="--pipeline-parallel-size ${pp_size}"
+                            tp_option="-tp ${tp_size}"
+                            ep_option="--moe-ep-size ${moe_ep_size}"
+                            batch_size_option=""
+                            if [ $use_pp -le 0 ]; then
+                                batch_size_option="--batch-size ${batch_size}"
+                            fi
+    
+                            hf_model_name=$(basename "${HF_MODEL}")
+                            LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
+                            echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
+                            python3 ${BENCHMARK_CMD} \
+                                ${benchmark_option} \
+                                --trust-remote-code \
+                                --max-num-batched-tokens ${max_num_batched_tokens} \
+                                --max-model-len ${max_model_len} \
+                                --block-size 16 \
+                                --model ${HF_MODEL} \
+                                --tokenizer ${HF_MODEL} \
+                                --dtype bfloat16 \
+                                --input-len ${input_size} \
+                                --output-len ${output_size} \
+                                ${pp_option} ${tp_option} ${ep_option} \
+                                --max-seq-len-to-capture ${max_seq_len_to_capture} \
+                                --max-num-seqs ${max_num_seqs} \
+                                ${batch_size_option} \
+                                ${eager_option} ${ray_option} ${quantization_option} \
+                                2>&1 | tee ${LOG_FILE}                  
+                            # 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
+                            if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
+                                echo "Found one or more specified errors in the log file."
+                                break
+                            else
+                                echo "No specified errors found."
+                            fi
+                        done
+                    done
+                done
+            done
+        done
+    done
+done
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu_perf.sh
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu_perf.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+rm output -rf
+mkdir output
+
+DATA_DIR=/data
+MODELS_DEEPSEEK_V2=(
+  "${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
+)
+
+MODELS=(${MODELS_DEEPSEEK_V2[@]})
+
+# 定义变量
+use_ray=0
+use_eager=0
+use_pp=0
+use_kernel_analysis=0
+# context parameter
+input_sizes=(1024)
+output_sizes=(1)
+# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
+batch_sizes=(1 4 8 16 32)
+
+# decoder parameter
+# input_sizes=(1)
+# output_sizes=(128)
+# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
+# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
+
+tp_sizes=(8)
+moe_ep_sizes=(8 -1)
+pp_sizes=(1)
+
+if [ $use_pp -gt 0 ]; then
+  tp_sizes=(1)
+  moe_ep_sizes=(-1)
+  pp_sizes=(8)
+  BENCHMARK_CMD=benchmarks/benchmark_throughput.py
+  benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
+else
+  BENCHMARK_CMD=benchmarks/benchmark_latency.py
+  benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
+fi
+
+max_position_embeddings=163840
+
+#export MLU_VISIBLE_DEVICES=4,5,6,7
+export EXPERT_PARALLEL_EN=true
+export VLLM_LATENCY_DEBUG=true
+export VLLM_GRAPH_DEBUG=false
+# export VLLM_DUMP_MLU_INFO=true
+export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
+
+ray_option=""
+if [ $use_ray -gt 0 ]; then
+    ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
+fi
+
+record_option=""
+if [ $use_kernel_analysis -gt 0 ]; then
+    # ref: https://wiki.cambricon.com/pages/viewpage.action?pageId=434445235
+    export CNPERF_KERNEL_ANALYSIS=1
+    record_option="--pmu --capture_range=cnpx --cnpx_include kangpengtao --cnpx_exclude kangpengtao_exec --events tp_core__write_bytes,tp_core__read_bytes,tp_memcore__write_bytes,tp_memcore__read_bytes,tp_core__lt_cycles,tp_core__csimd_pre_cycles,tp_core__csimd_post_cycles"
+    use_eager=1
+fi
+
+eager_option=""
+if [ $use_eager -gt 0 ]; then
+    eager_option="--enforce-eager"
+fi
+
+# 遍历所有组合
+for HF_MODEL in "${MODELS[@]}"; do
+    quantization_option=""
+    if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
+        quantization_option="--quantization=smoothquant"
+    fi
+    for tp_size in "${tp_sizes[@]}"; do
+        for moe_ep_size in "${moe_ep_sizes[@]}"; do
+            for pp_size in "${pp_sizes[@]}"; do
+                for input_size in "${input_sizes[@]}"; do
+                    for output_size in "${output_sizes[@]}"; do
+                        for batch_size in "${batch_sizes[@]}"; do
+                            max_seq_len_to_capture=$(expr $input_size \+ $output_size)
+                            max_num_batched_tokens=$(expr $batch_size \* $input_size)
+                            max_model_len=$max_seq_len_to_capture 
+                            if [ $max_model_len -gt $max_position_embeddings ]; then
+                                continue
+                            fi
+                            # max_num_seqs=256
+                            # if [ $max_num_seqs -lt $batch_size ]; then
+                            #     max_num_seqs=$batch_size
+                            # fi
+                            max_num_seqs=$batch_size
+                            if [ $max_model_len -gt $max_num_batched_tokens ]; then
+                                max_num_batched_tokens=$max_model_len
+                            fi
+                            if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
+                                max_num_batched_tokens=$max_num_seqs
+                            fi
+    
+                            pp_option="--pipeline-parallel-size ${pp_size}"
+                            tp_option="-tp ${tp_size}"
+                            ep_option="--moe-ep-size ${moe_ep_size}"
+                            batch_size_option=""
+                            if [ $use_pp -le 0 ]; then
+                                batch_size_option="--batch-size ${batch_size}"
+                            fi
+        
+                            hf_model_name=$(basename "${HF_MODEL}")
+                            LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
+                            echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
+                            dltrace_data_name="dltrace_data_${hf_model_name}_${tp_size}_${moe_ep_size}_${pp_size}_${input_size}_${output_size}_${batch_size}_${max_model_len}_${max_num_batched_tokens}"
+                            rm dltrace_data -rf
+                            rm cnperf_data_* -rf
+                            CNPERF_VLOG_LEVEL=0-40 cnperf-cli record ${record_option} python3 ${BENCHMARK_CMD} \
+                                --trust-remote-code \
+                                --max-num-batched-tokens ${max_num_batched_tokens} \
+                                --max-model-len ${max_model_len} \
+                                --block-size 16 \
+                                --model ${HF_MODEL} \
+                                --tokenizer ${HF_MODEL} \
+                                --dtype bfloat16 \
+                                --input-len ${input_size} \
+                                --output-len ${output_size} \
+                                ${pp_option} ${tp_option} ${ep_option} \
+                                --max-seq-len-to-capture ${max_seq_len_to_capture} \
+                                --max-num-seqs ${max_num_seqs} \
+                                ${batch_size_option} \
+                                ${eager_option} ${ray_option} ${quantization_option} \
+                                2>&1 | tee ${LOG_FILE}
+                                # 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
+                                if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
+                                    echo "Found one or more specified errors in the log file."
+                                    break
+                                else
+                                    echo "No specified errors found."
+                                fi
+                            mv dltrace_data ${dltrace_data_name}
+                            mv cnperf_data_* ${dltrace_data_name}/
+                        done
+                    done
+                done
+            done
+        done
+    done
+done
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/client.sh
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/client.sh
@@ -0,0 +1,34 @@
+#/bin/bash
+
+# export EXPERT_PARALLEL_EN=True
+# export VLLM_LATENCY_DEBUG=True
+
+rm output/client -rf
+mkdir -p output/client
+
+PORT=32345
+MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
+input_sizes=(1024)
+output_sizes=(1)
+# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
+batch_sizes=(32)
+for input_size in "${input_sizes[@]}"; do
+    for output_size in "${output_sizes[@]}"; do
+        for batch_size in "${batch_sizes[@]}"; do
+            hf_model_name=$(basename "${HF_MODEL}")
+            LOG_FILE=output/client/${hf_model_name}_${input_size}_${output_size}_bs_${batch_size}.log
+            python benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model ${MODEL_PATH} \
+                --trust-remote-code \
+                --dataset-name random \
+                --num-prompts 1000 \
+                --port ${PORT} \
+                --request-rate inf \
+                --random_input_len $input_size \
+                --random-output-len ${output_size} \
+                --max-concurrency ${batch_size} \
+                2>&1 | tee ${LOG_FILE}
+        done
+    done
+done
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py
@@ -0,0 +1,2 @@
+print("Apply Expert Parallel Demo!")
+from . import model_executor
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/init.py
@@ -0,0 +1,5 @@
+from .layers import sparse_moe_mlp
+from .models import custom
+from .models import mixtral
+from .models import qwen2_moe
+from .models import deepseek_v2
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/init.py
@@ -0,0 +1 @@
+
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py
@@ -0,0 +1,142 @@
+"""
+Inference-only MOE model.
+
+Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
+which means each rank holds partial weight of all experts.
+While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
+which means each rank holds part of the experts' full weight.
+
+As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
+then computes using the partial weights, while for Expert Parallel, each rank only receives
+part of tokens' hidden states for experts on this rank, then computes using the full weights.
+
+When both Tensor Parallel and Expert Parallel are enabled, each rank handles
+a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
+across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
+enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
+"""
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              get_tensor_model_parallel_group)
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm_mlu._mlu_utils import get_device_major_capability
+
+
+def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        up_proj_name: str,
+        is_gated: bool,
+        down_proj_name: str,
+        has_bias: bool,
+        skip_bias_add: bool = False,
+        renormalize:bool = False,
+        hidden_act: str = "silu",
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_use_fused_moe: bool = False,
+        expert_group: int = 1,
+        topk_group: int = 1,
+    ):
+    super(SparseMoeMlp, self).__init__()
+    self.tp_rank = get_tensor_model_parallel_rank()
+    self.tp_size = get_tensor_model_parallel_world_size()
+    self.tp_group = get_tensor_model_parallel_group()
+    self.num_total_experts = num_experts
+    self.top_k = top_k
+    self.hidden_size = hidden_size
+    self.intermediate_size = intermediate_size
+    self.up_proj_name = up_proj_name
+    self.is_gated = is_gated
+    self.down_proj_name = down_proj_name
+    self.has_bias = has_bias
+    self.renormalize = renormalize
+    self.hidden_act = hidden_act
+    self.quant_config = quant_config
+    self.is_use_fused_moe = is_use_fused_moe
+    self.expert_group = expert_group
+    self.topk_group = topk_group
+    if get_device_major_capability() == 3:
+        self.is_use_fused_moe = False
+
+    if params_dtype is None:
+        params_dtype = torch.get_default_dtype()
+    self.params_dtype = params_dtype
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add moe relative distribution
+    '''
+    self.moe_tp_size = get_moe_tensor_parallel_world_size()
+    self.moe_tp_rank = get_moe_tensor_parallel_rank()
+    self.moe_tp_group = get_moe_tensor_parallel_group()
+    self.moe_ep_size = get_moe_expert_parallel_world_size()
+    self.moe_ep_rank = get_moe_expert_parallel_rank()
+    self.moe_ep_group = get_moe_expert_parallel_group()
+
+    # NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
+    # contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
+    self.skip_bias_add = True if self.moe_tp_rank > 0 else False
+
+    assert self.num_total_experts >= self.moe_ep_size, (
+        f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
+
+    assert self.intermediate_size % self.moe_tp_size == 0, (
+        f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
+
+    self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
+    if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
+        self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
+
+    self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
+
+    # Gate always runs at half / full precision for now.
+    self.gate = ReplicatedLinear(self.hidden_size,
+                                 self.num_total_experts,
+                                 bias=False,
+                                 params_dtype=self.params_dtype,
+                                 quant_config=None)
+    self.experts = nn.ModuleList([
+        FeedForward(hidden_size=self.hidden_size,
+                    intermediate_size=self.intermediate_size,
+                    hidden_act=self.hidden_act,
+                    up_proj_name=self.up_proj_name,
+                    is_gated=self.is_gated,
+                    down_proj_name=self.down_proj_name,
+                    bias=self.has_bias,
+                    quant_config=self.quant_config,
+                    skip_bias_add=self.skip_bias_add,
+                    reduce_results=False,
+                    tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
+    ])
+
+    self.init_pack_param()
+
+
+MluHijackObject.apply_hijack(SparseMoeMlp,
+                             SparseMoeMlp.__init__,
+                             vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/init.py
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py
@@ -0,0 +1,183 @@
+import torch
+import torch.nn.functional as F
+from typing import Optional
+from vllm.config import CacheConfig
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm_mlu.transformers_utils.configs import CustomConfig
+from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm_mlu.model_executor.models.layer_utils import (
+    decoder_layer_forward_base, is_per_tensor_smoothquant,
+    is_per_token_smoothquant, quant_fusion_with_rmsnorm,
+    quant_fusion_with_layernorm)
+
+
+class CustomMoeBlock(SparseMoeMlp):
+
+    def __init__(
+        self,
+        config: CustomConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(num_experts=config.num_experts,
+                         top_k=config.num_experts_per_tok,
+                         hidden_size=config.hidden_size,
+                         intermediate_size=config.moe_intermediate_size,
+                         up_proj_name="gate_up_proj",
+                         is_gated=config.is_gated,
+                         down_proj_name="down_proj",
+                         has_bias=config.mlp_bias,
+                         skip_bias_add=False,
+                         renormalize=config.norm_topk_prob,
+                         hidden_act=config.hidden_act,
+                         params_dtype=None,
+                         quant_config=quant_config,
+                         is_use_fused_moe=True)
+
+        self.config = config
+        self.rank = self.tp_rank
+        self.shared_expert = None
+        self.shared_expert_gate = None
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = FeedForward(hidden_size=config.hidden_size,
+                                             intermediate_size=config.shared_expert_intermediate_size,
+                                             hidden_act=config.hidden_act,
+                                             up_proj_name='gate_up_proj',
+                                             is_gated=config.is_gated,
+                                             down_proj_name='down_proj',
+                                             bias=config.mlp_bias,
+                                             quant_config=quant_config,
+                                             reduce_results=False)
+            self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
+                                                       1,
+                                                       bias=False,
+                                                       params_dtype=self.params_dtype,
+                                                       quant_config=None)
+
+
+    def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                gate_output = self.shared_expert_gate(hidden_states)
+                shared_output = F.sigmoid(gate_output[0]) * shared_output
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        residual_ = None if self.rank > 0 else residual
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: modify bt_ops.fused_moe to forward_experts
+        '''
+        final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: add comment to explain use_parallel_residual usage
+        '''
+        # use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
+        # use_parallel_residual = False:
+        #   if apply_residual_connection_post_layernorm:
+        #       x_attn = ln1(x) + attn(ln1(x))
+        #       x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
+        #   else:
+        #       x_attn = x + attn(ln1(x))
+        #       x_mlp = x_attn + mlp(ln2(x_attn))
+        # When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
+        # reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
+        # But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
+        # when mlp is finished.
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        reduce_results = (self.config.use_parallel_residual == False)
+        if reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
+        self,
+        config: CustomConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+    super(CustomDecoderLayer, self).__init__()
+    self.config = config
+    self.self_attn = CustomAttention(
+        config=config,
+        cache_config=cache_config,
+        quant_config=quant_config,
+    )
+
+    mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
+    is_gated = getattr(config, "is_gated", False)
+
+    if config.num_experts is not None:
+        '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: nothing changed, only use the CustomMoeBlock class in this file
+            '''
+        self.mlp = CustomMoeBlock(config=config,
+                                quant_config=quant_config)
+        '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+    else:
+        self.mlp = FeedForward(hidden_size=config.hidden_size,
+                               intermediate_size=config.intermediate_size,
+                               hidden_act=self.config.hidden_act,
+                               up_proj_name='up_proj',
+                               is_gated=is_gated,
+                               down_proj_name='down_proj',
+                               bias=mlp_bias,
+                               quant_config=quant_config,
+                               skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
+                               reduce_results = (self.config.use_parallel_residual == False))
+
+    self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
+    self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
+
+    # perf per-tensor sq cases by fusing quantization in layernorm
+    self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
+                                        not self.config.apply_residual_connection_post_layernorm)
+    self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
+                                        not self.config.apply_residual_connection_post_layernorm)
+    if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
+        self.self_attn.qkv_proj.quant_method.skip_quant_input = True
+        self.quant_fusion_attn_layernorm = None
+        self.is_moe = config.num_experts is not None
+        self.use_rmsnorm = self.config.norm_type == "rmsnorm"
+        if not self.is_moe:
+            self.mlp.up_proj.quant_method.skip_quant_input = True
+            self.quant_fusion_mlp_layernorm = None
+
+
+MluHijackObject.apply_hijack(CustomDecoderLayer,
+                             CustomDecoderLayer.__init__,
+                             vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py
@@ -0,0 +1,222 @@
+
+import re
+import torch
+from torch import nn
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from transformers import PretrainedConfig
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
+from vllm_mlu.model_executor.models.deepseek_v2  import DeepseekV2MoE
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+
+
+def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+    super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
+                     top_k=config.num_experts_per_tok,
+                     hidden_size=config.hidden_size,
+                     intermediate_size=config.moe_intermediate_size,
+                     up_proj_name="gate_up_proj",
+                     is_gated=True,
+                     down_proj_name="down_proj",
+                     has_bias=False,
+                     skip_bias_add=False,
+                     renormalize=config.norm_topk_prob,
+                     hidden_act=config.hidden_act,
+                     params_dtype=None,
+                     quant_config=quant_config,
+                     is_use_fused_moe=True,
+                     expert_group=config.n_group,
+                     topk_group=config.topk_group)
+    self.config = config
+    self.routed_scaling_factor = config.routed_scaling_factor
+    self.n_shared_experts = config.n_shared_experts
+    self.routed_scaling_factor = config.routed_scaling_factor
+    if self.moe_tp_size > config.n_routed_experts:
+        raise ValueError(
+            f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
+            f"the number of experts {config.n_routed_experts}.")
+
+    if config.hidden_act != "silu":
+        raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                         "Only silu is supported for now.")
+
+    self.gate = ReplicatedLinear(config.hidden_size,
+                                 config.n_routed_experts,
+                                 bias=False,
+                                 quant_config=None,
+                                 prefix=f"{prefix}.gate")
+    if config.n_shared_experts is not None:
+        intermediate_size = (config.moe_intermediate_size *
+                             config.n_shared_experts)
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace MLP with FeedForward.
+        '''
+        self.shared_experts = FeedForward(hidden_size=config.hidden_size,
+                                         intermediate_size=intermediate_size,
+                                         hidden_act=config.hidden_act,
+                                         up_proj_name='gate_up_proj',
+                                         is_gated=True,
+                                         down_proj_name='down_proj',
+                                         bias=False,
+                                         quant_config=quant_config,
+                                         reduce_results=False)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+
+def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.n_routed_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("gate_up_proj", "gate_proj", 0),
+        ("gate_up_proj", "up_proj", 1),
+    ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "mlp.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            # Skip non-stacked layers and experts (experts handled below).
+            if weight_name not in name:
+                continue
+            # We have mlp.experts[0].gate_proj in the checkpoint.
+            # Since we handle the experts below in expert_params_mapping,
+            # we need to skip here BEFORE we update the name, otherwise
+            # name will be updated to mlp.experts[0].gate_up_proj, which
+            # will then be updated below in expert_params_mapping
+            # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
+            '''
+            name = name.replace(weight_name, param_name)
+            if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+
+
+MluHijackObject.apply_hijack(DeepseekV2MoE,
+                             DeepseekV2MoE.__init__,
+                             vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
+MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
+                             DeepseekV2ForCausalLM.load_weights,
+                             vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py
@@ -0,0 +1,143 @@
+import torch
+import re
+import vllm
+from torch import nn
+from typing import List, Optional, Tuple, Iterable
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.model_executor.models.mixtral import MixtralForCausalLM
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.num_local_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("qkv_proj", "q_proj", "q"),
+        ("qkv_proj", "k_proj", "k"),
+        ("qkv_proj", "v_proj", "v"),
+        ("w13", "w1", 0),
+        ("w13", "w3", 1),
+        ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "block_sparse_moe.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if weight_name not in name:
+                continue
+            name = name.replace(weight_name, param_name)
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            # Remapping the name of FP8 kv-scale.
+            name = maybe_remap_kv_scale_name(name, params_dict)
+            if name is None:
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+MluHijackObject.apply_hijack(MixtralForCausalLM,
+                             MixtralForCausalLM.load_weights,
+                             vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py
@@ -0,0 +1,179 @@
+import torch
+import re
+from typing import Optional, Iterable, Tuple
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.model_executor.models.qwen2_moe import  Qwen2MoeForCausalLM
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
+    get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.utils import print_warning_once
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]]):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: pack params and cal start expert id
+    '''
+    for name, m in self.model.named_modules():
+        if isinstance(m, SparseMoeMlp):
+            m.pack_params()
+
+    # expert parallel modification start
+    moe_ep_rank = get_moe_expert_parallel_rank()
+    moe_ep_size = get_moe_expert_parallel_world_size()
+    num_total_experts = self.config.num_experts
+    start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
+    # expert parallel modification end
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("qkv_proj", "q_proj", "q"),
+        ("qkv_proj", "k_proj", "k"),
+        ("qkv_proj", "v_proj", "v"),
+        ("gate_up_proj", "gate_proj", 0),
+        ("gate_up_proj", "up_proj", 1),
+    ]
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: delete expert_params_mapping for no useless
+    '''
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    params_dict = dict(self.named_parameters())
+    for name, loaded_weight in weights:
+        if "rotary_emb.inv_freq" in name:
+            continue
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace expert_id in weight to named_expert_id in params_dict
+        '''
+        if start_expert_id > 0 and "mlp.experts." in name:
+            expert_str = re.search(r'experts\.\d+', name).group(0)
+            expert_id=int(expert_str.split(".")[1])
+            named_expert_id = expert_id - start_expert_id
+            old_expert_name = f"experts.{expert_id}"
+            new_expert_name = f"experts.{named_expert_id}"
+            name = name.replace(old_expert_name, new_expert_name)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            if weight_name not in name:
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: delete if "mlp.experts" in name: continue condition
+            '''
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            name = name.replace(weight_name, param_name)
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                continue
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            break
+        else:
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: delete for mapping in expert_params_mapping condition
+            '''
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            # Skip loading extra bias for GPTQ models.
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                continue
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            # Remapping the name of FP8 kv-scale.
+            if name.endswith("kv_scale"):
+                remapped_kv_scale_name = name.replace(
+                    ".kv_scale", ".attn.kv_scale")
+                if remapped_kv_scale_name not in params_dict:
+                    print_warning_once(
+                        "Found kv scale in the checkpoint "
+                        f"(e.g. {name}), but not found the expected "
+                        f"name in the model "
+                        f"(e.g. {remapped_kv_scale_name}). "
+                        "kv-scale is not loaded.")
+                    continue
+                else:
+                    name = remapped_kv_scale_name
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: add expert skiped condition
+            '''
+            # Skip experts that are not assigned to this worker.
+            if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
+                    and name not in params_dict):
+                continue
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
+                             Qwen2MoeForCausalLM.load_weights,
+                             vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py
@@ -0,0 +1,61 @@
+import os
+os.environ['EXPERT_PARALLEL_EN'] = "True"
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B"
+tp_size = 2
+moe_ep_size=2
+is_check_act_range = True
+input_seq_len=64
+output_seq_len=1
+batch=1
+# max_position_embedding=1024
+max_model_len=input_seq_len + output_seq_len
+# if max_model_len < max_position_embedding:
+#     max_model_len = max_position_embedding
+max_num_batched_tokens=input_seq_len * batch
+if max_model_len > max_num_batched_tokens:
+    max_num_batched_tokens=max_model_len
+max_num_seqs = batch
+
+if __name__ == '__main__':
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8)
+
+    # Create an LLM.
+    llm = LLM(model=model_dir,
+              trust_remote_code=True,
+              enforce_eager=True,
+              dtype='bfloat16',
+              max_model_len=max_model_len,
+              max_num_batched_tokens=max_num_batched_tokens,
+              max_num_seqs=max_num_seqs,
+              tensor_parallel_size=tp_size,
+              moe_ep_size=moe_ep_size,
+              )
+
+    if is_check_act_range:
+        llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True)
+
+        llm.llm_engine.model_executor._run_workers("remove_hooks")
+        act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
+        print(f"len(act_range)={len(act_range)}")
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/server.sh
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/server.sh
@@ -0,0 +1,48 @@
+#/bin/bash
+
+rm output/server -rf
+mkdir -p output/server
+
+PORT=32345
+use_ray=0
+use_pp=1
+use_eager=0
+
+eager_option=""
+if [ $use_eager -gt 0 ]; then
+    eager_option="--enforce-eager"
+fi
+
+ray_option=""
+if [ $use_ray -gt 0 ]; then
+    ray_option="--worker-use-ray"
+    ray  stop --force
+fi
+
+export VLLM_ENGINE_ITERATION_TIMEOUT_S=180
+MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
+
+if [ $use_pp -gt 0 ]; then
+  parallel_option="--pipeline-parallel-size=8"
+else
+  parallel_option="--tensor-parallel-size=8"
+fi
+
+# TP8
+python -m vllm.entrypoints.openai.api_server \
+  --disable-log-requests \
+  --port ${PORT} \
+  --model ${MODEL_PATH} \
+  --trust-remote-code \
+  --swap-space 16 \
+  ${parallel_option} \
+  --max-num-batched-tokens=40960 \
+  --max-model-len=1034 \
+  --block-size=16 \
+  --dtype=bfloat16 \
+  --max-seq-len-to-capture=1034 \
+  --max-num-seqs=40 \
+  --quantization=smoothquant \
+  ${eager_option} \
+  ${ray_option}  \
+  2>&1 | tee output/server/server.log