add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/common.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/common.py
@@ -0,0 +1,17 @@
+import logging
+from logging import Logger
+
+def init_logger(name: str) -> Logger:
+    """Initialize loggers for benchmarks module,
+    and keep the configuration consistent with the vllm module"""
+
+    logger = logging.getLogger(name)
+
+    vllm_logger = logging.Logger.manager.loggerDict.get('vllm', None)
+    if vllm_logger:
+        logger.setLevel(vllm_logger.level)
+        logger.propagate = vllm_logger.propagate
+        logger.handlers = vllm_logger.handlers
+
+    return logger
+
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py
@@ -0,0 +1,110 @@
+import torch
+from vllm.config import ParallelConfig, TokenizerPoolConfig
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
+from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
+from vllm.platforms import current_platform
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+    from vllm.executor.executor_base import ExecutorBase
+
+logger = init_logger(__name__)
+
+
+def vllm__config__ParallelConfig___init__(
+        self,
+        pipeline_parallel_size: int,
+        tensor_parallel_size: int,
+        worker_use_ray: Optional[bool] = None,
+        max_parallel_loading_workers: Optional[int] = None,
+        disable_custom_all_reduce: bool = False,
+        tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
+        ray_workers_use_nsight: bool = False,
+        placement_group: Optional["PlacementGroup"] = None,
+        distributed_executor_backend: Optional[Union[
+            str, Type["ExecutorBase"]]] = None,
+) -> None:
+    self.pipeline_parallel_size = pipeline_parallel_size
+    self.tensor_parallel_size = tensor_parallel_size
+    self.distributed_executor_backend = distributed_executor_backend
+    self.max_parallel_loading_workers = max_parallel_loading_workers
+    self.disable_custom_all_reduce = disable_custom_all_reduce
+    self.tokenizer_pool_config = tokenizer_pool_config
+    self.ray_workers_use_nsight = ray_workers_use_nsight
+    self.placement_group = placement_group
+
+    '''
+    ==========================
+    Modify by vllm_mlu
+    ==========================
+    @brief: modify world_size
+    '''
+    self.context_parallel_size = self.context_parallel_size
+    self.moe_tp_size = self.moe_tp_size
+    self.moe_ep_size = self.moe_ep_size
+
+    self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size
+    '''
+    =======================
+    End of MLU Hijack
+    =======================
+    '''
+    if worker_use_ray:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        elif not self.use_ray:
+            raise ValueError(f"worker-use-ray can't be used with "
+                             f"distributed executor backend "
+                             f"'{self.distributed_executor_backend}'.")
+
+    if current_platform.is_tpu() and self.world_size > 1:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        if self.distributed_executor_backend != "ray":
+            raise ValueError(
+                "TPU backend only supports Ray for distributed inference.")
+
+    if current_platform.is_hpu() and self.world_size > 1:
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        if self.distributed_executor_backend != "ray":
+            raise ValueError(
+                "HPU backend only supports Ray for distributed inference.")
+
+    if self.distributed_executor_backend is None and self.world_size > 1:
+        # We use multiprocessing by default if world_size fits on the
+        # current node and we aren't in a ray placement group.
+
+        from vllm.executor import ray_utils
+        backend = "mp"
+        ray_found = ray_utils.ray_is_available()
+        if (current_platform.is_cuda()
+                and cuda_device_count_stateless() < self.world_size):
+            if not ray_found:
+                raise ValueError("Unable to load Ray which is "
+                                 "required for multi-node inference, "
+                                 "please install Ray with `pip install "
+                                 "ray`.") from ray_utils.ray_import_err
+            backend = "ray"
+        elif ray_found:
+            if self.placement_group:
+                backend = "ray"
+            else:
+                from ray import is_initialized as ray_is_initialized
+                if ray_is_initialized():
+                    from ray.util import get_current_placement_group
+                    if get_current_placement_group():
+                        backend = "ray"
+        self.distributed_executor_backend = backend
+        logger.info("Defaulting to use %s for distributed inference",
+                    backend)
+
+    self._verify_args()
+    self.rank: int = 0
+
+
+MluHijackObject.apply_hijack(ParallelConfig,
+                             ParallelConfig.__init__,
+                             vllm__config__ParallelConfig___init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/init.py
@@ -0,0 +1,2 @@
+from . import communication_op
+from . import parallel_state
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/communication_op.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/communication_op.py
@@ -0,0 +1,21 @@
+import torch
+from typing import Any, Dict, Optional, Union
+
+from .parallel_state import get_tp_group
+
+def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group(tp_group).all_reduce(input_)
+
+
+def tensor_model_parallel_all_gather(input_: torch.Tensor,
+                                     dim: int = -1, tp_group: Any = None) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group(tp_group).all_gather(input_, dim)
+
+
+def tensor_model_parallel_gather(input_: torch.Tensor,
+                                 dst: int = 0,
+                                 dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group(tp_group).gather(input_, dst, dim)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/parallel_state.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/parallel_state.py
@@ -0,0 +1,339 @@
+import torch
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from vllm.config import ParallelConfig
+from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size,
+                                             get_tensor_model_parallel_rank, get_world_group, get_pp_group,
+                                             GroupCoordinator)
+import vllm.distributed.parallel_state as parallel_state_org
+from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org
+from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org
+
+def get_tp_group(tp_group: Any = None) -> GroupCoordinator:
+    if tp_group is not None:
+        return tp_group
+    assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized")
+    return parallel_state_org._TP
+
+_CP: Optional[GroupCoordinator] = None
+
+def get_cp_group() -> GroupCoordinator:
+    assert _CP is not None, ("context parallel group is not initialized")
+    return _CP
+
+# kept for backward compatibility
+get_context_model_parallel_group = get_cp_group
+
+_MOE_TP: Optional[GroupCoordinator] = None
+
+def get_moe_tp_group() -> GroupCoordinator:
+    assert _MOE_TP is not None, ("moe tensor parallel group is not initialized")
+    return _MOE_TP
+
+# kept for backward compatibility
+get_moe_tensor_parallel_group = get_moe_tp_group
+
+_MOE_EP: Optional[GroupCoordinator] = None
+
+def get_moe_ep_group() -> GroupCoordinator:
+    assert _MOE_EP is not None, ("moe expert parallel group is not initialized")
+    return _MOE_EP
+
+
+# kept for backward compatibility
+get_moe_expert_parallel_group = get_moe_ep_group
+
+
+def initialize_model_parallel(
+    parallel_config: ParallelConfig,
+    backend: Optional[str] = None,
+) -> None:
+    """
+    Initialize model parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model
+            parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model
+            parallelism.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+        4 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 pipeline model-parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: get parallel_size from parallel_config and valid world_size
+    '''
+    tensor_model_parallel_size = parallel_config.tensor_parallel_size
+    pipeline_model_parallel_size = parallel_config.pipeline_parallel_size
+    context_model_parallel_size = parallel_config.context_parallel_size
+    moe_tensor_parallel_size = parallel_config.moe_tp_size
+    moe_expert_parallel_size = parallel_config.moe_ep_size
+
+    if (world_size !=
+            tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size):
+        raise RuntimeError(
+            f"world_size ({world_size}) is not equal to "
+            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+            f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x"
+            f"context_model_parallel_size ({context_model_parallel_size})")
+
+    if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size !=
+            moe_tensor_parallel_size * moe_expert_parallel_size):
+        raise RuntimeError(
+            f"tensor_model_parallel_size ({world_size}) is not equal to "
+            f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x "
+            f"moe_expert_parallel_size ({moe_expert_parallel_size})")
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    # Build the tensor model-parallel groups.
+    num_tensor_model_parallel_groups: int = (world_size //
+                                             tensor_model_parallel_size)
+    assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized")
+    group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = list(
+            range(i * tensor_model_parallel_size,
+                  (i + 1) * tensor_model_parallel_size))
+        group_ranks.append(ranks)
+
+    # message queue broadcaster is only used in tensor model parallel group
+    parallel_state_org._TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True,
+                                    group_name="tp")
+
+    # Build the pipeline model-parallel groups.
+    num_pipeline_model_parallel_groups: int = (world_size //
+                                               pipeline_model_parallel_size)
+    assert parallel_state_org._PP is None, (
+        "pipeline model parallel group is already initialized")
+    group_ranks = []
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+        group_ranks.append(ranks)
+    # pipeline parallel does not need custom allreduce
+    parallel_state_org._PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_custom_allreduce=False,
+                                    group_name="pp")
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add _CP, _MOE_TP, MOE_EP
+    '''
+    # Build the context parallel groups.
+    num_context_model_parallel_groups: int = (world_size //
+                                              context_model_parallel_size)
+    global _CP
+    assert _CP is None, (
+        "context parallel group is already initialized")
+    group_ranks = []
+    for i in range(num_context_model_parallel_groups):
+        ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size))
+        group_ranks.append(ranks)
+    # message queue broadcaster is set to be used in context parallel group
+    _CP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True,
+                                    group_name="cp")
+
+    # Build the moe tensor parallel groups.
+    global _MOE_TP
+    assert _MOE_TP is None, ("moe tensor parallel group is already initialized")
+    group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        for j in range(moe_expert_parallel_size):
+            ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size,
+                              moe_expert_parallel_size))
+            group_ranks.append(ranks)
+
+    # message queue broadcaster is set to be used in moe tensor parallel group
+    _MOE_TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True,
+                                    group_name="moe_tp")
+
+    # Build the moe expert parallel groups.
+    global _MOE_EP
+    assert _MOE_EP is None, ("moe expert parallel group is already initialized")
+    group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        for j in range(moe_tensor_parallel_size):
+            ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size,
+                              i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size)
+            group_ranks.append(ranks)
+
+    # message queue broadcaster is set to be used in moe expert parallel group
+    _MOE_EP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True,
+                                    group_name="moe_ep")
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+
+def ensure_model_parallel_initialized(
+    parallel_config: ParallelConfig,
+    backend: Optional[str] = None,
+) -> None:
+    """Helper to initialize model parallel groups if they are not initialized,
+    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+    values if the model parallel groups are initialized.
+    """
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
+    if not model_parallel_is_initialized():
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: replace all parallel_size to parallel_config
+        '''
+        initialize_model_parallel(parallel_config, backend)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        return
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: check parallel_size with prefix parallel_config
+    '''
+    assert (
+        get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size
+    ), ("tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tensor_model_parallel_world_size()=} vs. "
+        f"{parallel_config.tensor_model_parallel_size=}")
+    pp_world_size = get_pp_group().world_size
+    assert (pp_world_size == parallel_config.pipeline_model_parallel_size), (
+        "pipeline parallel group already initialized, but of unexpected size: "
+        f"{pp_world_size=} vs. "
+        f"{parallel_config.pipeline_model_parallel_size=}")
+    cp_world_size = get_cp_group().world_size
+    assert (cp_world_size == parallel_config.context_parallel_size), (
+        "context parallel group already initialized, but of unexpected size: "
+        f"{cp_world_size=} vs. "
+        f"{parallel_config.context_parallel_size=}")
+    moe_tp_world_size = get_moe_tp_group().world_size
+    assert (moe_tp_world_size == parallel_config.moe_tp_size), (
+        "moe tensor parallel group already initialized, but of unexpected size: "
+        f"{moe_tp_world_size=} vs. "
+        f"{parallel_config.moe_tp_size=}")
+    moe_ep_world_size = get_moe_ep_group().world_size
+    assert (moe_ep_world_size == parallel_config.moe_ep_size), (
+        "moe expert parallel group already initialized, but of unexpected size: "
+        f"{moe_ep_world_size=} vs. "
+        f"{parallel_config.moe_ep_size=}")
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+
+def model_parallel_is_initialized():
+    """Check if tensor, pipeline, context, moe parallel groups are initialized."""
+    return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and (
+        _MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None)
+
+
+def destroy_model_parallel():
+    """Set the groups to none and destroy them."""
+    destroy_model_parallel_org()
+    global _CP
+    if _CP:
+        _CP.destroy()
+    _CP = None
+
+    global _MOE_TP
+    if _MOE_TP:
+        _MOE_TP.destroy()
+    _MOE_TP = None
+
+    global _MOE_EP
+    if _MOE_EP:
+        _MOE_EP.destroy()
+    _MOE_EP = None
+
+
+def get_context_model_parallel_world_size():
+    """Return world size for the context parallel group."""
+    return get_cp_group().world_size
+
+
+def get_context_model_parallel_rank():
+    """Return my rank for the context parallel group."""
+    return get_cp_group().rank_in_group
+
+
+def get_moe_tensor_parallel_world_size():
+    """Return world size for the moe tensor parallel group."""
+    return get_moe_tp_group().world_size
+
+
+def get_moe_tensor_parallel_rank():
+    """Return my rank for the moe tensor parallel group."""
+    return get_moe_tp_group().rank_in_group
+
+
+def get_moe_expert_parallel_world_size():
+    """Return world size for the moe expert parallel group."""
+    return get_moe_ep_group().world_size
+
+
+def get_moe_expert_parallel_rank():
+    """Return my rank for the moe expert parallel group."""
+    return get_moe_ep_group().rank_in_group
+
+
+def get_parallel_world_size_with_group(group):
+    """Return world size for the special group."""
+    if group is not None:
+        return group.world_size
+    else:
+        return get_tensor_model_parallel_world_size()
+
+
+def get_parallel_rank_with_group(group):
+    """Return my rank for the special group."""
+    if group is not None:
+        return group.rank_in_group
+    else:
+        return get_tensor_model_parallel_rank()
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/init.py
@@ -0,0 +1 @@
+from . import arg_utils
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/arg_utils.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/arg_utils.py
@@ -0,0 +1,141 @@
+import argparse
+import torch
+from vllm.config import VllmConfig, ParallelConfig
+from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+vllm__engine__arg_utils__EngineArgs__create_engine_config_org = EngineArgs.create_engine_config
+vllm__engine__arg_utils__EngineArgs__add_cli_args_org = EngineArgs.add_cli_args
+vllm__engine__arg_utils__EngineArgs__from_cli_args_org = EngineArgs.from_cli_args
+vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org = AsyncEngineArgs.from_cli_args
+
+
+def vllm__engine__arg_utils__EngineArgs__create_engine_config(self, ) -> VllmConfig:
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: chunked parallel pipeline only support batch size = 1 yet.
+    '''
+    if CHUNKED_PIPELINE_PARALLEL_EN:
+        self.max_num_seqs = 1
+        logger.info("Reset max_num_seqs to 1 as the chunked parallel pipeline mode "
+                    "only supports batch size to 1.")
+    '''
+    @brief: disable custom_all_reduce, re-set block_size to support paged and unpaged mode.
+    '''
+    # MLU not support custom all reduce
+    self.disable_custom_all_reduce = True
+    BlockSizeInfo.set_block_size(self.block_size)
+    if not USE_PAGED and self.enable_chunked_prefill:
+        raise ValueError("Not support chunked_prefill in unpaged mode.")
+
+    # set parallel_config context_parallel_size, moe_tp_size, moe_ep_size
+    self.context_parallel_size = getattr(self, "context_parallel_size", 1)
+    self.moe_tp_size = getattr(self, "moe_tp_size", -1)
+    self.moe_ep_size = getattr(self, "moe_ep_size", -1)
+    # check context parallel whether supported or not
+    if CONTEXT_PARALLEL_EN:
+        if self.context_parallel_size > 1 and get_device_major_capability() == 3:
+            raise ValueError('Context parallel does not support MLU370.')
+    else:
+        if self.context_parallel_size > 1:
+            raise ValueError('Context parallel does not support when CONTEXT_PARALLEL_EN=False')
+    # check expert parallel whether supported or not
+    if not EXPERT_PARALLEL_EN and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
+        raise ValueError('Expert parallel does not support when EXPERT_PARALLEL_EN=False')
+
+    ParallelConfig.context_parallel_size = self.context_parallel_size
+
+    # set parallel_config moe_tp_size and moe_ep_size
+    if self.moe_tp_size < 1 and self.moe_ep_size < 1:
+        moe_tp_size = self.tensor_parallel_size
+        moe_ep_size = 1
+    elif self.moe_tp_size >= 1 and self.moe_ep_size < 1:
+        moe_tp_size = self.moe_tp_size
+        moe_ep_size = self.tensor_parallel_size // self.moe_tp_size
+    elif self.moe_tp_size < 1 and self.moe_ep_size >= 1:
+        moe_tp_size = self.tensor_parallel_size // self.moe_ep_size
+        moe_ep_size = self.moe_ep_size
+    else:
+        moe_tp_size = self.moe_tp_size
+        moe_ep_size = self.moe_ep_size
+    assert moe_tp_size * moe_ep_size == self.tensor_parallel_size, (
+            f"tensor_parallel_size ({self.tensor_parallel_size}) is not equal to "
+            f"moe_tp_size ({self.moe_tp_size}) x moe_ep_size ({self.moe_ep_size})"
+            "or moe_tp_size and moe_ep_size should be -1 or one of them should be -1")
+
+    ParallelConfig.moe_tp_size = moe_tp_size
+    ParallelConfig.moe_ep_size = moe_ep_size
+
+    engine_config = vllm__engine__arg_utils__EngineArgs__create_engine_config_org(self)
+    engine_config.cache_config.block_size = BlockSizeInfo.BLOCK_SIZE
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    return engine_config
+
+
+@staticmethod
+def vllm__engine__arg_utils__EngineArgs__add_cli_args(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser = vllm__engine__arg_utils__EngineArgs__add_cli_args_org(parser)
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add --context-parallel-size, --moe-tp-size and --moe-ep-size
+    '''
+    parser.add_argument('--context-parallel-size',
+                        '-cp',
+                        type=int,
+                        default=1,
+                        help='number of context parallel replicas')
+    parser.add_argument('--moe-tp-size',
+                        type=int,
+                        default=-1,
+                        help='Number of moe tensor parallel replicas')
+    parser.add_argument('--moe-ep-size',
+                        type=int,
+                        default=-1,
+                        help='Number of moe expert parallel replicas')
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    return parser
+
+
+@classmethod
+def vllm__engine__arg_utils__EngineArgs__from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
+    if cls == AsyncEngineArgs:
+        engine_args = vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org(args)
+    else:
+        engine_args = vllm__engine__arg_utils__EngineArgs__from_cli_args_org(args)
+    setattr(engine_args, 'context_parallel_size', getattr(args, "context_parallel_size"))
+    setattr(engine_args, 'moe_tp_size', getattr(args, "moe_tp_size"))
+    setattr(engine_args, 'moe_ep_size', getattr(args, "moe_ep_size"))
+    return engine_args
+
+
+MluHijackObject.apply_hijack(EngineArgs,
+                             EngineArgs.create_engine_config,
+                             vllm__engine__arg_utils__EngineArgs__create_engine_config)
+MluHijackObject.apply_hijack(EngineArgs,
+                             EngineArgs.add_cli_args,
+                             vllm__engine__arg_utils__EngineArgs__add_cli_args)
+MluHijackObject.apply_hijack(EngineArgs,
+                             EngineArgs.from_cli_args,
+                             vllm__engine__arg_utils__EngineArgs__from_cli_args)
+MluHijackObject.apply_hijack(AsyncEngineArgs,
+                             AsyncEngineArgs.from_cli_args,
+                             vllm__engine__arg_utils__EngineArgs__from_cli_args)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/init.py
@@ -0,0 +1 @@
+from . import llm
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/llm.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/llm.py
@@ -0,0 +1,98 @@
+from typing import Optional, Dict, Any
+from vllm.entrypoints.llm import LLM
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.logger import init_logger
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
+
+
+logger = init_logger(__name__)
+
+
+vllm__entrypoints__llm__LLM____init__org = LLM.__init__
+
+def vllm__entrypoints__llm__LLM____init__(
+    self,
+    model: str,
+    tokenizer: Optional[str] = None,
+    tokenizer_mode: str = "auto",
+    skip_tokenizer_init: bool = False,
+    trust_remote_code: bool = False,
+    allowed_local_media_path: str = "",
+    tensor_parallel_size: int = 1,
+    dtype: str = "auto",
+    quantization: Optional[str] = None,
+    revision: Optional[str] = None,
+    tokenizer_revision: Optional[str] = None,
+    seed: int = 0,
+    gpu_memory_utilization: float = 0.9,
+    swap_space: float = 4,
+    cpu_offload_gb: float = 0,
+    enforce_eager: Optional[bool] = None,
+    max_seq_len_to_capture: int = 8192,
+    disable_custom_all_reduce: bool = False,
+    disable_async_output_proc: bool = False,
+    hf_overrides: Optional[HfOverrides] = None,
+    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    # After positional args are removed, move this right below `model`
+    task: TaskOption = "auto",
+    override_pooler_config: Optional[PoolerConfig] = None,
+    **kwargs,
+) -> None:
+    '''
+    LLM constructor.
+
+    Note: if enforce_eager is unset (enforce_eager is None)
+    it defaults to False.
+    '''
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add cp and ep parameter
+    '''
+    # pop context_parallel_size
+    EngineArgs.context_parallel_size = kwargs.pop("context_parallel_size", 1)
+    # pop moe_tp_size and moe_ep_size
+    EngineArgs.moe_tp_size = kwargs.pop("moe_tp_size", -1)
+    # pop moe_ep_size and moe_ep_size
+    EngineArgs.moe_ep_size = kwargs.pop("moe_ep_size", -1)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    vllm__entrypoints__llm__LLM____init__org(
+        self=self,
+        model=model,
+        tokenizer=tokenizer,
+        tokenizer_mode=tokenizer_mode,
+        skip_tokenizer_init=skip_tokenizer_init,
+        trust_remote_code=trust_remote_code,
+        allowed_local_media_path=allowed_local_media_path,
+        tensor_parallel_size=tensor_parallel_size,
+        dtype=dtype,
+        quantization=quantization,
+        revision=revision,
+        tokenizer_revision=tokenizer_revision,
+        seed=seed,
+        gpu_memory_utilization=gpu_memory_utilization,
+        swap_space=swap_space,
+        cpu_offload_gb=cpu_offload_gb,
+        enforce_eager=enforce_eager,
+        max_seq_len_to_capture=max_seq_len_to_capture,
+        disable_custom_all_reduce=disable_custom_all_reduce,
+        disable_async_output_proc=disable_async_output_proc,
+        hf_overrides=hf_overrides,
+        mm_processor_kwargs=mm_processor_kwargs,
+        # After positional args are removed, move this right below `model`
+        task=task,
+        override_pooler_config=override_pooler_config,
+        **kwargs
+    )
+
+
+MluHijackObject.apply_hijack(LLM,
+                             LLM.__init__,
+                             vllm__entrypoints__llm__LLM____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/mlu_hijack.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/mlu_hijack.py
@@ -0,0 +1,7 @@
+print("Apply Custom VLLM Demo!")
+from . import distributed
+from . import engine
+from . import entrypoints
+from . import worker
+from . import config
+from . import model_executor
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/init.py
@@ -0,0 +1,2 @@
+from . import layers
+from . import parameter
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/init.py
@@ -0,0 +1,2 @@
+from . import linear
+from . import feed_forward
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/feed_forward.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/feed_forward.py
@@ -0,0 +1,93 @@
+from typing import Optional, Any
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    ColumnParallelLinear,
+    RowParallelLinear
+)
+from vllm_mlu.mlu_hijack_utils import set_is_gated, MluHijackObject
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
+
+
+logger = init_logger(__name__)
+
+
+def vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        up_proj_name: str,
+        is_gated: bool,
+        down_proj_name: str,
+        bias: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        skip_bias_add: bool = False,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_group: Any = None,
+    ):
+    super(FeedForward, self).__init__()
+    self.hidden_size = hidden_size
+    self.hidden_act = hidden_act
+    self.is_gated = is_gated
+    self.bias = bias
+    self.up_proj_name = up_proj_name
+    self.down_proj_name = down_proj_name
+    self.quant_config = quant_config
+    self.is_initialized = False
+    self.skip_bias_add = skip_bias_add
+    self.reduce_results = reduce_results
+    self.use_bt_ffn = True if quant_config is None else False
+    set_is_gated(self.is_gated)
+    self.tp_size = get_parallel_world_size_with_group(tp_group)
+    self.tp_rank = get_parallel_rank_with_group(tp_group)
+
+    '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: add tp_group parameter at the end of each linear class
+        '''
+    self.tp_group = tp_group
+    # up_proj with gate or not
+    if self.is_gated:
+        up_proj = MergedColumnParallelLinear(hidden_size,
+                                             [intermediate_size] * 2,
+                                             bias=bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.{up_proj_name}",
+                                             tp_group=tp_group)
+    else:
+        up_proj = ColumnParallelLinear(hidden_size,
+                                       intermediate_size,
+                                       bias=bias,
+                                       skip_bias_add=skip_bias_add,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.{up_proj_name}",
+                                       tp_group=tp_group)
+    self.register_module(up_proj_name, up_proj)
+
+    # down_proj
+    down_proj = RowParallelLinear(intermediate_size,
+                                  hidden_size,
+                                  bias=bias,
+                                  skip_bias_add=skip_bias_add,
+                                  reduce_results=reduce_results,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.{down_proj_name}",
+                                  tp_group=tp_group)
+    '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+    self.register_module(down_proj_name, down_proj)
+
+
+MluHijackObject.apply_hijack(FeedForward,
+                             FeedForward.__init__,
+                             vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/linear.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/linear.py
@@ -0,0 +1,696 @@
+from typing import Optional, List, Any, Tuple
+import torch
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from vllm.distributed import (divide, split_tensor_along_last_dim)
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PerTensorScaleParameter,
+                                           RowvLLMParameter)
+
+from vllm.logger import init_logger
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearBase, ColumnParallelLinear,
+                                               MergedColumnParallelLinear, RowParallelLinear, adjust_marlin_shard,
+                                               adjust_scalar_to_fused_array)
+from vllm import _mlu_ops as mlu_ops
+from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group,
+                                                       get_tp_group)
+from ....mlu_hijack.distributed.communication_op import (tensor_model_parallel_all_reduce,
+                                                         tensor_model_parallel_all_gather)
+
+vllm__model_executor__layers__linear__LinearBase____init__org = LinearBase.__init__
+
+logger = init_logger(__name__)
+
+
+def vllm__model_executor__layers__linear__LinearBase____init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_group: Any = None,
+    ):
+    vllm__model_executor__layers__linear__LinearBase____init__org(self=self,
+                                                                  input_size=input_size,
+                                                                  output_size=output_size,
+                                                                  skip_bias_add=skip_bias_add,
+                                                                  params_dtype=params_dtype,
+                                                                  quant_config=quant_config,
+                                                                  prefix=prefix)
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
+    '''
+    self.tp_group = tp_group
+    self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
+    self.tp_rank = get_parallel_rank_with_group(self.tp_group)
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+
+
+def vllm__model_executor__layers__linear__ColumnParallelLinear____init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[List[int]] = None,
+        prefix: str = "",
+        tp_group: Any = None,
+    ):
+    super(ColumnParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
+                     quant_config, prefix, tp_group)
+
+    self.gather_output = gather_output
+
+    # Divide the weight matrix along the last dimension.
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    @brief: move checking output_sizes logic from MergedColumnParallelLinear to here
+    '''
+    tp_size = self.tp_world_size
+
+    if output_sizes is not None:
+        assert all(output_size_var % tp_size == 0 for output_size_var in output_sizes)
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    assert self.quant_method is not None
+    self.output_size_per_partition = divide(self.output_size, tp_size)
+    self.output_partition_sizes = [self.output_size_per_partition]
+    # If QKV or MergedColumn, use output size of each partition.
+    if hasattr(self, "output_sizes"):
+        self.output_partition_sizes = [
+            divide(output_size, tp_size)
+            for output_size in self.output_sizes
+        ]
+
+    if output_sizes is None:
+        output_sizes = [output_size]
+
+    self.quant_method.create_weights(
+        layer=self,
+        input_size_per_partition=self.input_size,
+        output_partition_sizes=self.output_partition_sizes,
+        input_size=self.input_size,
+        output_size=self.output_size,
+        params_dtype=self.params_dtype,
+        weight_loader=(
+            self.weight_loader_v2 if self.quant_method.__class__.__name__
+            in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+    if bias:
+        self.bias = Parameter(
+            torch.empty(self.output_size_per_partition,
+                        dtype=params_dtype))
+        set_weight_attrs(self.bias, {
+            "output_dim": 0,
+            "weight_loader": self.weight_loader,
+        })
+    else:
+        self.register_parameter("bias", None)
+
+
+def vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader(
+        self, param: Parameter, loaded_weight: torch.Tensor):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    '''
+    tp_rank = self.tp_rank
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    output_dim = getattr(param, "output_dim", None)
+
+    # Special case for GGUF
+    is_gguf_weight = getattr(param, "is_gguf_weight", False)
+    is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+    if is_gguf_weight_type:
+        param.weight_type = loaded_weight.item()
+
+    # Materialize GGUF UninitializedParameter
+    if is_gguf_weight and isinstance(param, UninitializedParameter):
+        param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+
+    use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+    param_data = param.data
+    # bitsandbytes loads the weights of the specific portion
+    # no need to narrow here
+    if output_dim is not None and not use_bitsandbytes_4bit:
+        shard_size = param_data.shape[output_dim]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                             shard_size)
+
+    # Special case for loading scales off disk, which often do not
+    # have a shape (such as in the case of AutoFP8).
+    if len(loaded_weight.shape) == 0:
+        loaded_weight = loaded_weight.reshape(1)
+
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+def vllm__model_executor__layers__linear__ColumnParallelLinear__forward(
+        self, input_, smooth_quant_scale: Optional[torch.Tensor] = None):
+    bias = self.bias if not self.skip_bias_add else None
+
+    # Matrix multiply.
+    assert self.quant_method is not None
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: Add input_scale parameter.
+    '''
+    if smooth_quant_scale is not None:
+        output_parallel = self.quant_method.apply(self, input_, bias,
+                input_scale=smooth_quant_scale)
+    else:
+        output_parallel = self.quant_method.apply(self, input_, bias)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    if self.gather_output:
+        # All-gather across the partitions.
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: add tp_group param to tensor_model_parallel_all_gather
+        '''
+        output = tensor_model_parallel_all_gather(output_parallel, self.tp_group)
+        '''
+        =================
+        End of MLU Hijack
+        =================
+        '''
+    else:
+        output = output_parallel
+    output_bias = self.bias if self.skip_bias_add else None
+    return output, output_bias
+
+
+def vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr(self) -> str:
+    s = f"in_features={self.input_size}"
+    s += f", output_features={self.output_size_per_partition}"
+    s += f", bias={self.bias is not None}"
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    '''
+    s += f", tp_size={self.tp_world_size}"
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    s += f", gather_output={self.gather_output}"
+    return s
+
+
+def vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__(
+        self,
+        input_size: int,
+        output_sizes: List[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_group: Any = None,
+    ):
+    self.output_sizes = output_sizes
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: move checking output_sizes logic from MergedColumnParallelLinear to ColumnParallelLinear.__init__
+    '''
+    # tp_size = get_tensor_model_parallel_world_size()
+    # assert all(output_size % tp_size == 0 for output_size in output_sizes)
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    super(MergedColumnParallelLinear, self).__init__(input_size=input_size,
+                                                     output_size=sum(output_sizes),
+                                                     bias=bias,
+                                                     gather_output=gather_output,
+                                                     skip_bias_add=skip_bias_add,
+                                                     params_dtype=params_dtype,
+                                                     quant_config=quant_config,
+                                                     output_sizes=self.output_sizes,
+                                                     prefix=prefix,
+                                                     tp_group=tp_group)
+
+
+def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[int] = None):
+    # Special case for GGUF
+    # initialize GGUF param after we know the quantize type
+    is_gguf_weight = getattr(param, "is_gguf_weight", False)
+    is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+    if is_gguf_weight_type:
+        param.data[loaded_shard_id].copy_(loaded_weight)
+        param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+        return
+
+    if is_gguf_weight:
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+        @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+        '''
+        tp_rank = self.tp_rank
+        tp_size = self.tp_world_size
+        '''
+        =================
+        End of MLU Hijack
+        =================
+        '''
+        output_dim = getattr(param, "output_dim", None)
+        shard_size = loaded_weight.size(output_dim) // tp_size
+        start_idx = tp_rank * shard_size
+
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                             shard_size)
+
+        param.shard_id.append(loaded_shard_id)
+        param.shard_id_map[loaded_shard_id] = len(param.data_container)
+        param.data_container.append(loaded_weight)
+        if len(param.data_container) == 2:
+            self.qweight = param.materialize_nested()
+        return
+
+    param_data = param.data
+    output_dim = getattr(param, "output_dim", None)
+    # Special case for AQLM codebooks.
+    is_metadata = getattr(param, "is_metadata", False)
+    # Special case for per-tensor scale to load scalar into fused array.
+    needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+    if loaded_shard_id is None:
+        # Loaded weight is already fused on disk (qkv/mlp).
+        if output_dim is None:
+            if needs_scalar_to_array:
+                param_data, loaded_weight = adjust_scalar_to_fused_array(
+                    param_data, loaded_weight, 0)
+
+            assert param_data.shape == loaded_weight.shape
+            param_data.copy_(loaded_weight)
+            return
+        current_shard_offset = 0
+        shard_offsets: List[Tuple[int, int, int]] = []
+        for i, output_size in enumerate(self.output_sizes):
+            shard_offsets.append((i, current_shard_offset, output_size))
+            current_shard_offset += output_size
+        packed_dim = getattr(param, "packed_dim", None)
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
+            loaded_weight_shard = loaded_weight.narrow(
+                output_dim, shard_offset, shard_size)
+            self.weight_loader(param, loaded_weight_shard, shard_id)
+        return
+
+    assert loaded_shard_id < len(self.output_sizes)
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    '''
+    tp_rank = self.tp_rank
+    tp_size = self.tp_world_size
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    if output_dim is not None:
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // tp_size
+        # Special case for quantization.
+        # If quantized, we need to adjust the offset and size to account
+        # for the packing.
+        packed_dim = getattr(param, "packed_dim", None)
+        if packed_dim == output_dim:
+            shard_size = shard_size // param.pack_factor
+            shard_offset = shard_offset // param.pack_factor
+            # Special case for Marlin.
+            shard_size, shard_offset = adjust_marlin_shard(
+                param, shard_size, shard_offset)
+
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
+                                        False)
+        if use_bitsandbytes_4bit:
+            shard_size = loaded_weight.shape[output_dim]
+            shard_offset = loaded_weight.shape[output_dim] * \
+                loaded_shard_id
+
+        param_data = param_data.narrow(output_dim, shard_offset,
+                                       shard_size)
+        start_idx = tp_rank * shard_size
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if not use_bitsandbytes_4bit:
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
+    # Special case for AQLM codebooks.
+    elif is_metadata:
+        # metadata indicates fixed size concatenated along dim 0
+        shard_size = loaded_weight.shape[0]
+        shard_offset = loaded_shard_id * shard_size
+        param_data = param_data.narrow(0, shard_offset, shard_size)
+
+    # Special case for per-tensor scales in fused case.
+    elif needs_scalar_to_array:
+        param_data, loaded_weight = adjust_scalar_to_fused_array(
+            param_data, loaded_weight, loaded_shard_id)
+
+    else:
+        ignore_warning = getattr(param, "ignore_warning", False)
+        if not ignore_warning:
+            logger.warning(
+                "Loading a weight without `output_dim` attribute in "
+                "MergedColumnParallelLinear, assume the weight is "
+                "the same for all partitions.")
+
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2(self,
+                         param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[int] = None):
+    if loaded_shard_id is None:
+        if isinstance(param, PerTensorScaleParameter):
+            param.load_merged_column_weight(loaded_weight=loaded_weight,
+                                            shard_id=0)
+            return
+        elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+            param.load_merged_column_weight(loaded_weight=loaded_weight)
+            return
+        # TODO: @dsikka - move to parameter.py
+        self._load_fused_module_from_checkpoint(param, loaded_weight)
+        return
+
+    assert loaded_shard_id < len(self.output_sizes)
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    '''
+    tp_size = self.tp_world_size
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+    shard_size = self.output_sizes[loaded_shard_id] // tp_size
+
+    param.load_merged_column_weight(loaded_weight=loaded_weight,
+                                    shard_id=loaded_shard_id,
+                                    shard_offset=shard_offset,
+                                    shard_size=shard_size)
+
+def vllm__model_executor__layers__linear__RowParallelLinear____init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_group: Any = None,
+    ):
+    super(RowParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype,
+                     quant_config, prefix, tp_group)
+
+    self.input_is_parallel = input_is_parallel
+    self.reduce_results = reduce_results
+
+    # Divide the weight matrix along the last dimension.
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    '''
+    self.tp_size = self.tp_world_size
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    self.input_size_per_partition = divide(input_size, self.tp_size)
+    assert self.quant_method is not None
+
+    self.quant_method.create_weights(
+        layer=self,
+        input_size_per_partition=self.input_size_per_partition,
+        output_partition_sizes=[self.output_size],
+        input_size=self.input_size,
+        output_size=self.output_size,
+        params_dtype=self.params_dtype,
+        weight_loader=(
+            self.weight_loader_v2 if self.quant_method.__class__.__name__
+            in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+    if not reduce_results and (bias and not skip_bias_add):
+        raise ValueError("When not reduce the results, adding bias to the "
+                         "results can lead to incorrect results")
+
+    if bias:
+        self.bias = Parameter(
+            torch.empty(self.output_size, dtype=params_dtype))
+        set_weight_attrs(self.bias, {
+            "output_dim": 0,
+            "weight_loader": self.weight_loader,
+        })
+    else:
+        self.register_parameter("bias", None)
+
+
+def vllm__model_executor__layers__linear__RowParallelLinear__weight_loader(
+        self, param: Parameter, loaded_weight: torch.Tensor):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size
+    '''
+    tp_rank = self.tp_rank
+    tp_size = self.tp_world_size
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    input_dim = getattr(param, "input_dim", None)
+    use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+    # Special case for GGUF
+    is_gguf_weight = getattr(param, "is_gguf_weight", False)
+    is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+    if is_gguf_weight_type:
+        param.weight_type = loaded_weight.item()
+
+    # Materialize GGUF UninitializedParameter
+    if is_gguf_weight and isinstance(param, UninitializedParameter):
+        weight_shape = list(loaded_weight.shape)
+        if input_dim:
+            weight_shape[input_dim] = weight_shape[input_dim] // tp_size
+        param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
+
+    param_data = param.data
+    # bitsandbytes loads the weights of the specific portion
+    # no need to narrow here
+    if input_dim is not None and not use_bitsandbytes_4bit:
+        shard_size = param_data.shape[input_dim]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(input_dim, start_idx,
+                                             shard_size)
+
+    # Special case for loading scales off disk, which often do not
+    # have a shape (such as in the case of AutoFP8).
+    if len(loaded_weight.shape) == 0:
+        loaded_weight = loaded_weight.reshape(1)
+
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+def vllm__model_executor__layers__linear__RowParallelLinear__forward(
+    self,
+    input_,
+    residual: Optional[torch.Tensor] = None
+):
+    if self.input_is_parallel:
+        input_parallel = input_
+    else:
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+        '''
+        tp_rank = self.tp_rank
+        '''
+        =================
+        End of MLU Hijack
+        =================
+        '''
+        splitted_input = split_tensor_along_last_dim(
+            input_, num_partitions=self.tp_size)
+        input_parallel = splitted_input[tp_rank].contiguous()
+
+    # Matrix multiply.
+    assert self.quant_method is not None
+    # Only fuse bias add into GEMM for rank 0 (this ensures that
+    # bias will not get added more than once in TP>1 case)
+    bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+    residual_ = None if self.tp_rank > 0 else residual
+    '''
+    =====================================================
+    Modify by custom vllm_mlu
+    =====================================================
+    @brief: abandon original reduce if parallel_num is set
+    '''
+    is_parallel_enable = hasattr(self.quant_method, 'parallel_num') and get_is_prompt()
+    '''
+    =====================================================
+    End of custom MLU Hijack
+    =====================================================
+    '''
+    output_parallel = self.quant_method.apply(self,
+                                              input_parallel,
+                                              bias=bias_,
+                                              residual=residual_)
+    '''
+    =============================
+    Modify by custom vllm_mlu
+    =============================
+    @brief: when preload_size is set, call GroupCoordinator.all_reduce() directly and
+    use async_op to set all_reduce paralleled with preload
+    '''
+    if self.reduce_results and self.tp_size > 1 and not is_parallel_enable:
+        if hasattr(self, 'preload_size') and self.preload_size > 0 and not self.is_prompt:
+            handle = get_tp_group(self.tp_group).all_reduce(output_parallel, async_op=True)
+            _MB = 1 << 20
+            mlu_ops.preload(self.preloaded_weights[0].data, self.preload_size * _MB)
+            preloaded_weights_size = self.preloaded_weights[0].numel() * self.preloaded_weights[0].element_size()
+            if preloaded_weights_size < (self.preload_size * _MB) and len(self.preloaded_weights) > 1:
+                mlu_ops.preload(self.preloaded_weights[1].data, (self.preload_size * _MB) - preloaded_weights_size)
+            handle.wait()
+            output = output_parallel
+        else:
+            '''
+             =============================
+             Modify by vllm_mlu
+             =============================
+             @brief: add tensor_model_parallel_all_reduce() with self.tp_group
+             '''
+            output = tensor_model_parallel_all_reduce(output_parallel, tp_group=self.tp_group)
+            '''
+             =================
+             End of MLU Hijack
+             =================
+             '''
+    else:
+        output = output_parallel
+    '''
+    =========================
+    End of custom MLU Hijack
+    =========================
+    '''
+    output_bias = self.bias if self.skip_bias_add else None
+
+    return output, output_bias
+
+
+MluHijackObject.apply_hijack(LinearBase,
+                             LinearBase.__init__,
+                             vllm__model_executor__layers__linear__LinearBase____init__)
+MluHijackObject.apply_hijack(ColumnParallelLinear,
+                             ColumnParallelLinear.__init__,
+                             vllm__model_executor__layers__linear__ColumnParallelLinear____init__)
+MluHijackObject.apply_hijack(ColumnParallelLinear,
+                             ColumnParallelLinear.weight_loader,
+                             vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader)
+MluHijackObject.apply_hijack(ColumnParallelLinear,
+                             ColumnParallelLinear.forward,
+                             vllm__model_executor__layers__linear__ColumnParallelLinear__forward)
+MluHijackObject.apply_hijack(ColumnParallelLinear,
+                             ColumnParallelLinear.extra_repr,
+                             vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr)
+MluHijackObject.apply_hijack(MergedColumnParallelLinear,
+                             MergedColumnParallelLinear.__init__,
+                             vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__)
+MluHijackObject.apply_hijack(MergedColumnParallelLinear,
+                             MergedColumnParallelLinear.weight_loader,
+                             vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader)
+MluHijackObject.apply_hijack(MergedColumnParallelLinear,
+                             MergedColumnParallelLinear.weight_loader_v2,
+                             vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2)
+MluHijackObject.apply_hijack(RowParallelLinear,
+                             RowParallelLinear.__init__,
+                             vllm__model_executor__layers__linear__RowParallelLinear____init__)
+MluHijackObject.apply_hijack(RowParallelLinear,
+                             RowParallelLinear.weight_loader,
+                             vllm__model_executor__layers__linear__RowParallelLinear__weight_loader)
+MluHijackObject.apply_hijack(RowParallelLinear,
+                             RowParallelLinear.forward,
+                             vllm__model_executor__layers__linear__RowParallelLinear__forward)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/parameter.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/parameter.py
@@ -0,0 +1,173 @@
+from fractions import Fraction
+from typing import Callable, Optional, Union, Any
+
+import torch
+from torch.nn import Parameter
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           PerTensorScaleParameter,
+                                           RowvLLMParameter,
+                                           _ColumnvLLMParameter)
+
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from ..distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group)
+
+logger = init_logger(__name__)
+
+
+def vllm__model_executor__parameter__BasevLLMParameter____init__(self, data: torch.Tensor, weight_loader: Callable, tp_group: Any = None):
+    """
+    Initialize the BasevLLMParameter
+
+    :param data: torch tensor with the parameter data
+    :param weight_loader: weight loader callable
+
+    :returns: a torch.nn.parameter
+    """
+
+    self._weight_loader = weight_loader
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add self.tp_group, world_size and tp_rank to support moe expert parallel
+    '''
+    self.tp_group = tp_group
+    self.tp_world_size = get_parallel_world_size_with_group(self.tp_group)
+    self.tp_rank = get_parallel_rank_with_group(self.tp_group)
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+
+
+def vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    '''
+    tp_rank = self.tp_rank
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    shard_size = self.data.shape[self.output_dim]
+    loaded_weight = loaded_weight.narrow(self.output_dim,
+                                         tp_rank * shard_size, shard_size)
+    assert self.data.shape == loaded_weight.shape
+    self.data.copy_(loaded_weight)
+
+def vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+    shard_offset = kwargs.get("shard_offset")
+    shard_size = kwargs.get("shard_size")
+    if isinstance(
+            self,
+        (PackedColumnParameter,
+         PackedvLLMParameter)) and self.packed_dim == self.output_dim:
+        shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+            shard_offset=shard_offset, shard_size=shard_size)
+
+    param_data = self.data
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    '''
+    tp_rank = self.tp_rank
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    param_data = param_data.narrow(self.output_dim, shard_offset,
+                                   shard_size)
+    loaded_weight = loaded_weight.narrow(self.output_dim,
+                                         tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+def vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+    shard_offset = kwargs.get("shard_offset")
+    shard_size = kwargs.get("shard_size")
+    shard_id = kwargs.get("shard_id")
+    num_heads = kwargs.get("num_heads")
+
+    if isinstance(
+            self,
+        (PackedColumnParameter,
+         PackedvLLMParameter)) and self.output_dim == self.packed_dim:
+        shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+            shard_offset=shard_offset, shard_size=shard_size)
+
+    param_data = self.data
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    '''
+    tp_rank = self.tp_rank
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+    param_data = param_data.narrow(self.output_dim, shard_offset,
+                                   shard_size)
+    loaded_weight = loaded_weight.narrow(self.output_dim,
+                                         shard_id * shard_size, shard_size)
+
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+def vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: modify get_tensor_model_parallel_rank() to self.tp_rank
+    '''
+    tp_rank = self.tp_rank
+    '''
+    =================
+    End of MLU Hijack
+    =================
+    '''
+    shard_size = self.data.shape[self.input_dim]
+    loaded_weight = loaded_weight.narrow(self.input_dim,
+                                         tp_rank * shard_size, shard_size)
+
+    if len(loaded_weight.shape) == 0:
+        loaded_weight = loaded_weight.reshape(1)
+
+    assert self.data.shape == loaded_weight.shape
+    self.data.copy_(loaded_weight)
+
+
+MluHijackObject.apply_hijack(BasevLLMParameter,
+                             BasevLLMParameter.__init__,
+                             vllm__model_executor__parameter__BasevLLMParameter____init__)
+MluHijackObject.apply_hijack(_ColumnvLLMParameter,
+                             _ColumnvLLMParameter.load_column_parallel_weight,
+                             vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight)
+MluHijackObject.apply_hijack(_ColumnvLLMParameter,
+                             _ColumnvLLMParameter.load_merged_column_weight,
+                             vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight)
+MluHijackObject.apply_hijack(_ColumnvLLMParameter,
+                             _ColumnvLLMParameter.load_qkv_weight,
+                             vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight)
+MluHijackObject.apply_hijack(RowvLLMParameter,
+                             RowvLLMParameter.load_row_parallel_weight,
+                             vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight)
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/init.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/init.py
@@ -0,0 +1 @@
+from . import mlu_worker
--- a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/mlu_worker.py
+++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/mlu_worker.py
@@ -0,0 +1,192 @@
+import gc
+import os
+import torch
+from typing import List, Optional, Set, Tuple, Type
+from vllm.config import ParallelConfig
+from vllm.distributed import init_distributed_environment, set_custom_all_reduce
+from vllm.model_executor import set_random_seed
+from vllm.worker.mlu_worker import MLUWorker, _check_if_gpu_supports_dtype
+from vllm_mlu.worker.mlu_worker import MLUWorker_V2
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from ..distributed.parallel_state import ensure_model_parallel_initialized
+
+import functools
+from collections import defaultdict
+from vllm.logger import init_logger
+from vllm_mlu.model_executor.layers.feed_forward import FeedForward
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear)
+from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
+from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
+from ..distributed.parallel_state import (get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size,
+                                          get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
+
+
+logger = init_logger(__name__)
+
+
+def vllm__worker__mlu_worker__init_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank,
+                                 backend='cncl')
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add context_parallel_size, moe_tp_size, moe_ep_size
+    '''
+    ensure_model_parallel_initialized(parallel_config=parallel_config)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+def vllm__worker__mlu_worker__MLUWorker__init_device(self) -> None:
+    if self.device_config.device.type == "mlu":
+        # torch.distributed.all_reduce does not free the input tensor until
+        # the synchronization point. This causes the memory usage to grow
+        # as the number of all_reduce calls increases. This env var disables
+        # this behavior.
+        # Related issue:
+        # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+        os.environ["TORCH_CNCL_AVOID_RECORD_STREAMS"] = "1"
+
+        # This env var set by Ray causes exceptions with graph building.
+        os.environ.pop("CNCL_ASYNC_ERROR_HANDLING", None)
+        self.device = torch.device(f"mlu:{self.local_rank}")
+        torch.mlu.set_device(self.device)
+
+        _check_if_gpu_supports_dtype(self.model_config.dtype)
+        gc.collect()
+        torch.mlu.empty_cache()
+        self.init_gpu_memory = torch.mlu.mem_get_info()[0]
+    else:
+        raise RuntimeError(
+            f"Not support device type: {self.device_config.device}")
+    # Initialize the distributed environment.
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: @brief: modify to vllm__worker__mlu_worker__init_worker_distributed_environment
+    '''
+    vllm__worker__mlu_worker__init_worker_distributed_environment(self.parallel_config, self.rank,
+                                                                  self.distributed_init_method, self.local_rank)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    # Set random seed.
+    set_random_seed(self.model_config.seed)
+
+
+def default_act_range_value():
+    return {
+        "x": None,
+        "split": None,
+        "is_linear": False,
+        "is_qkv": False,
+        "q_proj_size": 0,
+        "num_kv_head_replicas": 1,
+        "is_merge": False,
+        "input_id": [],
+        "self_rank": 0,
+        "rank": None,
+        "tensor_rank": None,
+        "tp_world_size": None,
+        "moe_tp_rank": None,
+        "moe_tp_world_size": None,
+        "moe_ep_rank": None,
+        "moe_ep_world_size": None,
+        "weight": None,
+    }
+
+def vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook(self,
+                                                       is_save_input_id: bool = False,
+                                                       is_save_moe_info: bool = False):
+    model = self.model_runner.model
+    self.act_range = defaultdict(default_act_range_value)
+    self.hooks = []
+    linear_class_list = (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
+    other_class_list = (VocabParallelEmbedding, ParallelLMHead)
+    class_list = linear_class_list + other_class_list
+    row_class_list = (RowParallelLinear)
+
+    for name, m in model.named_modules():
+        if isinstance(m, FeedForward):
+            m.use_bt_ffn = False
+        if isinstance(m, SparseMoeMlp):
+            m.is_use_fused_moe = False
+
+        if isinstance(m, class_list):
+            is_linear = True if isinstance(m, linear_class_list) else False
+            split_type = "row" if isinstance(m, row_class_list) else "col"
+            self.act_range[name]["split"] = split_type
+            self.act_range[name]["is_linear"] = is_linear
+            if isinstance(m, QKVParallelLinear):
+                self.act_range[name]["is_qkv"] = True
+                self.act_range[name]["q_proj_size"] = m.num_heads * m.head_size
+                self.act_range[name]["num_kv_head_replicas"] = m.num_kv_head_replicas
+            self.act_range[name]["is_merge"] = isinstance(m, MergedColumnParallelLinear)
+            if is_save_moe_info:
+                self.act_range[name]["rank"] = torch.distributed.get_rank()
+                self.act_range[name]["tensor_rank"] = get_tensor_model_parallel_rank()
+                self.act_range[name]["tp_world_size"] = get_tensor_model_parallel_world_size()
+                self.act_range[name]["moe_tp_rank"] = get_moe_tensor_parallel_rank()
+                self.act_range[name]["moe_tp_world_size"] = get_moe_tensor_parallel_world_size()
+                self.act_range[name]["moe_ep_rank"] = get_moe_expert_parallel_rank()
+                self.act_range[name]["moe_ep_world_size"] = get_moe_expert_parallel_world_size()
+                if ".expert." in name:
+                    self.act_range[name]["weight"] = m.weight
+            logger.info(f"rank:{self.rank}, add hook to {name}, is_linear:{is_linear}, split_type:{split_type}")
+            self.hooks.append(
+                m.register_forward_hook(
+                    functools.partial(self.stat_input_hook,
+                                      name=name,
+                                      act_range=self.act_range,
+                                      is_linear=is_linear,
+                                      is_save_input_id=is_save_input_id)))
+
+
+def vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range(self):
+    act_range = defaultdict(default_act_range_value)
+    for layer_name, layer_range in self.act_range.items():
+        for tensor_key, tensor_value in layer_range.items():
+            if isinstance(tensor_value, torch.Tensor):
+                act_range[layer_name][tensor_key] = tensor_value.to("cpu")
+            elif tensor_key == "input_id" and isinstance(tensor_value, list):
+                input_id_len = len(tensor_value)
+                for i in range(input_id_len):
+                    if isinstance(tensor_value[i], torch.Tensor):
+                        act_range[layer_name][tensor_key].append(tensor_value[i].to("cpu"))
+                    else:
+                        act_range[layer_name][tensor_key].append(tensor_value[i])
+            else:
+                act_range[layer_name][tensor_key] = tensor_value
+
+    return act_range
+
+
+MluHijackObject.apply_hijack(MLUWorker,
+                             MLUWorker.init_device,
+                             vllm__worker__mlu_worker__MLUWorker__init_device)
+MluHijackObject.apply_hijack(MLUWorker,
+                             "setup_smooth_hook",
+                             vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook)
+MluHijackObject.apply_hijack(MLUWorker,
+                             "get_act_range",
+                             vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range)