[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/v1/engine/init.py
+++ b/vllm_mlu/v1/engine/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
--- a/vllm_mlu/v1/engine/async_llm.py
+++ b/vllm_mlu/v1/engine/async_llm.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+from vllm.v1.engine.async_llm import AsyncLLM
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+class AsyncLLM_MluHijack(AsyncLLM):
+
+    async def start_scheduler_profile(self) -> None:
+        await self.engine_core.start_scheduler_profile()
+
+    async def stop_scheduler_profile(self) -> None:
+        await self.engine_core.stop_scheduler_profile()
+
+
+MluHijackObject.apply_hijack(AsyncLLM,
+                             "start_scheduler_profile",
+                             AsyncLLM_MluHijack.start_scheduler_profile)
+MluHijackObject.apply_hijack(AsyncLLM,
+                             "stop_scheduler_profile",
+                             AsyncLLM_MluHijack.stop_scheduler_profile)
--- a/vllm_mlu/v1/engine/core.py
+++ b/vllm_mlu/v1/engine/core.py
@@ -0,0 +1,566 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+# SPDX-License-Identifier: Apache-2.0
+from collections import deque
+import signal
+from typing import Any, Callable, cast
+from concurrent.futures import Future
+
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.logger import logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import engine_receiver_cache_from_config
+from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
+from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.utils.system_utils import decorate_logs, set_process_title
+from vllm.v1.core.kv_cache_utils import BlockHash, get_request_block_hasher, init_none_hash
+from vllm.v1.engine import EngineCoreOutputs
+from vllm.v1.engine.core import (
+    EngineCore,
+    EngineCoreProc,
+    DPEngineCoreProc,
+)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.core.sched.interface import SchedulerInterface
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.version import __version__ as VLLM_VERSION
+from logging import DEBUG
+
+import vllm_mlu._mlu_utils as mlu_envs
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+from vllm_mlu.mlu_metric import LLMMetric
+
+
+class EngineCore_MluHijack(EngineCore):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        executor_fail_callback: Callable | None = None,
+    ):
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: load_general_plugins in run_engine_core
+        '''
+        # # plugins need to be loaded at the engine/scheduler level too
+        # from vllm.plugins import load_general_plugins
+        # load_general_plugins()
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+        self.vllm_config = vllm_config
+        if vllm_config.parallel_config.data_parallel_rank == 0:
+            logger.info(
+                "Initializing a V1 LLM engine (v%s) with config: %s",
+                VLLM_VERSION,
+                vllm_config,
+            )
+
+        self.log_stats = log_stats
+
+        # Setup Model.
+        self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(executor_fail_callback)
+
+        self.available_gpu_memory_for_kv_cache = -1
+
+        # Setup KV Caches and update CacheConfig after profiling.
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
+            vllm_config
+        )
+
+        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
+        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
+
+        self.structured_output_manager = StructuredOutputManager(vllm_config)
+
+        # Setup scheduler.
+        Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
+
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Encoder models without KV cache don't support
+            # chunked prefill. But do SSM models?
+            logger.info("Disabling chunked prefill for model without KVCache")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+
+        scheduler_block_size = (
+            vllm_config.cache_config.block_size
+            * vllm_config.parallel_config.decode_context_parallel_size
+        )
+
+        self.scheduler: SchedulerInterface = Scheduler(
+            vllm_config=vllm_config,
+            kv_cache_config=kv_cache_config,
+            structured_output_manager=self.structured_output_manager,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size > 1,
+            log_stats=self.log_stats,
+            block_size=scheduler_block_size,
+        )
+        self.use_spec_decode = vllm_config.speculative_config is not None
+        if self.scheduler.connector is not None:  # type: ignore
+            self.model_executor.init_kv_output_aggregator(self.scheduler.connector)  # type: ignore
+
+        self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
+        self.mm_receiver_cache = engine_receiver_cache_from_config(
+            vllm_config, mm_registry
+        )
+
+        # If a KV connector is initialized for scheduler, we want to collect
+        # handshake metadata from all workers so the connector in the scheduler
+        # will have the full context
+        kv_connector = self.scheduler.get_kv_connector()
+        if kv_connector is not None:
+            # Collect and store KV connector xfer metadata from workers
+            # (after KV cache registration)
+            xfer_handshake_metadata = (
+                self.model_executor.get_kv_connector_handshake_metadata()
+            )
+
+            if xfer_handshake_metadata:
+                # xfer_handshake_metadata is list of dicts from workers
+                # Each dict already has structure {tp_rank: metadata}
+                # Merge all worker dicts into a single dict
+                content: dict[int, Any] = {}
+                for worker_dict in xfer_handshake_metadata:
+                    if worker_dict is not None:
+                        content.update(worker_dict)
+                kv_connector.set_xfer_handshake_metadata(content)
+
+        # Setup batch queue for pipeline parallelism.
+        # Batch queue for scheduled batches. This enables us to asynchronously
+        # schedule and execute batches, and is required by pipeline parallelism
+        # to eliminate pipeline bubbles.
+        self.batch_queue_size = self.model_executor.max_concurrent_batches
+        self.batch_queue: (
+            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None
+        ) = None
+        if self.batch_queue_size > 1:
+            logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
+            self.batch_queue = deque(maxlen=self.batch_queue_size)
+
+        self.ec_producer = (
+            vllm_config.ec_transfer_config is not None
+            and vllm_config.ec_transfer_config.is_ec_producer
+        )
+        self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
+
+        self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
+        if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
+            caching_hash_fn = get_hash_fn_by_name(
+                vllm_config.cache_config.prefix_caching_hash_algo
+            )
+            init_none_hash(caching_hash_fn)
+
+            self.request_block_hasher = get_request_block_hasher(
+                scheduler_block_size, caching_hash_fn
+            )
+
+        self.step_fn = (
+            self.step if self.batch_queue is None else self.step_with_batch_queue
+        )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
+
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        freeze_gc_heap()
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: v1 support offline benchmark
+        '''
+        self.step_latency = []
+        self.model_exec_latency = []
+        self.mm_encoder_latency = []
+        self.num_gpu_blocks = num_gpu_blocks
+        self.num_cpu_blocks = num_cpu_blocks
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+    def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
+        """Schedule, execute, and make output.
+
+        Returns tuple of outputs and a flag indicating whether the model
+        was executed.
+        """
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: v1 support offline benchmark
+        '''
+        if mlu_envs.VLLM_LATENCY_DEBUG_EN:
+            step_start = LLMMetric.get_mlu_cost_time()
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+        # Check for any requests remaining in the scheduler - unfinished,
+        # or finished and not yet removed from the batch.
+        if not self.scheduler.has_requests():
+            return {}, False
+        scheduler_output = self.scheduler.schedule()
+        future = self.model_executor.execute_model(scheduler_output, non_block=True)
+        grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+            if model_output is None:
+                model_output = self.model_executor.sample_tokens(grammar_output)
+                
+        if self.use_spec_decode and \
+            self.vllm_config.kv_transfer_config is not None and \
+               self.vllm_config.kv_transfer_config.kv_role == "kv_producer":
+            draft_token_ids = self.model_executor.take_draft_token_ids()
+            self.scheduler.draft_token_ids = draft_token_ids
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: v1 support offline benchmark
+        '''
+        has_sched_reqs = (scheduler_output.total_num_scheduled_tokens > 0)
+        if mlu_envs.VLLM_LATENCY_DEBUG_EN and has_sched_reqs:
+            self.step_latency.append(LLMMetric.get_mlu_cost_time() - step_start)
+        if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN and has_sched_reqs:
+            self.model_exec_latency.append(self.get_model_exec_latency())
+            mm_encoder_latency = self.get_mm_encoder_latency()
+            if mm_encoder_latency:
+                self.mm_encoder_latency.append(mm_encoder_latency)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
+    
+    def step_with_batch_queue(
+        self,
+    ) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
+        """Schedule and execute batches with the batch queue.
+        Note that if nothing to output in this step, None is returned.
+
+        The execution flow is as follows:
+        1. Try to schedule a new batch if the batch queue is not full.
+        If a new batch is scheduled, directly return an empty engine core
+        output. In other words, fulfilling the batch queue has a higher priority
+        than getting model outputs.
+        2. If there is no new scheduled batch, meaning that the batch queue
+        is full or no other requests can be scheduled, we block until the first
+        batch in the job queue is finished.
+        3. Update the scheduler from the output.
+        """
+        batch_queue = self.batch_queue
+        assert batch_queue is not None
+
+        # Try to schedule a new batch if the batch queue is not full, but
+        # the scheduler may return an empty batch if all requests are scheduled.
+        # Note that this is not blocking.
+        assert len(batch_queue) < self.batch_queue_size
+
+        model_executed = False
+        deferred_scheduler_output = None
+        if self.scheduler.has_requests():
+            scheduler_output = self.scheduler.schedule()
+            exec_future = self.model_executor.execute_model(
+                scheduler_output, non_block=True
+            )
+            if not self.ec_producer:
+                model_executed = scheduler_output.total_num_scheduled_tokens > 0
+
+            if self.is_pooling_model or not model_executed:
+                # No sampling required (no requests scheduled).
+                future = cast(Future[ModelRunnerOutput], exec_future)
+            else:
+                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
+
+                if not scheduler_output.pending_structured_output_tokens:
+                    # We aren't waiting for any tokens, get any grammar output
+                    # and sample immediately.
+                    grammar_output = self.scheduler.get_grammar_bitmask(
+                        scheduler_output
+                    )
+                    future = self.model_executor.sample_tokens(
+                        grammar_output, non_block=True
+                    )
+                else:
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+
+            if not deferred_scheduler_output:
+                # Add this step's future to the queue.
+                batch_queue.appendleft((future, scheduler_output))
+                if (
+                    model_executed
+                    and len(batch_queue) < self.batch_queue_size
+                    and not batch_queue[-1][0].done()
+                ):
+                    # Don't block on next worker response unless the queue is full
+                    # or there are no more requests to schedule.
+                    return None, True
+
+        elif not batch_queue:
+            # Queue is empty. We should not reach here since this method should
+            # only be called when the scheduler contains requests or the queue
+            # is non-empty.
+            return None, False
+
+        # Block until the next result is available.
+        future, scheduler_output = batch_queue.pop()
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+            
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: supoort disagg for mlu.
+        '''
+        if self.use_spec_decode and \
+            self.vllm_config.kv_transfer_config is not None and \
+               self.vllm_config.kv_transfer_config.kv_role == "kv_producer":
+            draft_token_ids = self.model_executor.take_draft_token_ids()
+            self.scheduler.draft_token_ids = draft_token_ids
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
+
+        # NOTE(nick): We can either handle the deferred tasks here or save
+        # in a field and do it immediately once step_with_batch_queue is
+        # re-called. The latter slightly favors TTFT over TPOT/throughput.
+        if deferred_scheduler_output:
+            # We now have the tokens needed to compute the bitmask for the
+            # deferred request. Get the bitmask and call sample tokens.
+            grammar_output = self.scheduler.get_grammar_bitmask(
+                deferred_scheduler_output
+            )
+            future = self.model_executor.sample_tokens(grammar_output, non_block=True)
+            batch_queue.appendleft((future, deferred_scheduler_output))
+
+        return engine_core_outputs, model_executed
+
+    def get_model_exec_latency(self):
+        latency = self.model_executor.get_latency()
+        return latency
+
+    def get_mm_encoder_latency(self):
+        return self.model_executor.get_mm_encoder_latency()
+
+    def get_hfu_info(self, batch, input_len, output_len):
+        return self.model_executor.get_hfu_info(batch, input_len, output_len)
+
+    def get_latency(self):
+        return (self.step_latency, self.model_exec_latency, self.mm_encoder_latency)
+
+    def get_memory_usage(self):
+        peak_memory, block_memory = self.model_executor.get_memory_usage()
+        return (peak_memory, block_memory,
+                self.num_gpu_blocks, self.num_cpu_blocks)
+
+    def recapture_model(self,
+                        prefill_enable_mlugraph: bool,
+                        batch_size: int,
+                        input_len: int):
+        self.model_executor.recapture_model(
+            prefill_enable_mlugraph, batch_size, input_len)
+
+    def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
+        self.step_latency = []
+        self.model_exec_latency = []
+        self.mm_encoder_latency = []
+        mlu_envs.VLLM_V1_USE_UNCHUNK_SCHED = use_unchunk_sched
+        mlu_envs.VLLM_V1_MIN_PREFILL_BATCH = min_prefill_batch
+
+    def start_scheduler_profile(self):
+        self.scheduler.start_scheduler_profile()
+
+    def stop_scheduler_profile(self):
+        self.scheduler.stop_scheduler_profile()
+        
+    def response_remote_alloc_once(self):
+        self.model_executor.response_remote_alloc_once()
+
+
+class EngineCoreProc_MluHijack(EngineCoreProc):
+
+    @staticmethod
+    def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
+        """Launch EngineCore busy loop in background process."""
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: load_general_plugins for mp backend engine
+        '''
+        # plugins need to be loaded at the engine/scheduler level too
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the engine_core
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        engine_core: EngineCoreProc | None = None
+        try:
+            parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
+            if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+                set_process_title("EngineCore", f"DP{dp_rank}")
+                decorate_logs()
+                # Set data parallel rank for this engine process.
+                parallel_config.data_parallel_rank = dp_rank
+                parallel_config.data_parallel_rank_local = local_dp_rank
+                engine_core = DPEngineCoreProc(*args, **kwargs)
+            else:
+                set_process_title("EngineCore")
+                decorate_logs()
+                engine_core = EngineCoreProc(*args, **kwargs)
+
+            engine_core.run_busy_loop()
+
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
+        finally:
+            if engine_core is not None:
+                engine_core.shutdown()
+                
+    def _process_input_queue(self):
+        """Exits when an engine step needs to be performed."""
+
+        waited = False
+        while (
+            not self.engines_running
+            and not self.scheduler.has_requests()
+            and not self.batch_queue
+        ):
+            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
+                logger.debug("EngineCore waiting for work.")
+                waited = True
+                
+            if self.vllm_config.kv_transfer_config is not None and \
+                self.vllm_config.kv_transfer_config.kv_role == "kv_consumer":
+                self.response_remote_alloc_once()
+                if self.input_queue.empty():
+                    continue
+                req = self.input_queue.get_nowait()
+                self._handle_client_request(*req)    
+            else:
+                req = self.input_queue.get()
+                self._handle_client_request(*req)
+
+        if waited:
+            logger.debug("EngineCore loop active.")
+            
+        if self.vllm_config.kv_transfer_config is not None and \
+            self.vllm_config.kv_transfer_config.kv_role == "kv_consumer":
+            self.response_remote_alloc_once()
+
+        # Handle any more client requests.
+        while not self.input_queue.empty():
+            req = self.input_queue.get_nowait()
+            self._handle_client_request(*req)
+
+
+MluHijackObject.apply_hijack(EngineCore,
+                             "get_mm_encoder_latency",
+                             EngineCore_MluHijack.get_mm_encoder_latency)
+MluHijackObject.apply_hijack(EngineCore,
+                             "get_model_exec_latency",
+                             EngineCore_MluHijack.get_model_exec_latency)
+MluHijackObject.apply_hijack(EngineCore,
+                             "get_hfu_info",
+                             EngineCore_MluHijack.get_hfu_info)
+MluHijackObject.apply_hijack(EngineCore,
+                             "get_latency",
+                             EngineCore_MluHijack.get_latency)
+MluHijackObject.apply_hijack(EngineCore,
+                             "get_memory_usage",
+                             EngineCore_MluHijack.get_memory_usage)
+MluHijackObject.apply_hijack(EngineCore,
+                             "recapture_model",
+                             EngineCore_MluHijack.recapture_model)
+MluHijackObject.apply_hijack(EngineCore,
+                             "init_metric",
+                             EngineCore_MluHijack.init_metric)
+MluHijackObject.apply_hijack(EngineCore,
+                             "start_scheduler_profile",
+                             EngineCore_MluHijack.start_scheduler_profile)
+MluHijackObject.apply_hijack(EngineCore,
+                             "stop_scheduler_profile",
+                             EngineCore_MluHijack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(EngineCore,
+                             EngineCore.__init__,
+                             EngineCore_MluHijack.__init__)
+MluHijackObject.apply_hijack(EngineCore,
+                             EngineCore.step,
+                             EngineCore_MluHijack.step)
+MluHijackObject.apply_hijack(EngineCore,
+                             "response_remote_alloc_once",
+                             EngineCore_MluHijack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(EngineCore,
+                             EngineCore.step_with_batch_queue,
+                             EngineCore_MluHijack.step_with_batch_queue)
+MluHijackObject.apply_hijack(EngineCoreProc,
+                             EngineCoreProc.run_engine_core,
+                             EngineCoreProc_MluHijack.run_engine_core)
+MluHijackObject.apply_hijack(EngineCoreProc,
+                             EngineCoreProc._process_input_queue,
+                             EngineCoreProc_MluHijack._process_input_queue)
--- a/vllm_mlu/v1/engine/core_client.py
+++ b/vllm_mlu/v1/engine/core_client.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.engine.core_client import (
+    EngineCoreClient,
+    InprocClient,
+    SyncMPClient,
+    AsyncMPClient,
+    DPAsyncMPClient,
+    DPLBAsyncMPClient,
+)
+from vllm.v1.engine import EngineCoreRequest
+from vllm.config import VllmConfig
+from vllm.v1.executor import Executor
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+class EngineCoreClient_MluHiack(EngineCoreClient):
+    
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ) -> "MPClient":
+        parallel_config = vllm_config.parallel_config
+        client_args = (
+            vllm_config,
+            executor_class,
+            log_stats,
+            client_addresses,
+            client_count,
+            client_index,
+        )
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: disagg use DPAsyncMPClient instead of DPLBAsyncMPClient.
+        '''
+        if parallel_config.data_parallel_size > 1:
+            if parallel_config.data_parallel_external_lb or vllm_config.kv_transfer_config is not None:
+                # External load balancer - client per DP rank.
+                return DPAsyncMPClient(*client_args)
+            # Internal load balancer - client balances to all DP ranks.
+            return DPLBAsyncMPClient(*client_args)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        return AsyncMPClient(*client_args)
+
+
+class InprocClient_MluHiack(InprocClient):
+
+    def get_hfu_info(self, batch, input_len, output_len):
+        return self.engine_core.get_hfu_info(batch, input_len, output_len)
+
+    def get_latency(self):
+        return self.engine_core.get_latency()
+
+    def get_memory_usage(self):
+        return self.engine_core.get_memory_usage()
+
+    def recapture_model(
+        self,
+        prefill_enable_mlugraph: bool,
+        batch_size: int,
+        input_len: int,
+    ):
+        return self.engine_core.recapture_model(
+            prefill_enable_mlugraph, batch_size, input_len
+        )
+
+    def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
+        return self.engine_core.init_metric(
+            use_unchunk_sched, min_prefill_batch,
+        )
+
+    def start_scheduler_profile(self):
+        self.engine_core.start_scheduler_profile()
+
+    def stop_scheduler_profile(self):
+        self.engine_core.stop_scheduler_profile()
+        
+    def response_remote_alloc_once(self) -> None:
+        self.engine_core.response_remote_alloc_once()
+
+
+class SyncMPClient_MluHiack(SyncMPClient):
+
+    def get_hfu_info(self, batch, input_len, output_len):
+        try:
+            return self.call_utility("get_hfu_info", batch, input_len, output_len)
+        except Exception as e:
+            raise RuntimeError(f"Failed to get HFU info: {str(e)}")
+
+    def get_latency(self):
+        return self.call_utility("get_latency")
+
+    def get_memory_usage(self):
+        return self.call_utility("get_memory_usage")
+
+    def recapture_model(self,
+                        prefill_enable_mlugraph: bool,
+                        batch_size: int,
+                        input_len: int):
+        return self.call_utility("recapture_model",
+                                 prefill_enable_mlugraph, batch_size, input_len)
+
+    def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
+        return self.call_utility("init_metric",
+                                 use_unchunk_sched,
+                                 min_prefill_batch)
+
+    def start_scheduler_profile(self):
+        self.call_utility("start_scheduler_profile")
+
+    def stop_scheduler_profile(self):
+        self.call_utility("stop_scheduler_profile")
+        
+    def response_remote_alloc_once(self) -> None:
+        self.call_utility("response_remote_alloc_once")
+
+
+class AsyncMPClient_MluHijack(AsyncMPClient):
+
+    async def start_scheduler_profile(self) -> None:
+        await self.call_utility_async("start_scheduler_profile")
+
+    async def stop_scheduler_profile(self) -> None:
+        await self.call_utility_async("stop_scheduler_profile")
+        
+    async def response_remote_alloc_once(self) -> None:
+        await self.call_utility_async("response_remote_alloc_once")
+        
+        
+class DPAsyncMPClient_MluHijack(DPAsyncMPClient):
+
+    def get_core_engine_for_request(self, request: EngineCoreRequest):
+        
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: disagg need proxy to assign dp_rank
+        '''
+        if request.data_parallel_rank is not None:
+            # engines are already in rank order
+            return self.core_engines[request.data_parallel_rank]
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        
+        return self.core_engine
+
+
+MluHijackObject.apply_hijack(EngineCoreClient,
+                             EngineCoreClient.make_async_mp_client,
+                             EngineCoreClient_MluHiack.make_async_mp_client)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_hfu_info",
+                             InprocClient_MluHiack.get_hfu_info)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_latency",
+                             InprocClient_MluHiack.get_latency)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_memory_usage",
+                             InprocClient_MluHiack.get_memory_usage)
+MluHijackObject.apply_hijack(InprocClient,
+                             "recapture_model",
+                             InprocClient_MluHiack.recapture_model)
+MluHijackObject.apply_hijack(InprocClient,
+                             "init_metric",
+                             InprocClient_MluHiack.init_metric)
+MluHijackObject.apply_hijack(InprocClient,
+                             "start_scheduler_profile",
+                             InprocClient_MluHiack.start_scheduler_profile)
+MluHijackObject.apply_hijack(InprocClient,
+                             "stop_scheduler_profile",
+                             InprocClient_MluHiack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(InprocClient,
+                             "response_remote_alloc_once",
+                             InprocClient_MluHiack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_hfu_info",
+                             SyncMPClient_MluHiack.get_hfu_info)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_latency",
+                             SyncMPClient_MluHiack.get_latency)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_memory_usage",
+                             SyncMPClient_MluHiack.get_memory_usage)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "recapture_model",
+                             SyncMPClient_MluHiack.recapture_model)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "init_metric",
+                             SyncMPClient_MluHiack.init_metric)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "start_scheduler_profile",
+                             SyncMPClient_MluHiack.start_scheduler_profile)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "stop_scheduler_profile",
+                             SyncMPClient_MluHiack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "response_remote_alloc_once",
+                             SyncMPClient_MluHiack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "start_scheduler_profile",
+                             AsyncMPClient_MluHijack.start_scheduler_profile)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "stop_scheduler_profile",
+                             AsyncMPClient_MluHijack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "response_remote_alloc_once",
+                             AsyncMPClient_MluHijack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(DPAsyncMPClient,
+                             DPAsyncMPClient.get_core_engine_for_request,
+                             DPAsyncMPClient_MluHijack.get_core_engine_for_request)
--- a/vllm_mlu/v1/engine/llm_engine.py
+++ b/vllm_mlu/v1/engine/llm_engine.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.engine.llm_engine import LLMEngine
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+def vllm__engine__llm_engine__LLMEngine__get_hfu_info(self, batch, input_len, output_len):
+    return self.engine_core.get_hfu_info(batch, input_len, output_len)
+
+
+def vllm__engine__llm_engine__LLMEngine__get_latency(self):
+    return self.engine_core.get_latency()
+
+
+def vllm__engine__llm_engine__LLMEngine__get_memory_usage(self):
+    return self.engine_core.get_memory_usage()
+
+
+def vllm__engine__llm_engine__LLMEngine__start_scheduler_profile(self):
+    self.engine_core.start_scheduler_profile()
+
+
+def vllm__engine__llm_engine__LLMEngine__stop_scheduler_profile(self):
+    self.engine_core.stop_scheduler_profile()
+
+
+MluHijackObject.apply_hijack(LLMEngine,
+                             "get_hfu_info",
+                             vllm__engine__llm_engine__LLMEngine__get_hfu_info)
+MluHijackObject.apply_hijack(LLMEngine,
+                             "get_latency",
+                             vllm__engine__llm_engine__LLMEngine__get_latency)
+MluHijackObject.apply_hijack(LLMEngine,
+                             "get_memory_usage",
+                             vllm__engine__llm_engine__LLMEngine__get_memory_usage)
+MluHijackObject.apply_hijack(LLMEngine,
+                             "start_scheduler_profile",
+                             vllm__engine__llm_engine__LLMEngine__start_scheduler_profile)
+MluHijackObject.apply_hijack(LLMEngine,
+                             "stop_scheduler_profile",
+                             vllm__engine__llm_engine__LLMEngine__stop_scheduler_profile)