[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/v1/engine/core_client.py
+++ b/vllm_mlu/v1/engine/core_client.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.engine.core_client import (
+    EngineCoreClient,
+    InprocClient,
+    SyncMPClient,
+    AsyncMPClient,
+    DPAsyncMPClient,
+    DPLBAsyncMPClient,
+)
+from vllm.v1.engine import EngineCoreRequest
+from vllm.config import VllmConfig
+from vllm.v1.executor import Executor
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+class EngineCoreClient_MluHiack(EngineCoreClient):
+    
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ) -> "MPClient":
+        parallel_config = vllm_config.parallel_config
+        client_args = (
+            vllm_config,
+            executor_class,
+            log_stats,
+            client_addresses,
+            client_count,
+            client_index,
+        )
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: disagg use DPAsyncMPClient instead of DPLBAsyncMPClient.
+        '''
+        if parallel_config.data_parallel_size > 1:
+            if parallel_config.data_parallel_external_lb or vllm_config.kv_transfer_config is not None:
+                # External load balancer - client per DP rank.
+                return DPAsyncMPClient(*client_args)
+            # Internal load balancer - client balances to all DP ranks.
+            return DPLBAsyncMPClient(*client_args)
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        return AsyncMPClient(*client_args)
+
+
+class InprocClient_MluHiack(InprocClient):
+
+    def get_hfu_info(self, batch, input_len, output_len):
+        return self.engine_core.get_hfu_info(batch, input_len, output_len)
+
+    def get_latency(self):
+        return self.engine_core.get_latency()
+
+    def get_memory_usage(self):
+        return self.engine_core.get_memory_usage()
+
+    def recapture_model(
+        self,
+        prefill_enable_mlugraph: bool,
+        batch_size: int,
+        input_len: int,
+    ):
+        return self.engine_core.recapture_model(
+            prefill_enable_mlugraph, batch_size, input_len
+        )
+
+    def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
+        return self.engine_core.init_metric(
+            use_unchunk_sched, min_prefill_batch,
+        )
+
+    def start_scheduler_profile(self):
+        self.engine_core.start_scheduler_profile()
+
+    def stop_scheduler_profile(self):
+        self.engine_core.stop_scheduler_profile()
+        
+    def response_remote_alloc_once(self) -> None:
+        self.engine_core.response_remote_alloc_once()
+
+
+class SyncMPClient_MluHiack(SyncMPClient):
+
+    def get_hfu_info(self, batch, input_len, output_len):
+        try:
+            return self.call_utility("get_hfu_info", batch, input_len, output_len)
+        except Exception as e:
+            raise RuntimeError(f"Failed to get HFU info: {str(e)}")
+
+    def get_latency(self):
+        return self.call_utility("get_latency")
+
+    def get_memory_usage(self):
+        return self.call_utility("get_memory_usage")
+
+    def recapture_model(self,
+                        prefill_enable_mlugraph: bool,
+                        batch_size: int,
+                        input_len: int):
+        return self.call_utility("recapture_model",
+                                 prefill_enable_mlugraph, batch_size, input_len)
+
+    def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
+        return self.call_utility("init_metric",
+                                 use_unchunk_sched,
+                                 min_prefill_batch)
+
+    def start_scheduler_profile(self):
+        self.call_utility("start_scheduler_profile")
+
+    def stop_scheduler_profile(self):
+        self.call_utility("stop_scheduler_profile")
+        
+    def response_remote_alloc_once(self) -> None:
+        self.call_utility("response_remote_alloc_once")
+
+
+class AsyncMPClient_MluHijack(AsyncMPClient):
+
+    async def start_scheduler_profile(self) -> None:
+        await self.call_utility_async("start_scheduler_profile")
+
+    async def stop_scheduler_profile(self) -> None:
+        await self.call_utility_async("stop_scheduler_profile")
+        
+    async def response_remote_alloc_once(self) -> None:
+        await self.call_utility_async("response_remote_alloc_once")
+        
+        
+class DPAsyncMPClient_MluHijack(DPAsyncMPClient):
+
+    def get_core_engine_for_request(self, request: EngineCoreRequest):
+        
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: disagg need proxy to assign dp_rank
+        '''
+        if request.data_parallel_rank is not None:
+            # engines are already in rank order
+            return self.core_engines[request.data_parallel_rank]
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+        
+        return self.core_engine
+
+
+MluHijackObject.apply_hijack(EngineCoreClient,
+                             EngineCoreClient.make_async_mp_client,
+                             EngineCoreClient_MluHiack.make_async_mp_client)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_hfu_info",
+                             InprocClient_MluHiack.get_hfu_info)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_latency",
+                             InprocClient_MluHiack.get_latency)
+MluHijackObject.apply_hijack(InprocClient,
+                             "get_memory_usage",
+                             InprocClient_MluHiack.get_memory_usage)
+MluHijackObject.apply_hijack(InprocClient,
+                             "recapture_model",
+                             InprocClient_MluHiack.recapture_model)
+MluHijackObject.apply_hijack(InprocClient,
+                             "init_metric",
+                             InprocClient_MluHiack.init_metric)
+MluHijackObject.apply_hijack(InprocClient,
+                             "start_scheduler_profile",
+                             InprocClient_MluHiack.start_scheduler_profile)
+MluHijackObject.apply_hijack(InprocClient,
+                             "stop_scheduler_profile",
+                             InprocClient_MluHiack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(InprocClient,
+                             "response_remote_alloc_once",
+                             InprocClient_MluHiack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_hfu_info",
+                             SyncMPClient_MluHiack.get_hfu_info)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_latency",
+                             SyncMPClient_MluHiack.get_latency)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "get_memory_usage",
+                             SyncMPClient_MluHiack.get_memory_usage)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "recapture_model",
+                             SyncMPClient_MluHiack.recapture_model)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "init_metric",
+                             SyncMPClient_MluHiack.init_metric)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "start_scheduler_profile",
+                             SyncMPClient_MluHiack.start_scheduler_profile)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "stop_scheduler_profile",
+                             SyncMPClient_MluHiack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(SyncMPClient,
+                             "response_remote_alloc_once",
+                             SyncMPClient_MluHiack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "start_scheduler_profile",
+                             AsyncMPClient_MluHijack.start_scheduler_profile)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "stop_scheduler_profile",
+                             AsyncMPClient_MluHijack.stop_scheduler_profile)
+MluHijackObject.apply_hijack(AsyncMPClient,
+                             "response_remote_alloc_once",
+                             AsyncMPClient_MluHijack.response_remote_alloc_once)
+MluHijackObject.apply_hijack(DPAsyncMPClient,
+                             DPAsyncMPClient.get_core_engine_for_request,
+                             DPAsyncMPClient_MluHijack.get_core_engine_for_request)