[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/mlu_hijack.py
+++ b/vllm_mlu/mlu_hijack.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import importlib.util
+from vllm_mlu._mlu_utils import *
+from vllm_mlu.logger import logger
+
+
+def is_module_available(module_name):
+    spec = importlib.util.find_spec(module_name)
+    return spec is not None
+
+def check_environ_compatibility():
+    if is_module_available('apex'):
+        logger.error(f"The `apex` package is currently present in your environment, "
+                     f"which may cause model accuracy issues or other problems. It is "
+                     f"strongly recommended that you uninstall it before using vLLM.")
+
+# Check environment compatibility first before applying mlu hijack.
+check_environ_compatibility()
+
+logger.info(f"[MLU] Apply Monkey Patch.")
+
+# Apply v1 hijack
+import vllm_mlu.v1.engine.core
+import vllm_mlu.v1.engine.core_client
+import vllm_mlu.v1.engine.llm_engine
+import vllm_mlu.v1.engine.async_llm
+import vllm_mlu.v1.core.sched.scheduler
+import vllm_mlu.v1.core.single_type_kv_cache_manager
+import vllm_mlu.v1.core.kv_cache_utils
+import vllm_mlu.v1.core.kv_cache_manager
+import vllm_mlu.v1.executor.abstract
+import vllm_mlu.v1.executor.ray_executor
+import vllm_mlu.v1.executor.multiproc_executor
+import vllm_mlu.v1.sample.rejection_sampler
+import vllm_mlu.v1.worker.lora_model_runner_mixin
+import vllm_mlu.v1.worker.block_table
+import vllm_mlu.v1.worker.gpu_input_batch
+import vllm_mlu.v1.worker.kv_connector_model_runner_mixin
+import vllm_mlu.v1.attention.backends.gdn_attn
+import vllm_mlu.v1.attention.backends.mla.flashmla
+import vllm_mlu.compilation.fix_functionalization
+
+# Apply common hijack
+import vllm_mlu.attention.layer
+import vllm_mlu.benchmarks.datasets
+import vllm_mlu.config.model
+import vllm_mlu.config.scheduler
+import vllm_mlu.config.speculative
+import vllm_mlu.config.vllm
+import vllm_mlu.utils
+import vllm_mlu.distributed.parallel_state
+import vllm_mlu.distributed.kv_transfer.kv_connector.factory
+import vllm_mlu.engine.arg_utils
+import vllm_mlu.entrypoints.llm
+import vllm_mlu.lora.layers.base_linear
+import vllm_mlu.lora.layers.row_parallel_linear
+import vllm_mlu.lora.layers.column_parallel_linear
+import vllm_mlu.model_executor.parameter
+import vllm_mlu.model_executor.layers.linear
+import vllm_mlu.model_executor.layers.rotary_embedding
+import vllm_mlu.model_executor.layers.quantization.utils.w8a8_utils
+import vllm_mlu.model_executor.layers.quantization.fp8
+import vllm_mlu.model_executor.layers.activation
+import vllm_mlu.model_executor.layers.layernorm
+import vllm_mlu.model_executor.layers.fused_moe.layer
+import vllm_mlu.model_executor.model_loader.tensorizer_loader
+import vllm_mlu.model_executor.models.registry
+import vllm_mlu.model_executor.models.config
+import vllm_mlu.multimodal.utils
+if is_module_available('lmcache'):
+    import vllm_mlu.distributed.kv_transfer.kv_connector.v1.lmcache_connector
+
+if VLLM_CI_ACCURACY_TEST:
+    import vllm_mlu.model_executor.model_loader.dummy_loader
+
+if VLLM_SCHEDULER_PROFILE:
+    import vllm_mlu.entrypoints.openai.api_server