[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/config/init.py
+++ b/vllm_mlu/config/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
--- a/vllm_mlu/config/model.py
+++ b/vllm_mlu/config/model.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+from vllm.config.model import ModelConfig
+from vllm.logger import init_logger
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+logger = init_logger(__name__)
+
+
+def vllm__config__model__ModelConfig__is_embedding_task(self) -> bool:
+    return self.runner_type == "pooling"
+
+def vllm__config__model__ModelConfig__get_head_size(self) -> int:
+    # TODO remove hard code
+    if self.is_deepseek_mla:
+        qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
+        if self.use_mla:
+            return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
+        else:
+            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
+            if qk_rope_head_dim and qk_nope_head_dim:
+                return qk_rope_head_dim + qk_nope_head_dim
+
+    if hasattr(self.hf_text_config, "model_type") and (
+        self.hf_text_config.model_type == "zamba2"
+    ):
+        return self.hf_text_config.attention_head_dim
+
+    if self.is_attention_free:
+        return 0
+
+    # NOTE: Some configs may set head_dim=None in the config
+    if getattr(self.hf_text_config, "head_dim", None) is not None:
+        return self.hf_text_config.head_dim
+
+    # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
+    if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
+        return self.hf_text_config.hidden_size_per_head
+
+    # FIXME(woosuk): This may not be true for all models.
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: adjust num_heads and num_attention_heads.
+    '''
+    if hasattr(self.hf_text_config, "num_heads"):
+        num_attention_heads = self.hf_text_config.num_heads
+    else:
+        num_attention_heads = self.hf_text_config.num_attention_heads
+
+    return (self.hf_text_config.hidden_size // num_attention_heads)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+
+MluHijackObject.apply_hijack(
+    ModelConfig,
+    "is_embedding_task",
+    vllm__config__model__ModelConfig__is_embedding_task,
+)
+MluHijackObject.apply_hijack(
+    ModelConfig,
+    ModelConfig.get_head_size,
+    vllm__config__model__ModelConfig__get_head_size,
+)
--- a/vllm_mlu/config/scheduler.py
+++ b/vllm_mlu/config/scheduler.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+
+from typing_extensions import Self
+
+from vllm.config.scheduler import SchedulerConfig
+from vllm.logger import init_logger
+
+from vllm_mlu._mlu_utils import VLLM_V1_BENCHMARK
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+logger = init_logger(__name__)
+
+
+def vllm__config__scheduler__SchedulerConfig__verify_max_model_len(
+    self, max_model_len: int,
+) -> Self:
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: This restriction is removed when VLLM_V1_BENCHMARK is set to True
+    '''
+    if not VLLM_V1_BENCHMARK:
+        if (
+            self.max_num_batched_tokens < max_model_len
+            and not self.enable_chunked_prefill
+        ):
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+                f"smaller than max_model_len ({max_model_len}). "
+                "This effectively limits the maximum sequence length to "
+                "max_num_batched_tokens and makes vLLM reject longer "
+                "sequences. Please increase max_num_batched_tokens or "
+                "decrease max_model_len."
+            )
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    if self.max_num_batched_tokens < self.max_num_seqs:
+        raise ValueError(
+            f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+            "be greater than or equal to max_num_seqs "
+            f"({self.max_num_seqs})."
+        )
+
+    if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
+        logger.warning(
+            "max_num_batched_tokens (%d) exceeds max_num_seqs "
+            "* max_model_len (%d). This may lead to unexpected behavior.",
+            self.max_num_batched_tokens,
+            self.max_num_seqs * max_model_len,
+        )
+
+    if self.max_num_partial_prefills > 1:
+        if not self.enable_chunked_prefill:
+            raise ValueError(
+                "Chunked prefill must be enabled to set "
+                "max_num_partial_prefills > 1."
+            )
+
+        if self.long_prefill_token_threshold > max_model_len:
+            raise ValueError(
+                "long_prefill_token_threshold "
+                f"({self.long_prefill_token_threshold}) cannot be greater "
+                f"than the max_model_len ({max_model_len})."
+            )
+
+    if self.max_long_partial_prefills > self.max_num_partial_prefills:
+        raise ValueError(
+            f"{self.max_long_partial_prefills=} must be less than or equal to "
+            f"{self.max_num_partial_prefills=}."
+        )
+
+    return self
+
+
+MluHijackObject.apply_hijack(
+    SchedulerConfig,
+    SchedulerConfig.verify_max_model_len,
+    vllm__config__scheduler__SchedulerConfig__verify_max_model_len,
+)
--- a/vllm_mlu/config/speculative.py
+++ b/vllm_mlu/config/speculative.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+from vllm.config.parallel import ParallelConfig
+from vllm.config.speculative import SpeculativeConfig
+from vllm.logger import init_logger
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+logger = init_logger(__name__)
+
+@staticmethod
+def vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config(
+    target_parallel_config: ParallelConfig,
+    speculative_draft_tensor_parallel_size: int,
+) -> ParallelConfig:
+    """Create a parallel config for use by the draft worker.
+
+    This is mostly a copy of the target parallel config, except the tp_size.
+    """
+    '''
+    =============================
+    Modify by vllm_mlu
+    @brief: add draft data parallel parameters
+    =============================
+    '''
+    draft_parallel_config = ParallelConfig(
+        pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
+        tensor_parallel_size=speculative_draft_tensor_parallel_size,
+        distributed_executor_backend=target_parallel_config.distributed_executor_backend,
+        max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
+        disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
+        ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
+        placement_group=target_parallel_config.placement_group,
+        # add draft data parallel parameters
+        data_parallel_size=target_parallel_config.data_parallel_size,
+        data_parallel_size_local=target_parallel_config.data_parallel_size_local,
+        data_parallel_master_ip=target_parallel_config.data_parallel_master_ip,
+        data_parallel_rpc_port=target_parallel_config.data_parallel_rpc_port,
+    )
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    return draft_parallel_config
+
+
+vllm__config__speculative__SpeculativeConfig____post_init___org = SpeculativeConfig.__post_init__
+def vllm__config__speculative__SpeculativeConfig____post_init__(self):
+    if self.model is None and self.num_speculative_tokens is not None and self.method is None:
+        self.method = "mtp"
+    vllm__config__speculative__SpeculativeConfig____post_init___org(self)
+
+
+MluHijackObject.apply_hijack(
+    SpeculativeConfig,
+    SpeculativeConfig.create_draft_parallel_config,
+    vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config,
+)
+MluHijackObject.apply_hijack(
+    SpeculativeConfig,
+    SpeculativeConfig.__post_init__,
+    vllm__config__speculative__SpeculativeConfig____post_init__,
+)
--- a/vllm_mlu/config/vllm.py
+++ b/vllm_mlu/config/vllm.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import os
+
+from vllm.config.vllm import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.logger import init_logger
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+logger = init_logger(__name__)
+
+
+def vllm__config__vllm__VllmConfig___set_cudagraph_sizes(self):
+    """
+    vLLM defines the default candidate list of batch sizes for CUDA graph
+    capture as:
+
+    ```python
+    max_graph_size = min(max_num_seqs * 2, 512)
+    # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
+    # up to max_graph_size
+    cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
+        range(256, max_graph_size + 1, 16))
+
+    In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
+    will be the final sizes to capture cudagraph (in ascending order).
+
+    These sizes are used to capture and reuse CUDA graphs for
+    performance-critical paths (e.g., decoding). Capturing enables
+    significantly faster kernel dispatch by avoiding Python overhead. The
+    list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
+    most GPUs), which controls the total allowed number of tokens in a
+    batch. Since each sequence may have a variable number of tokens, the
+    maximum usable batch size will depend on actual sequence lengths.
+
+    Example:
+        With `max_num_batched_tokens = 8192`, and typical sequences
+        averaging ~32 tokens, most practical batch sizes fall below 256.
+        However, the system will still allow capture sizes up to 512 if
+        shape and memory permit.
+
+    Note:
+        If users explicitly specify cudagraph capture sizes in the
+        compilation config, those will override this default logic.
+        At runtime:
+
+        - If batch size <= one of the `cudagraph_capture_sizes`, the closest
+        padded CUDA graph will be used.
+        - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
+        not be used.
+    """
+    if hasattr(self.compilation_config, "_has_set_capture_list"):
+        # avoid set capture list twice while init
+        return
+
+    if (
+        self.model_config is not None
+        and not self.model_config.enforce_eager
+        and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+    ):
+        # determine the initial max_cudagraph_capture_size
+        max_cudagraph_capture_size = (
+            self.compilation_config.max_cudagraph_capture_size
+        )
+        if max_cudagraph_capture_size is None:
+            max_cudagraph_capture_size = min(
+                self.scheduler_config.max_num_seqs * 2, 512
+            )
+        max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
+
+        assert max_cudagraph_capture_size >= 1, (
+            "Maximum cudagraph size should be greater than or equal to 1 "
+            "when using cuda graph."
+        )
+
+        # determine the cudagraph_capture_sizes
+        if self.compilation_config.cudagraph_capture_sizes is not None:
+            assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
+                "cudagraph_capture_sizes should contain at least one element "
+                "when using cuda graph."
+            )
+            # de-duplicate the sizes provided by the config
+            dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
+            cudagraph_capture_sizes = [
+                i for i in dedup_sizes if i <= max_num_tokens
+            ]
+            # sort to make sure the sizes are in ascending order
+            cudagraph_capture_sizes.sort()
+        else:
+            cudagraph_capture_sizes = [
+                i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
+            ]
+            if max_cudagraph_capture_size >= 8:
+                # Step size 8 for small batch sizes, up to 256(not included)
+                cudagraph_capture_sizes += list(
+                    range(8, min(max_cudagraph_capture_size + 1, 256), 8)
+                )
+            if max_cudagraph_capture_size >= 256:
+                # Step size 16 for larger batch sizes
+                cudagraph_capture_sizes += list(
+                    range(256, max_cudagraph_capture_size + 1, 16)
+                )
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief:
+        1) check batch_size_capture_list when enable mtp because bs * (K + 1)
+        may greater than max_num_batched_tokens
+        2) capture MLUGraph by given batch list
+        '''
+        mlu_graph_capture_list = os.getenv("MLU_GRAPH_CAPTURE_LIST", None)
+        if mlu_graph_capture_list:
+            if "-" in mlu_graph_capture_list:
+                batch_info = mlu_graph_capture_list.split("-")
+                assert len(batch_info) == 3, \
+                    f"Got invalid graph_capture_list={mlu_graph_capture_list}, " + \
+                    f"but expected format 'min_bs-max_bs(may not include)-step'."
+                start, end, step = mlu_graph_capture_list.split("-")
+                cudagraph_capture_sizes = [1, 2, 4] + [
+                    i for i in range(int(start), int(end), int(step))
+                ]
+                cudagraph_capture_sizes = sorted(list(set(cudagraph_capture_sizes)))
+            else:
+                cudagraph_capture_sizes = [int(x) for x in mlu_graph_capture_list.split(",")]
+
+        if (self.speculative_config is not None
+            and self.speculative_config.num_speculative_tokens > 0
+        ):
+            K = self.speculative_config.num_speculative_tokens
+            cudagraph_capture_sizes = [x * (1 + K) for x in cudagraph_capture_sizes]
+
+        cudagraph_capture_sizes = [
+            size for size in cudagraph_capture_sizes
+            if size <= self.scheduler_config.max_num_batched_tokens
+        ]
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+        if (
+            self.parallel_config.tensor_parallel_size > 1
+            and self.compilation_config.pass_config.enable_sequence_parallelism
+        ):
+            cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
+                cudagraph_capture_sizes
+            )
+
+        # user-specific compilation_config.max_cudagraph_capture_size get
+        # truncated to valid_max_size when they are inconsistent.
+        valid_max_size = (
+            cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
+        )
+        if (
+            self.compilation_config.max_cudagraph_capture_size is not None
+            and self.compilation_config.max_cudagraph_capture_size != valid_max_size
+        ):
+            # raise error only when both two flags are user-specified
+            # and they are inconsistent with each other
+            if self.compilation_config.cudagraph_capture_sizes is not None:
+                raise ValueError(
+                    "customized max_cudagraph_capture_size"
+                    f"(={self.compilation_config.max_cudagraph_capture_size}) "
+                    "should be consistent with the max value of "
+                    f"cudagraph_capture_sizes(={valid_max_size})"
+                )
+
+            logger.warning(
+                "Truncating max_cudagraph_capture_size to %d",
+                valid_max_size,
+            )
+        # always set the final max_cudagraph_capture_size
+        self.compilation_config.max_cudagraph_capture_size = valid_max_size
+
+        if self.compilation_config.cudagraph_capture_sizes is not None and len(
+            cudagraph_capture_sizes
+        ) < len(self.compilation_config.cudagraph_capture_sizes):
+            # If users have specified capture sizes, we only need to
+            # compare the lens before and after modification since the modified
+            # list is only the subset of the original list.
+            logger.warning(
+                (
+                    "cudagraph_capture_sizes specified in compilation_config"
+                    " %s is overridden by config %s"
+                ),
+                self.compilation_config.cudagraph_capture_sizes,
+                cudagraph_capture_sizes,
+            )
+        # always write back the final sizes
+        self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
+
+    else:
+        # no cudagraph in use
+        self.compilation_config.max_cudagraph_capture_size = 0
+        self.compilation_config.cudagraph_capture_sizes = []
+
+    # complete the remaining process.
+    self.compilation_config.post_init_cudagraph_sizes()
+
+    setattr(self.compilation_config, "_has_set_capture_list", True)
+
+
+MluHijackObject.apply_hijack(
+    VllmConfig,
+    VllmConfig._set_cudagraph_sizes,
+    vllm__config__vllm__VllmConfig___set_cudagraph_sizes,
+)