[Model] Support DeepSeek-V4

This commit is contained in:
chenxb002
2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project

71
vllm_mlu/config/model.py Normal file
View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from vllm.config.model import ModelConfig
from vllm.logger import init_logger
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
def vllm__config__model__ModelConfig__is_embedding_task(self) -> bool:
return self.runner_type == "pooling"
def vllm__config__model__ModelConfig__get_head_size(self) -> int:
# TODO remove hard code
if self.is_deepseek_mla:
qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
if self.use_mla:
return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
else:
qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
if qk_rope_head_dim and qk_nope_head_dim:
return qk_rope_head_dim + qk_nope_head_dim
if hasattr(self.hf_text_config, "model_type") and (
self.hf_text_config.model_type == "zamba2"
):
return self.hf_text_config.attention_head_dim
if self.is_attention_free:
return 0
# NOTE: Some configs may set head_dim=None in the config
if getattr(self.hf_text_config, "head_dim", None) is not None:
return self.hf_text_config.head_dim
# NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
return self.hf_text_config.hidden_size_per_head
# FIXME(woosuk): This may not be true for all models.
'''
=============================
Modify by vllm_mlu
=============================
@brief: adjust num_heads and num_attention_heads.
'''
if hasattr(self.hf_text_config, "num_heads"):
num_attention_heads = self.hf_text_config.num_heads
else:
num_attention_heads = self.hf_text_config.num_attention_heads
return (self.hf_text_config.hidden_size // num_attention_heads)
'''
==================
End of MLU Hijack
==================
'''
MluHijackObject.apply_hijack(
ModelConfig,
"is_embedding_task",
vllm__config__model__ModelConfig__is_embedding_task,
)
MluHijackObject.apply_hijack(
ModelConfig,
ModelConfig.get_head_size,
vllm__config__model__ModelConfig__get_head_size,
)

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from typing_extensions import Self
from vllm.config.scheduler import SchedulerConfig
from vllm.logger import init_logger
from vllm_mlu._mlu_utils import VLLM_V1_BENCHMARK
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
def vllm__config__scheduler__SchedulerConfig__verify_max_model_len(
self, max_model_len: int,
) -> Self:
'''
=============================
Modify by vllm_mlu
=============================
@brief: This restriction is removed when VLLM_V1_BENCHMARK is set to True
'''
if not VLLM_V1_BENCHMARK:
if (
self.max_num_batched_tokens < max_model_len
and not self.enable_chunked_prefill
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len."
)
'''
==================
End of MLU Hijack
==================
'''
if self.max_num_batched_tokens < self.max_num_seqs:
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
"be greater than or equal to max_num_seqs "
f"({self.max_num_seqs})."
)
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
logger.warning(
"max_num_batched_tokens (%d) exceeds max_num_seqs "
"* max_model_len (%d). This may lead to unexpected behavior.",
self.max_num_batched_tokens,
self.max_num_seqs * max_model_len,
)
if self.max_num_partial_prefills > 1:
if not self.enable_chunked_prefill:
raise ValueError(
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
)
if self.long_prefill_token_threshold > max_model_len:
raise ValueError(
"long_prefill_token_threshold "
f"({self.long_prefill_token_threshold}) cannot be greater "
f"than the max_model_len ({max_model_len})."
)
if self.max_long_partial_prefills > self.max_num_partial_prefills:
raise ValueError(
f"{self.max_long_partial_prefills=} must be less than or equal to "
f"{self.max_num_partial_prefills=}."
)
return self
MluHijackObject.apply_hijack(
SchedulerConfig,
SchedulerConfig.verify_max_model_len,
vllm__config__scheduler__SchedulerConfig__verify_max_model_len,
)

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.logger import init_logger
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
@staticmethod
def vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int,
) -> ParallelConfig:
"""Create a parallel config for use by the draft worker.
This is mostly a copy of the target parallel config, except the tp_size.
"""
'''
=============================
Modify by vllm_mlu
@brief: add draft data parallel parameters
=============================
'''
draft_parallel_config = ParallelConfig(
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
tensor_parallel_size=speculative_draft_tensor_parallel_size,
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
placement_group=target_parallel_config.placement_group,
# add draft data parallel parameters
data_parallel_size=target_parallel_config.data_parallel_size,
data_parallel_size_local=target_parallel_config.data_parallel_size_local,
data_parallel_master_ip=target_parallel_config.data_parallel_master_ip,
data_parallel_rpc_port=target_parallel_config.data_parallel_rpc_port,
)
'''
==================
End of MLU Hijack
==================
'''
return draft_parallel_config
vllm__config__speculative__SpeculativeConfig____post_init___org = SpeculativeConfig.__post_init__
def vllm__config__speculative__SpeculativeConfig____post_init__(self):
if self.model is None and self.num_speculative_tokens is not None and self.method is None:
self.method = "mtp"
vllm__config__speculative__SpeculativeConfig____post_init___org(self)
MluHijackObject.apply_hijack(
SpeculativeConfig,
SpeculativeConfig.create_draft_parallel_config,
vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config,
)
MluHijackObject.apply_hijack(
SpeculativeConfig,
SpeculativeConfig.__post_init__,
vllm__config__speculative__SpeculativeConfig____post_init__,
)

213
vllm_mlu/config/vllm.py Normal file
View File

@@ -0,0 +1,213 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
import os
from vllm.config.vllm import VllmConfig
from vllm.config.compilation import CUDAGraphMode
from vllm.logger import init_logger
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
def vllm__config__vllm__VllmConfig___set_cudagraph_sizes(self):
"""
vLLM defines the default candidate list of batch sizes for CUDA graph
capture as:
```python
max_graph_size = min(max_num_seqs * 2, 512)
# 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
# up to max_graph_size
cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
range(256, max_graph_size + 1, 16))
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
will be the final sizes to capture cudagraph (in ascending order).
These sizes are used to capture and reuse CUDA graphs for
performance-critical paths (e.g., decoding). Capturing enables
significantly faster kernel dispatch by avoiding Python overhead. The
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
most GPUs), which controls the total allowed number of tokens in a
batch. Since each sequence may have a variable number of tokens, the
maximum usable batch size will depend on actual sequence lengths.
Example:
With `max_num_batched_tokens = 8192`, and typical sequences
averaging ~32 tokens, most practical batch sizes fall below 256.
However, the system will still allow capture sizes up to 512 if
shape and memory permit.
Note:
If users explicitly specify cudagraph capture sizes in the
compilation config, those will override this default logic.
At runtime:
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
padded CUDA graph will be used.
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
not be used.
"""
if hasattr(self.compilation_config, "_has_set_capture_list"):
# avoid set capture list twice while init
return
if (
self.model_config is not None
and not self.model_config.enforce_eager
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
# determine the initial max_cudagraph_capture_size
max_cudagraph_capture_size = (
self.compilation_config.max_cudagraph_capture_size
)
if max_cudagraph_capture_size is None:
max_cudagraph_capture_size = min(
self.scheduler_config.max_num_seqs * 2, 512
)
max_num_tokens = self.scheduler_config.max_num_batched_tokens
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
assert max_cudagraph_capture_size >= 1, (
"Maximum cudagraph size should be greater than or equal to 1 "
"when using cuda graph."
)
# determine the cudagraph_capture_sizes
if self.compilation_config.cudagraph_capture_sizes is not None:
assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
"cudagraph_capture_sizes should contain at least one element "
"when using cuda graph."
)
# de-duplicate the sizes provided by the config
dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
cudagraph_capture_sizes = [
i for i in dedup_sizes if i <= max_num_tokens
]
# sort to make sure the sizes are in ascending order
cudagraph_capture_sizes.sort()
else:
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
range(8, min(max_cudagraph_capture_size + 1, 256), 8)
)
if max_cudagraph_capture_size >= 256:
# Step size 16 for larger batch sizes
cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16)
)
'''
=============================
Modify by vllm_mlu
=============================
@brief:
1) check batch_size_capture_list when enable mtp because bs * (K + 1)
may greater than max_num_batched_tokens
2) capture MLUGraph by given batch list
'''
mlu_graph_capture_list = os.getenv("MLU_GRAPH_CAPTURE_LIST", None)
if mlu_graph_capture_list:
if "-" in mlu_graph_capture_list:
batch_info = mlu_graph_capture_list.split("-")
assert len(batch_info) == 3, \
f"Got invalid graph_capture_list={mlu_graph_capture_list}, " + \
f"but expected format 'min_bs-max_bs(may not include)-step'."
start, end, step = mlu_graph_capture_list.split("-")
cudagraph_capture_sizes = [1, 2, 4] + [
i for i in range(int(start), int(end), int(step))
]
cudagraph_capture_sizes = sorted(list(set(cudagraph_capture_sizes)))
else:
cudagraph_capture_sizes = [int(x) for x in mlu_graph_capture_list.split(",")]
if (self.speculative_config is not None
and self.speculative_config.num_speculative_tokens > 0
):
K = self.speculative_config.num_speculative_tokens
cudagraph_capture_sizes = [x * (1 + K) for x in cudagraph_capture_sizes]
cudagraph_capture_sizes = [
size for size in cudagraph_capture_sizes
if size <= self.scheduler_config.max_num_batched_tokens
]
'''
==================
End of MLU Hijack
==================
'''
if (
self.parallel_config.tensor_parallel_size > 1
and self.compilation_config.pass_config.enable_sequence_parallelism
):
cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
cudagraph_capture_sizes
)
# user-specific compilation_config.max_cudagraph_capture_size get
# truncated to valid_max_size when they are inconsistent.
valid_max_size = (
cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
)
if (
self.compilation_config.max_cudagraph_capture_size is not None
and self.compilation_config.max_cudagraph_capture_size != valid_max_size
):
# raise error only when both two flags are user-specified
# and they are inconsistent with each other
if self.compilation_config.cudagraph_capture_sizes is not None:
raise ValueError(
"customized max_cudagraph_capture_size"
f"(={self.compilation_config.max_cudagraph_capture_size}) "
"should be consistent with the max value of "
f"cudagraph_capture_sizes(={valid_max_size})"
)
logger.warning(
"Truncating max_cudagraph_capture_size to %d",
valid_max_size,
)
# always set the final max_cudagraph_capture_size
self.compilation_config.max_cudagraph_capture_size = valid_max_size
if self.compilation_config.cudagraph_capture_sizes is not None and len(
cudagraph_capture_sizes
) < len(self.compilation_config.cudagraph_capture_sizes):
# If users have specified capture sizes, we only need to
# compare the lens before and after modification since the modified
# list is only the subset of the original list.
logger.warning(
(
"cudagraph_capture_sizes specified in compilation_config"
" %s is overridden by config %s"
),
self.compilation_config.cudagraph_capture_sizes,
cudagraph_capture_sizes,
)
# always write back the final sizes
self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
else:
# no cudagraph in use
self.compilation_config.max_cudagraph_capture_size = 0
self.compilation_config.cudagraph_capture_sizes = []
# complete the remaining process.
self.compilation_config.post_init_cudagraph_sizes()
setattr(self.compilation_config, "_has_set_capture_list", True)
MluHijackObject.apply_hijack(
VllmConfig,
VllmConfig._set_cudagraph_sizes,
vllm__config__vllm__VllmConfig___set_cudagraph_sizes,
)