[Model] Support DeepSeek-V4
This commit is contained in:
3
vllm_mlu/config/__init__.py
Normal file
3
vllm_mlu/config/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
71
vllm_mlu/config/model.py
Normal file
71
vllm_mlu/config/model.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__config__model__ModelConfig__is_embedding_task(self) -> bool:
|
||||
return self.runner_type == "pooling"
|
||||
|
||||
def vllm__config__model__ModelConfig__get_head_size(self) -> int:
|
||||
# TODO remove hard code
|
||||
if self.is_deepseek_mla:
|
||||
qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
|
||||
if self.use_mla:
|
||||
return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
|
||||
else:
|
||||
qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0)
|
||||
if qk_rope_head_dim and qk_nope_head_dim:
|
||||
return qk_rope_head_dim + qk_nope_head_dim
|
||||
|
||||
if hasattr(self.hf_text_config, "model_type") and (
|
||||
self.hf_text_config.model_type == "zamba2"
|
||||
):
|
||||
return self.hf_text_config.attention_head_dim
|
||||
|
||||
if self.is_attention_free:
|
||||
return 0
|
||||
|
||||
# NOTE: Some configs may set head_dim=None in the config
|
||||
if getattr(self.hf_text_config, "head_dim", None) is not None:
|
||||
return self.hf_text_config.head_dim
|
||||
|
||||
# NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
|
||||
if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None:
|
||||
return self.hf_text_config.hidden_size_per_head
|
||||
|
||||
# FIXME(woosuk): This may not be true for all models.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: adjust num_heads and num_attention_heads.
|
||||
'''
|
||||
if hasattr(self.hf_text_config, "num_heads"):
|
||||
num_attention_heads = self.hf_text_config.num_heads
|
||||
else:
|
||||
num_attention_heads = self.hf_text_config.num_attention_heads
|
||||
|
||||
return (self.hf_text_config.hidden_size // num_attention_heads)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
ModelConfig,
|
||||
"is_embedding_task",
|
||||
vllm__config__model__ModelConfig__is_embedding_task,
|
||||
)
|
||||
MluHijackObject.apply_hijack(
|
||||
ModelConfig,
|
||||
ModelConfig.get_head_size,
|
||||
vllm__config__model__ModelConfig__get_head_size,
|
||||
)
|
||||
86
vllm_mlu/config/scheduler.py
Normal file
86
vllm_mlu/config/scheduler.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.scheduler import SchedulerConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from vllm_mlu._mlu_utils import VLLM_V1_BENCHMARK
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__config__scheduler__SchedulerConfig__verify_max_model_len(
|
||||
self, max_model_len: int,
|
||||
) -> Self:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: This restriction is removed when VLLM_V1_BENCHMARK is set to True
|
||||
'''
|
||||
if not VLLM_V1_BENCHMARK:
|
||||
if (
|
||||
self.max_num_batched_tokens < max_model_len
|
||||
and not self.enable_chunked_prefill
|
||||
):
|
||||
raise ValueError(
|
||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||
f"smaller than max_model_len ({max_model_len}). "
|
||||
"This effectively limits the maximum sequence length to "
|
||||
"max_num_batched_tokens and makes vLLM reject longer "
|
||||
"sequences. Please increase max_num_batched_tokens or "
|
||||
"decrease max_model_len."
|
||||
)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
if self.max_num_batched_tokens < self.max_num_seqs:
|
||||
raise ValueError(
|
||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
||||
"be greater than or equal to max_num_seqs "
|
||||
f"({self.max_num_seqs})."
|
||||
)
|
||||
|
||||
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
|
||||
logger.warning(
|
||||
"max_num_batched_tokens (%d) exceeds max_num_seqs "
|
||||
"* max_model_len (%d). This may lead to unexpected behavior.",
|
||||
self.max_num_batched_tokens,
|
||||
self.max_num_seqs * max_model_len,
|
||||
)
|
||||
|
||||
if self.max_num_partial_prefills > 1:
|
||||
if not self.enable_chunked_prefill:
|
||||
raise ValueError(
|
||||
"Chunked prefill must be enabled to set "
|
||||
"max_num_partial_prefills > 1."
|
||||
)
|
||||
|
||||
if self.long_prefill_token_threshold > max_model_len:
|
||||
raise ValueError(
|
||||
"long_prefill_token_threshold "
|
||||
f"({self.long_prefill_token_threshold}) cannot be greater "
|
||||
f"than the max_model_len ({max_model_len})."
|
||||
)
|
||||
|
||||
if self.max_long_partial_prefills > self.max_num_partial_prefills:
|
||||
raise ValueError(
|
||||
f"{self.max_long_partial_prefills=} must be less than or equal to "
|
||||
f"{self.max_num_partial_prefills=}."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
SchedulerConfig,
|
||||
SchedulerConfig.verify_max_model_len,
|
||||
vllm__config__scheduler__SchedulerConfig__verify_max_model_len,
|
||||
)
|
||||
66
vllm_mlu/config/speculative.py
Normal file
66
vllm_mlu/config/speculative.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@staticmethod
|
||||
def vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int,
|
||||
) -> ParallelConfig:
|
||||
"""Create a parallel config for use by the draft worker.
|
||||
|
||||
This is mostly a copy of the target parallel config, except the tp_size.
|
||||
"""
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
@brief: add draft data parallel parameters
|
||||
=============================
|
||||
'''
|
||||
draft_parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
|
||||
tensor_parallel_size=speculative_draft_tensor_parallel_size,
|
||||
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
|
||||
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
|
||||
disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
|
||||
ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
|
||||
placement_group=target_parallel_config.placement_group,
|
||||
# add draft data parallel parameters
|
||||
data_parallel_size=target_parallel_config.data_parallel_size,
|
||||
data_parallel_size_local=target_parallel_config.data_parallel_size_local,
|
||||
data_parallel_master_ip=target_parallel_config.data_parallel_master_ip,
|
||||
data_parallel_rpc_port=target_parallel_config.data_parallel_rpc_port,
|
||||
)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
return draft_parallel_config
|
||||
|
||||
|
||||
vllm__config__speculative__SpeculativeConfig____post_init___org = SpeculativeConfig.__post_init__
|
||||
def vllm__config__speculative__SpeculativeConfig____post_init__(self):
|
||||
if self.model is None and self.num_speculative_tokens is not None and self.method is None:
|
||||
self.method = "mtp"
|
||||
vllm__config__speculative__SpeculativeConfig____post_init___org(self)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
SpeculativeConfig,
|
||||
SpeculativeConfig.create_draft_parallel_config,
|
||||
vllm__config__speculative__SpeculativeConfig__create_draft_parallel_config,
|
||||
)
|
||||
MluHijackObject.apply_hijack(
|
||||
SpeculativeConfig,
|
||||
SpeculativeConfig.__post_init__,
|
||||
vllm__config__speculative__SpeculativeConfig____post_init__,
|
||||
)
|
||||
213
vllm_mlu/config/vllm.py
Normal file
213
vllm_mlu/config/vllm.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
import os
|
||||
|
||||
from vllm.config.vllm import VllmConfig
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def vllm__config__vllm__VllmConfig___set_cudagraph_sizes(self):
|
||||
"""
|
||||
vLLM defines the default candidate list of batch sizes for CUDA graph
|
||||
capture as:
|
||||
|
||||
```python
|
||||
max_graph_size = min(max_num_seqs * 2, 512)
|
||||
# 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
|
||||
# up to max_graph_size
|
||||
cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
|
||||
range(256, max_graph_size + 1, 16))
|
||||
|
||||
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
|
||||
will be the final sizes to capture cudagraph (in ascending order).
|
||||
|
||||
These sizes are used to capture and reuse CUDA graphs for
|
||||
performance-critical paths (e.g., decoding). Capturing enables
|
||||
significantly faster kernel dispatch by avoiding Python overhead. The
|
||||
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
|
||||
most GPUs), which controls the total allowed number of tokens in a
|
||||
batch. Since each sequence may have a variable number of tokens, the
|
||||
maximum usable batch size will depend on actual sequence lengths.
|
||||
|
||||
Example:
|
||||
With `max_num_batched_tokens = 8192`, and typical sequences
|
||||
averaging ~32 tokens, most practical batch sizes fall below 256.
|
||||
However, the system will still allow capture sizes up to 512 if
|
||||
shape and memory permit.
|
||||
|
||||
Note:
|
||||
If users explicitly specify cudagraph capture sizes in the
|
||||
compilation config, those will override this default logic.
|
||||
At runtime:
|
||||
|
||||
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
|
||||
padded CUDA graph will be used.
|
||||
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
|
||||
not be used.
|
||||
"""
|
||||
if hasattr(self.compilation_config, "_has_set_capture_list"):
|
||||
# avoid set capture list twice while init
|
||||
return
|
||||
|
||||
if (
|
||||
self.model_config is not None
|
||||
and not self.model_config.enforce_eager
|
||||
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
):
|
||||
# determine the initial max_cudagraph_capture_size
|
||||
max_cudagraph_capture_size = (
|
||||
self.compilation_config.max_cudagraph_capture_size
|
||||
)
|
||||
if max_cudagraph_capture_size is None:
|
||||
max_cudagraph_capture_size = min(
|
||||
self.scheduler_config.max_num_seqs * 2, 512
|
||||
)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
|
||||
|
||||
assert max_cudagraph_capture_size >= 1, (
|
||||
"Maximum cudagraph size should be greater than or equal to 1 "
|
||||
"when using cuda graph."
|
||||
)
|
||||
|
||||
# determine the cudagraph_capture_sizes
|
||||
if self.compilation_config.cudagraph_capture_sizes is not None:
|
||||
assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
|
||||
"cudagraph_capture_sizes should contain at least one element "
|
||||
"when using cuda graph."
|
||||
)
|
||||
# de-duplicate the sizes provided by the config
|
||||
dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
|
||||
cudagraph_capture_sizes = [
|
||||
i for i in dedup_sizes if i <= max_num_tokens
|
||||
]
|
||||
# sort to make sure the sizes are in ascending order
|
||||
cudagraph_capture_sizes.sort()
|
||||
else:
|
||||
cudagraph_capture_sizes = [
|
||||
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
|
||||
]
|
||||
if max_cudagraph_capture_size >= 8:
|
||||
# Step size 8 for small batch sizes, up to 256(not included)
|
||||
cudagraph_capture_sizes += list(
|
||||
range(8, min(max_cudagraph_capture_size + 1, 256), 8)
|
||||
)
|
||||
if max_cudagraph_capture_size >= 256:
|
||||
# Step size 16 for larger batch sizes
|
||||
cudagraph_capture_sizes += list(
|
||||
range(256, max_cudagraph_capture_size + 1, 16)
|
||||
)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief:
|
||||
1) check batch_size_capture_list when enable mtp because bs * (K + 1)
|
||||
may greater than max_num_batched_tokens
|
||||
2) capture MLUGraph by given batch list
|
||||
'''
|
||||
mlu_graph_capture_list = os.getenv("MLU_GRAPH_CAPTURE_LIST", None)
|
||||
if mlu_graph_capture_list:
|
||||
if "-" in mlu_graph_capture_list:
|
||||
batch_info = mlu_graph_capture_list.split("-")
|
||||
assert len(batch_info) == 3, \
|
||||
f"Got invalid graph_capture_list={mlu_graph_capture_list}, " + \
|
||||
f"but expected format 'min_bs-max_bs(may not include)-step'."
|
||||
start, end, step = mlu_graph_capture_list.split("-")
|
||||
cudagraph_capture_sizes = [1, 2, 4] + [
|
||||
i for i in range(int(start), int(end), int(step))
|
||||
]
|
||||
cudagraph_capture_sizes = sorted(list(set(cudagraph_capture_sizes)))
|
||||
else:
|
||||
cudagraph_capture_sizes = [int(x) for x in mlu_graph_capture_list.split(",")]
|
||||
|
||||
if (self.speculative_config is not None
|
||||
and self.speculative_config.num_speculative_tokens > 0
|
||||
):
|
||||
K = self.speculative_config.num_speculative_tokens
|
||||
cudagraph_capture_sizes = [x * (1 + K) for x in cudagraph_capture_sizes]
|
||||
|
||||
cudagraph_capture_sizes = [
|
||||
size for size in cudagraph_capture_sizes
|
||||
if size <= self.scheduler_config.max_num_batched_tokens
|
||||
]
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
if (
|
||||
self.parallel_config.tensor_parallel_size > 1
|
||||
and self.compilation_config.pass_config.enable_sequence_parallelism
|
||||
):
|
||||
cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
|
||||
cudagraph_capture_sizes
|
||||
)
|
||||
|
||||
# user-specific compilation_config.max_cudagraph_capture_size get
|
||||
# truncated to valid_max_size when they are inconsistent.
|
||||
valid_max_size = (
|
||||
cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
|
||||
)
|
||||
if (
|
||||
self.compilation_config.max_cudagraph_capture_size is not None
|
||||
and self.compilation_config.max_cudagraph_capture_size != valid_max_size
|
||||
):
|
||||
# raise error only when both two flags are user-specified
|
||||
# and they are inconsistent with each other
|
||||
if self.compilation_config.cudagraph_capture_sizes is not None:
|
||||
raise ValueError(
|
||||
"customized max_cudagraph_capture_size"
|
||||
f"(={self.compilation_config.max_cudagraph_capture_size}) "
|
||||
"should be consistent with the max value of "
|
||||
f"cudagraph_capture_sizes(={valid_max_size})"
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
"Truncating max_cudagraph_capture_size to %d",
|
||||
valid_max_size,
|
||||
)
|
||||
# always set the final max_cudagraph_capture_size
|
||||
self.compilation_config.max_cudagraph_capture_size = valid_max_size
|
||||
|
||||
if self.compilation_config.cudagraph_capture_sizes is not None and len(
|
||||
cudagraph_capture_sizes
|
||||
) < len(self.compilation_config.cudagraph_capture_sizes):
|
||||
# If users have specified capture sizes, we only need to
|
||||
# compare the lens before and after modification since the modified
|
||||
# list is only the subset of the original list.
|
||||
logger.warning(
|
||||
(
|
||||
"cudagraph_capture_sizes specified in compilation_config"
|
||||
" %s is overridden by config %s"
|
||||
),
|
||||
self.compilation_config.cudagraph_capture_sizes,
|
||||
cudagraph_capture_sizes,
|
||||
)
|
||||
# always write back the final sizes
|
||||
self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
|
||||
|
||||
else:
|
||||
# no cudagraph in use
|
||||
self.compilation_config.max_cudagraph_capture_size = 0
|
||||
self.compilation_config.cudagraph_capture_sizes = []
|
||||
|
||||
# complete the remaining process.
|
||||
self.compilation_config.post_init_cudagraph_sizes()
|
||||
|
||||
setattr(self.compilation_config, "_has_set_capture_list", True)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
VllmConfig,
|
||||
VllmConfig._set_cudagraph_sizes,
|
||||
vllm__config__vllm__VllmConfig___set_cudagraph_sizes,
|
||||
)
|
||||
Reference in New Issue
Block a user