[Model] Support DeepSeek-V4

This commit is contained in:
chenxb002
2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions

158
vllm_mlu/entrypoints/llm.py Normal file
View File

@@ -0,0 +1,158 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from tqdm import tqdm
from typing import Callable
from vllm.entrypoints.llm import LLM
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.logger import init_logger
import vllm_mlu._mlu_utils as mlu_envs
from vllm_mlu.mlu_metric import LLMMetric
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
def vllm__entrypoints__llm__LLM__get_mlu_metrics(
self,
metrics_idx_start,
only_average,
input_len,
output_len,
tp_nums,
quantization,
show_per_iter=False,
is_embedding_task=False,
mm_kwargs=None,
total_prefill_steps=1,
num_speculative_tokens=0,
dp_size=1,
) -> None:
'''
@brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
@params:
metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况
因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大需自行排查测试环境是否稳定。
其余参数:均为模型配置参数
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
batch_size = self.metric.batch_size_list[-1] * dp_size
if mm_kwargs or is_embedding_task:
# The multimodal and pooling model doesn't support the hfu feature yet.
hfu_info, io_efficiency = None, None
else:
hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len)
self.metric.calc_metric(
self.llm_engine.model_config.model,
self.llm_engine.model_config.dtype,
metrics_idx_start, only_average,
input_len, output_len, tp_nums,
quantization, show_per_iter,
is_embedding_task, mm_kwargs, total_prefill_steps,
num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency)
else:
print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
def vllm__entrypoints__llm__LLM___run_engine(
self, *, use_tqdm: bool | Callable[..., tqdm] = True
) -> list[RequestOutput | PoolingRequestOutput]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
pbar = tqdm_func(
total=num_requests,
desc="Processed prompts",
dynamic_ncols=True,
postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
)
'''
=============================
Added by vllm_mlu
=============================
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
total_request_num = self.llm_engine.get_num_unfinished_requests()
e2e_start_time = self.metric.get_mlu_cost_time()
if not self.llm_engine.model_config.is_embedding_task():
peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \
self.llm_engine.get_memory_usage()
self.metric.update_memory_usage(peak_memory, block_memory,
num_gpu_blocks, num_cpu_blocks)
'''
==================
End of addition
==================
'''
# Run the engine.
outputs: list[RequestOutput | PoolingRequestOutput] = []
total_in_toks = 0
total_out_toks = 0
while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step()
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
if isinstance(output, RequestOutput):
# Calculate tokens only for RequestOutput
n = len(output.outputs)
assert output.prompt_token_ids is not None
total_in_toks += len(output.prompt_token_ids) * n
in_spd = total_in_toks / pbar.format_dict["elapsed"]
total_out_toks += sum(
len(stp.token_ids) for stp in output.outputs
)
out_spd = total_out_toks / pbar.format_dict["elapsed"]
pbar.postfix = (
f"est. speed input: {in_spd:.2f} toks/s, "
f"output: {out_spd:.2f} toks/s"
)
pbar.update(n)
else:
pbar.update(1)
if pbar.n == num_requests:
pbar.refresh()
if use_tqdm:
pbar.close()
'''
=============================
Added by vllm_mlu
=============================
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
e2e_end_time = self.metric.get_mlu_cost_time()
e2e_latency = e2e_end_time - e2e_start_time
engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency()
self.metric.update_step_latency(engine_step_latency)
if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
self.metric.update_step_latency_device(model_forward_latency)
self.metric.update_mm_encoder_latency_device(mm_encoder_latency)
self.metric.add_metrics(total_request_num, e2e_latency)
'''
==================
End of addition
==================
'''
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
return sorted(outputs, key=lambda x: int(x.request_id))
LLM.metric = LLMMetric()
MluHijackObject.apply_hijack(LLM,
"get_mlu_metrics",
vllm__entrypoints__llm__LLM__get_mlu_metrics)
MluHijackObject.apply_hijack(LLM,
LLM._run_engine,
vllm__entrypoints__llm__LLM___run_engine)