[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/entrypoints/llm.py
+++ b/vllm_mlu/entrypoints/llm.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+from tqdm import tqdm
+from typing import Callable
+
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.logger import init_logger
+
+import vllm_mlu._mlu_utils as mlu_envs
+from vllm_mlu.mlu_metric import LLMMetric
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+logger = init_logger(__name__)
+
+def vllm__entrypoints__llm__LLM__get_mlu_metrics(
+    self,
+    metrics_idx_start,
+    only_average,
+    input_len,
+    output_len,
+    tp_nums,
+    quantization,
+    show_per_iter=False,
+    is_embedding_task=False,
+    mm_kwargs=None,
+    total_prefill_steps=1,
+    num_speculative_tokens=0,
+    dp_size=1,
+) -> None:
+    '''
+    @brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
+    @params:
+        metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况，
+        因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
+        only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大，需自行排查测试环境是否稳定。
+        其余参数:均为模型配置参数
+    '''
+    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
+        batch_size = self.metric.batch_size_list[-1] * dp_size
+        if mm_kwargs or is_embedding_task:
+            # The multimodal and pooling model doesn't support the hfu feature yet.
+            hfu_info, io_efficiency = None, None
+        else:
+            hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len)
+        self.metric.calc_metric(
+                                self.llm_engine.model_config.model,
+                                self.llm_engine.model_config.dtype,
+                                metrics_idx_start, only_average,
+                                input_len, output_len, tp_nums,
+                                quantization, show_per_iter,
+                                is_embedding_task, mm_kwargs, total_prefill_steps,
+                                num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency)
+    else:
+        print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
+
+
+def vllm__entrypoints__llm__LLM___run_engine(
+    self, *, use_tqdm: bool | Callable[..., tqdm] = True
+) -> list[RequestOutput | PoolingRequestOutput]:
+    # Initialize tqdm.
+    if use_tqdm:
+        num_requests = self.llm_engine.get_num_unfinished_requests()
+        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+        pbar = tqdm_func(
+            total=num_requests,
+            desc="Processed prompts",
+            dynamic_ncols=True,
+            postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
+        )
+
+    '''
+    =============================
+    Added by vllm_mlu
+    =============================
+    '''
+    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
+        total_request_num = self.llm_engine.get_num_unfinished_requests()
+        e2e_start_time = self.metric.get_mlu_cost_time()
+        if not self.llm_engine.model_config.is_embedding_task():
+            peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \
+                self.llm_engine.get_memory_usage()
+            self.metric.update_memory_usage(peak_memory, block_memory,
+                                            num_gpu_blocks, num_cpu_blocks)
+    '''
+    ==================
+    End of addition
+    ==================
+    '''
+
+    # Run the engine.
+    outputs: list[RequestOutput | PoolingRequestOutput] = []
+    total_in_toks = 0
+    total_out_toks = 0
+    while self.llm_engine.has_unfinished_requests():
+        step_outputs = self.llm_engine.step()
+        for output in step_outputs:
+            if output.finished:
+                outputs.append(output)
+                if use_tqdm:
+                    if isinstance(output, RequestOutput):
+                        # Calculate tokens only for RequestOutput
+                        n = len(output.outputs)
+                        assert output.prompt_token_ids is not None
+                        total_in_toks += len(output.prompt_token_ids) * n
+                        in_spd = total_in_toks / pbar.format_dict["elapsed"]
+                        total_out_toks += sum(
+                            len(stp.token_ids) for stp in output.outputs
+                        )
+                        out_spd = total_out_toks / pbar.format_dict["elapsed"]
+                        pbar.postfix = (
+                            f"est. speed input: {in_spd:.2f} toks/s, "
+                            f"output: {out_spd:.2f} toks/s"
+                        )
+                        pbar.update(n)
+                    else:
+                        pbar.update(1)
+                    if pbar.n == num_requests:
+                        pbar.refresh()
+
+    if use_tqdm:
+        pbar.close()
+    '''
+    =============================
+    Added by vllm_mlu
+    =============================
+    '''
+    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
+        e2e_end_time = self.metric.get_mlu_cost_time()
+        e2e_latency = e2e_end_time - e2e_start_time
+
+        engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency()
+        self.metric.update_step_latency(engine_step_latency)
+        if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
+            self.metric.update_step_latency_device(model_forward_latency)
+            self.metric.update_mm_encoder_latency_device(mm_encoder_latency)
+
+        self.metric.add_metrics(total_request_num, e2e_latency)
+    '''
+    ==================
+    End of addition
+    ==================
+    '''
+    # Sort the outputs by request ID.
+    # This is necessary because some requests may be finished earlier than
+    # its previous requests.
+    return sorted(outputs, key=lambda x: int(x.request_id))
+
+
+LLM.metric = LLMMetric()
+MluHijackObject.apply_hijack(LLM,
+                             "get_mlu_metrics",
+                             vllm__entrypoints__llm__LLM__get_mlu_metrics)
+MluHijackObject.apply_hijack(LLM,
+                             LLM._run_engine,
+                             vllm__entrypoints__llm__LLM___run_engine)