enginex-mlu590-vllm/vllm_mlu/entrypoints/llm.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project

from tqdm import tqdm
from typing import Callable

from vllm.entrypoints.llm import LLM
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.logger import init_logger

import vllm_mlu._mlu_utils as mlu_envs
from vllm_mlu.mlu_metric import LLMMetric
from vllm_mlu.mlu_hijack_utils import MluHijackObject


logger = init_logger(__name__)

def vllm__entrypoints__llm__LLM__get_mlu_metrics(
    self,
    metrics_idx_start,
    only_average,
    input_len,
    output_len,
    tp_nums,
    quantization,
    show_per_iter=False,
    is_embedding_task=False,
    mm_kwargs=None,
    total_prefill_steps=1,
    num_speculative_tokens=0,
    dp_size=1,
) -> None:
    '''
    @brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
    @params:
        metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况，
        因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
        only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大，需自行排查测试环境是否稳定。
        其余参数:均为模型配置参数
    '''
    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
        batch_size = self.metric.batch_size_list[-1] * dp_size
        if mm_kwargs or is_embedding_task:
            # The multimodal and pooling model doesn't support the hfu feature yet.
            hfu_info, io_efficiency = None, None
        else:
            hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len)
        self.metric.calc_metric(
                                self.llm_engine.model_config.model,
                                self.llm_engine.model_config.dtype,
                                metrics_idx_start, only_average,
                                input_len, output_len, tp_nums,
                                quantization, show_per_iter,
                                is_embedding_task, mm_kwargs, total_prefill_steps,
                                num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency)
    else:
        print("Warnning:please set VLLM_LATENCY_DEBUG=true!")


def vllm__entrypoints__llm__LLM___run_engine(
    self, *, use_tqdm: bool | Callable[..., tqdm] = True
) -> list[RequestOutput | PoolingRequestOutput]:
    # Initialize tqdm.
    if use_tqdm:
        num_requests = self.llm_engine.get_num_unfinished_requests()
        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
        pbar = tqdm_func(
            total=num_requests,
            desc="Processed prompts",
            dynamic_ncols=True,
            postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
        )

    '''
    =============================
    Added by vllm_mlu
    =============================
    '''
    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
        total_request_num = self.llm_engine.get_num_unfinished_requests()
        e2e_start_time = self.metric.get_mlu_cost_time()
        if not self.llm_engine.model_config.is_embedding_task():
            peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \
                self.llm_engine.get_memory_usage()
            self.metric.update_memory_usage(peak_memory, block_memory,
                                            num_gpu_blocks, num_cpu_blocks)
    '''
    ==================
    End of addition
    ==================
    '''

    # Run the engine.
    outputs: list[RequestOutput | PoolingRequestOutput] = []
    total_in_toks = 0
    total_out_toks = 0
    while self.llm_engine.has_unfinished_requests():
        step_outputs = self.llm_engine.step()
        for output in step_outputs:
            if output.finished:
                outputs.append(output)
                if use_tqdm:
                    if isinstance(output, RequestOutput):
                        # Calculate tokens only for RequestOutput
                        n = len(output.outputs)
                        assert output.prompt_token_ids is not None
                        total_in_toks += len(output.prompt_token_ids) * n
                        in_spd = total_in_toks / pbar.format_dict["elapsed"]
                        total_out_toks += sum(
                            len(stp.token_ids) for stp in output.outputs
                        )
                        out_spd = total_out_toks / pbar.format_dict["elapsed"]
                        pbar.postfix = (
                            f"est. speed input: {in_spd:.2f} toks/s, "
                            f"output: {out_spd:.2f} toks/s"
                        )
                        pbar.update(n)
                    else:
                        pbar.update(1)
                    if pbar.n == num_requests:
                        pbar.refresh()

    if use_tqdm:
        pbar.close()
    '''
    =============================
    Added by vllm_mlu
    =============================
    '''
    if mlu_envs.VLLM_LATENCY_DEBUG_EN:
        e2e_end_time = self.metric.get_mlu_cost_time()
        e2e_latency = e2e_end_time - e2e_start_time

        engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency()
        self.metric.update_step_latency(engine_step_latency)
        if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
            self.metric.update_step_latency_device(model_forward_latency)
            self.metric.update_mm_encoder_latency_device(mm_encoder_latency)

        self.metric.add_metrics(total_request_num, e2e_latency)
    '''
    ==================
    End of addition
    ==================
    '''
    # Sort the outputs by request ID.
    # This is necessary because some requests may be finished earlier than
    # its previous requests.
    return sorted(outputs, key=lambda x: int(x.request_id))


LLM.metric = LLMMetric()
MluHijackObject.apply_hijack(LLM,
                             "get_mlu_metrics",
                             vllm__entrypoints__llm__LLM__get_mlu_metrics)
MluHijackObject.apply_hijack(LLM,
                             LLM._run_engine,
                             vllm__entrypoints__llm__LLM___run_engine)