# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project from tqdm import tqdm from typing import Callable from vllm.entrypoints.llm import LLM from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.logger import init_logger import vllm_mlu._mlu_utils as mlu_envs from vllm_mlu.mlu_metric import LLMMetric from vllm_mlu.mlu_hijack_utils import MluHijackObject logger = init_logger(__name__) def vllm__entrypoints__llm__LLM__get_mlu_metrics( self, metrics_idx_start, only_average, input_len, output_len, tp_nums, quantization, show_per_iter=False, is_embedding_task=False, mm_kwargs=None, total_prefill_steps=1, num_speculative_tokens=0, dp_size=1, ) -> None: ''' @brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据 @params: metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况, 因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。 only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大,需自行排查测试环境是否稳定。 其余参数:均为模型配置参数 ''' if mlu_envs.VLLM_LATENCY_DEBUG_EN: batch_size = self.metric.batch_size_list[-1] * dp_size if mm_kwargs or is_embedding_task: # The multimodal and pooling model doesn't support the hfu feature yet. hfu_info, io_efficiency = None, None else: hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len) self.metric.calc_metric( self.llm_engine.model_config.model, self.llm_engine.model_config.dtype, metrics_idx_start, only_average, input_len, output_len, tp_nums, quantization, show_per_iter, is_embedding_task, mm_kwargs, total_prefill_steps, num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency) else: print("Warnning:please set VLLM_LATENCY_DEBUG=true!") def vllm__entrypoints__llm__LLM___run_engine( self, *, use_tqdm: bool | Callable[..., tqdm] = True ) -> list[RequestOutput | PoolingRequestOutput]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() tqdm_func = use_tqdm if callable(use_tqdm) else tqdm pbar = tqdm_func( total=num_requests, desc="Processed prompts", dynamic_ncols=True, postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"), ) ''' ============================= Added by vllm_mlu ============================= ''' if mlu_envs.VLLM_LATENCY_DEBUG_EN: total_request_num = self.llm_engine.get_num_unfinished_requests() e2e_start_time = self.metric.get_mlu_cost_time() if not self.llm_engine.model_config.is_embedding_task(): peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \ self.llm_engine.get_memory_usage() self.metric.update_memory_usage(peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks) ''' ================== End of addition ================== ''' # Run the engine. outputs: list[RequestOutput | PoolingRequestOutput] = [] total_in_toks = 0 total_out_toks = 0 while self.llm_engine.has_unfinished_requests(): step_outputs = self.llm_engine.step() for output in step_outputs: if output.finished: outputs.append(output) if use_tqdm: if isinstance(output, RequestOutput): # Calculate tokens only for RequestOutput n = len(output.outputs) assert output.prompt_token_ids is not None total_in_toks += len(output.prompt_token_ids) * n in_spd = total_in_toks / pbar.format_dict["elapsed"] total_out_toks += sum( len(stp.token_ids) for stp in output.outputs ) out_spd = total_out_toks / pbar.format_dict["elapsed"] pbar.postfix = ( f"est. speed input: {in_spd:.2f} toks/s, " f"output: {out_spd:.2f} toks/s" ) pbar.update(n) else: pbar.update(1) if pbar.n == num_requests: pbar.refresh() if use_tqdm: pbar.close() ''' ============================= Added by vllm_mlu ============================= ''' if mlu_envs.VLLM_LATENCY_DEBUG_EN: e2e_end_time = self.metric.get_mlu_cost_time() e2e_latency = e2e_end_time - e2e_start_time engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency() self.metric.update_step_latency(engine_step_latency) if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN: self.metric.update_step_latency_device(model_forward_latency) self.metric.update_mm_encoder_latency_device(mm_encoder_latency) self.metric.add_metrics(total_request_num, e2e_latency) ''' ================== End of addition ================== ''' # Sort the outputs by request ID. # This is necessary because some requests may be finished earlier than # its previous requests. return sorted(outputs, key=lambda x: int(x.request_id)) LLM.metric = LLMMetric() MluHijackObject.apply_hijack(LLM, "get_mlu_metrics", vllm__entrypoints__llm__LLM__get_mlu_metrics) MluHijackObject.apply_hijack(LLM, LLM._run_engine, vllm__entrypoints__llm__LLM___run_engine)