Files
2026-04-24 09:58:03 +08:00

159 lines
6.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from tqdm import tqdm
from typing import Callable
from vllm.entrypoints.llm import LLM
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.logger import init_logger
import vllm_mlu._mlu_utils as mlu_envs
from vllm_mlu.mlu_metric import LLMMetric
from vllm_mlu.mlu_hijack_utils import MluHijackObject
logger = init_logger(__name__)
def vllm__entrypoints__llm__LLM__get_mlu_metrics(
self,
metrics_idx_start,
only_average,
input_len,
output_len,
tp_nums,
quantization,
show_per_iter=False,
is_embedding_task=False,
mm_kwargs=None,
total_prefill_steps=1,
num_speculative_tokens=0,
dp_size=1,
) -> None:
'''
@brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
@params:
metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况
因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大需自行排查测试环境是否稳定。
其余参数:均为模型配置参数
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
batch_size = self.metric.batch_size_list[-1] * dp_size
if mm_kwargs or is_embedding_task:
# The multimodal and pooling model doesn't support the hfu feature yet.
hfu_info, io_efficiency = None, None
else:
hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len)
self.metric.calc_metric(
self.llm_engine.model_config.model,
self.llm_engine.model_config.dtype,
metrics_idx_start, only_average,
input_len, output_len, tp_nums,
quantization, show_per_iter,
is_embedding_task, mm_kwargs, total_prefill_steps,
num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency)
else:
print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
def vllm__entrypoints__llm__LLM___run_engine(
self, *, use_tqdm: bool | Callable[..., tqdm] = True
) -> list[RequestOutput | PoolingRequestOutput]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
pbar = tqdm_func(
total=num_requests,
desc="Processed prompts",
dynamic_ncols=True,
postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
)
'''
=============================
Added by vllm_mlu
=============================
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
total_request_num = self.llm_engine.get_num_unfinished_requests()
e2e_start_time = self.metric.get_mlu_cost_time()
if not self.llm_engine.model_config.is_embedding_task():
peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \
self.llm_engine.get_memory_usage()
self.metric.update_memory_usage(peak_memory, block_memory,
num_gpu_blocks, num_cpu_blocks)
'''
==================
End of addition
==================
'''
# Run the engine.
outputs: list[RequestOutput | PoolingRequestOutput] = []
total_in_toks = 0
total_out_toks = 0
while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step()
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
if isinstance(output, RequestOutput):
# Calculate tokens only for RequestOutput
n = len(output.outputs)
assert output.prompt_token_ids is not None
total_in_toks += len(output.prompt_token_ids) * n
in_spd = total_in_toks / pbar.format_dict["elapsed"]
total_out_toks += sum(
len(stp.token_ids) for stp in output.outputs
)
out_spd = total_out_toks / pbar.format_dict["elapsed"]
pbar.postfix = (
f"est. speed input: {in_spd:.2f} toks/s, "
f"output: {out_spd:.2f} toks/s"
)
pbar.update(n)
else:
pbar.update(1)
if pbar.n == num_requests:
pbar.refresh()
if use_tqdm:
pbar.close()
'''
=============================
Added by vllm_mlu
=============================
'''
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
e2e_end_time = self.metric.get_mlu_cost_time()
e2e_latency = e2e_end_time - e2e_start_time
engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency()
self.metric.update_step_latency(engine_step_latency)
if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
self.metric.update_step_latency_device(model_forward_latency)
self.metric.update_mm_encoder_latency_device(mm_encoder_latency)
self.metric.add_metrics(total_request_num, e2e_latency)
'''
==================
End of addition
==================
'''
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
return sorted(outputs, key=lambda x: int(x.request_id))
LLM.metric = LLMMetric()
MluHijackObject.apply_hijack(LLM,
"get_mlu_metrics",
vllm__entrypoints__llm__LLM__get_mlu_metrics)
MluHijackObject.apply_hijack(LLM,
LLM._run_engine,
vllm__entrypoints__llm__LLM___run_engine)