159 lines
6.2 KiB
Python
159 lines
6.2 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||
|
||
from tqdm import tqdm
|
||
from typing import Callable
|
||
|
||
from vllm.entrypoints.llm import LLM
|
||
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
||
from vllm.logger import init_logger
|
||
|
||
import vllm_mlu._mlu_utils as mlu_envs
|
||
from vllm_mlu.mlu_metric import LLMMetric
|
||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||
|
||
|
||
logger = init_logger(__name__)
|
||
|
||
def vllm__entrypoints__llm__LLM__get_mlu_metrics(
|
||
self,
|
||
metrics_idx_start,
|
||
only_average,
|
||
input_len,
|
||
output_len,
|
||
tp_nums,
|
||
quantization,
|
||
show_per_iter=False,
|
||
is_embedding_task=False,
|
||
mm_kwargs=None,
|
||
total_prefill_steps=1,
|
||
num_speculative_tokens=0,
|
||
dp_size=1,
|
||
) -> None:
|
||
'''
|
||
@brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
|
||
@params:
|
||
metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况,
|
||
因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
|
||
only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大,需自行排查测试环境是否稳定。
|
||
其余参数:均为模型配置参数
|
||
'''
|
||
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
|
||
batch_size = self.metric.batch_size_list[-1] * dp_size
|
||
if mm_kwargs or is_embedding_task:
|
||
# The multimodal and pooling model doesn't support the hfu feature yet.
|
||
hfu_info, io_efficiency = None, None
|
||
else:
|
||
hfu_info, io_efficiency = self.llm_engine.get_hfu_info(batch_size, input_len, output_len)
|
||
self.metric.calc_metric(
|
||
self.llm_engine.model_config.model,
|
||
self.llm_engine.model_config.dtype,
|
||
metrics_idx_start, only_average,
|
||
input_len, output_len, tp_nums,
|
||
quantization, show_per_iter,
|
||
is_embedding_task, mm_kwargs, total_prefill_steps,
|
||
num_speculative_tokens, dp_size=dp_size, hfu_info=hfu_info, io_efficiency=io_efficiency)
|
||
else:
|
||
print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
|
||
|
||
|
||
def vllm__entrypoints__llm__LLM___run_engine(
|
||
self, *, use_tqdm: bool | Callable[..., tqdm] = True
|
||
) -> list[RequestOutput | PoolingRequestOutput]:
|
||
# Initialize tqdm.
|
||
if use_tqdm:
|
||
num_requests = self.llm_engine.get_num_unfinished_requests()
|
||
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
|
||
pbar = tqdm_func(
|
||
total=num_requests,
|
||
desc="Processed prompts",
|
||
dynamic_ncols=True,
|
||
postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
|
||
)
|
||
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
|
||
total_request_num = self.llm_engine.get_num_unfinished_requests()
|
||
e2e_start_time = self.metric.get_mlu_cost_time()
|
||
if not self.llm_engine.model_config.is_embedding_task():
|
||
peak_memory, block_memory, num_gpu_blocks, num_cpu_blocks = \
|
||
self.llm_engine.get_memory_usage()
|
||
self.metric.update_memory_usage(peak_memory, block_memory,
|
||
num_gpu_blocks, num_cpu_blocks)
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
|
||
# Run the engine.
|
||
outputs: list[RequestOutput | PoolingRequestOutput] = []
|
||
total_in_toks = 0
|
||
total_out_toks = 0
|
||
while self.llm_engine.has_unfinished_requests():
|
||
step_outputs = self.llm_engine.step()
|
||
for output in step_outputs:
|
||
if output.finished:
|
||
outputs.append(output)
|
||
if use_tqdm:
|
||
if isinstance(output, RequestOutput):
|
||
# Calculate tokens only for RequestOutput
|
||
n = len(output.outputs)
|
||
assert output.prompt_token_ids is not None
|
||
total_in_toks += len(output.prompt_token_ids) * n
|
||
in_spd = total_in_toks / pbar.format_dict["elapsed"]
|
||
total_out_toks += sum(
|
||
len(stp.token_ids) for stp in output.outputs
|
||
)
|
||
out_spd = total_out_toks / pbar.format_dict["elapsed"]
|
||
pbar.postfix = (
|
||
f"est. speed input: {in_spd:.2f} toks/s, "
|
||
f"output: {out_spd:.2f} toks/s"
|
||
)
|
||
pbar.update(n)
|
||
else:
|
||
pbar.update(1)
|
||
if pbar.n == num_requests:
|
||
pbar.refresh()
|
||
|
||
if use_tqdm:
|
||
pbar.close()
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
|
||
e2e_end_time = self.metric.get_mlu_cost_time()
|
||
e2e_latency = e2e_end_time - e2e_start_time
|
||
|
||
engine_step_latency, model_forward_latency, mm_encoder_latency = self.llm_engine.get_latency()
|
||
self.metric.update_step_latency(engine_step_latency)
|
||
if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
|
||
self.metric.update_step_latency_device(model_forward_latency)
|
||
self.metric.update_mm_encoder_latency_device(mm_encoder_latency)
|
||
|
||
self.metric.add_metrics(total_request_num, e2e_latency)
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
# Sort the outputs by request ID.
|
||
# This is necessary because some requests may be finished earlier than
|
||
# its previous requests.
|
||
return sorted(outputs, key=lambda x: int(x.request_id))
|
||
|
||
|
||
LLM.metric = LLMMetric()
|
||
MluHijackObject.apply_hijack(LLM,
|
||
"get_mlu_metrics",
|
||
vllm__entrypoints__llm__LLM__get_mlu_metrics)
|
||
MluHijackObject.apply_hijack(LLM,
|
||
LLM._run_engine,
|
||
vllm__entrypoints__llm__LLM___run_engine)
|