314 lines
11 KiB
Python
314 lines
11 KiB
Python
import time
|
||
from tqdm import tqdm
|
||
from typing import Optional, List, Union, Dict, Any
|
||
from vllm.entrypoints.llm import LLM
|
||
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
||
TaskOption)
|
||
from vllm.engine.llm_engine import LLMEngine
|
||
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
|
||
from vllm.usage.usage_lib import UsageContext
|
||
from vllm.utils import Counter, deprecate_args
|
||
from vllm_mlu._mlu_utils import VLLM_LATENCY_DEBUG_EN, VLLM_LATENCY_DEBUG_WITH_DEVICE_EN
|
||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||
from vllm_mlu.mlu_metric import LLMMetric
|
||
from vllm_mlu.dump_info import LLMDumpInfo
|
||
from vllm.logger import init_logger
|
||
|
||
|
||
logger = init_logger(__name__)
|
||
|
||
|
||
@deprecate_args(
|
||
start_index=2, # Ignore self and model
|
||
is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
|
||
additional_message=(
|
||
"All positional arguments other than `model` will be "
|
||
"replaced with keyword arguments in an upcoming version."),
|
||
)
|
||
def vllm__entrypoints__llm__LLM____init__(
|
||
self,
|
||
model: str,
|
||
tokenizer: Optional[str] = None,
|
||
tokenizer_mode: str = "auto",
|
||
skip_tokenizer_init: bool = False,
|
||
trust_remote_code: bool = False,
|
||
allowed_local_media_path: str = "",
|
||
tensor_parallel_size: int = 1,
|
||
dtype: str = "auto",
|
||
quantization: Optional[str] = None,
|
||
revision: Optional[str] = None,
|
||
tokenizer_revision: Optional[str] = None,
|
||
seed: int = 0,
|
||
gpu_memory_utilization: float = 0.9,
|
||
swap_space: float = 4,
|
||
cpu_offload_gb: float = 0,
|
||
enforce_eager: Optional[bool] = None,
|
||
max_seq_len_to_capture: int = 8192,
|
||
disable_custom_all_reduce: bool = False,
|
||
disable_async_output_proc: bool = False,
|
||
hf_overrides: Optional[HfOverrides] = None,
|
||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||
# After positional args are removed, move this right below `model`
|
||
task: TaskOption = "auto",
|
||
override_pooler_config: Optional[PoolerConfig] = None,
|
||
**kwargs,
|
||
) -> None:
|
||
'''
|
||
LLM constructor.
|
||
|
||
Note: if enforce_eager is unset (enforce_eager is None)
|
||
it defaults to False.
|
||
'''
|
||
|
||
'''
|
||
=============================
|
||
Modify by vllm_mlu
|
||
=============================
|
||
@brief: 1) Initialize LLMDumpInfo
|
||
2) Initialize context mlugraph params
|
||
'''
|
||
LLM.dump_info.init_param(
|
||
tensor_parallel_size=tensor_parallel_size, dtype=dtype,
|
||
kv_cache_dtype=kwargs.get('kv_cache_dtype', 'default_value'),
|
||
quantization=quantization,
|
||
model=model, trust_remote_code=kwargs.get('trust_remote_code', 'default_value'))
|
||
|
||
enable_context_mlugraph = kwargs.pop("enable_context_mlugraph", False)
|
||
context_batch_size_to_capture = kwargs.pop("context_batch_size_to_capture", None)
|
||
context_seq_len_to_capture = kwargs.pop("context_seq_len_to_capture", None)
|
||
'''
|
||
==================
|
||
End of MLU Hijack
|
||
==================
|
||
'''
|
||
|
||
if "disable_log_stats" not in kwargs:
|
||
kwargs["disable_log_stats"] = True
|
||
|
||
engine_args = EngineArgs(
|
||
model=model,
|
||
task=task,
|
||
tokenizer=tokenizer,
|
||
tokenizer_mode=tokenizer_mode,
|
||
skip_tokenizer_init=skip_tokenizer_init,
|
||
trust_remote_code=trust_remote_code,
|
||
allowed_local_media_path=allowed_local_media_path,
|
||
tensor_parallel_size=tensor_parallel_size,
|
||
dtype=dtype,
|
||
quantization=quantization,
|
||
revision=revision,
|
||
tokenizer_revision=tokenizer_revision,
|
||
seed=seed,
|
||
gpu_memory_utilization=gpu_memory_utilization,
|
||
swap_space=swap_space,
|
||
cpu_offload_gb=cpu_offload_gb,
|
||
enforce_eager=enforce_eager,
|
||
max_seq_len_to_capture=max_seq_len_to_capture,
|
||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||
disable_async_output_proc=disable_async_output_proc,
|
||
hf_overrides=hf_overrides,
|
||
mm_processor_kwargs=mm_processor_kwargs,
|
||
override_pooler_config=override_pooler_config,
|
||
**kwargs,
|
||
)
|
||
|
||
'''
|
||
=============================
|
||
Modify by vllm_mlu
|
||
=============================
|
||
@brief: set context mlugraph params for EngineArgs
|
||
'''
|
||
setattr(engine_args, "enable_context_mlugraph", enable_context_mlugraph)
|
||
setattr(engine_args, "context_batch_size_to_capture", context_batch_size_to_capture)
|
||
setattr(engine_args, "context_seq_len_to_capture", context_seq_len_to_capture)
|
||
'''
|
||
==================
|
||
End of MLU Hijack
|
||
==================
|
||
'''
|
||
|
||
# Logic to switch between engines is done at runtime instead of import
|
||
# to avoid import order issues
|
||
self.engine_class = self.get_engine_class()
|
||
|
||
# TODO(rob): enable mp by default (issue with fork vs spawn)
|
||
self.llm_engine = self.engine_class.from_engine_args(
|
||
engine_args, usage_context=UsageContext.LLM_CLASS)
|
||
|
||
self.request_counter = Counter()
|
||
|
||
'''
|
||
=============================
|
||
Modify by vllm_mlu
|
||
=============================
|
||
@brief: Get Cpuinfo member for vllm
|
||
'''
|
||
LLM.dump_info.memory_usage()
|
||
'''
|
||
==================
|
||
End of MLU Hijack
|
||
==================
|
||
'''
|
||
|
||
|
||
def vllm__entrypoints__llm__LLM__get_metrics(
|
||
self,
|
||
metrics_idx_start,
|
||
only_average,
|
||
input_len,
|
||
output_len,
|
||
tp_nums,
|
||
quantization,
|
||
dump_info=None,
|
||
show_per_iter=False,
|
||
) -> None:
|
||
'''
|
||
@brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
|
||
@params:
|
||
metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况,
|
||
因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
|
||
only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大,需自行排查测试环境是否稳定。
|
||
其余参数:均为模型配置参数
|
||
'''
|
||
if VLLM_LATENCY_DEBUG_EN:
|
||
self.metric.calc_metric(self.llm_engine.model_config.model,
|
||
self.llm_engine.model_config.dtype,
|
||
metrics_idx_start, only_average,
|
||
input_len, output_len, tp_nums,
|
||
quantization, dump_info, show_per_iter)
|
||
else:
|
||
print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
|
||
|
||
|
||
def vllm__entrypoints__llm__LLM___run_engine(
|
||
self, *, use_tqdm: bool
|
||
) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
|
||
# Initialize tqdm.
|
||
if use_tqdm:
|
||
num_requests = self.llm_engine.get_num_unfinished_requests()
|
||
pbar = tqdm(
|
||
total=num_requests,
|
||
desc="Processed prompts",
|
||
dynamic_ncols=True,
|
||
postfix=(f"est. speed input: {0:.2f} toks/s, "
|
||
f"output: {0:.2f} toks/s"),
|
||
)
|
||
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
is_latency_debug = VLLM_LATENCY_DEBUG_EN
|
||
# Record start
|
||
if is_latency_debug:
|
||
total_request_num = self.llm_engine.get_num_unfinished_requests()
|
||
self.dump_info.capture_cpu_info()
|
||
peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks = \
|
||
self.llm_engine.get_memory_usage()
|
||
self.metric.update_memory_usage(peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks)
|
||
e2e_start_time = self.metric.get_mlu_cost_time()
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
|
||
# Run the engine.
|
||
outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
|
||
total_in_toks = 0
|
||
total_out_toks = 0
|
||
while self.llm_engine.has_unfinished_requests():
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
if is_latency_debug :
|
||
self.dump_info.memory_usage()
|
||
start_time = self.metric.get_mlu_cost_time()
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
step_outputs = self.llm_engine.step()
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
if is_latency_debug:
|
||
end_time = self.metric.get_mlu_cost_time()
|
||
step_latency = end_time - start_time
|
||
if len(step_outputs) > 0:
|
||
batch_size = len(step_outputs)
|
||
assert batch_size == total_request_num, \
|
||
f"LLM has received {total_request_num} requests, but only processed {batch_size} requests in the current step.\n" + \
|
||
f"If you are running benchmark_latency test, please check if the input is correct.\n" + \
|
||
f"Otherwise, please set env VLLM_LATENCY_DEBUG=false, then run test again.\n"
|
||
num_free_gpu_blocks, num_free_cpu_blocks = self.llm_engine.get_block_usage()
|
||
self.metric.update_step_block_usage(num_free_gpu_blocks, num_free_cpu_blocks)
|
||
self.metric.update_step_latency(step_latency)
|
||
if VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
|
||
self.metric.update_step_latency_device(self.llm_engine.get_latency())
|
||
self.dump_info.memory_usage()
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
for output in step_outputs:
|
||
if output.finished:
|
||
outputs.append(output)
|
||
if use_tqdm:
|
||
if isinstance(output, RequestOutput):
|
||
# Calculate tokens only for RequestOutput
|
||
assert output.prompt_token_ids is not None
|
||
total_in_toks += len(output.prompt_token_ids)
|
||
in_spd = total_in_toks / pbar.format_dict["elapsed"]
|
||
total_out_toks += sum(
|
||
len(stp.token_ids) for stp in output.outputs)
|
||
out_spd = (total_out_toks /
|
||
pbar.format_dict["elapsed"])
|
||
pbar.postfix = (
|
||
f"est. speed input: {in_spd:.2f} toks/s, "
|
||
f"output: {out_spd:.2f} toks/s")
|
||
pbar.update(1)
|
||
'''
|
||
=============================
|
||
Added by vllm_mlu
|
||
=============================
|
||
'''
|
||
if is_latency_debug:
|
||
e2e_end_time = self.metric.get_mlu_cost_time()
|
||
e2e_latency = e2e_end_time - e2e_start_time
|
||
self.metric.add_metrics(batch_size, e2e_latency)
|
||
'''
|
||
==================
|
||
End of addition
|
||
==================
|
||
'''
|
||
|
||
if use_tqdm:
|
||
pbar.close()
|
||
# Sort the outputs by request ID.
|
||
# This is necessary because some requests may be finished earlier than
|
||
# its previous requests.
|
||
return sorted(outputs, key=lambda x: int(x.request_id))
|
||
|
||
|
||
LLM.metric = LLMMetric()
|
||
|
||
LLM.dump_info = LLMDumpInfo()
|
||
|
||
MluHijackObject.apply_hijack(LLM,
|
||
LLM.__init__,
|
||
vllm__entrypoints__llm__LLM____init__)
|
||
MluHijackObject.apply_hijack(LLM,
|
||
"get_metrics",
|
||
vllm__entrypoints__llm__LLM__get_metrics)
|
||
MluHijackObject.apply_hijack(LLM,
|
||
LLM._run_engine,
|
||
vllm__entrypoints__llm__LLM___run_engine)
|