Files
enginex-mlu370-vllm/vllm-v0.6.2/vllm_mlu/vllm_mlu/entrypoints/llm.py
2026-02-04 17:22:39 +08:00

314 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
from tqdm import tqdm
from typing import Optional, List, Union, Dict, Any
from vllm.entrypoints.llm import LLM
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, deprecate_args
from vllm_mlu._mlu_utils import VLLM_LATENCY_DEBUG_EN, VLLM_LATENCY_DEBUG_WITH_DEVICE_EN
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu.mlu_metric import LLMMetric
from vllm_mlu.dump_info import LLMDumpInfo
from vllm.logger import init_logger
logger = init_logger(__name__)
@deprecate_args(
start_index=2, # Ignore self and model
is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
additional_message=(
"All positional arguments other than `model` will be "
"replaced with keyword arguments in an upcoming version."),
)
def vllm__entrypoints__llm__LLM____init__(
self,
model: str,
tokenizer: Optional[str] = None,
tokenizer_mode: str = "auto",
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
allowed_local_media_path: str = "",
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
enforce_eager: Optional[bool] = None,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
disable_async_output_proc: bool = False,
hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
# After positional args are removed, move this right below `model`
task: TaskOption = "auto",
override_pooler_config: Optional[PoolerConfig] = None,
**kwargs,
) -> None:
'''
LLM constructor.
Note: if enforce_eager is unset (enforce_eager is None)
it defaults to False.
'''
'''
=============================
Modify by vllm_mlu
=============================
@brief: 1) Initialize LLMDumpInfo
2) Initialize context mlugraph params
'''
LLM.dump_info.init_param(
tensor_parallel_size=tensor_parallel_size, dtype=dtype,
kv_cache_dtype=kwargs.get('kv_cache_dtype', 'default_value'),
quantization=quantization,
model=model, trust_remote_code=kwargs.get('trust_remote_code', 'default_value'))
enable_context_mlugraph = kwargs.pop("enable_context_mlugraph", False)
context_batch_size_to_capture = kwargs.pop("context_batch_size_to_capture", None)
context_seq_len_to_capture = kwargs.pop("context_seq_len_to_capture", None)
'''
==================
End of MLU Hijack
==================
'''
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
engine_args = EngineArgs(
model=model,
task=task,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init,
trust_remote_code=trust_remote_code,
allowed_local_media_path=allowed_local_media_path,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
quantization=quantization,
revision=revision,
tokenizer_revision=tokenizer_revision,
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_seq_len_to_capture=max_seq_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
disable_async_output_proc=disable_async_output_proc,
hf_overrides=hf_overrides,
mm_processor_kwargs=mm_processor_kwargs,
override_pooler_config=override_pooler_config,
**kwargs,
)
'''
=============================
Modify by vllm_mlu
=============================
@brief: set context mlugraph params for EngineArgs
'''
setattr(engine_args, "enable_context_mlugraph", enable_context_mlugraph)
setattr(engine_args, "context_batch_size_to_capture", context_batch_size_to_capture)
setattr(engine_args, "context_seq_len_to_capture", context_seq_len_to_capture)
'''
==================
End of MLU Hijack
==================
'''
# Logic to switch between engines is done at runtime instead of import
# to avoid import order issues
self.engine_class = self.get_engine_class()
# TODO(rob): enable mp by default (issue with fork vs spawn)
self.llm_engine = self.engine_class.from_engine_args(
engine_args, usage_context=UsageContext.LLM_CLASS)
self.request_counter = Counter()
'''
=============================
Modify by vllm_mlu
=============================
@brief: Get Cpuinfo member for vllm
'''
LLM.dump_info.memory_usage()
'''
==================
End of MLU Hijack
==================
'''
def vllm__entrypoints__llm__LLM__get_metrics(
self,
metrics_idx_start,
only_average,
input_len,
output_len,
tp_nums,
quantization,
dump_info=None,
show_per_iter=False,
) -> None:
'''
@brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
@params:
metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况
因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大需自行排查测试环境是否稳定。
其余参数:均为模型配置参数
'''
if VLLM_LATENCY_DEBUG_EN:
self.metric.calc_metric(self.llm_engine.model_config.model,
self.llm_engine.model_config.dtype,
metrics_idx_start, only_average,
input_len, output_len, tp_nums,
quantization, dump_info, show_per_iter)
else:
print("Warnning:please set VLLM_LATENCY_DEBUG=true!")
def vllm__entrypoints__llm__LLM___run_engine(
self, *, use_tqdm: bool
) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
pbar = tqdm(
total=num_requests,
desc="Processed prompts",
dynamic_ncols=True,
postfix=(f"est. speed input: {0:.2f} toks/s, "
f"output: {0:.2f} toks/s"),
)
'''
=============================
Added by vllm_mlu
=============================
'''
is_latency_debug = VLLM_LATENCY_DEBUG_EN
# Record start
if is_latency_debug:
total_request_num = self.llm_engine.get_num_unfinished_requests()
self.dump_info.capture_cpu_info()
peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks = \
self.llm_engine.get_memory_usage()
self.metric.update_memory_usage(peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks)
e2e_start_time = self.metric.get_mlu_cost_time()
'''
==================
End of addition
==================
'''
# Run the engine.
outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
total_in_toks = 0
total_out_toks = 0
while self.llm_engine.has_unfinished_requests():
'''
=============================
Added by vllm_mlu
=============================
'''
if is_latency_debug :
self.dump_info.memory_usage()
start_time = self.metric.get_mlu_cost_time()
'''
==================
End of addition
==================
'''
step_outputs = self.llm_engine.step()
'''
=============================
Added by vllm_mlu
=============================
'''
if is_latency_debug:
end_time = self.metric.get_mlu_cost_time()
step_latency = end_time - start_time
if len(step_outputs) > 0:
batch_size = len(step_outputs)
assert batch_size == total_request_num, \
f"LLM has received {total_request_num} requests, but only processed {batch_size} requests in the current step.\n" + \
f"If you are running benchmark_latency test, please check if the input is correct.\n" + \
f"Otherwise, please set env VLLM_LATENCY_DEBUG=false, then run test again.\n"
num_free_gpu_blocks, num_free_cpu_blocks = self.llm_engine.get_block_usage()
self.metric.update_step_block_usage(num_free_gpu_blocks, num_free_cpu_blocks)
self.metric.update_step_latency(step_latency)
if VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
self.metric.update_step_latency_device(self.llm_engine.get_latency())
self.dump_info.memory_usage()
'''
==================
End of addition
==================
'''
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
if isinstance(output, RequestOutput):
# Calculate tokens only for RequestOutput
assert output.prompt_token_ids is not None
total_in_toks += len(output.prompt_token_ids)
in_spd = total_in_toks / pbar.format_dict["elapsed"]
total_out_toks += sum(
len(stp.token_ids) for stp in output.outputs)
out_spd = (total_out_toks /
pbar.format_dict["elapsed"])
pbar.postfix = (
f"est. speed input: {in_spd:.2f} toks/s, "
f"output: {out_spd:.2f} toks/s")
pbar.update(1)
'''
=============================
Added by vllm_mlu
=============================
'''
if is_latency_debug:
e2e_end_time = self.metric.get_mlu_cost_time()
e2e_latency = e2e_end_time - e2e_start_time
self.metric.add_metrics(batch_size, e2e_latency)
'''
==================
End of addition
==================
'''
if use_tqdm:
pbar.close()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
return sorted(outputs, key=lambda x: int(x.request_id))
LLM.metric = LLMMetric()
LLM.dump_info = LLMDumpInfo()
MluHijackObject.apply_hijack(LLM,
LLM.__init__,
vllm__entrypoints__llm__LLM____init__)
MluHijackObject.apply_hijack(LLM,
"get_metrics",
vllm__entrypoints__llm__LLM__get_metrics)
MluHijackObject.apply_hijack(LLM,
LLM._run_engine,
vllm__entrypoints__llm__LLM___run_engine)