enginex-mlu370-vllm/vllm-v0.6.2/vllm_mlu/vllm_mlu/entrypoints/llm.py

import time
from tqdm import tqdm
from typing import Optional, List, Union, Dict, Any
from vllm.entrypoints.llm import LLM
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
                                   TaskOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, deprecate_args
from vllm_mlu._mlu_utils import VLLM_LATENCY_DEBUG_EN, VLLM_LATENCY_DEBUG_WITH_DEVICE_EN
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu.mlu_metric import LLMMetric
from vllm_mlu.dump_info import LLMDumpInfo
from vllm.logger import init_logger


logger = init_logger(__name__)


@deprecate_args(
    start_index=2,  # Ignore self and model
    is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
    additional_message=(
        "All positional arguments other than `model` will be "
        "replaced with keyword arguments in an upcoming version."),
)
def vllm__entrypoints__llm__LLM____init__(
    self,
    model: str,
    tokenizer: Optional[str] = None,
    tokenizer_mode: str = "auto",
    skip_tokenizer_init: bool = False,
    trust_remote_code: bool = False,
    allowed_local_media_path: str = "",
    tensor_parallel_size: int = 1,
    dtype: str = "auto",
    quantization: Optional[str] = None,
    revision: Optional[str] = None,
    tokenizer_revision: Optional[str] = None,
    seed: int = 0,
    gpu_memory_utilization: float = 0.9,
    swap_space: float = 4,
    cpu_offload_gb: float = 0,
    enforce_eager: Optional[bool] = None,
    max_seq_len_to_capture: int = 8192,
    disable_custom_all_reduce: bool = False,
    disable_async_output_proc: bool = False,
    hf_overrides: Optional[HfOverrides] = None,
    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
    # After positional args are removed, move this right below `model`
    task: TaskOption = "auto",
    override_pooler_config: Optional[PoolerConfig] = None,
    **kwargs,
) -> None:
    '''
    LLM constructor.

    Note: if enforce_eager is unset (enforce_eager is None)
    it defaults to False.
    '''

    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: 1) Initialize LLMDumpInfo
            2) Initialize context mlugraph params
    '''
    LLM.dump_info.init_param(
        tensor_parallel_size=tensor_parallel_size, dtype=dtype,
        kv_cache_dtype=kwargs.get('kv_cache_dtype', 'default_value'),
        quantization=quantization,
        model=model, trust_remote_code=kwargs.get('trust_remote_code', 'default_value'))

    enable_context_mlugraph = kwargs.pop("enable_context_mlugraph", False)
    context_batch_size_to_capture = kwargs.pop("context_batch_size_to_capture", None)
    context_seq_len_to_capture = kwargs.pop("context_seq_len_to_capture", None)
    '''
    ==================
    End of MLU Hijack
    ==================
    '''

    if "disable_log_stats" not in kwargs:
        kwargs["disable_log_stats"] = True

    engine_args = EngineArgs(
        model=model,
        task=task,
        tokenizer=tokenizer,
        tokenizer_mode=tokenizer_mode,
        skip_tokenizer_init=skip_tokenizer_init,
        trust_remote_code=trust_remote_code,
        allowed_local_media_path=allowed_local_media_path,
        tensor_parallel_size=tensor_parallel_size,
        dtype=dtype,
        quantization=quantization,
        revision=revision,
        tokenizer_revision=tokenizer_revision,
        seed=seed,
        gpu_memory_utilization=gpu_memory_utilization,
        swap_space=swap_space,
        cpu_offload_gb=cpu_offload_gb,
        enforce_eager=enforce_eager,
        max_seq_len_to_capture=max_seq_len_to_capture,
        disable_custom_all_reduce=disable_custom_all_reduce,
        disable_async_output_proc=disable_async_output_proc,
        hf_overrides=hf_overrides,
        mm_processor_kwargs=mm_processor_kwargs,
        override_pooler_config=override_pooler_config,
        **kwargs,
    )

    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: set context mlugraph params for EngineArgs
    '''
    setattr(engine_args, "enable_context_mlugraph", enable_context_mlugraph)
    setattr(engine_args, "context_batch_size_to_capture", context_batch_size_to_capture)
    setattr(engine_args, "context_seq_len_to_capture", context_seq_len_to_capture)
    '''
    ==================
    End of MLU Hijack
    ==================
    '''

    # Logic to switch between engines is done at runtime instead of import
    # to avoid import order issues
    self.engine_class = self.get_engine_class()

    # TODO(rob): enable mp by default (issue with fork vs spawn)
    self.llm_engine = self.engine_class.from_engine_args(
        engine_args, usage_context=UsageContext.LLM_CLASS)

    self.request_counter = Counter()

    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: Get Cpuinfo member for vllm
    '''
    LLM.dump_info.memory_usage()
    '''
    ==================
    End of MLU Hijack
    ==================
    '''


def vllm__entrypoints__llm__LLM__get_metrics(
    self,
    metrics_idx_start,
    only_average,
    input_len,
    output_len,
    tp_nums,
    quantization,
    dump_info=None,
    show_per_iter=False,
) -> None:
    '''
    @brief:该函数用来打印vLLM调用generate接口过程中代码统计的各项性能指标数据
    @params:
        metrics_idx_start: 考虑存在调用generate接口为warmup过程的情况，
        因此设置该参数可忽略统计[0,metrics_idx_start)之间的数据,默认为0,即所有性能数据有效。
        only_average: True 只打印N次调用generate接口的平均性能 False 打印每次调用generate接口的性能及其均值 若N次性能数据波动较大，需自行排查测试环境是否稳定。
        其余参数:均为模型配置参数
    '''
    if VLLM_LATENCY_DEBUG_EN:
        self.metric.calc_metric(self.llm_engine.model_config.model,
                                self.llm_engine.model_config.dtype,
                                metrics_idx_start, only_average,
                                input_len, output_len, tp_nums,
                                quantization, dump_info, show_per_iter)
    else:
        print("Warnning:please set VLLM_LATENCY_DEBUG=true!")


def vllm__entrypoints__llm__LLM___run_engine(
        self, *, use_tqdm: bool
) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
    # Initialize tqdm.
    if use_tqdm:
        num_requests = self.llm_engine.get_num_unfinished_requests()
        pbar = tqdm(
            total=num_requests,
            desc="Processed prompts",
            dynamic_ncols=True,
            postfix=(f"est. speed input: {0:.2f} toks/s, "
                        f"output: {0:.2f} toks/s"),
        )

    '''
    =============================
    Added by vllm_mlu
    =============================
    '''
    is_latency_debug = VLLM_LATENCY_DEBUG_EN
    # Record start
    if is_latency_debug:
        total_request_num = self.llm_engine.get_num_unfinished_requests()
        self.dump_info.capture_cpu_info()
        peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks = \
            self.llm_engine.get_memory_usage()
        self.metric.update_memory_usage(peak_memory, block_memory, num_total_gpu_blocks, num_total_cpu_blocks)
        e2e_start_time = self.metric.get_mlu_cost_time()
    '''
    ==================
    End of addition
    ==================
    '''

    # Run the engine.
    outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
    total_in_toks = 0
    total_out_toks = 0
    while self.llm_engine.has_unfinished_requests():
        '''
        =============================
        Added by vllm_mlu
        =============================
        '''
        if is_latency_debug :
            self.dump_info.memory_usage()
            start_time = self.metric.get_mlu_cost_time()
        '''
        ==================
        End of addition
        ==================
        '''
        step_outputs = self.llm_engine.step()
        '''
        =============================
        Added by vllm_mlu
        =============================
        '''
        if is_latency_debug:
            end_time = self.metric.get_mlu_cost_time()
            step_latency = end_time - start_time
            if len(step_outputs) > 0:
                batch_size = len(step_outputs)
                assert batch_size == total_request_num, \
                    f"LLM has received {total_request_num} requests, but only processed {batch_size} requests in the current step.\n" + \
                    f"If you are running benchmark_latency test, please check if the input is correct.\n" + \
                    f"Otherwise, please set env VLLM_LATENCY_DEBUG=false, then run test again.\n"
            num_free_gpu_blocks, num_free_cpu_blocks = self.llm_engine.get_block_usage()
            self.metric.update_step_block_usage(num_free_gpu_blocks, num_free_cpu_blocks)
            self.metric.update_step_latency(step_latency)
            if VLLM_LATENCY_DEBUG_WITH_DEVICE_EN:
                self.metric.update_step_latency_device(self.llm_engine.get_latency())
            self.dump_info.memory_usage()
        '''
        ==================
        End of addition
        ==================
        '''
        for output in step_outputs:
            if output.finished:
                outputs.append(output)
                if use_tqdm:
                    if isinstance(output, RequestOutput):
                        # Calculate tokens only for RequestOutput
                        assert output.prompt_token_ids is not None
                        total_in_toks += len(output.prompt_token_ids)
                        in_spd = total_in_toks / pbar.format_dict["elapsed"]
                        total_out_toks += sum(
                            len(stp.token_ids) for stp in output.outputs)
                        out_spd = (total_out_toks /
                                    pbar.format_dict["elapsed"])
                        pbar.postfix = (
                            f"est. speed input: {in_spd:.2f} toks/s, "
                            f"output: {out_spd:.2f} toks/s")
                    pbar.update(1)
    '''
    =============================
    Added by vllm_mlu
    =============================
    '''
    if is_latency_debug:
        e2e_end_time = self.metric.get_mlu_cost_time()
        e2e_latency = e2e_end_time - e2e_start_time
        self.metric.add_metrics(batch_size, e2e_latency)
    '''
    ==================
    End of addition
    ==================
    '''

    if use_tqdm:
        pbar.close()
    # Sort the outputs by request ID.
    # This is necessary because some requests may be finished earlier than
    # its previous requests.
    return sorted(outputs, key=lambda x: int(x.request_id))


LLM.metric = LLMMetric()

LLM.dump_info = LLMDumpInfo()

MluHijackObject.apply_hijack(LLM,
                             LLM.__init__,
                             vllm__entrypoints__llm__LLM____init__)
MluHijackObject.apply_hijack(LLM,
                             "get_metrics",
                             vllm__entrypoints__llm__LLM__get_metrics)
MluHijackObject.apply_hijack(LLM,
                             LLM._run_engine,
                             vllm__entrypoints__llm__LLM___run_engine)