enginex-mlu370-vllm/vllm-v0.6.2/vllm_mlu/vllm_mlu/dump_info.py

from vllm.logger import init_logger
from vllm_mlu.mlu_hijack_utils import get_is_gated, ModelConfig
import ctypes
import json
from vllm.transformers_utils.config import get_config
from vllm.entrypoints.llm import LLM
from vllm_mlu._mlu_utils import VLLM_DUMP_CPU_INFO_EN, VLLM_DUMP_MLU_INFO_EN
logger = init_logger(__name__)

def get_deepseek_v2_flops(bcfg, batch, seq_len, hidden_size):
    ATTN_PAD_SIZE    = 192
    qk_nope_head_dim = bcfg.qk_nope_head_dim
    qk_rope_head_dim = bcfg.qk_rope_head_dim
    v_head_dim       = bcfg.v_head_dim
    q_lora_rank      = bcfg.q_lora_rank
    kv_lora_rank     = bcfg.kv_lora_rank
    context_atn_pre = 2 * batch * seq_len * \
                 (hidden_size * q_lora_rank + \
                 hidden_size * (kv_lora_rank + qk_rope_head_dim) + \
                 q_lora_rank * bcfg.head_num * (qk_nope_head_dim + qk_rope_head_dim) + \
                 kv_lora_rank * bcfg.head_num * (qk_nope_head_dim + v_head_dim))
    context_atn_qk = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE
    context_atn_qkv = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE
    context_atn_post = 2 * batch * seq_len * bcfg.head_num * v_head_dim * hidden_size
    return context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post

class FlopsInfo(ctypes.Structure):
    _fields_ = [("context_flops", ctypes.c_double),
                ("decoder_flops", ctypes.c_double)]

class LLMDumpInfo:
    def __init__(self,
                 tensor_parallel_size=None,
                 dtype=None, kv_cache_dtype=None,
                 quantization=None,
                 model=None, batch_size=None,
                 input_len=None,
                 output_len=None,
                 trust_remote_code=None)->None:
        self.so_file = None
        self.dev_info = None
        self.cpu_info = None
        self.lib = None
        self.hfu_info = None
        self.flops_info = None
        self.ctypes_model_config = ModelConfig()
        self.io_efficiency = 0
        self.context_latency_device = 0
        self.generate_latency_device = 0

        self.tensor_parallel_size = tensor_parallel_size
        self.dtype = dtype
        self.kv_cache_dtype = kv_cache_dtype
        self.quantization = quantization
        self.batch_size = batch_size
        self.input_len = input_len
        self.output_len = output_len
        self.model = model
        self.model_config = None

        try:
            from vllm_mlu.device_info import get_info_inner
            self.so_file,self.dev_info,self.cpu_info,self.lib = get_info_inner(self.so_file, self.dev_info, self.cpu_info, self.lib)
        except:
            logger.info("Cannot get device info")

    def init_param(self,
                   tensor_parallel_size=None,
                   dtype=None,
                   kv_cache_dtype=None,
                   quantization=None,
                   model=None,
                   batch_size=None,
                   input_len=None,
                   output_len=None,
                   trust_remote_code=None,
                   context_latency_device=None,
                   generate_latency_device=None):
        if tensor_parallel_size != None:
            self.tensor_parallel_size = tensor_parallel_size
        if dtype != None:
            self.dtype = dtype
        if kv_cache_dtype != None:
            self.kv_cache_dtype = kv_cache_dtype
        if quantization != None:
            self.quantization = quantization
        if model != None:
            self.model = model
        if batch_size != None:
            self.batch_size = batch_size
        if input_len != None:
            self.input_len = input_len
        if output_len != None:
            self.output_len = output_len
        if trust_remote_code != None:
            self.trust_remote_code = trust_remote_code
        if context_latency_device != None:
            self.context_latency_device = context_latency_device
        if generate_latency_device != None:
            self.generate_latency_device = generate_latency_device

        # paser the model config
        if self.model_config == None and self.model != None and self.trust_remote_code != None:
            self.model_config = get_config(self.model, self.trust_remote_code)

    def initialize_ctypes_model_config(self, model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization):
        # prepare input
        self.ctypes_model_config.hidden_size     = model_cfg.hidden_size
        self.ctypes_model_config.vocab_size      = model_cfg.vocab_size
        self.ctypes_model_config.cla_coeffient   = 1.0

        possible_keys_ffn_size = [
            # chatglm3-6b-32k
            "ffn_hidden_size",
            # llama3-8b-hf
            "intermediate_size",
        ]
        possible_kv_heads = [
            # chatglm3-6b-32k
            "multi_query_group_num",
            # llama3-8b-hf
            "num_key_value_heads",
            # falcon-180B-chat
            "num_kv_heads",
        ]
        possible_num_attention_heads = [
            "num_attention_heads",
            "n_heads",
        ]
        moe_size=None
        ffn_size=None
        if getattr(model_cfg, "moe_intermediate_size", None):
            moe_size = getattr(model_cfg, "moe_intermediate_size", None)
        for key in possible_keys_ffn_size:
            ffn_size = getattr(model_cfg, key, None)
            if ffn_size is not None:
                break
        if model_cfg.model_type in ['bloom'] and ffn_size is None:
            ffn_size = model_cfg.hidden_size * 4
        if model_cfg.model_type in ['qwen']:
           ffn_size = model_cfg.intermediate_size // 2
        if ffn_size is None and moe_size is None:
            logger.warning("The model's config.json does not contain any of the following"
                        "keys to determine the ffn_size or moe_size: "
                        f"{possible_keys_ffn_size}. ")

        for key in possible_num_attention_heads:
            num_attention_heads = getattr(model_cfg, key, None)
            if num_attention_heads is not None:
                break
        if num_attention_heads is None:
            logger.error("The model's config.json does not contain any of the following"
                        "keys to determine the num_attention_heads: "
                        f"{possible_num_attention_heads}. ")

        for key in possible_kv_heads:
            kv_heads = getattr(model_cfg, key, None)
            if kv_heads is not None:
                break

        if kv_heads is None:
            logger.warning("The model's config.json does not contain any of the following"
                        "keys to determine the kv_heads: "
                        f"{possible_kv_heads}, use num_attention_heads to replace")
            kv_heads = model_cfg.num_attention_heads

        self.ctypes_model_config.ffn_inner_size =  0 if ffn_size is None else ffn_size
        self.ctypes_model_config.moe_inner_size =  0 if moe_size is None else moe_size
        self.ctypes_model_config.moe_layer_num  =  0 if moe_size is None else model_cfg.num_hidden_layers
        self.ctypes_model_config.layer_num      =  model_cfg.num_hidden_layers
        self.ctypes_model_config.head_num       =  num_attention_heads
        self.ctypes_model_config.head_size      =  self.ctypes_model_config.hidden_size / self.ctypes_model_config.head_num
        self.ctypes_model_config.head_num_kv    =  kv_heads
        self.ctypes_model_config.tp_num         =  tp_num
        if hasattr(model_cfg, "shared_expert_intermediate_size") and model_cfg.shared_expert_intermediate_size is not None:
            self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.shared_expert_intermediate_size
        else:
            self.ctypes_model_config.shared_expert_intermediate_size = 0
        self.ctypes_model_config.use_gated_ffn   =  get_is_gated()
        if hasattr(model_cfg, "n_shared_experts") and model_cfg.n_shared_experts is not None:
            self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.n_shared_experts * moe_size
        else:
            self.ctypes_model_config.shared_experts  = 0
        if hasattr(model_cfg, "num_experts") and model_cfg.num_experts is not None:
            self.ctypes_model_config.experts_num     =  model_cfg.num_experts
            if model_cfg.model_type == 'hunyuan':
                self.ctypes_model_config.topk_num        =  model_cfg.moe_topk
            else:
                self.ctypes_model_config.topk_num        =  model_cfg.num_experts_per_tok
        elif hasattr(model_cfg, "num_local_experts"):
            self.ctypes_model_config.experts_num     =  model_cfg.num_local_experts
            if model_cfg.model_type == 'hunyuan':
                self.ctypes_model_config.topk_num        =  model_cfg.moe_topk
            else:
                self.ctypes_model_config.topk_num        =  model_cfg.num_experts_per_tok
        elif hasattr(model_cfg, "n_routed_experts"):
            self.ctypes_model_config.experts_num     =  model_cfg.n_routed_experts
            if model_cfg.model_type == 'hunyuan':
                self.ctypes_model_config.topk_num        =  model_cfg.moe_topk
            else:
                self.ctypes_model_config.topk_num        =  model_cfg.num_experts_per_tok
        else:
            self.ctypes_model_config.experts_num     =  0
        if hasattr(model_cfg, "model_type") and model_cfg.model_type is not None:
            self.ctypes_model_config.model_type = model_cfg.model_type.encode('utf-8')
            # when adding a moe model, need fix moe/ffn info, like
            # moe_inner_size, ffn_inner_size, moe_layer_num, shared_expert_intermediate_size.
            # add for mixtral
            if model_cfg.model_type == "mixtral":
                self.ctypes_model_config.moe_inner_size = ffn_size
                self.ctypes_model_config.ffn_inner_size =  0
                self.ctypes_model_config.moe_layer_num  = model_cfg.num_hidden_layers
            # add for deepseek-v2
            if model_cfg.model_type == "deepseek_v2":
                if hasattr(model_cfg, "first_k_dense_replace") and model_cfg.first_k_dense_replace is not None:
                    self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers - model_cfg.first_k_dense_replace
                if hasattr(model_cfg, "qk_nope_head_dim") and model_cfg.qk_nope_head_dim is not None:
                    self.ctypes_model_config.qk_nope_head_dim = model_cfg.qk_nope_head_dim
                if hasattr(model_cfg, "qk_rope_head_dim") and model_cfg.qk_rope_head_dim is not None:
                    self.ctypes_model_config.qk_rope_head_dim = model_cfg.qk_rope_head_dim
                if hasattr(model_cfg, "v_head_dim") and model_cfg.v_head_dim is not None:
                    self.ctypes_model_config.v_head_dim = model_cfg.v_head_dim
                if hasattr(model_cfg, "q_lora_rank") and model_cfg.q_lora_rank is not None:
                    self.ctypes_model_config.q_lora_rank = model_cfg.q_lora_rank
                else:
                    self.ctypes_model_config.q_lora_rank = 0
                if hasattr(model_cfg, "kv_lora_rank") and model_cfg.kv_lora_rank is not None:
                    self.ctypes_model_config.kv_lora_rank = model_cfg.kv_lora_rank
            # add for Hunyuan
            if model_cfg.model_type == "hunyuan":
                self.ctypes_model_config.cla_coeffient = 0.5 # huanyuan model use CLA2
                if hasattr(model_cfg, "num_shared_expert") and model_cfg.num_shared_expert is not None:
                    self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.num_shared_expert * model_cfg.intermediate_size
                if not self.ctypes_model_config.moe_inner_size and model_cfg.intermediate_size is not None:
                    self.ctypes_model_config.moe_inner_size = model_cfg.intermediate_size
                if not self.ctypes_model_config.moe_layer_num and hasattr(model_cfg, "num_experts"):
                    self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers

        self.ctypes_model_config.use_causal_mask     =  True  # the flash attention is only use causal_mask in vllm

        if weight_dtype == "auto":
            self.ctypes_model_config.data_type = b'float16'
        else:
            self.ctypes_model_config.data_type = weight_dtype.encode('utf-8')

        if quantization != None:
            with open(self.model + "/quantize_config.json", 'r') as file:
                config = json.load(file)
            if config["quant_mode"] == "SmoothQuant":
                self.ctypes_model_config.smooth_quant_type = b"SmoothQuant"
            else:
                self.ctypes_model_config.smooth_quant_type = b'invalid'
            self.ctypes_model_config.filter_data_type = ("int" +  str(config['bits'])).encode('utf-8')
        else:
            self.ctypes_model_config.smooth_quant_type = b'invalid'
            self.ctypes_model_config.filter_data_type = self.ctypes_model_config.data_type

        if kv_cache_dtype == "auto":
            self.ctypes_model_config.kv_cache_dtype      =  self.ctypes_model_config.data_type
        else:
            self.ctypes_model_config.kv_cache_dtype      =  kv_cache_dtype.encode('utf-8')


    def get_flops(self, bcfg, once_batch, input_seq_len, output_length, flops_info):
        self.batch_size = once_batch
        seq_len = input_seq_len
        hidden_size = bcfg.hidden_size
        voc_size = bcfg.vocab_size
        ffn_size = bcfg.ffn_inner_size
        moe_size = bcfg.moe_inner_size
        shared_expert_intermediate_size = bcfg.shared_expert_intermediate_size
        layer_num = bcfg.layer_num
        out_seq = output_length
        seq_len_decode = seq_len + out_seq / 2
        r = bcfg.head_num / bcfg.head_num_kv
        bsh2 = self.batch_size * seq_len * hidden_size * hidden_size
        cla_coeffient = bcfg.cla_coeffient

        if bcfg.model_type == b'deepseek_v2':
            context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post = (
                get_deepseek_v2_flops(bcfg, self.batch_size, seq_len, hidden_size)
            )
        else:
            context_atn_pre = 2 * bsh2 + 4 * bsh2 / r * cla_coeffient
            context_atn_qk = 2 * self.batch_size * seq_len * seq_len * hidden_size
            context_atn_qkv = 2 * self.batch_size * seq_len * seq_len * hidden_size
            context_atn_post = 2 * self.batch_size * seq_len * hidden_size * hidden_size
        context_lm_head = 2 * self.batch_size * seq_len * hidden_size * voc_size
        context_ffn = 0
        bh2 = self.batch_size * hidden_size * hidden_size
        decode_atn_pre = 2 * bh2 + 4 * bh2 / r * cla_coeffient
        decode_atn_qk = 2 * self.batch_size * seq_len_decode * hidden_size
        decode_atn_qkv = 2 * self.batch_size * seq_len_decode * hidden_size
        decode_atn_post = 2 * self.batch_size * hidden_size * hidden_size
        decode_lm_head = 2 * self.batch_size * hidden_size * voc_size
        decode_ffn = 0
        coeffient = 6 if bcfg.use_gated_ffn else 4
        if bcfg.experts_num == 0:
            context_ffn = coeffient * self.batch_size * seq_len * hidden_size * ffn_size
            decode_ffn = coeffient * self.batch_size * hidden_size * ffn_size
        else:
            context_ffn = self.batch_size * seq_len * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num)
            decode_ffn = self.batch_size * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num)

        if bcfg.use_causal_mask:
            c = 0.5
            context_atn_qk *= c
            context_atn_qkv *= c

        flops_info.context_flops = context_lm_head
        flops_info.decoder_flops = decode_lm_head
        if bcfg.kv_cache_dtype != b"int8":
            flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv))
            flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv))
        else:
            flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv))
            flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv))

        if bcfg.smooth_quant_type == b"invalid":
            flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn))
            flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn))
        else:
            flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn))
            flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn))

    def capture_cpu_info(self):
        if VLLM_DUMP_CPU_INFO_EN and self.cpu_info:
            try:
                from vllm_mlu.device_info import capture_cpu_info
                self.cpu_info = capture_cpu_info(self.cpu_info, my_rank=0)
            except:
                logger.info("Unsupport capture_cpu_info function")

    def memory_usage(self):
        if VLLM_DUMP_CPU_INFO_EN and self.cpu_info:
            try:
                from vllm_mlu.device_info import memory_usage
                self.cpu_info = memory_usage(self.cpu_info)
            except:
                logger.info("Unsupport memory_usage function")

    def analyze_perf_data(self, rank=0):
        try:
            from vllm_mlu.device_info import analyze_perf_data
            analyze_perf_data(self.cpu_info, self.lib)
        except:
            logger.info("Cannot analyze perf data, no analyze_perf_data function")

    def get_decoder_io_efficiency(self, ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device):
        try:
            from vllm_mlu.device_info import get_decoder_io_efficiency
            self.io_efficiency = get_decoder_io_efficiency(ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device)
        except:
            logger.info("Unsupport io_efficiency get_decoder_io_efficiency function")

    def get_device_output_info(self,
                               model_cfg,
                               batch_size,
                               input_seq_len,
                               output_length,
                               tp_num,
                               weight_dtype,
                               kv_cache_dtype,
                               quantization):
        self.initialize_ctypes_model_config(model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization)
        if VLLM_DUMP_CPU_INFO_EN and self.so_file:
            self.analyze_perf_data()
        if VLLM_DUMP_MLU_INFO_EN and self.lib:
            from vllm_mlu.device_info import get_flops_inner, HFUInfo
            self.hfu_info = HFUInfo()
            get_flops_inner(self.ctypes_model_config, batch_size, input_seq_len, output_length, tp_num, self.hfu_info, self.lib)
            self.get_decoder_io_efficiency(self.ctypes_model_config,
                                           self.lib,
                                           self.batch_size,
                                           self.input_len,
                                           self.output_len,
                                           self.generate_latency_device)
        else:
            self.flops_info = FlopsInfo()
            self.get_flops(self.ctypes_model_config, batch_size, input_seq_len, output_length, self.flops_info)

    def has_information_dump(self):
        if self.dev_info and self.dev_info.so_file:
            return True
        return False

    def dump(self):
        self.get_device_output_info(self.model_config,
                                    self.batch_size,
                                    self.input_len,
                                    self.output_len,
                                    self.tensor_parallel_size,
                                    self.dtype,
                                    self.kv_cache_dtype,
                                    self.quantization)
        try:
            from vllm_mlu.device_info import dump
            dump(LLM.dump_info)
        except:
            logger.info("Unsupport dump device/cpu information")

    def dump_performance_info(self):
        try:
            from vllm_mlu.device_info import dump_information
            dump_information(LLM.dump_info)
        except:
            logger.info("Unsupport dump performance information")