from vllm.logger import init_logger from vllm_mlu.mlu_hijack_utils import get_is_gated, ModelConfig import ctypes import json from vllm.transformers_utils.config import get_config from vllm.entrypoints.llm import LLM from vllm_mlu._mlu_utils import VLLM_DUMP_CPU_INFO_EN, VLLM_DUMP_MLU_INFO_EN logger = init_logger(__name__) def get_deepseek_v2_flops(bcfg, batch, seq_len, hidden_size): ATTN_PAD_SIZE = 192 qk_nope_head_dim = bcfg.qk_nope_head_dim qk_rope_head_dim = bcfg.qk_rope_head_dim v_head_dim = bcfg.v_head_dim q_lora_rank = bcfg.q_lora_rank kv_lora_rank = bcfg.kv_lora_rank context_atn_pre = 2 * batch * seq_len * \ (hidden_size * q_lora_rank + \ hidden_size * (kv_lora_rank + qk_rope_head_dim) + \ q_lora_rank * bcfg.head_num * (qk_nope_head_dim + qk_rope_head_dim) + \ kv_lora_rank * bcfg.head_num * (qk_nope_head_dim + v_head_dim)) context_atn_qk = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE context_atn_qkv = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE context_atn_post = 2 * batch * seq_len * bcfg.head_num * v_head_dim * hidden_size return context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post class FlopsInfo(ctypes.Structure): _fields_ = [("context_flops", ctypes.c_double), ("decoder_flops", ctypes.c_double)] class LLMDumpInfo: def __init__(self, tensor_parallel_size=None, dtype=None, kv_cache_dtype=None, quantization=None, model=None, batch_size=None, input_len=None, output_len=None, trust_remote_code=None)->None: self.so_file = None self.dev_info = None self.cpu_info = None self.lib = None self.hfu_info = None self.flops_info = None self.ctypes_model_config = ModelConfig() self.io_efficiency = 0 self.context_latency_device = 0 self.generate_latency_device = 0 self.tensor_parallel_size = tensor_parallel_size self.dtype = dtype self.kv_cache_dtype = kv_cache_dtype self.quantization = quantization self.batch_size = batch_size self.input_len = input_len self.output_len = output_len self.model = model self.model_config = None try: from vllm_mlu.device_info import get_info_inner self.so_file,self.dev_info,self.cpu_info,self.lib = get_info_inner(self.so_file, self.dev_info, self.cpu_info, self.lib) except: logger.info("Cannot get device info") def init_param(self, tensor_parallel_size=None, dtype=None, kv_cache_dtype=None, quantization=None, model=None, batch_size=None, input_len=None, output_len=None, trust_remote_code=None, context_latency_device=None, generate_latency_device=None): if tensor_parallel_size != None: self.tensor_parallel_size = tensor_parallel_size if dtype != None: self.dtype = dtype if kv_cache_dtype != None: self.kv_cache_dtype = kv_cache_dtype if quantization != None: self.quantization = quantization if model != None: self.model = model if batch_size != None: self.batch_size = batch_size if input_len != None: self.input_len = input_len if output_len != None: self.output_len = output_len if trust_remote_code != None: self.trust_remote_code = trust_remote_code if context_latency_device != None: self.context_latency_device = context_latency_device if generate_latency_device != None: self.generate_latency_device = generate_latency_device # paser the model config if self.model_config == None and self.model != None and self.trust_remote_code != None: self.model_config = get_config(self.model, self.trust_remote_code) def initialize_ctypes_model_config(self, model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization): # prepare input self.ctypes_model_config.hidden_size = model_cfg.hidden_size self.ctypes_model_config.vocab_size = model_cfg.vocab_size self.ctypes_model_config.cla_coeffient = 1.0 possible_keys_ffn_size = [ # chatglm3-6b-32k "ffn_hidden_size", # llama3-8b-hf "intermediate_size", ] possible_kv_heads = [ # chatglm3-6b-32k "multi_query_group_num", # llama3-8b-hf "num_key_value_heads", # falcon-180B-chat "num_kv_heads", ] possible_num_attention_heads = [ "num_attention_heads", "n_heads", ] moe_size=None ffn_size=None if getattr(model_cfg, "moe_intermediate_size", None): moe_size = getattr(model_cfg, "moe_intermediate_size", None) for key in possible_keys_ffn_size: ffn_size = getattr(model_cfg, key, None) if ffn_size is not None: break if model_cfg.model_type in ['bloom'] and ffn_size is None: ffn_size = model_cfg.hidden_size * 4 if model_cfg.model_type in ['qwen']: ffn_size = model_cfg.intermediate_size // 2 if ffn_size is None and moe_size is None: logger.warning("The model's config.json does not contain any of the following" "keys to determine the ffn_size or moe_size: " f"{possible_keys_ffn_size}. ") for key in possible_num_attention_heads: num_attention_heads = getattr(model_cfg, key, None) if num_attention_heads is not None: break if num_attention_heads is None: logger.error("The model's config.json does not contain any of the following" "keys to determine the num_attention_heads: " f"{possible_num_attention_heads}. ") for key in possible_kv_heads: kv_heads = getattr(model_cfg, key, None) if kv_heads is not None: break if kv_heads is None: logger.warning("The model's config.json does not contain any of the following" "keys to determine the kv_heads: " f"{possible_kv_heads}, use num_attention_heads to replace") kv_heads = model_cfg.num_attention_heads self.ctypes_model_config.ffn_inner_size = 0 if ffn_size is None else ffn_size self.ctypes_model_config.moe_inner_size = 0 if moe_size is None else moe_size self.ctypes_model_config.moe_layer_num = 0 if moe_size is None else model_cfg.num_hidden_layers self.ctypes_model_config.layer_num = model_cfg.num_hidden_layers self.ctypes_model_config.head_num = num_attention_heads self.ctypes_model_config.head_size = self.ctypes_model_config.hidden_size / self.ctypes_model_config.head_num self.ctypes_model_config.head_num_kv = kv_heads self.ctypes_model_config.tp_num = tp_num if hasattr(model_cfg, "shared_expert_intermediate_size") and model_cfg.shared_expert_intermediate_size is not None: self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.shared_expert_intermediate_size else: self.ctypes_model_config.shared_expert_intermediate_size = 0 self.ctypes_model_config.use_gated_ffn = get_is_gated() if hasattr(model_cfg, "n_shared_experts") and model_cfg.n_shared_experts is not None: self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.n_shared_experts * moe_size else: self.ctypes_model_config.shared_experts = 0 if hasattr(model_cfg, "num_experts") and model_cfg.num_experts is not None: self.ctypes_model_config.experts_num = model_cfg.num_experts if model_cfg.model_type == 'hunyuan': self.ctypes_model_config.topk_num = model_cfg.moe_topk else: self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok elif hasattr(model_cfg, "num_local_experts"): self.ctypes_model_config.experts_num = model_cfg.num_local_experts if model_cfg.model_type == 'hunyuan': self.ctypes_model_config.topk_num = model_cfg.moe_topk else: self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok elif hasattr(model_cfg, "n_routed_experts"): self.ctypes_model_config.experts_num = model_cfg.n_routed_experts if model_cfg.model_type == 'hunyuan': self.ctypes_model_config.topk_num = model_cfg.moe_topk else: self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok else: self.ctypes_model_config.experts_num = 0 if hasattr(model_cfg, "model_type") and model_cfg.model_type is not None: self.ctypes_model_config.model_type = model_cfg.model_type.encode('utf-8') # when adding a moe model, need fix moe/ffn info, like # moe_inner_size, ffn_inner_size, moe_layer_num, shared_expert_intermediate_size. # add for mixtral if model_cfg.model_type == "mixtral": self.ctypes_model_config.moe_inner_size = ffn_size self.ctypes_model_config.ffn_inner_size = 0 self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers # add for deepseek-v2 if model_cfg.model_type == "deepseek_v2": if hasattr(model_cfg, "first_k_dense_replace") and model_cfg.first_k_dense_replace is not None: self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers - model_cfg.first_k_dense_replace if hasattr(model_cfg, "qk_nope_head_dim") and model_cfg.qk_nope_head_dim is not None: self.ctypes_model_config.qk_nope_head_dim = model_cfg.qk_nope_head_dim if hasattr(model_cfg, "qk_rope_head_dim") and model_cfg.qk_rope_head_dim is not None: self.ctypes_model_config.qk_rope_head_dim = model_cfg.qk_rope_head_dim if hasattr(model_cfg, "v_head_dim") and model_cfg.v_head_dim is not None: self.ctypes_model_config.v_head_dim = model_cfg.v_head_dim if hasattr(model_cfg, "q_lora_rank") and model_cfg.q_lora_rank is not None: self.ctypes_model_config.q_lora_rank = model_cfg.q_lora_rank else: self.ctypes_model_config.q_lora_rank = 0 if hasattr(model_cfg, "kv_lora_rank") and model_cfg.kv_lora_rank is not None: self.ctypes_model_config.kv_lora_rank = model_cfg.kv_lora_rank # add for Hunyuan if model_cfg.model_type == "hunyuan": self.ctypes_model_config.cla_coeffient = 0.5 # huanyuan model use CLA2 if hasattr(model_cfg, "num_shared_expert") and model_cfg.num_shared_expert is not None: self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.num_shared_expert * model_cfg.intermediate_size if not self.ctypes_model_config.moe_inner_size and model_cfg.intermediate_size is not None: self.ctypes_model_config.moe_inner_size = model_cfg.intermediate_size if not self.ctypes_model_config.moe_layer_num and hasattr(model_cfg, "num_experts"): self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers self.ctypes_model_config.use_causal_mask = True # the flash attention is only use causal_mask in vllm if weight_dtype == "auto": self.ctypes_model_config.data_type = b'float16' else: self.ctypes_model_config.data_type = weight_dtype.encode('utf-8') if quantization != None: with open(self.model + "/quantize_config.json", 'r') as file: config = json.load(file) if config["quant_mode"] == "SmoothQuant": self.ctypes_model_config.smooth_quant_type = b"SmoothQuant" else: self.ctypes_model_config.smooth_quant_type = b'invalid' self.ctypes_model_config.filter_data_type = ("int" + str(config['bits'])).encode('utf-8') else: self.ctypes_model_config.smooth_quant_type = b'invalid' self.ctypes_model_config.filter_data_type = self.ctypes_model_config.data_type if kv_cache_dtype == "auto": self.ctypes_model_config.kv_cache_dtype = self.ctypes_model_config.data_type else: self.ctypes_model_config.kv_cache_dtype = kv_cache_dtype.encode('utf-8') def get_flops(self, bcfg, once_batch, input_seq_len, output_length, flops_info): self.batch_size = once_batch seq_len = input_seq_len hidden_size = bcfg.hidden_size voc_size = bcfg.vocab_size ffn_size = bcfg.ffn_inner_size moe_size = bcfg.moe_inner_size shared_expert_intermediate_size = bcfg.shared_expert_intermediate_size layer_num = bcfg.layer_num out_seq = output_length seq_len_decode = seq_len + out_seq / 2 r = bcfg.head_num / bcfg.head_num_kv bsh2 = self.batch_size * seq_len * hidden_size * hidden_size cla_coeffient = bcfg.cla_coeffient if bcfg.model_type == b'deepseek_v2': context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post = ( get_deepseek_v2_flops(bcfg, self.batch_size, seq_len, hidden_size) ) else: context_atn_pre = 2 * bsh2 + 4 * bsh2 / r * cla_coeffient context_atn_qk = 2 * self.batch_size * seq_len * seq_len * hidden_size context_atn_qkv = 2 * self.batch_size * seq_len * seq_len * hidden_size context_atn_post = 2 * self.batch_size * seq_len * hidden_size * hidden_size context_lm_head = 2 * self.batch_size * seq_len * hidden_size * voc_size context_ffn = 0 bh2 = self.batch_size * hidden_size * hidden_size decode_atn_pre = 2 * bh2 + 4 * bh2 / r * cla_coeffient decode_atn_qk = 2 * self.batch_size * seq_len_decode * hidden_size decode_atn_qkv = 2 * self.batch_size * seq_len_decode * hidden_size decode_atn_post = 2 * self.batch_size * hidden_size * hidden_size decode_lm_head = 2 * self.batch_size * hidden_size * voc_size decode_ffn = 0 coeffient = 6 if bcfg.use_gated_ffn else 4 if bcfg.experts_num == 0: context_ffn = coeffient * self.batch_size * seq_len * hidden_size * ffn_size decode_ffn = coeffient * self.batch_size * hidden_size * ffn_size else: context_ffn = self.batch_size * seq_len * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num) decode_ffn = self.batch_size * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num) if bcfg.use_causal_mask: c = 0.5 context_atn_qk *= c context_atn_qkv *= c flops_info.context_flops = context_lm_head flops_info.decoder_flops = decode_lm_head if bcfg.kv_cache_dtype != b"int8": flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv)) flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv)) else: flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv)) flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv)) if bcfg.smooth_quant_type == b"invalid": flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn)) flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn)) else: flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn)) flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn)) def capture_cpu_info(self): if VLLM_DUMP_CPU_INFO_EN and self.cpu_info: try: from vllm_mlu.device_info import capture_cpu_info self.cpu_info = capture_cpu_info(self.cpu_info, my_rank=0) except: logger.info("Unsupport capture_cpu_info function") def memory_usage(self): if VLLM_DUMP_CPU_INFO_EN and self.cpu_info: try: from vllm_mlu.device_info import memory_usage self.cpu_info = memory_usage(self.cpu_info) except: logger.info("Unsupport memory_usage function") def analyze_perf_data(self, rank=0): try: from vllm_mlu.device_info import analyze_perf_data analyze_perf_data(self.cpu_info, self.lib) except: logger.info("Cannot analyze perf data, no analyze_perf_data function") def get_decoder_io_efficiency(self, ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device): try: from vllm_mlu.device_info import get_decoder_io_efficiency self.io_efficiency = get_decoder_io_efficiency(ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device) except: logger.info("Unsupport io_efficiency get_decoder_io_efficiency function") def get_device_output_info(self, model_cfg, batch_size, input_seq_len, output_length, tp_num, weight_dtype, kv_cache_dtype, quantization): self.initialize_ctypes_model_config(model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization) if VLLM_DUMP_CPU_INFO_EN and self.so_file: self.analyze_perf_data() if VLLM_DUMP_MLU_INFO_EN and self.lib: from vllm_mlu.device_info import get_flops_inner, HFUInfo self.hfu_info = HFUInfo() get_flops_inner(self.ctypes_model_config, batch_size, input_seq_len, output_length, tp_num, self.hfu_info, self.lib) self.get_decoder_io_efficiency(self.ctypes_model_config, self.lib, self.batch_size, self.input_len, self.output_len, self.generate_latency_device) else: self.flops_info = FlopsInfo() self.get_flops(self.ctypes_model_config, batch_size, input_seq_len, output_length, self.flops_info) def has_information_dump(self): if self.dev_info and self.dev_info.so_file: return True return False def dump(self): self.get_device_output_info(self.model_config, self.batch_size, self.input_len, self.output_len, self.tensor_parallel_size, self.dtype, self.kv_cache_dtype, self.quantization) try: from vllm_mlu.device_info import dump dump(LLM.dump_info) except: logger.info("Unsupport dump device/cpu information") def dump_performance_info(self): try: from vllm_mlu.device_info import dump_information dump_information(LLM.dump_info) except: logger.info("Unsupport dump performance information")