forked from EngineX-Cambricon/enginex-mlu370-vllm
410 lines
20 KiB
Python
410 lines
20 KiB
Python
from vllm.logger import init_logger
|
|
from vllm_mlu.mlu_hijack_utils import get_is_gated, ModelConfig
|
|
import ctypes
|
|
import json
|
|
from vllm.transformers_utils.config import get_config
|
|
from vllm.entrypoints.llm import LLM
|
|
from vllm_mlu._mlu_utils import VLLM_DUMP_CPU_INFO_EN, VLLM_DUMP_MLU_INFO_EN
|
|
logger = init_logger(__name__)
|
|
|
|
def get_deepseek_v2_flops(bcfg, batch, seq_len, hidden_size):
|
|
ATTN_PAD_SIZE = 192
|
|
qk_nope_head_dim = bcfg.qk_nope_head_dim
|
|
qk_rope_head_dim = bcfg.qk_rope_head_dim
|
|
v_head_dim = bcfg.v_head_dim
|
|
q_lora_rank = bcfg.q_lora_rank
|
|
kv_lora_rank = bcfg.kv_lora_rank
|
|
context_atn_pre = 2 * batch * seq_len * \
|
|
(hidden_size * q_lora_rank + \
|
|
hidden_size * (kv_lora_rank + qk_rope_head_dim) + \
|
|
q_lora_rank * bcfg.head_num * (qk_nope_head_dim + qk_rope_head_dim) + \
|
|
kv_lora_rank * bcfg.head_num * (qk_nope_head_dim + v_head_dim))
|
|
context_atn_qk = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE
|
|
context_atn_qkv = 2 * batch * seq_len * seq_len * bcfg.head_num * ATTN_PAD_SIZE
|
|
context_atn_post = 2 * batch * seq_len * bcfg.head_num * v_head_dim * hidden_size
|
|
return context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post
|
|
|
|
class FlopsInfo(ctypes.Structure):
|
|
_fields_ = [("context_flops", ctypes.c_double),
|
|
("decoder_flops", ctypes.c_double)]
|
|
|
|
class LLMDumpInfo:
|
|
def __init__(self,
|
|
tensor_parallel_size=None,
|
|
dtype=None, kv_cache_dtype=None,
|
|
quantization=None,
|
|
model=None, batch_size=None,
|
|
input_len=None,
|
|
output_len=None,
|
|
trust_remote_code=None)->None:
|
|
self.so_file = None
|
|
self.dev_info = None
|
|
self.cpu_info = None
|
|
self.lib = None
|
|
self.hfu_info = None
|
|
self.flops_info = None
|
|
self.ctypes_model_config = ModelConfig()
|
|
self.io_efficiency = 0
|
|
self.context_latency_device = 0
|
|
self.generate_latency_device = 0
|
|
|
|
self.tensor_parallel_size = tensor_parallel_size
|
|
self.dtype = dtype
|
|
self.kv_cache_dtype = kv_cache_dtype
|
|
self.quantization = quantization
|
|
self.batch_size = batch_size
|
|
self.input_len = input_len
|
|
self.output_len = output_len
|
|
self.model = model
|
|
self.model_config = None
|
|
|
|
try:
|
|
from vllm_mlu.device_info import get_info_inner
|
|
self.so_file,self.dev_info,self.cpu_info,self.lib = get_info_inner(self.so_file, self.dev_info, self.cpu_info, self.lib)
|
|
except:
|
|
logger.info("Cannot get device info")
|
|
|
|
def init_param(self,
|
|
tensor_parallel_size=None,
|
|
dtype=None,
|
|
kv_cache_dtype=None,
|
|
quantization=None,
|
|
model=None,
|
|
batch_size=None,
|
|
input_len=None,
|
|
output_len=None,
|
|
trust_remote_code=None,
|
|
context_latency_device=None,
|
|
generate_latency_device=None):
|
|
if tensor_parallel_size != None:
|
|
self.tensor_parallel_size = tensor_parallel_size
|
|
if dtype != None:
|
|
self.dtype = dtype
|
|
if kv_cache_dtype != None:
|
|
self.kv_cache_dtype = kv_cache_dtype
|
|
if quantization != None:
|
|
self.quantization = quantization
|
|
if model != None:
|
|
self.model = model
|
|
if batch_size != None:
|
|
self.batch_size = batch_size
|
|
if input_len != None:
|
|
self.input_len = input_len
|
|
if output_len != None:
|
|
self.output_len = output_len
|
|
if trust_remote_code != None:
|
|
self.trust_remote_code = trust_remote_code
|
|
if context_latency_device != None:
|
|
self.context_latency_device = context_latency_device
|
|
if generate_latency_device != None:
|
|
self.generate_latency_device = generate_latency_device
|
|
|
|
# paser the model config
|
|
if self.model_config == None and self.model != None and self.trust_remote_code != None:
|
|
self.model_config = get_config(self.model, self.trust_remote_code)
|
|
|
|
def initialize_ctypes_model_config(self, model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization):
|
|
# prepare input
|
|
self.ctypes_model_config.hidden_size = model_cfg.hidden_size
|
|
self.ctypes_model_config.vocab_size = model_cfg.vocab_size
|
|
self.ctypes_model_config.cla_coeffient = 1.0
|
|
|
|
possible_keys_ffn_size = [
|
|
# chatglm3-6b-32k
|
|
"ffn_hidden_size",
|
|
# llama3-8b-hf
|
|
"intermediate_size",
|
|
]
|
|
possible_kv_heads = [
|
|
# chatglm3-6b-32k
|
|
"multi_query_group_num",
|
|
# llama3-8b-hf
|
|
"num_key_value_heads",
|
|
# falcon-180B-chat
|
|
"num_kv_heads",
|
|
]
|
|
possible_num_attention_heads = [
|
|
"num_attention_heads",
|
|
"n_heads",
|
|
]
|
|
moe_size=None
|
|
ffn_size=None
|
|
if getattr(model_cfg, "moe_intermediate_size", None):
|
|
moe_size = getattr(model_cfg, "moe_intermediate_size", None)
|
|
for key in possible_keys_ffn_size:
|
|
ffn_size = getattr(model_cfg, key, None)
|
|
if ffn_size is not None:
|
|
break
|
|
if model_cfg.model_type in ['bloom'] and ffn_size is None:
|
|
ffn_size = model_cfg.hidden_size * 4
|
|
if model_cfg.model_type in ['qwen']:
|
|
ffn_size = model_cfg.intermediate_size // 2
|
|
if ffn_size is None and moe_size is None:
|
|
logger.warning("The model's config.json does not contain any of the following"
|
|
"keys to determine the ffn_size or moe_size: "
|
|
f"{possible_keys_ffn_size}. ")
|
|
|
|
for key in possible_num_attention_heads:
|
|
num_attention_heads = getattr(model_cfg, key, None)
|
|
if num_attention_heads is not None:
|
|
break
|
|
if num_attention_heads is None:
|
|
logger.error("The model's config.json does not contain any of the following"
|
|
"keys to determine the num_attention_heads: "
|
|
f"{possible_num_attention_heads}. ")
|
|
|
|
for key in possible_kv_heads:
|
|
kv_heads = getattr(model_cfg, key, None)
|
|
if kv_heads is not None:
|
|
break
|
|
|
|
if kv_heads is None:
|
|
logger.warning("The model's config.json does not contain any of the following"
|
|
"keys to determine the kv_heads: "
|
|
f"{possible_kv_heads}, use num_attention_heads to replace")
|
|
kv_heads = model_cfg.num_attention_heads
|
|
|
|
self.ctypes_model_config.ffn_inner_size = 0 if ffn_size is None else ffn_size
|
|
self.ctypes_model_config.moe_inner_size = 0 if moe_size is None else moe_size
|
|
self.ctypes_model_config.moe_layer_num = 0 if moe_size is None else model_cfg.num_hidden_layers
|
|
self.ctypes_model_config.layer_num = model_cfg.num_hidden_layers
|
|
self.ctypes_model_config.head_num = num_attention_heads
|
|
self.ctypes_model_config.head_size = self.ctypes_model_config.hidden_size / self.ctypes_model_config.head_num
|
|
self.ctypes_model_config.head_num_kv = kv_heads
|
|
self.ctypes_model_config.tp_num = tp_num
|
|
if hasattr(model_cfg, "shared_expert_intermediate_size") and model_cfg.shared_expert_intermediate_size is not None:
|
|
self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.shared_expert_intermediate_size
|
|
else:
|
|
self.ctypes_model_config.shared_expert_intermediate_size = 0
|
|
self.ctypes_model_config.use_gated_ffn = get_is_gated()
|
|
if hasattr(model_cfg, "n_shared_experts") and model_cfg.n_shared_experts is not None:
|
|
self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.n_shared_experts * moe_size
|
|
else:
|
|
self.ctypes_model_config.shared_experts = 0
|
|
if hasattr(model_cfg, "num_experts") and model_cfg.num_experts is not None:
|
|
self.ctypes_model_config.experts_num = model_cfg.num_experts
|
|
if model_cfg.model_type == 'hunyuan':
|
|
self.ctypes_model_config.topk_num = model_cfg.moe_topk
|
|
else:
|
|
self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok
|
|
elif hasattr(model_cfg, "num_local_experts"):
|
|
self.ctypes_model_config.experts_num = model_cfg.num_local_experts
|
|
if model_cfg.model_type == 'hunyuan':
|
|
self.ctypes_model_config.topk_num = model_cfg.moe_topk
|
|
else:
|
|
self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok
|
|
elif hasattr(model_cfg, "n_routed_experts"):
|
|
self.ctypes_model_config.experts_num = model_cfg.n_routed_experts
|
|
if model_cfg.model_type == 'hunyuan':
|
|
self.ctypes_model_config.topk_num = model_cfg.moe_topk
|
|
else:
|
|
self.ctypes_model_config.topk_num = model_cfg.num_experts_per_tok
|
|
else:
|
|
self.ctypes_model_config.experts_num = 0
|
|
if hasattr(model_cfg, "model_type") and model_cfg.model_type is not None:
|
|
self.ctypes_model_config.model_type = model_cfg.model_type.encode('utf-8')
|
|
# when adding a moe model, need fix moe/ffn info, like
|
|
# moe_inner_size, ffn_inner_size, moe_layer_num, shared_expert_intermediate_size.
|
|
# add for mixtral
|
|
if model_cfg.model_type == "mixtral":
|
|
self.ctypes_model_config.moe_inner_size = ffn_size
|
|
self.ctypes_model_config.ffn_inner_size = 0
|
|
self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers
|
|
# add for deepseek-v2
|
|
if model_cfg.model_type == "deepseek_v2":
|
|
if hasattr(model_cfg, "first_k_dense_replace") and model_cfg.first_k_dense_replace is not None:
|
|
self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers - model_cfg.first_k_dense_replace
|
|
if hasattr(model_cfg, "qk_nope_head_dim") and model_cfg.qk_nope_head_dim is not None:
|
|
self.ctypes_model_config.qk_nope_head_dim = model_cfg.qk_nope_head_dim
|
|
if hasattr(model_cfg, "qk_rope_head_dim") and model_cfg.qk_rope_head_dim is not None:
|
|
self.ctypes_model_config.qk_rope_head_dim = model_cfg.qk_rope_head_dim
|
|
if hasattr(model_cfg, "v_head_dim") and model_cfg.v_head_dim is not None:
|
|
self.ctypes_model_config.v_head_dim = model_cfg.v_head_dim
|
|
if hasattr(model_cfg, "q_lora_rank") and model_cfg.q_lora_rank is not None:
|
|
self.ctypes_model_config.q_lora_rank = model_cfg.q_lora_rank
|
|
else:
|
|
self.ctypes_model_config.q_lora_rank = 0
|
|
if hasattr(model_cfg, "kv_lora_rank") and model_cfg.kv_lora_rank is not None:
|
|
self.ctypes_model_config.kv_lora_rank = model_cfg.kv_lora_rank
|
|
# add for Hunyuan
|
|
if model_cfg.model_type == "hunyuan":
|
|
self.ctypes_model_config.cla_coeffient = 0.5 # huanyuan model use CLA2
|
|
if hasattr(model_cfg, "num_shared_expert") and model_cfg.num_shared_expert is not None:
|
|
self.ctypes_model_config.shared_expert_intermediate_size = model_cfg.num_shared_expert * model_cfg.intermediate_size
|
|
if not self.ctypes_model_config.moe_inner_size and model_cfg.intermediate_size is not None:
|
|
self.ctypes_model_config.moe_inner_size = model_cfg.intermediate_size
|
|
if not self.ctypes_model_config.moe_layer_num and hasattr(model_cfg, "num_experts"):
|
|
self.ctypes_model_config.moe_layer_num = model_cfg.num_hidden_layers
|
|
|
|
self.ctypes_model_config.use_causal_mask = True # the flash attention is only use causal_mask in vllm
|
|
|
|
if weight_dtype == "auto":
|
|
self.ctypes_model_config.data_type = b'float16'
|
|
else:
|
|
self.ctypes_model_config.data_type = weight_dtype.encode('utf-8')
|
|
|
|
if quantization != None:
|
|
with open(self.model + "/quantize_config.json", 'r') as file:
|
|
config = json.load(file)
|
|
if config["quant_mode"] == "SmoothQuant":
|
|
self.ctypes_model_config.smooth_quant_type = b"SmoothQuant"
|
|
else:
|
|
self.ctypes_model_config.smooth_quant_type = b'invalid'
|
|
self.ctypes_model_config.filter_data_type = ("int" + str(config['bits'])).encode('utf-8')
|
|
else:
|
|
self.ctypes_model_config.smooth_quant_type = b'invalid'
|
|
self.ctypes_model_config.filter_data_type = self.ctypes_model_config.data_type
|
|
|
|
if kv_cache_dtype == "auto":
|
|
self.ctypes_model_config.kv_cache_dtype = self.ctypes_model_config.data_type
|
|
else:
|
|
self.ctypes_model_config.kv_cache_dtype = kv_cache_dtype.encode('utf-8')
|
|
|
|
|
|
def get_flops(self, bcfg, once_batch, input_seq_len, output_length, flops_info):
|
|
self.batch_size = once_batch
|
|
seq_len = input_seq_len
|
|
hidden_size = bcfg.hidden_size
|
|
voc_size = bcfg.vocab_size
|
|
ffn_size = bcfg.ffn_inner_size
|
|
moe_size = bcfg.moe_inner_size
|
|
shared_expert_intermediate_size = bcfg.shared_expert_intermediate_size
|
|
layer_num = bcfg.layer_num
|
|
out_seq = output_length
|
|
seq_len_decode = seq_len + out_seq / 2
|
|
r = bcfg.head_num / bcfg.head_num_kv
|
|
bsh2 = self.batch_size * seq_len * hidden_size * hidden_size
|
|
cla_coeffient = bcfg.cla_coeffient
|
|
|
|
if bcfg.model_type == b'deepseek_v2':
|
|
context_atn_pre, context_atn_qk, context_atn_qkv, context_atn_post = (
|
|
get_deepseek_v2_flops(bcfg, self.batch_size, seq_len, hidden_size)
|
|
)
|
|
else:
|
|
context_atn_pre = 2 * bsh2 + 4 * bsh2 / r * cla_coeffient
|
|
context_atn_qk = 2 * self.batch_size * seq_len * seq_len * hidden_size
|
|
context_atn_qkv = 2 * self.batch_size * seq_len * seq_len * hidden_size
|
|
context_atn_post = 2 * self.batch_size * seq_len * hidden_size * hidden_size
|
|
context_lm_head = 2 * self.batch_size * seq_len * hidden_size * voc_size
|
|
context_ffn = 0
|
|
bh2 = self.batch_size * hidden_size * hidden_size
|
|
decode_atn_pre = 2 * bh2 + 4 * bh2 / r * cla_coeffient
|
|
decode_atn_qk = 2 * self.batch_size * seq_len_decode * hidden_size
|
|
decode_atn_qkv = 2 * self.batch_size * seq_len_decode * hidden_size
|
|
decode_atn_post = 2 * self.batch_size * hidden_size * hidden_size
|
|
decode_lm_head = 2 * self.batch_size * hidden_size * voc_size
|
|
decode_ffn = 0
|
|
coeffient = 6 if bcfg.use_gated_ffn else 4
|
|
if bcfg.experts_num == 0:
|
|
context_ffn = coeffient * self.batch_size * seq_len * hidden_size * ffn_size
|
|
decode_ffn = coeffient * self.batch_size * hidden_size * ffn_size
|
|
else:
|
|
context_ffn = self.batch_size * seq_len * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num)
|
|
decode_ffn = self.batch_size * hidden_size * (coeffient * (moe_size * bcfg.topk_num + shared_expert_intermediate_size) + 2 * bcfg.experts_num)
|
|
|
|
if bcfg.use_causal_mask:
|
|
c = 0.5
|
|
context_atn_qk *= c
|
|
context_atn_qkv *= c
|
|
|
|
flops_info.context_flops = context_lm_head
|
|
flops_info.decoder_flops = decode_lm_head
|
|
if bcfg.kv_cache_dtype != b"int8":
|
|
flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv))
|
|
flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv))
|
|
else:
|
|
flops_info.context_flops += (layer_num * (context_atn_qk + context_atn_qkv))
|
|
flops_info.decoder_flops += (layer_num * (decode_atn_qk + decode_atn_qkv))
|
|
|
|
if bcfg.smooth_quant_type == b"invalid":
|
|
flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn))
|
|
flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn))
|
|
else:
|
|
flops_info.context_flops += (layer_num * (context_atn_pre + context_atn_post + context_ffn))
|
|
flops_info.decoder_flops += (layer_num * (decode_atn_pre + decode_atn_post + decode_ffn))
|
|
|
|
def capture_cpu_info(self):
|
|
if VLLM_DUMP_CPU_INFO_EN and self.cpu_info:
|
|
try:
|
|
from vllm_mlu.device_info import capture_cpu_info
|
|
self.cpu_info = capture_cpu_info(self.cpu_info, my_rank=0)
|
|
except:
|
|
logger.info("Unsupport capture_cpu_info function")
|
|
|
|
def memory_usage(self):
|
|
if VLLM_DUMP_CPU_INFO_EN and self.cpu_info:
|
|
try:
|
|
from vllm_mlu.device_info import memory_usage
|
|
self.cpu_info = memory_usage(self.cpu_info)
|
|
except:
|
|
logger.info("Unsupport memory_usage function")
|
|
|
|
def analyze_perf_data(self, rank=0):
|
|
try:
|
|
from vllm_mlu.device_info import analyze_perf_data
|
|
analyze_perf_data(self.cpu_info, self.lib)
|
|
except:
|
|
logger.info("Cannot analyze perf data, no analyze_perf_data function")
|
|
|
|
def get_decoder_io_efficiency(self, ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device):
|
|
try:
|
|
from vllm_mlu.device_info import get_decoder_io_efficiency
|
|
self.io_efficiency = get_decoder_io_efficiency(ctypes_model_config, lib, batch_size, input_len, output_len, generate_latency_device)
|
|
except:
|
|
logger.info("Unsupport io_efficiency get_decoder_io_efficiency function")
|
|
|
|
def get_device_output_info(self,
|
|
model_cfg,
|
|
batch_size,
|
|
input_seq_len,
|
|
output_length,
|
|
tp_num,
|
|
weight_dtype,
|
|
kv_cache_dtype,
|
|
quantization):
|
|
self.initialize_ctypes_model_config(model_cfg, tp_num, weight_dtype, kv_cache_dtype, quantization)
|
|
if VLLM_DUMP_CPU_INFO_EN and self.so_file:
|
|
self.analyze_perf_data()
|
|
if VLLM_DUMP_MLU_INFO_EN and self.lib:
|
|
from vllm_mlu.device_info import get_flops_inner, HFUInfo
|
|
self.hfu_info = HFUInfo()
|
|
get_flops_inner(self.ctypes_model_config, batch_size, input_seq_len, output_length, tp_num, self.hfu_info, self.lib)
|
|
self.get_decoder_io_efficiency(self.ctypes_model_config,
|
|
self.lib,
|
|
self.batch_size,
|
|
self.input_len,
|
|
self.output_len,
|
|
self.generate_latency_device)
|
|
else:
|
|
self.flops_info = FlopsInfo()
|
|
self.get_flops(self.ctypes_model_config, batch_size, input_seq_len, output_length, self.flops_info)
|
|
|
|
def has_information_dump(self):
|
|
if self.dev_info and self.dev_info.so_file:
|
|
return True
|
|
return False
|
|
|
|
def dump(self):
|
|
self.get_device_output_info(self.model_config,
|
|
self.batch_size,
|
|
self.input_len,
|
|
self.output_len,
|
|
self.tensor_parallel_size,
|
|
self.dtype,
|
|
self.kv_cache_dtype,
|
|
self.quantization)
|
|
try:
|
|
from vllm_mlu.device_info import dump
|
|
dump(LLM.dump_info)
|
|
except:
|
|
logger.info("Unsupport dump device/cpu information")
|
|
|
|
def dump_performance_info(self):
|
|
try:
|
|
from vllm_mlu.device_info import dump_information
|
|
dump_information(LLM.dump_info)
|
|
except:
|
|
logger.info("Unsupport dump performance information")
|
|
|
|
|