[Model] Support DeepSeek-V4

This commit is contained in:
chenxb002
2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions

View File

@@ -0,0 +1,281 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
"""A MLU quant class."""
import functools
from collections import defaultdict
from typing import Dict, Any, List, Optional, Union
import numpy as np
import torch
import torch.distributed
from vllm.distributed import (
get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size,
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size,
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
import vllm.envs as envs
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.model_executor.layers.vocab_parallel_embedding import (VocabParallelEmbedding,
ParallelLMHead)
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.models.deepseek_v2 import DeepseekV2MLAAttention
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.logger import init_logger
logger = init_logger(__name__)
def default_act_range_value():
return {
"x": None,
"split": None,
"is_linear": False,
"is_qkv": False,
"q_proj_size": 0,
"num_kv_head_replicas": 1,
"is_merge": False,
"input_id": [],
"self_rank": 0,
"rank": None,
"tensor_rank": None,
"tp_world_size": None,
"moe_tp_rank": None,
"moe_tp_world_size": None,
"moe_ep_rank": None,
"moe_ep_world_size": None,
"weight": None,
}
def _str_to_torch_dtype(dtype: str) -> torch.dtype:
dtype = dtype.split(".")[-1]
# STR_DTYPE_TO_TORCH_DTYPE dict does not have float16 type
return STR_DTYPE_TO_TORCH_DTYPE[dtype] if dtype != "float16" else torch.float16
class ActRangeValue:
"""
ActRangeValue for v1 MsgpackEncoder and MsgpackDecoder. This is a *WorkAround*.
The decode tensor can be wrong if we pass act range dict directly.
NOTE: here, we transfer torch.Tensor to numpy ndarray because torch.Tensor
may cause core dump.
"""
def __init__(self):
self.layer_name: str = ""
self.x: Optional[np.ndarray] = None
self.split: str = None
self.is_linear: bool = False
self.is_qkv: bool = False
self.q_proj_size: int = 0
self.num_kv_head_replicas: int = 1
self.is_merge: bool = False
self.input_id_dtype: str = None
self.input_id: Optional[List[np.ndarray]] = []
self.self_rank: int = 0
self.rank: Optional[int] = None
self.tensor_rank: Optional[int] = None
self.tp_world_size: Optional[int] = None
self.moe_tp_rank: Optional[int] = None
self.moe_tp_world_size: Optional[int] = None
self.moe_ep_rank: Optional[int] = None
self.moe_ep_world_size: Optional[int] = None
self.weight: Optional[np.ndarray] = None
self.weight_dtype: str = None
@classmethod
def serial(cls, layer_name: str, act_range: Dict[str, Any]) -> 'ActRangeValue':
instance = cls()
instance.layer_name = layer_name
instance.x = act_range.get("x")
instance.split = act_range.get("split")
instance.is_linear = act_range.get("is_linear", False)
instance.is_qkv = act_range.get("is_qkv", False)
instance.q_proj_size = act_range.get("q_proj_size", 0)
instance.num_kv_head_replicas = act_range.get("num_kv_head_replicas", 1)
instance.is_merge = act_range.get("is_merge", False)
instance.input_id = act_range.get("input_id", [])
instance.self_rank = act_range.get("self_rank", 0)
instance.rank = act_range.get("rank")
instance.tensor_rank = act_range.get("tensor_rank")
instance.tp_world_size = act_range.get("tp_world_size")
instance.moe_tp_rank = act_range.get("moe_tp_rank")
instance.moe_tp_world_size = act_range.get("moe_tp_world_size")
instance.moe_ep_rank = act_range.get("moe_ep_rank")
instance.moe_ep_world_size = act_range.get("moe_ep_world_size")
instance.weight = act_range.get("weight")
if instance.x is not None:
instance.x = instance.x.numpy()
# input_id and weight are used for debug
if isinstance(instance.input_id, torch.Tensor):
instance.input_id_dtype = str(instance.input_id.dtype)
instance.input_id = instance.input_id.float().numpy()
else:
input_id_np = []
for input_id in instance.input_id:
instance.input_id_dtype = str(input_id.dtype)
input_id_np.append(input_id.float().numpy())
instance.input_id = input_id_np
if instance.weight is not None:
instance.weight_dtype = str(instance.weight.dtype)
instance.weight = instance.weight.float().numpy()
return instance
def deserial(self) -> Dict[str, Any]:
act_range = self.to_dict()
if self.x is not None:
act_range["x"] = torch.from_numpy(self.x)
if self.input_id is not None:
if isinstance(self.input_id, torch.Tensor):
act_range["input_id"] = torch.from_numpy(self.input_id).to(
_str_to_torch_dtype(self.input_id_dtype))
else:
input_id_tensor = []
for input_id in self.input_id:
input_id_tensor.append(torch.from_numpy(input_id).to(
_str_to_torch_dtype(self.input_id_dtype)))
act_range["input_id"] = input_id_tensor
if self.weight_dtype is not None:
act_range["weight"] = torch.from_numpy(self.weight).to(
_str_to_torch_dtype(self.weight_dtype))
return act_range
def to_dict(self) -> Dict[str, Any]:
return {
"x": self.x,
"split": self.split,
"is_linear": self.is_linear,
"is_qkv": self.is_qkv,
"q_proj_size": self.q_proj_size,
"num_kv_head_replicas": self.num_kv_head_replicas,
"is_merge": self.is_merge,
"input_id": self.input_id,
"self_rank": self.self_rank,
"rank": self.rank,
"tensor_rank": self.tensor_rank,
"tp_world_size": self.tp_world_size,
"moe_tp_rank": self.moe_tp_rank,
"moe_tp_world_size": self.moe_tp_world_size,
"moe_ep_rank": self.moe_ep_rank,
"moe_ep_world_size": self.moe_ep_world_size,
"weight": self.weight,
}
def __repr__(self) -> str:
return f"layer: {self.layer_name}, ActRangeValue({self.to_dict()})"
class MLUWorkerQuant(object):
'''
mlu quant
'''
def stat_tensor(self, name, tensor, act_range, key, dim):
logger.debug(f"name:{name}, key:{key}, dim:{dim}, tensor.shape:{tensor.shape}")
hidden_dim = tensor.shape[-1]
tensor = tensor.view(-1, hidden_dim).abs()
comming_max = torch.max(tensor, dim=dim)[0].float()
if act_range[name][key] is None:
act_range[name][key] = comming_max
else:
act_range[name][key] = torch.max(act_range[name][key], comming_max)
def stat_input_hook(self, m, x, y, name, act_range, is_linear, is_save_input_id):
if isinstance(x, tuple):
x = x[0]
if isinstance(y, tuple):
y = y[0]
logger.debug(f"name:{name}, x.shape:{x.shape}, y.shape:{y.shape}, m.weight.shape:{m.weight.shape}")
if is_linear:
self.stat_tensor(name, x, act_range, "x", 0)
if act_range[name]["is_qkv"] and is_save_input_id and ".0." in name:
x_cpu = x.clone().to("cpu")
act_range[name]["input_id"].append(x_cpu)
def setup_smooth_hook(self, is_save_input_id: bool = False, is_save_moe_info: bool = False):
models = [self.model_runner.model]
if hasattr(self.model_runner, "drafter") and self.model_runner.drafter is not None:
models += [self.model_runner.drafter.model]
self.act_range = defaultdict(default_act_range_value)
self.hooks = []
linear_class_list = (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
other_class_list = (VocabParallelEmbedding, ParallelLMHead)
class_list = linear_class_list + other_class_list
row_class_list = (RowParallelLinear)
for model in models:
for name, m in model.named_modules():
if isinstance(m, FeedForward):
m.use_bt_ffn = False
if isinstance(m, SparseMoeMlp):
m.is_use_fused_moe = False
if isinstance(m, DeepseekV2MLAAttention):
m.use_fused_mla_qkv = False
if isinstance(m, class_list):
is_linear = True if isinstance(m, linear_class_list) else False
split_type = "row" if isinstance(m, row_class_list) else "col"
self.act_range[name]["split"] = split_type
self.act_range[name]["is_linear"] = is_linear
if isinstance(m, QKVParallelLinear):
self.act_range[name]["is_qkv"] = True
self.act_range[name]["q_proj_size"] = m.num_heads * m.head_size
self.act_range[name]["num_kv_head_replicas"] = m.num_kv_head_replicas
self.act_range[name]["is_merge"] = isinstance(m, MergedColumnParallelLinear)
if is_save_moe_info:
self.act_range[name]["rank"] = torch.distributed.get_rank()
self.act_range[name]["tensor_rank"] = get_tensor_model_parallel_rank()
self.act_range[name]["tp_world_size"] = get_tensor_model_parallel_world_size()
self.act_range[name]["moe_tp_rank"] = get_moe_tensor_parallel_rank()
self.act_range[name]["moe_tp_world_size"] = get_moe_tensor_parallel_world_size()
self.act_range[name]["moe_ep_rank"] = get_moe_expert_parallel_rank()
self.act_range[name]["moe_ep_world_size"] = get_moe_expert_parallel_world_size()
if ".expert." in name:
self.act_range[name]["weight"] = m.weight
logger.info(f"rank:{self.rank}, add hook to {name}, is_linear:{is_linear}, split_type:{split_type}")
self.hooks.append(m.register_forward_hook(functools.partial(self.stat_input_hook,
name=name, act_range=self.act_range,
is_linear=is_linear,
is_save_input_id=is_save_input_id)))
def remove_hooks(self):
for h in self.hooks:
h.remove()
def get_act_range(self):
act_range = defaultdict(default_act_range_value)
for layer_name, layer_range in self.act_range.items():
for tensor_key, tensor_value in layer_range.items():
if isinstance(tensor_value, torch.Tensor):
act_range[layer_name][tensor_key] = tensor_value.to("cpu")
elif tensor_key == "input_id" and isinstance(tensor_value, list):
input_id_len = len(tensor_value)
for i in range(input_id_len):
if isinstance(tensor_value[i], torch.Tensor):
act_range[layer_name][tensor_key].append(tensor_value[i].to("cpu"))
else:
act_range[layer_name][tensor_key].append(tensor_value[i])
else:
act_range[layer_name][tensor_key] = tensor_value
serialization_result = []
for layer_name, layer_range in act_range.items():
serialization_result.append(ActRangeValue.serial(layer_name, layer_range))
return serialization_result
@torch.no_grad()
def get_named_parameters(self):
name_parameters = {}
for name, param in self.model_runner.model.named_parameters():
name_parameters[name] = param.to("cpu")
return name_parameters