forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
print("Apply Expert Parallel Demo!")
|
||||
from . import model_executor
|
||||
@@ -0,0 +1,5 @@
|
||||
from .layers import sparse_moe_mlp
|
||||
from .models import custom
|
||||
from .models import mixtral
|
||||
from .models import qwen2_moe
|
||||
from .models import deepseek_v2
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Inference-only MOE model.
|
||||
|
||||
Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
|
||||
which means each rank holds partial weight of all experts.
|
||||
While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
|
||||
which means each rank holds part of the experts' full weight.
|
||||
|
||||
As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
|
||||
then computes using the partial weights, while for Expert Parallel, each rank only receives
|
||||
part of tokens' hidden states for experts on this rank, then computes using the full weights.
|
||||
|
||||
When both Tensor Parallel and Expert Parallel are enabled, each rank handles
|
||||
a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
|
||||
across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
|
||||
enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
get_tensor_model_parallel_group)
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm_mlu._mlu_utils import get_device_major_capability
|
||||
|
||||
|
||||
def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
|
||||
self,
|
||||
num_experts: int,
|
||||
top_k: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
up_proj_name: str,
|
||||
is_gated: bool,
|
||||
down_proj_name: str,
|
||||
has_bias: bool,
|
||||
skip_bias_add: bool = False,
|
||||
renormalize:bool = False,
|
||||
hidden_act: str = "silu",
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
is_use_fused_moe: bool = False,
|
||||
expert_group: int = 1,
|
||||
topk_group: int = 1,
|
||||
):
|
||||
super(SparseMoeMlp, self).__init__()
|
||||
self.tp_rank = get_tensor_model_parallel_rank()
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.tp_group = get_tensor_model_parallel_group()
|
||||
self.num_total_experts = num_experts
|
||||
self.top_k = top_k
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.up_proj_name = up_proj_name
|
||||
self.is_gated = is_gated
|
||||
self.down_proj_name = down_proj_name
|
||||
self.has_bias = has_bias
|
||||
self.renormalize = renormalize
|
||||
self.hidden_act = hidden_act
|
||||
self.quant_config = quant_config
|
||||
self.is_use_fused_moe = is_use_fused_moe
|
||||
self.expert_group = expert_group
|
||||
self.topk_group = topk_group
|
||||
if get_device_major_capability() == 3:
|
||||
self.is_use_fused_moe = False
|
||||
|
||||
if params_dtype is None:
|
||||
params_dtype = torch.get_default_dtype()
|
||||
self.params_dtype = params_dtype
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add moe relative distribution
|
||||
'''
|
||||
self.moe_tp_size = get_moe_tensor_parallel_world_size()
|
||||
self.moe_tp_rank = get_moe_tensor_parallel_rank()
|
||||
self.moe_tp_group = get_moe_tensor_parallel_group()
|
||||
self.moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
self.moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
self.moe_ep_group = get_moe_expert_parallel_group()
|
||||
|
||||
# NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
|
||||
# contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
|
||||
self.skip_bias_add = True if self.moe_tp_rank > 0 else False
|
||||
|
||||
assert self.num_total_experts >= self.moe_ep_size, (
|
||||
f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
|
||||
|
||||
assert self.intermediate_size % self.moe_tp_size == 0, (
|
||||
f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
|
||||
|
||||
self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
|
||||
if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
|
||||
self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
|
||||
|
||||
self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
|
||||
|
||||
# Gate always runs at half / full precision for now.
|
||||
self.gate = ReplicatedLinear(self.hidden_size,
|
||||
self.num_total_experts,
|
||||
bias=False,
|
||||
params_dtype=self.params_dtype,
|
||||
quant_config=None)
|
||||
self.experts = nn.ModuleList([
|
||||
FeedForward(hidden_size=self.hidden_size,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
up_proj_name=self.up_proj_name,
|
||||
is_gated=self.is_gated,
|
||||
down_proj_name=self.down_proj_name,
|
||||
bias=self.has_bias,
|
||||
quant_config=self.quant_config,
|
||||
skip_bias_add=self.skip_bias_add,
|
||||
reduce_results=False,
|
||||
tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
|
||||
])
|
||||
|
||||
self.init_pack_param()
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(SparseMoeMlp,
|
||||
SparseMoeMlp.__init__,
|
||||
vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)
|
||||
@@ -0,0 +1,183 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm_mlu.transformers_utils.configs import CustomConfig
|
||||
from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm_mlu.model_executor.models.layer_utils import (
|
||||
decoder_layer_forward_base, is_per_tensor_smoothquant,
|
||||
is_per_token_smoothquant, quant_fusion_with_rmsnorm,
|
||||
quant_fusion_with_layernorm)
|
||||
|
||||
|
||||
class CustomMoeBlock(SparseMoeMlp):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CustomConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
):
|
||||
super().__init__(num_experts=config.num_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.moe_intermediate_size,
|
||||
up_proj_name="gate_up_proj",
|
||||
is_gated=config.is_gated,
|
||||
down_proj_name="down_proj",
|
||||
has_bias=config.mlp_bias,
|
||||
skip_bias_add=False,
|
||||
renormalize=config.norm_topk_prob,
|
||||
hidden_act=config.hidden_act,
|
||||
params_dtype=None,
|
||||
quant_config=quant_config,
|
||||
is_use_fused_moe=True)
|
||||
|
||||
self.config = config
|
||||
self.rank = self.tp_rank
|
||||
self.shared_expert = None
|
||||
self.shared_expert_gate = None
|
||||
if config.shared_expert_intermediate_size > 0:
|
||||
self.shared_expert = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=config.shared_expert_intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
up_proj_name='gate_up_proj',
|
||||
is_gated=config.is_gated,
|
||||
down_proj_name='down_proj',
|
||||
bias=config.mlp_bias,
|
||||
quant_config=quant_config,
|
||||
reduce_results=False)
|
||||
self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
|
||||
1,
|
||||
bias=False,
|
||||
params_dtype=self.params_dtype,
|
||||
quant_config=None)
|
||||
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
num_tokens, hidden_dim = hidden_states.shape
|
||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||
shared_output = None
|
||||
if self.shared_expert is not None:
|
||||
shared_output = self.shared_expert(hidden_states)
|
||||
if self.shared_expert_gate is not None:
|
||||
gate_output = self.shared_expert_gate(hidden_states)
|
||||
shared_output = F.sigmoid(gate_output[0]) * shared_output
|
||||
|
||||
# router_logits: (num_tokens, n_experts)
|
||||
router_logits, _ = self.gate(hidden_states)
|
||||
residual_ = None if self.rank > 0 else residual
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify bt_ops.fused_moe to forward_experts
|
||||
'''
|
||||
final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
if shared_output is not None:
|
||||
final_hidden_states = final_hidden_states + shared_output
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add comment to explain use_parallel_residual usage
|
||||
'''
|
||||
# use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
|
||||
# use_parallel_residual = False:
|
||||
# if apply_residual_connection_post_layernorm:
|
||||
# x_attn = ln1(x) + attn(ln1(x))
|
||||
# x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
|
||||
# else:
|
||||
# x_attn = x + attn(ln1(x))
|
||||
# x_mlp = x_attn + mlp(ln2(x_attn))
|
||||
# When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
|
||||
# reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
|
||||
# But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
|
||||
# when mlp is finished.
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
reduce_results = (self.config.use_parallel_residual == False)
|
||||
if reduce_results and self.tp_size > 1:
|
||||
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
||||
|
||||
return final_hidden_states.view(num_tokens, hidden_dim)
|
||||
|
||||
|
||||
def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
|
||||
self,
|
||||
config: CustomConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
) -> None:
|
||||
super(CustomDecoderLayer, self).__init__()
|
||||
self.config = config
|
||||
self.self_attn = CustomAttention(
|
||||
config=config,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
|
||||
is_gated = getattr(config, "is_gated", False)
|
||||
|
||||
if config.num_experts is not None:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: nothing changed, only use the CustomMoeBlock class in this file
|
||||
'''
|
||||
self.mlp = CustomMoeBlock(config=config,
|
||||
quant_config=quant_config)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
else:
|
||||
self.mlp = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
hidden_act=self.config.hidden_act,
|
||||
up_proj_name='up_proj',
|
||||
is_gated=is_gated,
|
||||
down_proj_name='down_proj',
|
||||
bias=mlp_bias,
|
||||
quant_config=quant_config,
|
||||
skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
|
||||
reduce_results = (self.config.use_parallel_residual == False))
|
||||
|
||||
self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
|
||||
self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
|
||||
|
||||
# perf per-tensor sq cases by fusing quantization in layernorm
|
||||
self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
|
||||
not self.config.apply_residual_connection_post_layernorm)
|
||||
self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
|
||||
not self.config.apply_residual_connection_post_layernorm)
|
||||
if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
|
||||
self.self_attn.qkv_proj.quant_method.skip_quant_input = True
|
||||
self.quant_fusion_attn_layernorm = None
|
||||
self.is_moe = config.num_experts is not None
|
||||
self.use_rmsnorm = self.config.norm_type == "rmsnorm"
|
||||
if not self.is_moe:
|
||||
self.mlp.up_proj.quant_method.skip_quant_input = True
|
||||
self.quant_fusion_mlp_layernorm = None
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(CustomDecoderLayer,
|
||||
CustomDecoderLayer.__init__,
|
||||
vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)
|
||||
@@ -0,0 +1,222 @@
|
||||
|
||||
import re
|
||||
import torch
|
||||
from torch import nn
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
|
||||
from vllm_mlu.model_executor.models.deepseek_v2 import DeepseekV2MoE
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
|
||||
|
||||
def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.moe_intermediate_size,
|
||||
up_proj_name="gate_up_proj",
|
||||
is_gated=True,
|
||||
down_proj_name="down_proj",
|
||||
has_bias=False,
|
||||
skip_bias_add=False,
|
||||
renormalize=config.norm_topk_prob,
|
||||
hidden_act=config.hidden_act,
|
||||
params_dtype=None,
|
||||
quant_config=quant_config,
|
||||
is_use_fused_moe=True,
|
||||
expert_group=config.n_group,
|
||||
topk_group=config.topk_group)
|
||||
self.config = config
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
self.n_shared_experts = config.n_shared_experts
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
if self.moe_tp_size > config.n_routed_experts:
|
||||
raise ValueError(
|
||||
f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
|
||||
f"the number of experts {config.n_routed_experts}.")
|
||||
|
||||
if config.hidden_act != "silu":
|
||||
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
|
||||
"Only silu is supported for now.")
|
||||
|
||||
self.gate = ReplicatedLinear(config.hidden_size,
|
||||
config.n_routed_experts,
|
||||
bias=False,
|
||||
quant_config=None,
|
||||
prefix=f"{prefix}.gate")
|
||||
if config.n_shared_experts is not None:
|
||||
intermediate_size = (config.moe_intermediate_size *
|
||||
config.n_shared_experts)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace MLP with FeedForward.
|
||||
'''
|
||||
self.shared_experts = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
up_proj_name='gate_up_proj',
|
||||
is_gated=True,
|
||||
down_proj_name='down_proj',
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
reduce_results=False)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.n_routed_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "mlp.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
if weight_name not in name:
|
||||
continue
|
||||
# We have mlp.experts[0].gate_proj in the checkpoint.
|
||||
# Since we handle the experts below in expert_params_mapping,
|
||||
# we need to skip here BEFORE we update the name, otherwise
|
||||
# name will be updated to mlp.experts[0].gate_up_proj, which
|
||||
# will then be updated below in expert_params_mapping
|
||||
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
|
||||
'''
|
||||
name = name.replace(weight_name, param_name)
|
||||
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
|
||||
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(DeepseekV2MoE,
|
||||
DeepseekV2MoE.__init__,
|
||||
vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
|
||||
MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
|
||||
DeepseekV2ForCausalLM.load_weights,
|
||||
vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)
|
||||
@@ -0,0 +1,143 @@
|
||||
import torch
|
||||
import re
|
||||
import vllm
|
||||
from torch import nn
|
||||
from typing import List, Optional, Tuple, Iterable
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.model_executor.models.mixtral import MixtralForCausalLM
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
|
||||
def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.num_local_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("w13", "w1", 0),
|
||||
("w13", "w3", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "block_sparse_moe.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
# Remapping the name of FP8 kv-scale.
|
||||
name = maybe_remap_kv_scale_name(name, params_dict)
|
||||
if name is None:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(MixtralForCausalLM,
|
||||
MixtralForCausalLM.load_weights,
|
||||
vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)
|
||||
@@ -0,0 +1,179 @@
|
||||
import torch
|
||||
import re
|
||||
from typing import Optional, Iterable, Tuple
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.utils import print_warning_once
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
|
||||
def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.num_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "mlp.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete if "mlp.experts" in name: continue condition
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete for mapping in expert_params_mapping condition
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
# Remapping the name of FP8 kv-scale.
|
||||
if name.endswith("kv_scale"):
|
||||
remapped_kv_scale_name = name.replace(
|
||||
".kv_scale", ".attn.kv_scale")
|
||||
if remapped_kv_scale_name not in params_dict:
|
||||
print_warning_once(
|
||||
"Found kv scale in the checkpoint "
|
||||
f"(e.g. {name}), but not found the expected "
|
||||
f"name in the model "
|
||||
f"(e.g. {remapped_kv_scale_name}). "
|
||||
"kv-scale is not loaded.")
|
||||
continue
|
||||
else:
|
||||
name = remapped_kv_scale_name
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
|
||||
Qwen2MoeForCausalLM.load_weights,
|
||||
vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)
|
||||
Reference in New Issue
Block a user