add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
print("Apply Expert Parallel Demo!")
from . import model_executor

View File

@@ -0,0 +1,5 @@
from .layers import sparse_moe_mlp
from .models import custom
from .models import mixtral
from .models import qwen2_moe
from .models import deepseek_v2

View File

@@ -0,0 +1,142 @@
"""
Inference-only MOE model.
Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
which means each rank holds partial weight of all experts.
While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
which means each rank holds part of the experts' full weight.
As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
then computes using the partial weights, while for Expert Parallel, each rank only receives
part of tokens' hidden states for experts on this rank, then computes using the full weights.
When both Tensor Parallel and Expert Parallel are enabled, each rank handles
a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
"""
from typing import Optional
import torch
from torch import nn
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group)
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu._mlu_utils import get_device_major_capability
def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
self,
num_experts: int,
top_k: int,
hidden_size: int,
intermediate_size: int,
up_proj_name: str,
is_gated: bool,
down_proj_name: str,
has_bias: bool,
skip_bias_add: bool = False,
renormalize:bool = False,
hidden_act: str = "silu",
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
is_use_fused_moe: bool = False,
expert_group: int = 1,
topk_group: int = 1,
):
super(SparseMoeMlp, self).__init__()
self.tp_rank = get_tensor_model_parallel_rank()
self.tp_size = get_tensor_model_parallel_world_size()
self.tp_group = get_tensor_model_parallel_group()
self.num_total_experts = num_experts
self.top_k = top_k
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.up_proj_name = up_proj_name
self.is_gated = is_gated
self.down_proj_name = down_proj_name
self.has_bias = has_bias
self.renormalize = renormalize
self.hidden_act = hidden_act
self.quant_config = quant_config
self.is_use_fused_moe = is_use_fused_moe
self.expert_group = expert_group
self.topk_group = topk_group
if get_device_major_capability() == 3:
self.is_use_fused_moe = False
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.params_dtype = params_dtype
'''
=============================
Modify by vllm_mlu
=============================
@brief: add moe relative distribution
'''
self.moe_tp_size = get_moe_tensor_parallel_world_size()
self.moe_tp_rank = get_moe_tensor_parallel_rank()
self.moe_tp_group = get_moe_tensor_parallel_group()
self.moe_ep_size = get_moe_expert_parallel_world_size()
self.moe_ep_rank = get_moe_expert_parallel_rank()
self.moe_ep_group = get_moe_expert_parallel_group()
# NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
# contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
self.skip_bias_add = True if self.moe_tp_rank > 0 else False
assert self.num_total_experts >= self.moe_ep_size, (
f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
assert self.intermediate_size % self.moe_tp_size == 0, (
f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
'''
==================
End of MLU Hijack
==================
'''
self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
# Gate always runs at half / full precision for now.
self.gate = ReplicatedLinear(self.hidden_size,
self.num_total_experts,
bias=False,
params_dtype=self.params_dtype,
quant_config=None)
self.experts = nn.ModuleList([
FeedForward(hidden_size=self.hidden_size,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
up_proj_name=self.up_proj_name,
is_gated=self.is_gated,
down_proj_name=self.down_proj_name,
bias=self.has_bias,
quant_config=self.quant_config,
skip_bias_add=self.skip_bias_add,
reduce_results=False,
tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
])
self.init_pack_param()
MluHijackObject.apply_hijack(SparseMoeMlp,
SparseMoeMlp.__init__,
vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)

View File

@@ -0,0 +1,183 @@
import torch
import torch.nn.functional as F
from typing import Optional
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm_mlu._mlu_utils import *
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm_mlu.transformers_utils.configs import CustomConfig
from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm_mlu.model_executor.models.layer_utils import (
decoder_layer_forward_base, is_per_tensor_smoothquant,
is_per_token_smoothquant, quant_fusion_with_rmsnorm,
quant_fusion_with_layernorm)
class CustomMoeBlock(SparseMoeMlp):
def __init__(
self,
config: CustomConfig,
quant_config: Optional[QuantizationConfig] = None,
):
super().__init__(num_experts=config.num_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
up_proj_name="gate_up_proj",
is_gated=config.is_gated,
down_proj_name="down_proj",
has_bias=config.mlp_bias,
skip_bias_add=False,
renormalize=config.norm_topk_prob,
hidden_act=config.hidden_act,
params_dtype=None,
quant_config=quant_config,
is_use_fused_moe=True)
self.config = config
self.rank = self.tp_rank
self.shared_expert = None
self.shared_expert_gate = None
if config.shared_expert_intermediate_size > 0:
self.shared_expert = FeedForward(hidden_size=config.hidden_size,
intermediate_size=config.shared_expert_intermediate_size,
hidden_act=config.hidden_act,
up_proj_name='gate_up_proj',
is_gated=config.is_gated,
down_proj_name='down_proj',
bias=config.mlp_bias,
quant_config=quant_config,
reduce_results=False)
self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
1,
bias=False,
params_dtype=self.params_dtype,
quant_config=None)
def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
num_tokens, hidden_dim = hidden_states.shape
hidden_states = hidden_states.view(-1, hidden_dim)
shared_output = None
if self.shared_expert is not None:
shared_output = self.shared_expert(hidden_states)
if self.shared_expert_gate is not None:
gate_output = self.shared_expert_gate(hidden_states)
shared_output = F.sigmoid(gate_output[0]) * shared_output
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
residual_ = None if self.rank > 0 else residual
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify bt_ops.fused_moe to forward_experts
'''
final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
'''
==================
End of MLU Hijack
==================
'''
if shared_output is not None:
final_hidden_states = final_hidden_states + shared_output
'''
=============================
Modify by vllm_mlu
=============================
@brief: add comment to explain use_parallel_residual usage
'''
# use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
# use_parallel_residual = False:
# if apply_residual_connection_post_layernorm:
# x_attn = ln1(x) + attn(ln1(x))
# x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
# else:
# x_attn = x + attn(ln1(x))
# x_mlp = x_attn + mlp(ln2(x_attn))
# When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
# reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
# But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
# when mlp is finished.
'''
==================
End of MLU Hijack
==================
'''
reduce_results = (self.config.use_parallel_residual == False)
if reduce_results and self.tp_size > 1:
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
return final_hidden_states.view(num_tokens, hidden_dim)
def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
self,
config: CustomConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
) -> None:
super(CustomDecoderLayer, self).__init__()
self.config = config
self.self_attn = CustomAttention(
config=config,
cache_config=cache_config,
quant_config=quant_config,
)
mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
is_gated = getattr(config, "is_gated", False)
if config.num_experts is not None:
'''
=============================
Modify by vllm_mlu
=============================
@brief: nothing changed, only use the CustomMoeBlock class in this file
'''
self.mlp = CustomMoeBlock(config=config,
quant_config=quant_config)
'''
==================
End of MLU Hijack
==================
'''
else:
self.mlp = FeedForward(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=self.config.hidden_act,
up_proj_name='up_proj',
is_gated=is_gated,
down_proj_name='down_proj',
bias=mlp_bias,
quant_config=quant_config,
skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
reduce_results = (self.config.use_parallel_residual == False))
self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
# perf per-tensor sq cases by fusing quantization in layernorm
self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
not self.config.apply_residual_connection_post_layernorm)
self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
not self.config.apply_residual_connection_post_layernorm)
if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
self.self_attn.qkv_proj.quant_method.skip_quant_input = True
self.quant_fusion_attn_layernorm = None
self.is_moe = config.num_experts is not None
self.use_rmsnorm = self.config.norm_type == "rmsnorm"
if not self.is_moe:
self.mlp.up_proj.quant_method.skip_quant_input = True
self.quant_fusion_mlp_layernorm = None
MluHijackObject.apply_hijack(CustomDecoderLayer,
CustomDecoderLayer.__init__,
vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)

View File

@@ -0,0 +1,222 @@
import re
import torch
from torch import nn
from typing import Any, Dict, Iterable, List, Optional, Tuple
from transformers import PretrainedConfig
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.models.utils import is_pp_missing_parameter
from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
from vllm_mlu.model_executor.models.deepseek_v2 import DeepseekV2MoE
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
up_proj_name="gate_up_proj",
is_gated=True,
down_proj_name="down_proj",
has_bias=False,
skip_bias_add=False,
renormalize=config.norm_topk_prob,
hidden_act=config.hidden_act,
params_dtype=None,
quant_config=quant_config,
is_use_fused_moe=True,
expert_group=config.n_group,
topk_group=config.topk_group)
self.config = config
self.routed_scaling_factor = config.routed_scaling_factor
self.n_shared_experts = config.n_shared_experts
self.routed_scaling_factor = config.routed_scaling_factor
if self.moe_tp_size > config.n_routed_experts:
raise ValueError(
f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
f"the number of experts {config.n_routed_experts}.")
if config.hidden_act != "silu":
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
"Only silu is supported for now.")
self.gate = ReplicatedLinear(config.hidden_size,
config.n_routed_experts,
bias=False,
quant_config=None,
prefix=f"{prefix}.gate")
if config.n_shared_experts is not None:
intermediate_size = (config.moe_intermediate_size *
config.n_shared_experts)
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace MLP with FeedForward.
'''
self.shared_experts = FeedForward(hidden_size=config.hidden_size,
intermediate_size=intermediate_size,
hidden_act=config.hidden_act,
up_proj_name='gate_up_proj',
is_gated=True,
down_proj_name='down_proj',
bias=False,
quant_config=quant_config,
reduce_results=False)
'''
==================
End of MLU Hijack
==================
'''
def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.n_routed_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "mlp.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
if weight_name not in name:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
'''
name = name.replace(weight_name, param_name)
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
'''
==================
End of MLU Hijack
==================
'''
MluHijackObject.apply_hijack(DeepseekV2MoE,
DeepseekV2MoE.__init__,
vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
DeepseekV2ForCausalLM.load_weights,
vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)

View File

@@ -0,0 +1,143 @@
import torch
import re
import vllm
from torch import nn
from typing import List, Optional, Tuple, Iterable
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.model_executor.models.mixtral import MixtralForCausalLM
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.models.utils import is_pp_missing_parameter
def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
self,
weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.num_local_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("w13", "w1", 0),
("w13", "w3", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "block_sparse_moe.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
# Remapping the name of FP8 kv-scale.
name = maybe_remap_kv_scale_name(name, params_dict)
if name is None:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
MluHijackObject.apply_hijack(MixtralForCausalLM,
MixtralForCausalLM.load_weights,
vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)

View File

@@ -0,0 +1,179 @@
import torch
import re
from typing import Optional, Iterable, Tuple
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.utils import print_warning_once
from vllm.model_executor.models.utils import is_pp_missing_parameter
def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
self,
weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.num_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "mlp.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete if "mlp.experts" in name: continue condition
'''
'''
==================
End of MLU Hijack
==================
'''
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
# Skip layers on other devices.
if is_pp_missing_parameter(name, self):
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
'''
# Skip experts that are not assigned to this worker.
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete for mapping in expert_params_mapping condition
'''
'''
==================
End of MLU Hijack
==================
'''
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
# Skip layers on other devices.
if is_pp_missing_parameter(name, self):
continue
# Remapping the name of FP8 kv-scale.
if name.endswith("kv_scale"):
remapped_kv_scale_name = name.replace(
".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict:
print_warning_once(
"Found kv scale in the checkpoint "
f"(e.g. {name}), but not found the expected "
f"name in the model "
f"(e.g. {remapped_kv_scale_name}). "
"kv-scale is not loaded.")
continue
else:
name = remapped_kv_scale_name
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
Qwen2MoeForCausalLM.load_weights,
vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)