Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -127,12 +127,10 @@ class Glm4MoeLiteDecoderLayer(nn.Module):
v_head_dim = getattr(config, "v_head_dim", 0)
kv_lora_rank = getattr(config, "kv_lora_rank", 0)
# if model_config.use_mla:
# attn_cls = Glm4MoeLiteMLAAttention
# else:
# attn_cls = Glm4MoeLiteAttention
attn_cls = Glm4MoeLiteAttention
if model_config.use_mla:
attn_cls = Glm4MoeLiteMLAAttention
else:
attn_cls = Glm4MoeLiteAttention
self.self_attn = attn_cls(
vllm_config=vllm_config,
@@ -306,7 +304,7 @@ class Glm4MoeLiteModel(nn.Module):
),
}
)
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
@@ -318,6 +316,120 @@ class Glm4MoeLiteModel(nn.Module):
num_experts=self.config.n_routed_experts,
)
class Glm4MoeLiteForCausalLM(
nn.Module, SupportsPP, SupportsLoRA, Glm4LiteMixtureOfExperts
):
packed_modules_mapping = {
"gate_up_proj": ["gate_proj", "up_proj"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.config = config
self.quant_config = quant_config
qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
self.use_mha = config.model_type == "deepseek" or all(
dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
)
if self.use_mha:
self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
# `packed_modules_mapping` needs to be modified before
# initializing DeepseekV2Model, as it is passed inplace to
# quantization config init and may be used to select the
# quant_method for relevant layers during initialization.
self.fuse_qkv_a_proj = (
hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
)
if self.fuse_qkv_a_proj:
self.packed_modules_mapping["fused_qkv_a_proj"] = [
"q_a_proj",
"kv_a_proj_with_mqa",
]
self.model = Glm4MoeLiteModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
if get_pp_group().is_last_rank:
self.lm_head = ParallelLMHead(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
else:
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
)
# Set MoE hyperparameters
self.num_moe_layers = (
self.config.num_hidden_layers - self.config.first_k_dense_replace
)
self.set_moe_parameters()
def set_moe_parameters(self):
self.expert_weights = []
self.num_expert_groups = getattr(self.config, "n_group", 1)
self.moe_layers = []
self.moe_mlp_layers = []
example_moe = None
for layer in self.model.layers:
if isinstance(layer, PPMissingLayer):
continue
assert isinstance(layer, Glm4MoeLiteDecoderLayer)
if isinstance(layer.mlp, Glm4MoeLite):
# Pick last one layer since the first ones may be dense layers.
example_moe = layer.mlp
self.moe_mlp_layers.append(layer.mlp)
self.moe_layers.append(layer.mlp.experts)
self.extract_moe_parameters(example_moe)
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)
def forward(
self,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors:
hidden_states = self.model(
input_ids, positions, intermediate_tensors, inputs_embeds
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
) -> torch.Tensor | None:
logits = self.logits_processor(self.lm_head, hidden_states)
return logits
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj",
num_experts=self.config.n_routed_experts,
num_redundant_experts=0,
)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
rocm_aiter_moe_shared_expert_enabled = (
rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
@@ -327,12 +439,13 @@ class Glm4MoeLiteModel(nn.Module):
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
mla_params_mapping = [
("fused_qkv_a_proj", "q_a_proj", 0),
("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
mha_params_mapping = [
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
]
stacked_params_mapping.extend(mla_params_mapping)
if self.use_mha:
stacked_params_mapping.extend(mha_params_mapping)
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
@@ -510,128 +623,71 @@ class Glm4MoeLiteModel(nn.Module):
weight_loader(param, loaded_weight)
if not is_fusion_moe_shared_experts_layer:
loaded_params.add(name)
opt_support_quant_method = ["GGUFLinearMethod", "UnquantizedLinearMethod", "CompressedTensorsW8A8Int8", "AWQMarlinLinearMethod"]
# add your opt here..
def inject_layer(layer, quant_method, is_mla):
q_lora_rank = getattr(layer, "q_lora_rank", None)
if quant_method in ["UnquantizedLinearMethod", "CompressedTensorsW8A8Int8"]:
if q_lora_rank is not None:
layer.q_a_proj.weight.data = torch.cat([layer.q_a_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=0)
if hasattr(layer.q_a_proj, "weight_scale"):
layer.q_a_proj.weight_scale.data = torch.cat([layer.q_a_proj.weight_scale, layer.kv_a_proj_with_mqa.weight_scale], dim=0)
del layer.kv_a_proj_with_mqa.weight_scale
elif not is_mla:
layer.q_proj.weight.data = torch.cat([layer.q_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=0)
if hasattr(layer.q_proj, "weight_scale"):
layer.q_proj.weight_scale.data = torch.cat([layer.q_proj.weight_scale, layer.kv_a_proj_with_mqa.weight_scale], dim=0)
del layer.kv_a_proj_with_mqa.weight_scale
else:
return
del layer.kv_a_proj_with_mqa.weight
del layer.kv_a_proj_with_mqa
if is_mla:
layer.mla_attn.forward = layer.mla_attn.forward_opt
else:
layer.forward = layer.forward_opt
elif quant_method == "GGUFLinearMethod":
pass
elif quant_method == "AWQMarlinLinearMethod":
dtype = layer.kv_a_proj_with_mqa.qweight.dtype
assert dtype == torch.int32
if layer.q_lora_rank is not None:
layer.q_a_proj.qweight.data = torch.cat([layer.q_a_proj.qweight, layer.kv_a_proj_with_mqa.qweight], dim=1)
layer.q_a_proj.scales.data = torch.cat([layer.q_a_proj.scales, layer.kv_a_proj_with_mqa.scales], dim=1)
del layer.kv_a_proj_with_mqa.scales
layer.q_a_proj.qzeros.data = torch.cat([layer.q_a_proj.qzeros, layer.kv_a_proj_with_mqa.qzeros], dim=1)
del layer.kv_a_proj_with_mqa.qzeros
elif not is_mla:
layer.q_proj.weight.data = torch.cat([layer.q_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=1)
layer.q_proj.scales.data = torch.cat([layer.q_proj.scales, layer.kv_a_proj_with_mqa.scales], dim=1)
del layer.kv_a_proj_with_mqa.scales
layer.q_proj.qzeros.data = torch.cat([layer.q_proj.qzeros, layer.kv_a_proj_with_mqa.qzeros], dim=1)
del layer.kv_a_proj_with_mqa.qzeros
else:
return
del layer.kv_a_proj_with_mqa.qweight
del layer.kv_a_proj_with_mqa
if is_mla:
layer.mla_attn.forward = layer.mla_attn.forward_opt
else:
layer.forward = layer.forward_opt
else:
pass
for _, layer in self.model.named_modules():
if layer.__class__.__name__ in ["Glm4MoeLiteAttention","Glm4MoeLiteMLAAttention"]:
if hasattr(layer.kv_a_proj_with_mqa, "scheme"):
quant_method = layer.kv_a_proj_with_mqa.scheme.__class__.__name__
else:
quant_method = layer.kv_a_proj_with_mqa.quant_method.__class__.__name__
if quant_method not in opt_support_quant_method:
break
inject_layer(layer, quant_method, is_mla = layer.__class__.__name__ == "Glm4MoeLiteMLAAttention")
return loaded_params
class Glm4MoeLiteForCausalLM(
nn.Module, SupportsPP, SupportsLoRA, Glm4LiteMixtureOfExperts
):
packed_modules_mapping = {
"gate_up_proj": ["gate_proj", "up_proj"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.config = config
self.quant_config = quant_config
qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
self.use_mha = config.model_type == "deepseek" or all(
dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
)
if self.use_mha:
self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
# `packed_modules_mapping` needs to be modified before
# initializing DeepseekV2Model, as it is passed inplace to
# quantization config init and may be used to select the
# quant_method for relevant layers during initialization.
self.fuse_qkv_a_proj = (
hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
)
if self.fuse_qkv_a_proj:
self.packed_modules_mapping["fused_qkv_a_proj"] = [
"q_a_proj",
"kv_a_proj_with_mqa",
]
self.model = Glm4MoeLiteModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
if get_pp_group().is_last_rank:
self.lm_head = ParallelLMHead(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
else:
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
)
# Set MoE hyperparameters
self.num_moe_layers = (
self.config.num_hidden_layers - self.config.first_k_dense_replace
)
self.set_moe_parameters()
def set_moe_parameters(self):
self.expert_weights = []
self.num_expert_groups = getattr(self.config, "n_group", 1)
self.moe_layers = []
self.moe_mlp_layers = []
example_moe = None
for layer in self.model.layers:
if isinstance(layer, PPMissingLayer):
continue
assert isinstance(layer, Glm4MoeLiteDecoderLayer)
if isinstance(layer.mlp, Glm4MoeLite):
# Pick last one layer since the first ones may be dense layers.
example_moe = layer.mlp
self.moe_mlp_layers.append(layer.mlp)
self.moe_layers.append(layer.mlp.experts)
self.extract_moe_parameters(example_moe)
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)
def forward(
self,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors:
hidden_states = self.model(
input_ids, positions, intermediate_tensors, inputs_embeds
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
) -> torch.Tensor | None:
logits = self.logits_processor(self.lm_head, hidden_states)
return logits
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
return SharedFusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj",
num_experts=self.config.n_routed_experts,
num_redundant_experts=0,
)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self)
return loader.load_weights(weights)
def get_spec_layer_idx_from_weight_name(
config: "Glm4MoeLiteConfig", weight_name: str
) -> int | None: