Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -127,12 +127,10 @@ class Glm4MoeLiteDecoderLayer(nn.Module):
|
||||
v_head_dim = getattr(config, "v_head_dim", 0)
|
||||
kv_lora_rank = getattr(config, "kv_lora_rank", 0)
|
||||
|
||||
# if model_config.use_mla:
|
||||
# attn_cls = Glm4MoeLiteMLAAttention
|
||||
# else:
|
||||
# attn_cls = Glm4MoeLiteAttention
|
||||
|
||||
attn_cls = Glm4MoeLiteAttention
|
||||
if model_config.use_mla:
|
||||
attn_cls = Glm4MoeLiteMLAAttention
|
||||
else:
|
||||
attn_cls = Glm4MoeLiteAttention
|
||||
|
||||
self.self_attn = attn_cls(
|
||||
vllm_config=vllm_config,
|
||||
@@ -306,7 +304,7 @@ class Glm4MoeLiteModel(nn.Module):
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
@@ -318,6 +316,120 @@ class Glm4MoeLiteModel(nn.Module):
|
||||
num_experts=self.config.n_routed_experts,
|
||||
)
|
||||
|
||||
|
||||
class Glm4MoeLiteForCausalLM(
|
||||
nn.Module, SupportsPP, SupportsLoRA, Glm4LiteMixtureOfExperts
|
||||
):
|
||||
packed_modules_mapping = {
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
|
||||
qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
|
||||
qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
|
||||
self.use_mha = config.model_type == "deepseek" or all(
|
||||
dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
|
||||
)
|
||||
|
||||
if self.use_mha:
|
||||
self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
|
||||
|
||||
# `packed_modules_mapping` needs to be modified before
|
||||
# initializing DeepseekV2Model, as it is passed inplace to
|
||||
# quantization config init and may be used to select the
|
||||
# quant_method for relevant layers during initialization.
|
||||
self.fuse_qkv_a_proj = (
|
||||
hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
|
||||
)
|
||||
if self.fuse_qkv_a_proj:
|
||||
self.packed_modules_mapping["fused_qkv_a_proj"] = [
|
||||
"q_a_proj",
|
||||
"kv_a_proj_with_mqa",
|
||||
]
|
||||
|
||||
self.model = Glm4MoeLiteModel(
|
||||
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
|
||||
)
|
||||
if get_pp_group().is_last_rank:
|
||||
self.lm_head = ParallelLMHead(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "lm_head"),
|
||||
)
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors
|
||||
)
|
||||
# Set MoE hyperparameters
|
||||
self.num_moe_layers = (
|
||||
self.config.num_hidden_layers - self.config.first_k_dense_replace
|
||||
)
|
||||
self.set_moe_parameters()
|
||||
|
||||
def set_moe_parameters(self):
|
||||
self.expert_weights = []
|
||||
|
||||
self.num_expert_groups = getattr(self.config, "n_group", 1)
|
||||
|
||||
self.moe_layers = []
|
||||
self.moe_mlp_layers = []
|
||||
example_moe = None
|
||||
for layer in self.model.layers:
|
||||
if isinstance(layer, PPMissingLayer):
|
||||
continue
|
||||
|
||||
assert isinstance(layer, Glm4MoeLiteDecoderLayer)
|
||||
if isinstance(layer.mlp, Glm4MoeLite):
|
||||
# Pick last one layer since the first ones may be dense layers.
|
||||
example_moe = layer.mlp
|
||||
self.moe_mlp_layers.append(layer.mlp)
|
||||
self.moe_layers.append(layer.mlp.experts)
|
||||
|
||||
self.extract_moe_parameters(example_moe)
|
||||
|
||||
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.model.embed_input_ids(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor | None,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
hidden_states = self.model(
|
||||
input_ids, positions, intermediate_tensors, inputs_embeds
|
||||
)
|
||||
return hidden_states
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor | None:
|
||||
logits = self.logits_processor(self.lm_head, hidden_states)
|
||||
return logits
|
||||
|
||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
return SharedFusedMoE.make_expert_params_mapping(
|
||||
self,
|
||||
ckpt_gate_proj_name="gate_proj",
|
||||
ckpt_down_proj_name="down_proj",
|
||||
ckpt_up_proj_name="up_proj",
|
||||
num_experts=self.config.n_routed_experts,
|
||||
num_redundant_experts=0,
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
rocm_aiter_moe_shared_expert_enabled = (
|
||||
rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
|
||||
@@ -327,12 +439,13 @@ class Glm4MoeLiteModel(nn.Module):
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
mla_params_mapping = [
|
||||
("fused_qkv_a_proj", "q_a_proj", 0),
|
||||
("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
|
||||
mha_params_mapping = [
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
|
||||
stacked_params_mapping.extend(mla_params_mapping)
|
||||
if self.use_mha:
|
||||
stacked_params_mapping.extend(mha_params_mapping)
|
||||
|
||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
@@ -510,128 +623,71 @@ class Glm4MoeLiteModel(nn.Module):
|
||||
weight_loader(param, loaded_weight)
|
||||
if not is_fusion_moe_shared_experts_layer:
|
||||
loaded_params.add(name)
|
||||
opt_support_quant_method = ["GGUFLinearMethod", "UnquantizedLinearMethod", "CompressedTensorsW8A8Int8", "AWQMarlinLinearMethod"]
|
||||
# add your opt here..
|
||||
def inject_layer(layer, quant_method, is_mla):
|
||||
q_lora_rank = getattr(layer, "q_lora_rank", None)
|
||||
if quant_method in ["UnquantizedLinearMethod", "CompressedTensorsW8A8Int8"]:
|
||||
if q_lora_rank is not None:
|
||||
layer.q_a_proj.weight.data = torch.cat([layer.q_a_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=0)
|
||||
if hasattr(layer.q_a_proj, "weight_scale"):
|
||||
layer.q_a_proj.weight_scale.data = torch.cat([layer.q_a_proj.weight_scale, layer.kv_a_proj_with_mqa.weight_scale], dim=0)
|
||||
del layer.kv_a_proj_with_mqa.weight_scale
|
||||
elif not is_mla:
|
||||
layer.q_proj.weight.data = torch.cat([layer.q_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=0)
|
||||
if hasattr(layer.q_proj, "weight_scale"):
|
||||
layer.q_proj.weight_scale.data = torch.cat([layer.q_proj.weight_scale, layer.kv_a_proj_with_mqa.weight_scale], dim=0)
|
||||
del layer.kv_a_proj_with_mqa.weight_scale
|
||||
else:
|
||||
return
|
||||
del layer.kv_a_proj_with_mqa.weight
|
||||
del layer.kv_a_proj_with_mqa
|
||||
if is_mla:
|
||||
layer.mla_attn.forward = layer.mla_attn.forward_opt
|
||||
else:
|
||||
layer.forward = layer.forward_opt
|
||||
elif quant_method == "GGUFLinearMethod":
|
||||
pass
|
||||
elif quant_method == "AWQMarlinLinearMethod":
|
||||
dtype = layer.kv_a_proj_with_mqa.qweight.dtype
|
||||
assert dtype == torch.int32
|
||||
if layer.q_lora_rank is not None:
|
||||
layer.q_a_proj.qweight.data = torch.cat([layer.q_a_proj.qweight, layer.kv_a_proj_with_mqa.qweight], dim=1)
|
||||
layer.q_a_proj.scales.data = torch.cat([layer.q_a_proj.scales, layer.kv_a_proj_with_mqa.scales], dim=1)
|
||||
del layer.kv_a_proj_with_mqa.scales
|
||||
layer.q_a_proj.qzeros.data = torch.cat([layer.q_a_proj.qzeros, layer.kv_a_proj_with_mqa.qzeros], dim=1)
|
||||
del layer.kv_a_proj_with_mqa.qzeros
|
||||
elif not is_mla:
|
||||
layer.q_proj.weight.data = torch.cat([layer.q_proj.weight, layer.kv_a_proj_with_mqa.weight], dim=1)
|
||||
layer.q_proj.scales.data = torch.cat([layer.q_proj.scales, layer.kv_a_proj_with_mqa.scales], dim=1)
|
||||
del layer.kv_a_proj_with_mqa.scales
|
||||
layer.q_proj.qzeros.data = torch.cat([layer.q_proj.qzeros, layer.kv_a_proj_with_mqa.qzeros], dim=1)
|
||||
del layer.kv_a_proj_with_mqa.qzeros
|
||||
else:
|
||||
return
|
||||
|
||||
del layer.kv_a_proj_with_mqa.qweight
|
||||
del layer.kv_a_proj_with_mqa
|
||||
if is_mla:
|
||||
layer.mla_attn.forward = layer.mla_attn.forward_opt
|
||||
else:
|
||||
layer.forward = layer.forward_opt
|
||||
else:
|
||||
pass
|
||||
|
||||
for _, layer in self.model.named_modules():
|
||||
if layer.__class__.__name__ in ["Glm4MoeLiteAttention","Glm4MoeLiteMLAAttention"]:
|
||||
if hasattr(layer.kv_a_proj_with_mqa, "scheme"):
|
||||
quant_method = layer.kv_a_proj_with_mqa.scheme.__class__.__name__
|
||||
else:
|
||||
quant_method = layer.kv_a_proj_with_mqa.quant_method.__class__.__name__
|
||||
if quant_method not in opt_support_quant_method:
|
||||
break
|
||||
|
||||
inject_layer(layer, quant_method, is_mla = layer.__class__.__name__ == "Glm4MoeLiteMLAAttention")
|
||||
return loaded_params
|
||||
|
||||
|
||||
class Glm4MoeLiteForCausalLM(
|
||||
nn.Module, SupportsPP, SupportsLoRA, Glm4LiteMixtureOfExperts
|
||||
):
|
||||
packed_modules_mapping = {
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
|
||||
qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
|
||||
qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
|
||||
self.use_mha = config.model_type == "deepseek" or all(
|
||||
dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
|
||||
)
|
||||
|
||||
if self.use_mha:
|
||||
self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
|
||||
|
||||
# `packed_modules_mapping` needs to be modified before
|
||||
# initializing DeepseekV2Model, as it is passed inplace to
|
||||
# quantization config init and may be used to select the
|
||||
# quant_method for relevant layers during initialization.
|
||||
self.fuse_qkv_a_proj = (
|
||||
hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
|
||||
)
|
||||
if self.fuse_qkv_a_proj:
|
||||
self.packed_modules_mapping["fused_qkv_a_proj"] = [
|
||||
"q_a_proj",
|
||||
"kv_a_proj_with_mqa",
|
||||
]
|
||||
|
||||
self.model = Glm4MoeLiteModel(
|
||||
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
|
||||
)
|
||||
if get_pp_group().is_last_rank:
|
||||
self.lm_head = ParallelLMHead(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "lm_head"),
|
||||
)
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors
|
||||
)
|
||||
# Set MoE hyperparameters
|
||||
self.num_moe_layers = (
|
||||
self.config.num_hidden_layers - self.config.first_k_dense_replace
|
||||
)
|
||||
self.set_moe_parameters()
|
||||
|
||||
def set_moe_parameters(self):
|
||||
self.expert_weights = []
|
||||
|
||||
self.num_expert_groups = getattr(self.config, "n_group", 1)
|
||||
|
||||
self.moe_layers = []
|
||||
self.moe_mlp_layers = []
|
||||
example_moe = None
|
||||
for layer in self.model.layers:
|
||||
if isinstance(layer, PPMissingLayer):
|
||||
continue
|
||||
|
||||
assert isinstance(layer, Glm4MoeLiteDecoderLayer)
|
||||
if isinstance(layer.mlp, Glm4MoeLite):
|
||||
# Pick last one layer since the first ones may be dense layers.
|
||||
example_moe = layer.mlp
|
||||
self.moe_mlp_layers.append(layer.mlp)
|
||||
self.moe_layers.append(layer.mlp.experts)
|
||||
|
||||
self.extract_moe_parameters(example_moe)
|
||||
|
||||
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.model.embed_input_ids(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor | None,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
hidden_states = self.model(
|
||||
input_ids, positions, intermediate_tensors, inputs_embeds
|
||||
)
|
||||
return hidden_states
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor | None:
|
||||
logits = self.logits_processor(self.lm_head, hidden_states)
|
||||
return logits
|
||||
|
||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
return SharedFusedMoE.make_expert_params_mapping(
|
||||
self,
|
||||
ckpt_gate_proj_name="gate_proj",
|
||||
ckpt_down_proj_name="down_proj",
|
||||
ckpt_up_proj_name="up_proj",
|
||||
num_experts=self.config.n_routed_experts,
|
||||
num_redundant_experts=0,
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
def get_spec_layer_idx_from_weight_name(
|
||||
config: "Glm4MoeLiteConfig", weight_name: str
|
||||
) -> int | None:
|
||||
|
||||
Reference in New Issue
Block a user