adapt to minimax_m2 (#5624)
### What this PR does / why we need it?
This PR fixes Minimax model loading in vLLM Ascend backend by:
Adding model type check for "minimax" and "minimax_m2" to replace "mlp"
prefix with "block_sparse_moe"
Implementing special handling for Minimax expert layer naming
conventions
Adding Minimax configuration to packed_modules_model_mapping for proper
qkv_proj and experts module handling
Without these changes, Minimax models fail to load on Ascend devices due
to incompatible layer naming and module packing.
### Does this PR introduce _any_ user-facing change?
Yes. Users can now successfully load and run Minimax models on Ascend
hardware with vLLM. This enables inference capabilities for this model
family on Ascend devices.
### How was this patch tested?
Local Testing:
Verified model loading for minimax-xxx and minimax_m2-xxx model variants
on Atlas 800I A2 hardware
Tested inference with sample prompts using vLLM's OpenAI-compatible API
server
Benchmark Validation:
Compared throughput and latency metrics against GPU baseline
Verified memory usage stays within expected limits for different batch
sizes
Tested multi-card inference scenarios with tensor parallelism
- vLLM version: v0.13.0
- vLLM main:
8be6432bda
---------
Signed-off-by: Feng-xiaosuo <tengchang1@huawei.com>
This commit is contained in:
@@ -117,6 +117,18 @@ class AscendQuantConfig(QuantizationConfig):
|
||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_type = vllm_config.model_config.hf_text_config.model_type
|
||||
|
||||
if model_type in ["minimax", "minimax_m2"]:
|
||||
prefix = prefix.replace("mlp", "block_sparse_moe")
|
||||
|
||||
#To adapt to minimax, modify the prefix of the model layer name
|
||||
parts = prefix.split('.')
|
||||
if "experts" in parts and len(parts) > 2:
|
||||
exp_idx = parts.index("experts")
|
||||
if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
|
||||
parts = parts[:exp_idx + 1]
|
||||
prefix = ".".join(parts)
|
||||
|
||||
if model_type in packed_modules_model_mapping:
|
||||
self.packed_modules_mapping = packed_modules_model_mapping[
|
||||
model_type]
|
||||
@@ -312,6 +324,14 @@ packed_modules_model_mapping = {
|
||||
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
|
||||
},
|
||||
"minimax_m2": {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user