From c316679e658205313c0a547b8ae4365382e3a685 Mon Sep 17 00:00:00 2001 From: Feng-xiaosuo Date: Sat, 10 Jan 2026 23:01:35 +0800 Subject: [PATCH] adapt to minimax_m2 (#5624) ### What this PR does / why we need it? This PR fixes Minimax model loading in vLLM Ascend backend by: Adding model type check for "minimax" and "minimax_m2" to replace "mlp" prefix with "block_sparse_moe" Implementing special handling for Minimax expert layer naming conventions Adding Minimax configuration to packed_modules_model_mapping for proper qkv_proj and experts module handling Without these changes, Minimax models fail to load on Ascend devices due to incompatible layer naming and module packing. ### Does this PR introduce _any_ user-facing change? Yes. Users can now successfully load and run Minimax models on Ascend hardware with vLLM. This enables inference capabilities for this model family on Ascend devices. ### How was this patch tested? Local Testing: Verified model loading for minimax-xxx and minimax_m2-xxx model variants on Atlas 800I A2 hardware Tested inference with sample prompts using vLLM's OpenAI-compatible API server Benchmark Validation: Compared throughput and latency metrics against GPU baseline Verified memory usage stays within expected limits for different batch sizes Tested multi-card inference scenarios with tensor parallelism - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/8be6432bdaf6275664d857b1e5e9bf8ed1ce299e --------- Signed-off-by: Feng-xiaosuo --- vllm_ascend/quantization/quant_config.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 339e6f00..1490f394 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -117,6 +117,18 @@ class AscendQuantConfig(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: vllm_config = get_current_vllm_config() model_type = vllm_config.model_config.hf_text_config.model_type + + if model_type in ["minimax", "minimax_m2"]: + prefix = prefix.replace("mlp", "block_sparse_moe") + + #To adapt to minimax, modify the prefix of the model layer name + parts = prefix.split('.') + if "experts" in parts and len(parts) > 2: + exp_idx = parts.index("experts") + if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit(): + parts = parts[:exp_idx + 1] + prefix = ".".join(parts) + if model_type in packed_modules_model_mapping: self.packed_modules_mapping = packed_modules_model_mapping[ model_type] @@ -312,6 +324,14 @@ packed_modules_model_mapping = { ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"] }, + "minimax_m2": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"] + } }