adapt to minimax_m2 (#5624)

### What this PR does / why we need it? This PR fixes Minimax model loading in vLLM Ascend backend by: Adding model type check for "minimax" and "minimax_m2" to replace "mlp" prefix with "block_sparse_moe" Implementing special handling for Minimax expert layer naming conventions Adding Minimax configuration to packed_modules_model_mapping for proper qkv_proj and experts module handling Without these changes, Minimax models fail to load on Ascend devices due to incompatible layer naming and module packing. ### Does this PR introduce _any_ user-facing change? Yes. Users can now successfully load and run Minimax models on Ascend hardware with vLLM. This enables inference capabilities for this model family on Ascend devices. ### How was this patch tested? Local Testing: Verified model loading for minimax-xxx and minimax_m2-xxx model variants on Atlas 800I A2 hardware Tested inference with sample prompts using vLLM's OpenAI-compatible API server Benchmark Validation: Compared throughput and latency metrics against GPU baseline Verified memory usage stays within expected limits for different batch sizes Tested multi-card inference scenarios with tensor parallelism - vLLM version: v0.13.0 - vLLM main: 8be6432bda --------- Signed-off-by: Feng-xiaosuo <tengchang1@huawei.com>
2026-01-10 23:01:35 +08:00
parent ecd4232698
commit c316679e65
1 changed files with 20 additions and 0 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -117,6 +117,18 @@ class AscendQuantConfig(QuantizationConfig):
                         prefix: str) -> Optional["QuantizeMethodBase"]:
        vllm_config = get_current_vllm_config()
        model_type = vllm_config.model_config.hf_text_config.model_type
+
+        if model_type in ["minimax", "minimax_m2"]:
+            prefix = prefix.replace("mlp", "block_sparse_moe")
+
+            #To adapt to minimax, modify the prefix of the model layer name
+            parts = prefix.split('.')
+            if "experts" in parts and len(parts) > 2:
+                exp_idx = parts.index("experts")
+                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
+                    parts = parts[:exp_idx + 1]
+                    prefix = ".".join(parts)
+
        if model_type in packed_modules_model_mapping:
            self.packed_modules_mapping = packed_modules_model_mapping[
                model_type]
@@ -312,6 +324,14 @@ packed_modules_model_mapping = {
        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
    },
+    "minimax_m2": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"]
+    }
 }