From c316679e658205313c0a547b8ae4365382e3a685 Mon Sep 17 00:00:00 2001
From: Feng-xiaosuo <tengchang1@huawei.com>
Date: Sat, 10 Jan 2026 23:01:35 +0800
Subject: [PATCH] adapt to minimax_m2 (#5624)

### What this PR does / why we need it?
This PR fixes Minimax model loading in vLLM Ascend backend by:

Adding model type check for "minimax" and "minimax_m2" to replace "mlp"
prefix with "block_sparse_moe"
Implementing special handling for Minimax expert layer naming
conventions
Adding Minimax configuration to packed_modules_model_mapping for proper
qkv_proj and experts module handling
Without these changes, Minimax models fail to load on Ascend devices due
to incompatible layer naming and module packing.

### Does this PR introduce _any_ user-facing change?
Yes. Users can now successfully load and run Minimax models on Ascend
hardware with vLLM. This enables inference capabilities for this model
family on Ascend devices.

### How was this patch tested?
Local Testing:
Verified model loading for minimax-xxx and minimax_m2-xxx model variants
on Atlas 800I A2 hardware
Tested inference with sample prompts using vLLM's OpenAI-compatible API
server

Benchmark Validation:
Compared throughput and latency metrics against GPU baseline
Verified memory usage stays within expected limits for different batch
sizes
Tested multi-card inference scenarios with tensor parallelism

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/8be6432bdaf6275664d857b1e5e9bf8ed1ce299e

---------

Signed-off-by: Feng-xiaosuo <tengchang1@huawei.com>
---
 vllm_ascend/quantization/quant_config.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 339e6f00..1490f394 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -117,6 +117,18 @@ class AscendQuantConfig(QuantizationConfig):
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         vllm_config = get_current_vllm_config()
         model_type = vllm_config.model_config.hf_text_config.model_type
+
+        if model_type in ["minimax", "minimax_m2"]:
+            prefix = prefix.replace("mlp", "block_sparse_moe")
+
+            #To adapt to minimax, modify the prefix of the model layer name
+            parts = prefix.split('.')
+            if "experts" in parts and len(parts) > 2:
+                exp_idx = parts.index("experts")
+                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
+                    parts = parts[:exp_idx + 1]
+                    prefix = ".".join(parts)
+
         if model_type in packed_modules_model_mapping:
             self.packed_modules_mapping = packed_modules_model_mapping[
                 model_type]
@@ -312,6 +324,14 @@ packed_modules_model_mapping = {
         ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
         "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
     },
+    "minimax_m2": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"]
+    }
 }