From 724d04391e89408f3fc3fb0cef13e5f4cf155dde Mon Sep 17 00:00:00 2001 From: JeffLee1874 <52711096+JeffLee1874@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:15:29 +0800 Subject: [PATCH] [model] Support PanguUltraMoE (#4615) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? To support PanguUltraMoE model ### Test result #### Start serving using W8A8 quantized model and ACL graph: Master node: ``` vllm serve $LOCAL_CKPT_DIR \ --host 0.0.0.0 \ --port 8000 \ --data-parallel-size 2 \ --data-parallel-size-local 1 \ --data-parallel-address $MASTER_NODE_IP \ --data-parallel-rpc-port 13389 \ --tensor-parallel-size 16 \ --seed 1024 \ --enable-expert-parallel \ --served-model-name $NAME \ --max-model-len 4096 \ --max-num-batched-tokens 256 \ --max-num-seqs 18 \ --trust-remote-code \ --gpu-memory-utilization 0.90 \ --quantization ascend \ --additional-config '{"ascend_scheduler_config":{"enabled":false, "enable_chunked_prefill":true, "chunked_prefill_enabled":true},"torchair_graph_config":{"enabled":false}}' \ --speculative_config '{"method": "pangu_ultra_moe_mtp", "num_speculative_tokens": 1}' \ ``` Other nodes: ``` vllm serve $LOCAL_CKPT_DIR \ --host 0.0.0.0 \ --port 8000 \ --headless \ --data-parallel-size 2 \ --data-parallel-size-local 1 \ --data-parallel-start-rank 1 \ --data-parallel-address $MASTER_NODE_IP \ --data-parallel-rpc-port 13389 \ --tensor-parallel-size 16 \ --seed 1024 \ --enable-expert-parallel \ --served-model-name $NAME \ --max-model-len 4096 \ --max-num-batched-tokens 256 \ --max-num-seqs 18 \ --trust-remote-code \ --gpu-memory-utilization 0.90 \ --quantization ascend \ --additional-config '{"ascend_scheduler_config":{"enabled":false, "enable_chunked_prefill":true, "chunked_prefill_enabled":true},"torchair_graph_config":{"enabled":false}}' \ --speculative_config '{"method": "pangu_ultra_moe_mtp", "num_speculative_tokens": 1}' \ ``` Request & Response: - Request ``` curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "messages": [ {"role": "system", "content": ""}, {"role": "user", "content": "你是谁?"} ], "max_tokens": "64", "top_p": "0.95", "top_k": "50", "temperature": "0.6", "add_special_tokens" : true }' ``` - Response ``` [unused16] 好的,用户问我是谁,我需要按照之前的设定来回答。首先,我的角色是盘古,由华为开发,属于推理模型。要强调我的主要功能是解答问题和提供信息支持,特别是通过逻辑推理和数据分析处理复杂任务。需要保持回答简洁,用中文,并且符合用户的 ``` - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 Signed-off-by: lijifu Co-authored-by: lijifu --- vllm_ascend/quantization/quant_config.py | 12 ++++++++++++ vllm_ascend/spec_decode/mtp_proposer.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index e358b253..99bd0499 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -221,6 +221,12 @@ packed_modules_model_mapping = { ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"] }, + "pangu_ultra_moe": { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"] + }, "kimi_k2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": @@ -241,6 +247,12 @@ packed_modules_model_mapping = { "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] }, + "pangu_ultra_moe_mtp": { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"] + }, "qwen3_next": { "qkv_proj": [ "q_proj", diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index a152aa47..cd4b9a04 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -44,6 +44,8 @@ PADDING_SLOT_ID = -1 _MTP_MODELS = { "DeepseekV3ForCausalLM": ("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"), + "PanguUltraMoEForCausalLM": + ("vllm.model_executor.models.openpangu_mtp", "OpenPanguMTP"), "DeepseekV32ForCausalLM": ("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"), "Qwen3NextForCausalLM":