[main][bugfix] bugfix for qwen3 moe quantization (#4599)

### What this PR does / why we need it?
Fix the issue where the qwen3 moe service cannot be started due to
upgrading the vllm version

Error info:
AttributeError: 'AscendFusedMoE' object has no attribute 'use dp
chunking'

### Does this PR introduce _any_ user-facing change?
no


- vLLM version: v0.11.2

---------

Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
Wang Kunpeng
2025-12-01 23:48:57 +08:00
committed by GitHub
parent 12ca99c94e
commit a9c4b8604a
4 changed files with 36 additions and 10 deletions

View File

@@ -111,6 +111,10 @@ def parse_args():
parser.add_argument("--enable-expert-parallel", parser.add_argument("--enable-expert-parallel",
action="store_true", action="store_true",
help="Enable expert parallel, used in MOE models.") help="Enable expert parallel, used in MOE models.")
parser.add_argument("--quantization",
type=str,
default="",
help="Use quantization models")
return parser.parse_args() return parser.parse_args()
@@ -134,6 +138,7 @@ def main(
enable_expert_parallel, enable_expert_parallel,
enforce_eager, enforce_eager,
trust_remote_code, trust_remote_code,
quantization,
): ):
# DP only support on V1 engine # DP only support on V1 engine
os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@@ -185,6 +190,7 @@ def main(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel, enable_expert_parallel=enable_expert_parallel,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
quantization=quantization,
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
@@ -220,6 +226,8 @@ if __name__ == "__main__":
assert dp_size % node_size == 0, "dp_size should be divisible by node_size" assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
dp_per_node = dp_size // node_size dp_per_node = dp_size // node_size
quantization = args.quantization if args.quantization else None
from multiprocessing import Process from multiprocessing import Process
procs = [] procs = []
@@ -238,6 +246,7 @@ if __name__ == "__main__":
args.enable_expert_parallel, args.enable_expert_parallel,
args.enforce_eager, args.enforce_eager,
args.trust_remote_code, args.trust_remote_code,
quantization,
), ),
) )
proc.start() proc.start()

View File

@@ -27,13 +27,17 @@ from unittest.mock import patch
import pytest import pytest
MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B"] MODELS = [
"Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
def test_data_parallel_inference(model, max_tokens): def test_data_parallel_inference(model, max_tokens):
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
script = "examples/offline_data_parallel.py" script = "examples/offline_data_parallel.py"
env = os.environ.copy() env = os.environ.copy()
@@ -54,8 +58,11 @@ def test_data_parallel_inference(model, max_tokens):
"--trust-remote-code", "--trust-remote-code",
] ]
if model == "Qwen/Qwen3-30B-A3B": if model in moe_models:
cmd.append("--enable-expert-parallel") cmd.append("--enable-expert-parallel")
if model in quantization_models:
cmd.append("--quantization")
cmd.append("ascend")
print(f"Running subprocess: {' '.join(cmd)}") print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(cmd, proc = subprocess.run(cmd,

View File

@@ -408,11 +408,10 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
quant_config: The Ascend quantization config. quant_config: The Ascend quantization config.
""" """
def __init__(self, def __init__(self, quant_config: AscendQuantConfig, prefix: str,
quant_config: AscendQuantConfig, packed_modules_mapping: Dict[str,
prefix: str, Any], layer: torch.nn.Module):
packed_modules_mapping: Dict[str, Any], super().__init__(layer.moe_config)
layer: torch.nn.Module = None):
self.quant_method = get_quant_method(quant_config.quant_description, self.quant_method = get_quant_method(quant_config.quant_description,
prefix, prefix,
"moe", "moe",

View File

@@ -16,7 +16,7 @@
# Adapted from vllm/tests/kernels/test_moe.py # Adapted from vllm/tests/kernels/test_moe.py
import os import os
from typing import Any, Callable, Optional, Tuple, Union from typing import Any, Callable, Dict, Optional, Tuple, Union
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@@ -45,7 +45,9 @@ from vllm_ascend.ascend_forward_context import FusedMoEState
from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
AscendQuantConfig)
from vllm_ascend.quantization.utils import get_quant_method
from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
from vllm_ascend.torchair.utils import (get_all_reduce_merge_state, from vllm_ascend.torchair.utils import (get_all_reduce_merge_state,
get_rm_router_logits_state, get_rm_router_logits_state,
@@ -936,6 +938,15 @@ class TorchairAscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
ep_group=get_ep_group()) ep_group=get_ep_group())
class TorchairAscendFusedMoEMethod(AscendFusedMoEMethod):
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
packed_modules_mapping: Dict[str, Any]):
self.quant_method = get_quant_method(quant_config.quant_description,
prefix, "moe",
packed_modules_mapping)
class TorchairAscendFusedMoE(FusedMoE): class TorchairAscendFusedMoE(FusedMoE):
# The moe_counter parameter is required during the initialization of EPLB # The moe_counter parameter is required during the initialization of EPLB
@@ -1115,7 +1126,7 @@ class TorchairAscendFusedMoE(FusedMoE):
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
self.moe) self.moe)
else: else:
self.quant_method = AscendFusedMoEMethod( self.quant_method = TorchairAscendFusedMoEMethod(
quant_config, prefix, quant_config.packed_modules_mapping) quant_config, prefix, quant_config.packed_modules_mapping)
assert self.quant_method is not None assert self.quant_method is not None