[main][bugfix] bugfix for qwen3 moe quantization (#4599)
### What this PR does / why we need it? Fix the issue where the qwen3 moe service cannot be started due to upgrading the vllm version Error info: AttributeError: 'AscendFusedMoE' object has no attribute 'use dp chunking' ### Does this PR introduce _any_ user-facing change? no - vLLM version: v0.11.2 --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
@@ -111,6 +111,10 @@ def parse_args():
|
||||
parser.add_argument("--enable-expert-parallel",
|
||||
action="store_true",
|
||||
help="Enable expert parallel, used in MOE models.")
|
||||
parser.add_argument("--quantization",
|
||||
type=str,
|
||||
default="",
|
||||
help="Use quantization models")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -134,6 +138,7 @@ def main(
|
||||
enable_expert_parallel,
|
||||
enforce_eager,
|
||||
trust_remote_code,
|
||||
quantization,
|
||||
):
|
||||
# DP only support on V1 engine
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
@@ -185,6 +190,7 @@ def main(
|
||||
enforce_eager=enforce_eager,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
trust_remote_code=trust_remote_code,
|
||||
quantization=quantization,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
@@ -220,6 +226,8 @@ if __name__ == "__main__":
|
||||
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
|
||||
dp_per_node = dp_size // node_size
|
||||
|
||||
quantization = args.quantization if args.quantization else None
|
||||
|
||||
from multiprocessing import Process
|
||||
|
||||
procs = []
|
||||
@@ -238,6 +246,7 @@ if __name__ == "__main__":
|
||||
args.enable_expert_parallel,
|
||||
args.enforce_eager,
|
||||
args.trust_remote_code,
|
||||
quantization,
|
||||
),
|
||||
)
|
||||
proc.start()
|
||||
|
||||
@@ -27,13 +27,17 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B"]
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||
def test_data_parallel_inference(model, max_tokens):
|
||||
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
script = "examples/offline_data_parallel.py"
|
||||
|
||||
env = os.environ.copy()
|
||||
@@ -54,8 +58,11 @@ def test_data_parallel_inference(model, max_tokens):
|
||||
"--trust-remote-code",
|
||||
]
|
||||
|
||||
if model == "Qwen/Qwen3-30B-A3B":
|
||||
if model in moe_models:
|
||||
cmd.append("--enable-expert-parallel")
|
||||
if model in quantization_models:
|
||||
cmd.append("--quantization")
|
||||
cmd.append("ascend")
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(cmd,
|
||||
|
||||
@@ -408,11 +408,10 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
|
||||
quant_config: The Ascend quantization config.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
quant_config: AscendQuantConfig,
|
||||
prefix: str,
|
||||
packed_modules_mapping: Dict[str, Any],
|
||||
layer: torch.nn.Module = None):
|
||||
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
|
||||
packed_modules_mapping: Dict[str,
|
||||
Any], layer: torch.nn.Module):
|
||||
super().__init__(layer.moe_config)
|
||||
self.quant_method = get_quant_method(quant_config.quant_description,
|
||||
prefix,
|
||||
"moe",
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# Adapted from vllm/tests/kernels/test_moe.py
|
||||
|
||||
import os
|
||||
from typing import Any, Callable, Optional, Tuple, Union
|
||||
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@@ -45,7 +45,9 @@ from vllm_ascend.ascend_forward_context import FusedMoEState
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
|
||||
from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
|
||||
AscendQuantConfig)
|
||||
from vllm_ascend.quantization.utils import get_quant_method
|
||||
from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
|
||||
from vllm_ascend.torchair.utils import (get_all_reduce_merge_state,
|
||||
get_rm_router_logits_state,
|
||||
@@ -936,6 +938,15 @@ class TorchairAscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
ep_group=get_ep_group())
|
||||
|
||||
|
||||
class TorchairAscendFusedMoEMethod(AscendFusedMoEMethod):
|
||||
|
||||
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
|
||||
packed_modules_mapping: Dict[str, Any]):
|
||||
self.quant_method = get_quant_method(quant_config.quant_description,
|
||||
prefix, "moe",
|
||||
packed_modules_mapping)
|
||||
|
||||
|
||||
class TorchairAscendFusedMoE(FusedMoE):
|
||||
|
||||
# The moe_counter parameter is required during the initialization of EPLB
|
||||
@@ -1115,7 +1126,7 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
|
||||
self.moe)
|
||||
else:
|
||||
self.quant_method = AscendFusedMoEMethod(
|
||||
self.quant_method = TorchairAscendFusedMoEMethod(
|
||||
quant_config, prefix, quant_config.packed_modules_mapping)
|
||||
|
||||
assert self.quant_method is not None
|
||||
|
||||
Reference in New Issue
Block a user