[main][bugfix] bugfix for qwen3 moe quantization (#4599)
### What this PR does / why we need it? Fix the issue where the qwen3 moe service cannot be started due to upgrading the vllm version Error info: AttributeError: 'AscendFusedMoE' object has no attribute 'use dp chunking' ### Does this PR introduce _any_ user-facing change? no - vLLM version: v0.11.2 --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
@@ -27,13 +27,17 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B"]
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||
def test_data_parallel_inference(model, max_tokens):
|
||||
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
script = "examples/offline_data_parallel.py"
|
||||
|
||||
env = os.environ.copy()
|
||||
@@ -54,8 +58,11 @@ def test_data_parallel_inference(model, max_tokens):
|
||||
"--trust-remote-code",
|
||||
]
|
||||
|
||||
if model == "Qwen/Qwen3-30B-A3B":
|
||||
if model in moe_models:
|
||||
cmd.append("--enable-expert-parallel")
|
||||
if model in quantization_models:
|
||||
cmd.append("--quantization")
|
||||
cmd.append("ascend")
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(cmd,
|
||||
|
||||
Reference in New Issue
Block a user