[main][bugfix] bugfix for qwen3 moe quantization (#4599)

### What this PR does / why we need it? Fix the issue where the qwen3 moe service cannot be started due to upgrading the vllm version Error info: AttributeError: 'AscendFusedMoE' object has no attribute 'use dp chunking' ### Does this PR introduce _any_ user-facing change? no - vLLM version: v0.11.2 --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2025-12-01 23:48:57 +08:00
parent 12ca99c94e
commit a9c4b8604a
4 changed files with 36 additions and 10 deletions
--- a/examples/offline_data_parallel.py
+++ b/examples/offline_data_parallel.py
@@ -111,6 +111,10 @@ def parse_args():
    parser.add_argument("--enable-expert-parallel",
                        action="store_true",
                        help="Enable expert parallel, used in MOE models.")
+    parser.add_argument("--quantization",
+                        type=str,
+                        default="",
+                        help="Use quantization models")
    return parser.parse_args()


@@ -134,6 +138,7 @@ def main(
    enable_expert_parallel,
    enforce_eager,
    trust_remote_code,
+    quantization,
 ):
    # DP only support on V1 engine
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@@ -185,6 +190,7 @@ def main(
        enforce_eager=enforce_eager,
        enable_expert_parallel=enable_expert_parallel,
        trust_remote_code=trust_remote_code,
+        quantization=quantization,
    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
@@ -220,6 +226,8 @@ if __name__ == "__main__":
    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
    dp_per_node = dp_size // node_size

+    quantization = args.quantization if args.quantization else None
+
    from multiprocessing import Process

    procs = []
@@ -238,6 +246,7 @@ if __name__ == "__main__":
                args.enable_expert_parallel,
                args.enforce_eager,
                args.trust_remote_code,
+                quantization,
            ),
        )
        proc.start()