[main][bugfix] bugfix for qwen3 moe quantization (#4599)
### What this PR does / why we need it? Fix the issue where the qwen3 moe service cannot be started due to upgrading the vllm version Error info: AttributeError: 'AscendFusedMoE' object has no attribute 'use dp chunking' ### Does this PR introduce _any_ user-facing change? no - vLLM version: v0.11.2 --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
@@ -111,6 +111,10 @@ def parse_args():
|
||||
parser.add_argument("--enable-expert-parallel",
|
||||
action="store_true",
|
||||
help="Enable expert parallel, used in MOE models.")
|
||||
parser.add_argument("--quantization",
|
||||
type=str,
|
||||
default="",
|
||||
help="Use quantization models")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -134,6 +138,7 @@ def main(
|
||||
enable_expert_parallel,
|
||||
enforce_eager,
|
||||
trust_remote_code,
|
||||
quantization,
|
||||
):
|
||||
# DP only support on V1 engine
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
@@ -185,6 +190,7 @@ def main(
|
||||
enforce_eager=enforce_eager,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
trust_remote_code=trust_remote_code,
|
||||
quantization=quantization,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
@@ -220,6 +226,8 @@ if __name__ == "__main__":
|
||||
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
|
||||
dp_per_node = dp_size // node_size
|
||||
|
||||
quantization = args.quantization if args.quantization else None
|
||||
|
||||
from multiprocessing import Process
|
||||
|
||||
procs = []
|
||||
@@ -238,6 +246,7 @@ if __name__ == "__main__":
|
||||
args.enable_expert_parallel,
|
||||
args.enforce_eager,
|
||||
args.trust_remote_code,
|
||||
quantization,
|
||||
),
|
||||
)
|
||||
proc.start()
|
||||
|
||||
Reference in New Issue
Block a user