[main][Bugfix] Fix unable to load qwen3_moe quantized weights (#2219)

### What this PR does / why we need it? Fixes unable to load `qwen3_moe` quantized weights issue due to #1994 ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? Add a `qwen3_moe` W8A8 quantized model in `tests/e2e/multicard/test_qwen3_moe.py` - vLLM version: v0.10.0 - vLLM main: c494f96fbc --------- Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-08-06 09:08:36 +08:00
parent 54ace9e12b
commit e31b31f9c3
2 changed files with 53 additions and 5 deletions
--- a/tests/e2e/multicard/test_qwen3_moe.py
+++ b/tests/e2e/multicard/test_qwen3_moe.py
@@ -18,9 +18,11 @@
 #
 """Compare the short outputs of HF and vLLM when using greedy sampling.

-Run `pytest tests/test_offline_inference.py`.
+Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
 """

+from modelscope import snapshot_download  # type: ignore
+
 from tests.e2e.conftest import VllmRunner


@@ -53,3 +55,20 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_EP():
            distributed_executor_backend="mp",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_MOE_W8A8():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
+            max_model_len=8192,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            quantization="ascend",
+            enforce_eager=False,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)