[Feat][BugFix]Support the Qwen3-Next-80B-A3B-Instruct quantization model&Fix the NZ issue (#4245)

### What this PR does / why we need it? Support the Qwen3-Next-80B-A3B-Instruct quantization model and Fix the NZ issue. Triton kernel doesn't support data format nz, thus we skip converting weight to nz on layer `conv1d` - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: IncSec <1790766300@qq.com>
2025-11-21 10:42:56 +08:00
parent cbb27feaf2
commit 5a4e8cdeba
10 changed files with 39 additions and 30 deletions
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -20,6 +20,12 @@

 Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
 """
+
+import os
+from unittest.mock import patch
+
+from modelscope import snapshot_download  # type: ignore
+
 from tests.e2e.conftest import VllmRunner


@@ -106,3 +112,23 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
            print(f"spec_output: {spec_output[1]}")

    assert matches > int(0.66 * len(ref_outputs))
+
+
+# TODO: will conduct accuracy verification after the subsequent version becomes stable
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_models_distributed_Qwen3_NEXT_W8A8DYNAMIC_WITH_EP():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(
+            snapshot_download(
+                "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8-Pruning"),
+            max_model_len=4096,
+            tensor_parallel_size=2,
+            gpu_memory_utilization=0.4,
+            max_num_seqs=1,
+            enable_expert_parallel=True,
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)