[Bugfix] Qwen3Next support FlashComm1 (#6830)

### What this PR does / why we need it? Support FlashComm1 for Qwen3-Next. Fix some padding problems in Sequence Parallel (SP) and resolve precision problems in shared_out when both FlashComm1 is enabled. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: zhaojiangjiang <zhaojiangjiang1@h-partners.com> Co-authored-by: zhaojiangjiang <zhaojiangjiang1@h-partners.com>
2026-03-06 17:14:08 +08:00
parent a2696006d1
commit a51d6366b9
4 changed files with 63 additions and 8 deletions
--- a/tests/e2e/multicard/4-cards/test_qwen3_next.py
+++ b/tests/e2e/multicard/4-cards/test_qwen3_next.py
@@ -73,3 +73,38 @@ def test_qwen3_next_w8a8dynamic_distributed_tp4_ep():
            quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_qwen3_next_distributed_mp_flash_comm_tp4():
+    example_prompts = [
+        "Hello, my name is",
+    ] * 4
+    max_tokens = 5
+    with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+        del vllm_model
+
+
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_qwen3_next_distributed_mp_graph_mode_tp4():
+    example_prompts = [
+        "Hello, my name is",
+    ] * 4
+    max_tokens = 5
+    with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+        del vllm_model