From 22005c64c1f8ab5260eeb164bd8cc385a24c64bb Mon Sep 17 00:00:00 2001
From: realliujiaxu <realliujiaxu@163.com>
Date: Thu, 6 Nov 2025 20:02:03 +0800
Subject: [PATCH] [Bugfix] Add constraints for sequence parallelism (#4014)

### What this PR does / why we need it?
Add Add constraints for sequence parallelism for unsupported scenarios:
1. tp_size > 1
2. enable_expert_parallel must be True for MoE model

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac

---------

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
---
 vllm_ascend/ascend_forward_context.py |  4 +---
 vllm_ascend/utils.py                  | 11 +++++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index 80e6541e..a0f1edd5 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -115,12 +115,10 @@ def set_ascend_forward_context(
         # the performance may degrade due to the switching of communication methods.
         mmrs_fusion = True
         if is_moe_model(vllm_config):
-            sp_enabled = enable_sp(vllm_config) and \
-                tp_world_size > 1 and num_tokens is not None
+            sp_enabled = enable_sp(vllm_config) and num_tokens is not None
             mmrs_fusion = False
         else:
             sp_enabled = enable_sp(vllm_config) and \
-                tp_world_size > 1 and \
                 num_tokens is not None and num_tokens > 1000
         forward_context.mmrs_fusion = mmrs_fusion
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index e1afd24a..46e80606 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -659,6 +659,17 @@ def enable_sp(vllm_config=None) -> bool:
             # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
             or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))
 
+        if not _ENABLE_SP:
+            return _ENABLE_SP
+
+        assert vllm_config.parallel_config.tensor_parallel_size > 1, \
+            "Flash Comm v1 (Sequence Parallelism) is only supported when tp_size > 1."
+
+        assert (
+            not is_moe_model(vllm_config)
+            or vllm_config.parallel_config.enable_expert_parallel
+        ), "Flash Comm v1 (Sequence Parallelism) requires enable_expert_parallel=True for MoE models."
+
     return _ENABLE_SP