Qwen3.5 MoE supports flashcomm v1 (#7644)

cherry pick from https://github.com/vllm-project/vllm-ascend/pull/7486  ### What this PR does / why we need it?  Multimodal models like Qwen3.5 MoE does embedding in model_runner, so when flash comm is enabled, the first AllGather operation should be skipped. ### Does this PR introduce _any_ user-facing change?  No. ### How was this patch tested?  - vLLM version: v0.18.0 - vLLM main: 8b6325758c --------- Signed-off-by: Wangbingjie <wangbj1207@126.com> Signed-off-by: wangbj127 <256472688+wangbj127@users.noreply.github.com>
2026-03-25 23:09:33 +08:00
parent ff1860bd81
commit 2ad0ca52a6
7 changed files with 182 additions and 8 deletions
--- a/vllm_ascend/ops/layernorm.py
+++ b/vllm_ascend/ops/layernorm.py
@@ -97,6 +97,7 @@ class AscendGemmaRMSNorm(GemmaRMSNorm):
        import torch_npu

        if residual is not None:
+            residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
            if enable_custom_op():
                x, _, residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
                    x, residual, 1.0 + self.weight, None, self.variance_epsilon
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -57,6 +57,7 @@ from vllm.distributed import (
    tensor_model_parallel_reduce_scatter,
 )
 from vllm.distributed.parallel_state import get_tp_group
+from vllm.model_executor.models.utils import extract_layer_index

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import _EXTRA_CTX
@@ -74,6 +75,7 @@ from vllm_ascend.utils import (
    flashcomm2_enable,
    get_flashcomm2_reorgnized_batch_ids,
    get_weight_prefetch_method,
+    is_vl_model,
    matmul_allreduce_enable,
    mlp_tp_enable,
    oproj_tp_enable,
@@ -430,8 +432,8 @@ class SequenceColumnParallelOp(CustomColumnParallelOp):

        # Matrix multiply.
        assert self.quant_method is not None
-
-        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(input_, True)
+        need_all_gather = not (extract_layer_index(self.layer.prefix) == 0 and is_vl_model() and "attn" in self.prefix)
+        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(input_, label=need_all_gather)
        output_parallel = self.quant_method.apply(self.layer, input_, bias)

        if self.gather_output:
--- a/vllm_ascend/ops/register_custom_ops.py
+++ b/vllm_ascend/ops/register_custom_ops.py
@@ -17,7 +17,7 @@ from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
 from vllm_ascend.ops.rotary_embedding import rope_forward_oot
 from vllm_ascend.ops.triton.muls_add import muls_add_triton
 from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
-from vllm_ascend.utils import enable_sp_by_pass, npu_stream_switch, prefetch_stream
+from vllm_ascend.utils import enable_sp_by_pass, is_vl_model, npu_stream_switch, prefetch_stream


 def _maybe_chunk_residual_impl(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
@@ -80,7 +80,7 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor, is_ep_comm: bool = False) -> tor
        enable_sp_by_pass() and is_ep_comm
    )

-    if not flash_comm_v1_enabled:
+    if not flash_comm_v1_enabled or (forward_context.is_draft_model and is_vl_model()):
        return tensor_model_parallel_all_reduce(x)

    dp_metadata = forward_context.dp_metadata