[Bugfix] Disable the dispatch_ffn_combine kernel in MTP path (#4751)

### What this PR does / why we need it? This PR is to fix a smoking test failure. Adjust mtp_proposer and model_runner_v1 to route MTP decoding through the non‑fused MoE implementation while keeping the overall inference flow unchanged. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: mojave2 <chenchen145@huawei.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-09 22:14:05 +08:00
parent cd1c69ee0b
commit 848419d1ba
2 changed files with 12 additions and 7 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -27,7 +27,8 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

-from vllm_ascend.ascend_forward_context import set_ascend_forward_context
+from vllm_ascend.ascend_forward_context import (MoECommType,
+                                                set_ascend_forward_context)
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
@@ -237,6 +238,9 @@ class MtpProposer(Proposer):
        ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)

        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
+        # TODO: remove this after moe_comm_type selection logic is finalized
+        moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
+                         == MoECommType.FUSED_ALLTOALL else moe_comm_type)

        if skip_attn:
            attn_metadata = None