[BugFix][v0.18.0] Gate recompute/balance/fused_mc2 by PD mode (#8374)

### What this PR does / why we need it? - Enforce recompute scheduler only in PD-disaggregated mode. - Enforce balance scheduling only in PD-mixed mode. - Enforce fused MC2 only on PD-disaggregated D-side (kv_consumer).  ### Does this PR introduce _any_ user-facing change? No  ### How was this patch tested? By ci  --------- Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
2026-04-18 18:06:42 +08:00
parent c995a959e6
commit 363febb6cb
5 changed files with 288 additions and 3 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -86,6 +86,7 @@ class AscendConfig:
                )
        self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
        self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
+        # PD-disaggregated only (kv_producer/kv_consumer); invalid in PD-mixed (kv_both / no kv_transfer_config).
        self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
        self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True)

--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -93,7 +93,9 @@ env_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL", "0"))),
    # Whether to anbale dynamic EPLB
    "DYNAMIC_EPLB": lambda: os.getenv("DYNAMIC_EPLB", "false").lower(),
-    # Whether to enable fused mc2(`dispatch_gmm_combine_decode`/`dispatch_ffn_combine` operator)
+    # Whether to enable fused MC2 (`dispatch_gmm_combine_decode` / `dispatch_ffn_combine`).
+    # Platform validation: only PD-disaggregated **decode** instances (`kv_role='kv_consumer'`).
+    # Not supported in PD-mixed mode (`kv_both` or no kv_transfer_config) or on prefill nodes (`kv_producer`).
    # 0, or not set: default ALLTOALL and MC2 will be used.
    # 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator.
    # `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb.
@@ -101,7 +103,9 @@ env_variables: dict[str, Callable[[], Any]] = {
    # `dispatch_gmm_combine_decode` can be used only for **decode node** moe layer
    # with W8A8. And MTP layer must be W8A8.
    "VLLM_ASCEND_ENABLE_FUSED_MC2": lambda: int(os.getenv("VLLM_ASCEND_ENABLE_FUSED_MC2", "0")),
-    # Whether to anbale balance scheduling
+    # Whether to enable balance scheduling in the v1 scheduler.
+    # Platform validation: only PD-mixed mode (`kv_role='kv_both'` or no kv_transfer_config).
+    # Not supported in PD-disaggregated mode (`kv_producer` / `kv_consumer` only).
    "VLLM_ASCEND_BALANCE_SCHEDULING": lambda: bool(int(os.getenv("VLLM_ASCEND_BALANCE_SCHEDULING", "0"))),
    # use fused op transpose_kv_cache_by_block, default is True
    "VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK": lambda: bool(
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -448,7 +448,36 @@ class NPUPlatform(Platform):
        if get_ascend_device_type() != AscendDeviceType._310P:
            compilation_config.custom_ops = ["all"]

+        if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2:
+            kv_transfer_config = vllm_config.kv_transfer_config
+            kv_role = getattr(kv_transfer_config, "kv_role", None)
+            if kv_transfer_config is None or kv_role != "kv_consumer":
+                raise ValueError(
+                    "VLLM_ASCEND_ENABLE_FUSED_MC2 (fused mc2) only supports PD-disaggregated "
+                    "decode nodes (D-side) with kv_role='kv_consumer'. It is not supported "
+                    "in PD-mixed mode (no kv_transfer_config / kv_role='kv_both') nor on "
+                    "prefill nodes (P-side) with kv_role='kv_producer'."
+                )
+
+        if envs_ascend.VLLM_ASCEND_BALANCE_SCHEDULING:
+            kv_transfer_config = vllm_config.kv_transfer_config
+            kv_role = getattr(kv_transfer_config, "kv_role", None)
+            if kv_transfer_config is not None and kv_role != "kv_both":
+                raise ValueError(
+                    "VLLM_ASCEND_BALANCE_SCHEDULING (balance scheduling) only supports PD-mixed mode "
+                    "(kv_role='kv_both' or no kv_transfer_config), and is not supported in "
+                    "PD-disaggregated mode (kv_role='kv_producer'/'kv_consumer')."
+                )
+
        if ascend_config.recompute_scheduler_enable:
+            kv_transfer_config = vllm_config.kv_transfer_config
+            kv_role = getattr(kv_transfer_config, "kv_role", None)
+            if kv_transfer_config is None or kv_role == "kv_both":
+                raise ValueError(
+                    "recompute_scheduler_enable can only be enabled in PD-disaggregated mode "
+                    "(kv_role='kv_producer' or 'kv_consumer'), and is not supported in PD-mixed mode."
+                )
+
            from vllm_ascend.core.recompute_scheduler import RecomputeSchedulerConfig

            recompute_scheduler_config = RecomputeSchedulerConfig.initialize_from_config(vllm_config)