From c47371c1af88368c8deebd629c78b6229e0cdcd5 Mon Sep 17 00:00:00 2001 From: wangxiaoteng888 <56506195+wangxiaoteng888@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:44:07 +0800 Subject: [PATCH] [BugFix]Backport validate pd mode feature gates no fused mc2 v0.18.0 clean (#8583) ### What this PR does / why we need it? Backport validate pd mode feature gates no fused mc2 v0.18.0 clean backport #8582 --------- Signed-off-by: wangxiaoteng --- tests/ut/test_platform.py | 128 -------------------------------------- vllm_ascend/envs.py | 2 - vllm_ascend/platform.py | 11 ---- 3 files changed, 141 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index a5f73ea9..4820d868 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -541,134 +541,6 @@ class TestNPUPlatform(TestBase): ): self.platform.check_and_update_config(vllm_config) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") - @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_fused_mc2_rejects_pd_mixed_no_kv_transfer( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect - ): - mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() - mock_ascend_config.recompute_scheduler_enable = False - mock_ascend_config.enable_mc2_hierarchy_comm = False - mock_init_ascend.return_value = mock_ascend_config - - vllm_config = TestNPUPlatform.mock_vllm_config() - vllm_config.kv_transfer_config = None - vllm_config.parallel_config.decode_context_parallel_size = 1 - vllm_config.parallel_config.prefill_context_parallel_size = 1 - vllm_config.parallel_config.tensor_parallel_size = 1 - vllm_config.scheduler_config = MagicMock() - mock_init_recompute.return_value = MagicMock() - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform = platform.NPUPlatform() - - with patch("vllm_ascend.platform.envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2", 1, create=True): - with pytest.raises(ValueError, match=r"VLLM_ASCEND_ENABLE_FUSED_MC2.*kv_role='kv_consumer'.*PD-mixed"): - with patch.object(platform.NPUPlatform, "_fix_incompatible_config"): - self.platform.check_and_update_config(vllm_config) - - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") - @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_fused_mc2_rejects_pd_mixed_kv_both( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect - ): - mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() - mock_ascend_config.recompute_scheduler_enable = False - mock_ascend_config.enable_mc2_hierarchy_comm = False - mock_init_ascend.return_value = mock_ascend_config - - vllm_config = TestNPUPlatform.mock_vllm_config() - vllm_config.kv_transfer_config = MagicMock(kv_role="kv_both", engine_id="engine0") - vllm_config.parallel_config.decode_context_parallel_size = 1 - vllm_config.parallel_config.prefill_context_parallel_size = 1 - vllm_config.parallel_config.tensor_parallel_size = 1 - vllm_config.scheduler_config = MagicMock() - mock_init_recompute.return_value = MagicMock() - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform = platform.NPUPlatform() - - with patch("vllm_ascend.platform.envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2", 1, create=True): - with pytest.raises(ValueError, match=r"VLLM_ASCEND_ENABLE_FUSED_MC2.*kv_role='kv_consumer'.*kv_role='kv_both'"): - with ( - patch.object(platform.NPUPlatform, "_fix_incompatible_config"), - patch.object(platform, "check_kv_extra_config"), - ): - self.platform.check_and_update_config(vllm_config) - - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") - @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_fused_mc2_rejects_pd_disaggregated_kv_producer( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect - ): - mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() - mock_ascend_config.recompute_scheduler_enable = False - mock_ascend_config.enable_mc2_hierarchy_comm = False - mock_init_ascend.return_value = mock_ascend_config - - vllm_config = TestNPUPlatform.mock_vllm_config() - vllm_config.kv_transfer_config = MagicMock(kv_role="kv_producer", engine_id="engine0") - vllm_config.parallel_config.decode_context_parallel_size = 1 - vllm_config.parallel_config.prefill_context_parallel_size = 1 - vllm_config.parallel_config.tensor_parallel_size = 1 - vllm_config.scheduler_config = MagicMock() - mock_init_recompute.return_value = MagicMock() - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform = platform.NPUPlatform() - - with patch("vllm_ascend.platform.envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2", 1, create=True): - with pytest.raises(ValueError, match=r"VLLM_ASCEND_ENABLE_FUSED_MC2.*kv_role='kv_consumer'.*kv_role='kv_producer'"): - with ( - patch.object(platform.NPUPlatform, "_fix_incompatible_config"), - patch.object(platform, "check_kv_extra_config"), - ): - self.platform.check_and_update_config(vllm_config) - - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") - @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_fused_mc2_allows_pd_disaggregated_kv_consumer( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect - ): - mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() - mock_ascend_config.recompute_scheduler_enable = False - mock_ascend_config.enable_mc2_hierarchy_comm = False - mock_init_ascend.return_value = mock_ascend_config - - vllm_config = TestNPUPlatform.mock_vllm_config() - vllm_config.kv_transfer_config = MagicMock(kv_role="kv_consumer", engine_id="engine0") - vllm_config.parallel_config.decode_context_parallel_size = 1 - vllm_config.parallel_config.prefill_context_parallel_size = 1 - vllm_config.parallel_config.tensor_parallel_size = 1 - vllm_config.scheduler_config = MagicMock() - mock_init_recompute.return_value = MagicMock() - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform = platform.NPUPlatform() - - with patch("vllm_ascend.platform.envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2", 1, create=True): - with ( - patch.object(platform.NPUPlatform, "_fix_incompatible_config"), - patch.object(platform, "check_kv_extra_config"), - ): - self.platform.check_and_update_config(vllm_config) - def test_update_block_size_for_backend_preserves_hybrid_block_size(self): vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.model_config.is_hybrid = True diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index de408d36..30151c7e 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -94,8 +94,6 @@ env_variables: dict[str, Callable[[], Any]] = { # Whether to anbale dynamic EPLB "DYNAMIC_EPLB": lambda: os.getenv("DYNAMIC_EPLB", "false").lower(), # Whether to enable fused MC2 (`dispatch_gmm_combine_decode` / `dispatch_ffn_combine`). - # Platform validation: only PD-disaggregated **decode** instances (`kv_role='kv_consumer'`). - # Not supported in PD-mixed mode (`kv_both` or no kv_transfer_config) or on prefill nodes (`kv_producer`). # 0, or not set: default ALLTOALL and MC2 will be used. # 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator. # `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb. diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index f15f1a0a..831df9f2 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -448,17 +448,6 @@ class NPUPlatform(Platform): if get_ascend_device_type() != AscendDeviceType._310P: compilation_config.custom_ops = ["all"] - if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2: - kv_transfer_config = vllm_config.kv_transfer_config - kv_role = getattr(kv_transfer_config, "kv_role", None) - if kv_transfer_config is None or kv_role != "kv_consumer": - raise ValueError( - "VLLM_ASCEND_ENABLE_FUSED_MC2 (fused mc2) only supports PD-disaggregated " - "decode nodes (D-side) with kv_role='kv_consumer'. It is not supported " - "in PD-mixed mode (no kv_transfer_config / kv_role='kv_both') nor on " - "prefill nodes (P-side) with kv_role='kv_producer'." - ) - if envs_ascend.VLLM_ASCEND_BALANCE_SCHEDULING: kv_transfer_config = vllm_config.kv_transfer_config kv_role = getattr(kv_transfer_config, "kv_role", None)