From a9e78a329988c4fbe5382ef03b08d6413623d248 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 27 Aug 2025 09:30:25 +0800 Subject: [PATCH] [Aclgraph] Update compilation config in `check_and_update_config` (#2540) ### What this PR does / why we need it? This pr updates compilation config in `check_and_update_config`, we use `compilation_config.level` to update `compilation_config.cudagraph_mode` to ensure the config is correct. Add `compilation_config.cudagraph_num_of_warmups = 1` when V1 is enabled, cause this is also used in torchair graph mode. and this fixes https://github.com/vllm-project/vllm-ascend/issues/2523 fix the bug that the `aclgraphmode` always be `NONE` while running forward in aclgraph mode ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f58675bfb36b67cdbca4d2356a2f580e7a706ec3 --------- Signed-off-by: MengqingCao --- tests/ut/test_platform.py | 71 +++++++++++++++++++++++-- vllm_ascend/platform.py | 75 ++++++++++++++++----------- vllm_ascend/worker/model_runner_v1.py | 6 +++ 3 files changed, 118 insertions(+), 34 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 61efd00..551f1d0 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -3,11 +3,11 @@ import unittest from datetime import timedelta from unittest.mock import MagicMock, patch -import pytest import torch from torch.distributed import ProcessGroup from torch.distributed.distributed_c10d import PrefixStore from vllm.config import CompilationLevel +from vllm.config.compilation import CUDAGraphMode from vllm.platforms import PlatformEnum from tests.ut.base import TestBase @@ -28,6 +28,7 @@ class TestNPUPlatform(TestBase): self.mock_vllm_config.scheduler_config = MagicMock() self.mock_vllm_config.speculative_config = None self.mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False + self.mock_vllm_config.compilation_config.cudagraph_mode = None self.mock_ascend_config = MagicMock() self.mock_ascend_config.torchair_graph_config.enabled = False @@ -269,8 +270,6 @@ class TestNPUPlatform(TestBase): self.platform.check_and_update_config(self.mock_vllm_config) self.assertTrue("Model config is missing" in cm.output[0]) - @pytest.mark.skip( - reason="TODO: revert me when the occasional failed is fixed") @patch("vllm_ascend.utils.is_310p", return_value=False) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @@ -290,6 +289,10 @@ class TestNPUPlatform(TestBase): self.mock_vllm_config.compilation_config.level, CompilationLevel.NO_COMPILATION, ) + self.assertEqual( + self.mock_vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) @patch("vllm_ascend.utils.is_310p", return_value=False) @patch("vllm_ascend.ascend_config.check_ascend_config") @@ -310,6 +313,64 @@ class TestNPUPlatform(TestBase): self.mock_vllm_config.compilation_config.level, CompilationLevel.NO_COMPILATION, ) + self.assertEqual( + self.mock_vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_unsupported_cudagraph_mode( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = self.mock_ascend_config + self.mock_vllm_config.model_config.enforce_eager = False + self.mock_vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.FULL + + with self.assertLogs(logger="vllm", level="INFO") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(self.mock_vllm_config) + self.assertTrue( + "cudagraph_mode is not support on NPU. falling back to NONE" in + cm.output[0]) + self.assertEqual( + self.mock_vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + self.mock_vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_disable_aclgraph_when_ray_enabled( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = self.mock_ascend_config + self.mock_vllm_config.model_config.enforce_eager = False + self.mock_vllm_config.compilation_config.level = CompilationLevel.PIECEWISE + self.mock_vllm_config.parallel_config.distributed_executor_backend = "ray" + + with self.assertLogs(logger="vllm", level="WARNING") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(self.mock_vllm_config) + print(30 * "=", f"cm.output: {cm.output}") + self.assertTrue( + "Ray distributed executor backend is not compatible with ACL Graph mode" + in cm.output[0]) + self.assertEqual( + self.mock_vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + self.mock_vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) @patch("vllm_ascend.utils.is_310p", return_value=False) @patch("vllm_ascend.ascend_config.check_ascend_config") @@ -331,6 +392,10 @@ class TestNPUPlatform(TestBase): self.mock_vllm_config.compilation_config.level, CompilationLevel.NO_COMPILATION, ) + self.assertEqual( + self.mock_vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) @patch("vllm_ascend.utils.is_310p", return_value=False) @patch("vllm_ascend.ascend_config.check_ascend_config") diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 2808768..b8e1039 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -140,52 +140,65 @@ class NPUPlatform(Platform): check_ascend_config(vllm_config, enforce_eager) from vllm.config.compilation import CUDAGraphMode - - # TODO(cmq): update the post init in vllmconfig - # if cudagraph_mode is not explicitly set by users, set default value - if envs_vllm.VLLM_USE_V1 and compilation_config.level \ - == CompilationLevel.PIECEWISE: - compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE - else: - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - vllm_config._set_cudagraph_sizes() - - # TODO(cmq): update the compilation level config to be determined by CUDAGraphMode - if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION: + if enforce_eager: logger.info("Compilation disabled, using eager mode by default") compilation_config.level = CompilationLevel.NO_COMPILATION - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif compilation_config.level != CompilationLevel.PIECEWISE: - logger.warning( - "NPU does not support %s compilation level. Setting level to NO_COMPILATION", - compilation_config.level) - compilation_config.level = CompilationLevel.NO_COMPILATION - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif ascend_config.torchair_graph_config.enabled: + + compilation_config.cudagraph_num_of_warmups = 1 + + if compilation_config.cudagraph_mode is None: + # if cudagraph_mode is not explicitly set by users, set default value + if compilation_config.level == CompilationLevel.PIECEWISE: + compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + elif compilation_config.level not in [ + CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE + ]: + logger.warning( + "NPU does not support %s compilation level. Setting CUDAGraphMode to NONE", + compilation_config.level) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + else: + logger.warning( + "compilation_config.level = CompilationLevel.NO_COMPILATION is set, Setting CUDAGraphMode to NONE" + ) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + # set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is. + if ascend_config.torchair_graph_config.enabled: logger.info( - "Torchair compilation enabled on NPU. Setting level to NO_COMPILATION" + "Torchair compilation enabled on NPU. Setting CUDAGraphMode to NONE" ) - compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif parallel_config.distributed_executor_backend == "ray": + + if parallel_config.distributed_executor_backend == "ray": logger.warning( "Ray distributed executor backend is not compatible with ACL Graph mode " - "right now. Setting level to NO_COMPILATION") - compilation_config.level = CompilationLevel.NO_COMPILATION + "right now. Setting CUDAGraphMode to NONE") compilation_config.cudagraph_mode = CUDAGraphMode.NONE - else: + + # set cudaprah sizes before extending `compilation_config.splitting_ops` + vllm_config._set_cudagraph_sizes() + + if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: + compilation_config.level = CompilationLevel.NO_COMPILATION + elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: logger.info( "PIECEWISE compilation enabled on NPU. use_inductor not supported - " "using only ACL Graph mode") - if envs_vllm.VLLM_USE_V1 and \ - compilation_config.level == CompilationLevel.PIECEWISE: - compilation_config.set_splitting_ops_for_v1() + assert compilation_config.level == CompilationLevel.PIECEWISE, \ + "When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE" + compilation_config.set_splitting_ops_for_v1() compilation_config.use_inductor = False compilation_config.splitting_ops.extend( ["vllm.unified_ascend_attention_with_output"]) update_aclgraph_sizes(vllm_config) - compilation_config.cudagraph_num_of_warmups = 1 + else: + logger.info( + "%s cudagraph_mode is not support on NPU. falling back to NONE", + compilation_config.cudagraph_mode) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + compilation_config.level = CompilationLevel.NO_COMPILATION if parallel_config and parallel_config.worker_cls == "auto": if ascend_config.torchair_graph_config.enabled: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 8fe840a..75c91d5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1660,6 +1660,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): moe_comm_method = (self.moe_comm_method if num_input_tokens <= self.mc2_tokens_capacity else self.fallback_moe_comm_method) + batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, + uniform_decode=False) + aclgraph_runtime_mode, batch_descriptor = \ + self.aclgraph_dispatcher.dispatch(batch_descriptor) # Run forward pass with ProfileExecuteDuration().capture_async("forward"): @@ -1671,6 +1675,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): with_prefill=self.with_prefill, reserved_mc2_mask=self.reserved_mc2_mask, moe_comm_method=moe_comm_method, + aclgraph_runtime_mode=aclgraph_runtime_mode, + batch_descriptor=batch_descriptor, num_actual_tokens=scheduler_output. total_num_scheduled_tokens): self.maybe_setup_kv_connector(scheduler_output)