[Aclgraph][DP] Fix dp dummy run not in aclgraph error (#3208)

### What this PR does / why we need it? When running DP in a non-equilibrium scenario, which means there is some dp groups executing `dummy_run`, we need to make sure it running the same mode as other dp, thus improving then performance in dp scenario ### How was this patch tested? Tested by adding log in `_dummy_run` - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-09-30 11:14:51 +08:00
parent ddf4d53ca3
commit f8c93d8d24
3 changed files with 37 additions and 23 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -26,7 +26,7 @@ import torch_npu
 import vllm.envs as envs_vllm
 from torch_npu.op_plugin.atb._atb_ops import _register_atb_extensions
 from torch_npu.profiler import dynamic_profile as dp
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
 from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
@@ -356,7 +356,10 @@ class NPUWorker(WorkerBase):
        return self.model_runner.pin_lora(lora_id)

    def execute_dummy_batch(self) -> None:
-        self.model_runner._dummy_run(1)
+        force_attention = self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
+        self.model_runner._dummy_run(num_tokens=1,
+                                     uniform_decode=True,
+                                     force_attention=force_attention)

    def _init_worker_distributed_environment(self) -> None:
        """Initialize the distributed environment."""