[BugFix][Fusion] Fix graph fusion failure problem (#5253)

Currently, the vllm pull request (https://github.com/vllm-project/vllm/pull/24252) is causing operator fusion to fail. This issue was previously fixed by patching the backend. The root cause has been identified, and the problem can be resolved with this pull request. - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2026-01-05 17:49:09 +08:00
parent 4a3663327b
commit e7b623b363
9 changed files with 36 additions and 267 deletions
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -27,7 +27,7 @@ import torch_npu
 import vllm.envs as envs_vllm
 from torch_npu.op_plugin.atb._atb_ops import _register_atb_extensions
 from torch_npu.profiler import dynamic_profile as dp
-from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.config import CUDAGraphMode, VllmConfig, set_current_vllm_config
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
 from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
@@ -366,11 +366,25 @@ class NPUWorker(WorkerBase):
        self.model_runner.eplb_warmup()
        warmup_sizes = (self.vllm_config.compilation_config.compile_sizes
                        or []).copy()
-        if not self.model_config.enforce_eager:
+        cg_capture_sizes: list[int] = []
+        if self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            cg_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+            cg_capture_sizes = [] if cg_sizes is None else cg_sizes
            warmup_sizes = [
-                x for x in warmup_sizes if x not in
-                self.vllm_config.compilation_config.cudagraph_capture_sizes
+                x for x in warmup_sizes if x not in cg_capture_sizes
            ]
+
+        compile_ranges = self.vllm_config.compilation_config.get_compile_ranges(
+        )
+        # For each compile_range, if none of the batch sizes
+        # in warmup_sizes or cudagraph_capture_sizes are in the range,
+        # add the end of the range to ensure compilation/warmup.
+        all_sizes = set(cg_capture_sizes)
+        all_sizes.update([x for x in warmup_sizes if isinstance(x, int)])
+        for compile_range in compile_ranges:
+            if not any(x in compile_range for x in all_sizes):
+                warmup_sizes.append(compile_range.end)
+
        for size in sorted(warmup_sizes, reverse=True):
            logger.info("Compile and warming up model for size %d", size)
            self.model_runner._dummy_run(size)