upgrade to 0.18.0 (#7502)

### What this PR does / why we need it? 1. upgrade to 0.18.0 2. ensure kernel_block_sizes is int for Eagle drafter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.17.0 - vLLM main: 8b6325758c --------- Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2026-03-21 16:05:38 +08:00
parent 80a4265717
commit bff4fbfca5
16 changed files with 139 additions and 258 deletions
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
 from vllm.config.utils import Range

 from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
-from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
+from vllm_ascend.utils import COMPILATION_PASS_KEY

 logger = logging.getLogger(__name__)

@@ -86,11 +86,10 @@ def npugraph_ex_compile(
    config.mode = "reduce-overhead"
    # execute FX graph in eager mode before graph mode to optimize FX graph.
    config.debug.run_eagerly = True
-    if not vllm_version_is("0.17.0"):
-        # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
-        # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
-        # and cause copy_between_host_and_device error.
-        config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
+    # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
+    # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
+    # and cause copy_between_host_and_device error.
+    config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
    if ascend_compilation_config.enable_static_kernel:
        config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
        # According to the cudagraph_capture_size configuration, set the shapes
@@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface):
        # see https://github.com/pytorch/pytorch/issues/138980
        graph = copy.deepcopy(graph)

-        if not vllm_version_is("0.17.0"):
-            from torch._guards import detect_fake_mode
+        from torch._guards import detect_fake_mode

-            current_fake_mode = detect_fake_mode()
-            if current_fake_mode is not None:
-                example_inputs = [
-                    current_fake_mode.from_tensor(inp)
-                    if (
-                        isinstance(inp, torch.Tensor)
-                        and hasattr(inp, "fake_mode")
-                        and inp.fake_mode is not current_fake_mode
-                    )
-                    else inp
-                    for inp in example_inputs
-                ]
+        current_fake_mode = detect_fake_mode()
+        if current_fake_mode is not None:
+            example_inputs = [
+                current_fake_mode.from_tensor(inp)
+                if (
+                    isinstance(inp, torch.Tensor)
+                    and hasattr(inp, "fake_mode")
+                    and inp.fake_mode is not current_fake_mode
+                )
+                else inp
+                for inp in example_inputs
+            ]

        ascend_compilation_config = get_ascend_config().ascend_compilation_config
        if ascend_compilation_config.enable_npugraph_ex: