[main2main] upgrade vllm to 0308 (#7213)

### What this PR does / why we need it? Update main2main to vllm 0308. breaks: * https://github.com/vllm-project/vllm/pull/30681 * https://github.com/vllm-project/vllm/pull/35552 remove self.cudagraph_batch_sizes * https://github.com/vllm-project/vllm/pull/35158 clear_metadata -> defer_finalize * https://github.com/vllm-project/vllm/pull/36006 remove CacheConfig.cpu_offload_gb * https://github.com/vllm-project/vllm/pull/35472 * https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder * https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens * https://github.com/vllm-project/vllm/pull/28053 - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: menogrey <1299267905@qq.com> Co-authored-by: MrZ20 <2609716663@qq.com>
2026-03-18 09:24:43 +08:00
parent 79ef41a53d
commit 1c954ff264
16 changed files with 223 additions and 168 deletions
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -17,6 +17,7 @@
 #
 import copy
 import functools
+import logging
 from collections.abc import Callable
 from typing import Any

@@ -31,7 +32,9 @@ from vllm.config import VllmConfig
 from vllm.config.utils import Range

 from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
-from vllm_ascend.utils import COMPILATION_PASS_KEY
+from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
+
+logger = logging.getLogger(__name__)


 def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
@@ -83,6 +86,11 @@ def npugraph_ex_compile(
    config.mode = "reduce-overhead"
    # execute FX graph in eager mode before graph mode to optimize FX graph.
    config.debug.run_eagerly = True
+    if not vllm_version_is("0.17.0"):
+        # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
+        # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
+        # and cause copy_between_host_and_device error.
+        config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
    if ascend_compilation_config.enable_static_kernel:
        config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
        # According to the cudagraph_capture_size configuration, set the shapes
@@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface):
        # see https://github.com/pytorch/pytorch/issues/138980
        graph = copy.deepcopy(graph)

+        if not vllm_version_is("0.17.0"):
+            from torch._guards import detect_fake_mode
+
+            current_fake_mode = detect_fake_mode()
+            if current_fake_mode is not None:
+                example_inputs = [
+                    current_fake_mode.from_tensor(inp)
+                    if (
+                        isinstance(inp, torch.Tensor)
+                        and hasattr(inp, "fake_mode")
+                        and inp.fake_mode is not current_fake_mode
+                    )
+                    else inp
+                    for inp in example_inputs
+                ]
+
        ascend_compilation_config = get_ascend_config().ascend_compilation_config
        if ascend_compilation_config.enable_npugraph_ex:
            assert hasattr(self, "vllm_config")