[main2main] upgrade vllm to 0308 (#7213)

### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:

* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053 

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
zhangyiming
2026-03-18 09:24:43 +08:00
committed by GitHub
parent 79ef41a53d
commit 1c954ff264
16 changed files with 223 additions and 168 deletions

View File

@@ -17,6 +17,7 @@
#
import copy
import functools
import logging
from collections.abc import Callable
from typing import Any
@@ -31,7 +32,9 @@ from vllm.config import VllmConfig
from vllm.config.utils import Range
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
from vllm_ascend.utils import COMPILATION_PASS_KEY
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
logger = logging.getLogger(__name__)
def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
@@ -83,6 +86,11 @@ def npugraph_ex_compile(
config.mode = "reduce-overhead"
# execute FX graph in eager mode before graph mode to optimize FX graph.
config.debug.run_eagerly = True
if not vllm_version_is("0.17.0"):
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
# and cause copy_between_host_and_device error.
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
if ascend_compilation_config.enable_static_kernel:
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
# According to the cudagraph_capture_size configuration, set the shapes
@@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface):
# see https://github.com/pytorch/pytorch/issues/138980
graph = copy.deepcopy(graph)
if not vllm_version_is("0.17.0"):
from torch._guards import detect_fake_mode
current_fake_mode = detect_fake_mode()
if current_fake_mode is not None:
example_inputs = [
current_fake_mode.from_tensor(inp)
if (
isinstance(inp, torch.Tensor)
and hasattr(inp, "fake_mode")
and inp.fake_mode is not current_fake_mode
)
else inp
for inp in example_inputs
]
ascend_compilation_config = get_ascend_config().ascend_compilation_config
if ascend_compilation_config.enable_npugraph_ex:
assert hasattr(self, "vllm_config")