[main2main] upgrade vllm to 0308 (#7213)
### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:
* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#
|
||||
import copy
|
||||
import functools
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
@@ -31,7 +32,9 @@ from vllm.config import VllmConfig
|
||||
from vllm.config.utils import Range
|
||||
|
||||
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
|
||||
@@ -83,6 +86,11 @@ def npugraph_ex_compile(
|
||||
config.mode = "reduce-overhead"
|
||||
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
||||
config.debug.run_eagerly = True
|
||||
if not vllm_version_is("0.17.0"):
|
||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||
# and cause copy_between_host_and_device error.
|
||||
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||
if ascend_compilation_config.enable_static_kernel:
|
||||
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
|
||||
# According to the cudagraph_capture_size configuration, set the shapes
|
||||
@@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface):
|
||||
# see https://github.com/pytorch/pytorch/issues/138980
|
||||
graph = copy.deepcopy(graph)
|
||||
|
||||
if not vllm_version_is("0.17.0"):
|
||||
from torch._guards import detect_fake_mode
|
||||
|
||||
current_fake_mode = detect_fake_mode()
|
||||
if current_fake_mode is not None:
|
||||
example_inputs = [
|
||||
current_fake_mode.from_tensor(inp)
|
||||
if (
|
||||
isinstance(inp, torch.Tensor)
|
||||
and hasattr(inp, "fake_mode")
|
||||
and inp.fake_mode is not current_fake_mode
|
||||
)
|
||||
else inp
|
||||
for inp in example_inputs
|
||||
]
|
||||
|
||||
ascend_compilation_config = get_ascend_config().ascend_compilation_config
|
||||
if ascend_compilation_config.enable_npugraph_ex:
|
||||
assert hasattr(self, "vllm_config")
|
||||
|
||||
Reference in New Issue
Block a user