upgrade to 0.18.0 (#7502)

### What this PR does / why we need it?
1. upgrade to 0.18.0
2. ensure kernel_block_sizes is int for Eagle drafter
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.17.0
- vLLM main:
8b6325758c

---------

Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
meihanc
2026-03-21 16:05:38 +08:00
committed by GitHub
parent 80a4265717
commit bff4fbfca5
16 changed files with 139 additions and 258 deletions

View File

@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
from vllm.config.utils import Range
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
from vllm_ascend.utils import COMPILATION_PASS_KEY
logger = logging.getLogger(__name__)
@@ -86,11 +86,10 @@ def npugraph_ex_compile(
config.mode = "reduce-overhead"
# execute FX graph in eager mode before graph mode to optimize FX graph.
config.debug.run_eagerly = True
if not vllm_version_is("0.17.0"):
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
# and cause copy_between_host_and_device error.
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
# and cause copy_between_host_and_device error.
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
if ascend_compilation_config.enable_static_kernel:
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
# According to the cudagraph_capture_size configuration, set the shapes
@@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface):
# see https://github.com/pytorch/pytorch/issues/138980
graph = copy.deepcopy(graph)
if not vllm_version_is("0.17.0"):
from torch._guards import detect_fake_mode
from torch._guards import detect_fake_mode
current_fake_mode = detect_fake_mode()
if current_fake_mode is not None:
example_inputs = [
current_fake_mode.from_tensor(inp)
if (
isinstance(inp, torch.Tensor)
and hasattr(inp, "fake_mode")
and inp.fake_mode is not current_fake_mode
)
else inp
for inp in example_inputs
]
current_fake_mode = detect_fake_mode()
if current_fake_mode is not None:
example_inputs = [
current_fake_mode.from_tensor(inp)
if (
isinstance(inp, torch.Tensor)
and hasattr(inp, "fake_mode")
and inp.fake_mode is not current_fake_mode
)
else inp
for inp in example_inputs
]
ascend_compilation_config = get_ascend_config().ascend_compilation_config
if ascend_compilation_config.enable_npugraph_ex: