upgrade to 0.18.0 (#7502)
### What this PR does / why we need it?
1. upgrade to 0.18.0
2. ensure kernel_block_sizes is int for Eagle drafter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
8b6325758c
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.config.utils import Range
|
||||
|
||||
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -86,11 +86,10 @@ def npugraph_ex_compile(
|
||||
config.mode = "reduce-overhead"
|
||||
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
||||
config.debug.run_eagerly = True
|
||||
if not vllm_version_is("0.17.0"):
|
||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||
# and cause copy_between_host_and_device error.
|
||||
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||
# and cause copy_between_host_and_device error.
|
||||
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||
if ascend_compilation_config.enable_static_kernel:
|
||||
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
|
||||
# According to the cudagraph_capture_size configuration, set the shapes
|
||||
@@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface):
|
||||
# see https://github.com/pytorch/pytorch/issues/138980
|
||||
graph = copy.deepcopy(graph)
|
||||
|
||||
if not vllm_version_is("0.17.0"):
|
||||
from torch._guards import detect_fake_mode
|
||||
from torch._guards import detect_fake_mode
|
||||
|
||||
current_fake_mode = detect_fake_mode()
|
||||
if current_fake_mode is not None:
|
||||
example_inputs = [
|
||||
current_fake_mode.from_tensor(inp)
|
||||
if (
|
||||
isinstance(inp, torch.Tensor)
|
||||
and hasattr(inp, "fake_mode")
|
||||
and inp.fake_mode is not current_fake_mode
|
||||
)
|
||||
else inp
|
||||
for inp in example_inputs
|
||||
]
|
||||
current_fake_mode = detect_fake_mode()
|
||||
if current_fake_mode is not None:
|
||||
example_inputs = [
|
||||
current_fake_mode.from_tensor(inp)
|
||||
if (
|
||||
isinstance(inp, torch.Tensor)
|
||||
and hasattr(inp, "fake_mode")
|
||||
and inp.fake_mode is not current_fake_mode
|
||||
)
|
||||
else inp
|
||||
for inp in example_inputs
|
||||
]
|
||||
|
||||
ascend_compilation_config = get_ascend_config().ascend_compilation_config
|
||||
if ascend_compilation_config.enable_npugraph_ex:
|
||||
|
||||
Reference in New Issue
Block a user