[Misc] Removes unnecessary graph size re-initialization (#6280)

### What this PR does / why we need it?

This PR removes `update_default_aclgraph_sizes`. In earlier versions, we
add this function to change default `cudagraph_capture_sizes` because
`_npu_paged_attention` degrades significantly on certain shapes (which
is included in default `cudagraph_capture_sizes` of VLLM). Now since we
use FIA as default attention op (which does not contain such performance
degradation), there is no need to add this default change. Otherwise, it
could cause some conflicts if we set a small `cudagraph_capture_sizes`
that < 20 now.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.1
- vLLM main:
d68209402d

---------

Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
Angazenn
2026-01-27 14:38:07 +08:00
committed by GitHub
parent fea197ad50
commit 5e34c70ffc
3 changed files with 1 additions and 55 deletions

View File

@@ -207,13 +207,11 @@ class TestNPUPlatform(TestBase):
)
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_unsupported_compilation_level(
self, mock_init_recompute, mock_init_ascend, mock_update_default, mock_soc_version
self, mock_init_recompute, mock_init_ascend, mock_soc_version
):
mock_update_default.return_value = MagicMock()
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.model_config.enforce_eager = False

View File

@@ -47,7 +47,6 @@ from vllm_ascend.utils import (
refresh_block_size,
update_aclgraph_sizes,
update_cudagraph_capture_sizes,
update_default_aclgraph_sizes,
is_310p,
)
@@ -247,10 +246,6 @@ class NPUPlatform(Platform):
# set cudaprah sizes before extending `compilation_config.splitting_ops`
vllm_config._set_cudagraph_sizes()
# There are cases where default cudagraph_capture_sizes are not friendly
# to ascend ops && hardwares. We update these sizes here to improve
# default performance.
update_default_aclgraph_sizes(vllm_config)
# TODO delete graph size update here when compilation_config.pass_config.enable_sp
# is supported by vllm-ascend.
if (

View File

@@ -427,53 +427,6 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, cudagraph_capture_si
vllm_config.compilation_config.post_init_cudagraph_sizes()
def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
"""
Check whether it is vLLM default capture sizes.
"""
max_cudagraph_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
cudagraph_capture_sizes = [i for i in [1, 2, 4] if i <= max_cudagraph_capture_size]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(range(8, min(max_cudagraph_capture_size + 1, 256), 8))
if max_cudagraph_capture_size >= 256:
# Step size 16 for larger batch sizes
cudagraph_capture_sizes += list(range(256, max_cudagraph_capture_size + 1, 16))
# in newer version, vLLM use ascending order of cudagraph_capture_sizes.
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
return target_cudagraph_capture_sizes == vllm_config.compilation_config.cudagraph_capture_sizes
def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"""
Update ACL graph default capture sizes, so that new sizes
are more friendly to ascend ops && hardware.
"""
if (
vllm_config.model_config is None
or vllm_config.model_config.enforce_eager
or not _is_default_capture_sizes(vllm_config)
):
return
# modify the default capture_sizes for Qwen3-MoE models on dp settings.
# this is mainly because performance of _npu_paged_attention might degrades
# on special shapes.
# TODO(Angazenn): we will remove this once _npu_paged_attention is fully
# replaced by npu_fused_infer_attention_score which does not contain such bugs.
if (
vllm_config.model_config
and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe"
and vllm_config.parallel_config.tensor_parallel_size == 1
and vllm_config.parallel_config.data_parallel_size > 1
):
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [i for i in range(24, max_capture_size + 1, 8)]
update_cudagraph_capture_sizes(vllm_config, new_cudagraph_capture_sizes)
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"""Update ACL graph capture sizes based on hardware limitations"""
# NOTE: Currently, we can only capture 1800 graphs at most,