[Misc] Removes unnecessary graph size re-initialization (#6280)
### What this PR does / why we need it?
This PR removes `update_default_aclgraph_sizes`. In earlier versions, we
add this function to change default `cudagraph_capture_sizes` because
`_npu_paged_attention` degrades significantly on certain shapes (which
is included in default `cudagraph_capture_sizes` of VLLM). Now since we
use FIA as default attention op (which does not contain such performance
degradation), there is no need to add this default change. Otherwise, it
could cause some conflicts if we set a small `cudagraph_capture_sizes`
that < 20 now.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
d68209402d
---------
Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
@@ -207,13 +207,11 @@ class TestNPUPlatform(TestBase):
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_unsupported_compilation_level(
|
||||
self, mock_init_recompute, mock_init_ascend, mock_update_default, mock_soc_version
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version
|
||||
):
|
||||
mock_update_default.return_value = MagicMock()
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.enforce_eager = False
|
||||
|
||||
@@ -47,7 +47,6 @@ from vllm_ascend.utils import (
|
||||
refresh_block_size,
|
||||
update_aclgraph_sizes,
|
||||
update_cudagraph_capture_sizes,
|
||||
update_default_aclgraph_sizes,
|
||||
is_310p,
|
||||
)
|
||||
|
||||
@@ -247,10 +246,6 @@ class NPUPlatform(Platform):
|
||||
|
||||
# set cudaprah sizes before extending `compilation_config.splitting_ops`
|
||||
vllm_config._set_cudagraph_sizes()
|
||||
# There are cases where default cudagraph_capture_sizes are not friendly
|
||||
# to ascend ops && hardwares. We update these sizes here to improve
|
||||
# default performance.
|
||||
update_default_aclgraph_sizes(vllm_config)
|
||||
# TODO delete graph size update here when compilation_config.pass_config.enable_sp
|
||||
# is supported by vllm-ascend.
|
||||
if (
|
||||
|
||||
@@ -427,53 +427,6 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, cudagraph_capture_si
|
||||
vllm_config.compilation_config.post_init_cudagraph_sizes()
|
||||
|
||||
|
||||
def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
|
||||
"""
|
||||
Check whether it is vLLM default capture sizes.
|
||||
"""
|
||||
|
||||
max_cudagraph_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
cudagraph_capture_sizes = [i for i in [1, 2, 4] if i <= max_cudagraph_capture_size]
|
||||
if max_cudagraph_capture_size >= 8:
|
||||
# Step size 8 for small batch sizes, up to 256(not included)
|
||||
cudagraph_capture_sizes += list(range(8, min(max_cudagraph_capture_size + 1, 256), 8))
|
||||
if max_cudagraph_capture_size >= 256:
|
||||
# Step size 16 for larger batch sizes
|
||||
cudagraph_capture_sizes += list(range(256, max_cudagraph_capture_size + 1, 16))
|
||||
# in newer version, vLLM use ascending order of cudagraph_capture_sizes.
|
||||
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
|
||||
return target_cudagraph_capture_sizes == vllm_config.compilation_config.cudagraph_capture_sizes
|
||||
|
||||
|
||||
def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
"""
|
||||
Update ACL graph default capture sizes, so that new sizes
|
||||
are more friendly to ascend ops && hardware.
|
||||
"""
|
||||
|
||||
if (
|
||||
vllm_config.model_config is None
|
||||
or vllm_config.model_config.enforce_eager
|
||||
or not _is_default_capture_sizes(vllm_config)
|
||||
):
|
||||
return
|
||||
|
||||
# modify the default capture_sizes for Qwen3-MoE models on dp settings.
|
||||
# this is mainly because performance of _npu_paged_attention might degrades
|
||||
# on special shapes.
|
||||
# TODO(Angazenn): we will remove this once _npu_paged_attention is fully
|
||||
# replaced by npu_fused_infer_attention_score which does not contain such bugs.
|
||||
if (
|
||||
vllm_config.model_config
|
||||
and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe"
|
||||
and vllm_config.parallel_config.tensor_parallel_size == 1
|
||||
and vllm_config.parallel_config.data_parallel_size > 1
|
||||
):
|
||||
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [i for i in range(24, max_capture_size + 1, 8)]
|
||||
update_cudagraph_capture_sizes(vllm_config, new_cudagraph_capture_sizes)
|
||||
|
||||
|
||||
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
"""Update ACL graph capture sizes based on hardware limitations"""
|
||||
# NOTE: Currently, we can only capture 1800 graphs at most,
|
||||
|
||||
Reference in New Issue
Block a user