[main][misc]change default capture size for Qwen3-MoE when using full dp (#4199)

### What this PR does / why we need it?
Currently, the default `cudagraph_capture_size` in vLLM is `[1, 2, 4 ,8
,16 ,24 ,... , max_capture_size]`. However, this is not always the best
choice on different situations. This PR aims to change the default
setting when running Qwen3-MoE on full dp (`dp_size > 1` && `tp_size ==
1`) setting, which is usually applied in Large-Scale EP.
old :
`[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]`
new:
`[1, 2, 5 ,10 ,15, 16 ,24 ,... , max_capture_size]`
This is mainly because the performance of `_npu_paged_attention` op
degrades dramatically on old settings. We hope to provide better
performance if users do not set specific `cudagraph_capture_size`.
### Does this PR introduce _any_ user-facing change?
The default `cudagraph_capture_size` is modified in above cases.
However, if `cudagraph_capture_size` has already set by users, this PR
won't have any influence on this.

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
Angazenn
2025-11-18 08:41:45 +08:00
committed by GitHub
parent da1cd9c7ca
commit 10a046ddce
3 changed files with 81 additions and 3 deletions

View File

@@ -330,6 +330,7 @@ class TestNPUPlatform(TestBase):
)
@patch("vllm_ascend.utils.is_310p", return_value=False)
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch(
@@ -337,7 +338,8 @@ class TestNPUPlatform(TestBase):
)
def test_check_and_update_config_unsupported_compilation_level(
self, mock_init_recompute, mock_init_ascend, mock_check_ascend,
mock_is_310p):
mock_update_default, mock_is_310p):
mock_update_default.return_value = MagicMock()
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
)
vllm_config = TestNPUPlatform.mock_vllm_config()
@@ -410,6 +412,7 @@ class TestNPUPlatform(TestBase):
)
@patch("vllm_ascend.utils.is_310p", return_value=False)
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch(
@@ -417,7 +420,8 @@ class TestNPUPlatform(TestBase):
)
def test_check_and_update_config_torchair_enabled_compilation(
self, mock_init_recompute, mock_init_ascend, mock_check_ascend,
mock_is_310p):
mock_update_default, mock_is_310p):
mock_update_default.return_value = MagicMock()
mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
mock_ascend_config.torchair_graph_config.enabled = True
mock_init_ascend.return_value = mock_ascend_config

View File

@@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
prefill_context_parallel_enable,
update_aclgraph_sizes,
update_cudagraph_capture_sizes, vllm_version_is)
update_cudagraph_capture_sizes,
update_default_aclgraph_sizes, vllm_version_is)
if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig
@@ -193,6 +194,10 @@ class NPUPlatform(Platform):
# set cudaprah sizes before extending `compilation_config.splitting_ops`
vllm_config._set_cudagraph_sizes()
# There are cases where default cudagraph_capture_sizes are not friendly
# to ascend ops && hardwares. We update these sizes here to improve
# default performance.
update_default_aclgraph_sizes(vllm_config)
# TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
# is supported by vllm-ascend.
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \

View File

@@ -413,6 +413,75 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig,
vllm_config.compilation_config.post_init_cudagraph_sizes()
def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
"""
Check whether it is vLLM default capture sizes.
"""
if vllm_version_is("0.11.0"):
cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1:
cudagraph_capture_sizes = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
else:
max_cudagraph_capture_size = \
vllm_config.compilation_config.max_cudagraph_capture_size
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
range(8, min(max_cudagraph_capture_size + 1, 256), 8))
if max_cudagraph_capture_size >= 256:
# Step size 16 for larger batch sizes
cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16))
if sorted(cudagraph_capture_sizes, reverse=True) == \
vllm_config.compilation_config.cudagraph_capture_sizes:
return True
return False
def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"""
Update ACL graph default capture sizes, so that new sizes
are more friendly to ascend ops && hardware.
"""
if vllm_config.model_config is None or \
vllm_config.model_config.enforce_eager or \
not _is_default_capture_sizes(vllm_config):
return
# modify the default capture_sizes for Qwen3-MoE models on dp settings.
# this is mainly because performance of _npu_paged_attention might degrades
# on special shapes.
# TODO(Angazenn): we will remove this once _npu_paged_attention is fully
# replaced by npu_fused_infer_attention_score which does not contain such bugs.
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
and vllm_config.parallel_config.tensor_parallel_size == 1 \
and vllm_config.parallel_config.data_parallel_size > 1 :
if vllm_version_is("0.11.0"):
max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
else:
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
i for i in range(24, max_capture_size + 1, 8)
]
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
vllm_config.compilation_config.init_with_cudagraph_sizes(
new_cudagraph_capture_sizes)
else:
update_cudagraph_capture_sizes(vllm_config,
new_cudagraph_capture_sizes)
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"""Update ACL graph capture sizes based on hardware limitations"""
# NOTE: Currently, we can only capture 1800 graphs at most,