[v0.11.0-dev][misc]change default capture size for Qwen3-MoE when using full dp (#4205)

### What this PR does / why we need it? This dev version of #4199 . Currently, the default `cudagraph_capture_size` in vLLM is `[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]`. However, this is not always the best choice on different situations. This PR aims to change the default setting when running Qwen3-MoE on full dp (`dp_size > 1` && `tp_size == 1`) setting, which is usually applied in Large-Scale EP. old : `[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]` new: `[1, 2, 5 ,10 ,15, 16 ,24 ,... , max_capture_size]` This is mainly because the performance of `_npu_paged_attention` op degrades dramatically on old settings. We hope to provide better performance if users do not set specific `cudagraph_capture_size`. ### Does this PR introduce _any_ user-facing change? The default `cudagraph_capture_size` is modified in above cases. However, if `cudagraph_capture_size` has already set by users, this PR won't have any influence on this. ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: Angazenn <supperccell@163.com>
2025-11-21 11:19:11 +08:00
parent b6d59bdea2
commit 9c6d0b422c
2 changed files with 53 additions and 1 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -319,6 +319,53 @@ def get_max_hidden_layers(hf_config) -> int:
    return max(layer_counts)


+def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
+    """
+    Check whether it is vLLM default capture sizes.
+    """
+
+    cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
+    if len(cuda_graph_sizes) == 1:
+        default_size_capture_list = [1, 2, 4] + [
+            i for i in range(8, cuda_graph_sizes[0] + 1, 8)
+        ]
+
+        if sorted(default_size_capture_list, reverse=True) == \
+            vllm_config.compilation_config.cudagraph_capture_sizes:
+            return True
+
+    return False
+
+
+def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
+    """
+    Update ACL graph default capture sizes, so that new sizes
+    are more friendly to ascend ops && hardware.
+    """
+
+    if vllm_config.model_config is None or \
+        vllm_config.model_config.enforce_eager or \
+        not _is_default_capture_sizes(vllm_config):
+        return
+
+    # modify the default capture_sizes for Qwen3-MoE models on dp settings.
+    # this is mainly because performance of _npu_paged_attention might degrades
+    # on special shapes.
+    # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
+    # replaced by npu_fused_infer_attention_score which does not contain such bugs.
+    if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
+        and vllm_config.parallel_config.tensor_parallel_size == 1 \
+        and vllm_config.parallel_config.data_parallel_size > 1 :
+        max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
+        new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
+            i for i in range(24, max_capture_size + 1, 8)
+        ]
+
+        vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
+        vllm_config.compilation_config.init_with_cudagraph_sizes(
+            new_cudagraph_capture_sizes)
+
+
 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
    """Update ACL graph capture sizes based on hardware limitations"""
    # NOTE: Currently, we can only capture 1800 graphs at most,