From b7df04de9ba1bade98ebd77f86f0853c98dd4855 Mon Sep 17 00:00:00 2001
From: lilinsiman <lilinsiman@gmail.com>
Date: Wed, 10 Sep 2025 22:50:48 +0800
Subject: [PATCH] debug_aclgraph_sizes_capture (#2827)

### What this PR does / why we need it?
1. Solved the problem that in the Qwen3 Moe model case, opening DP would
use an extra stream, causing ACLgraph sizes capture error
2. After experimentation, it was found that in many cases, some
operators would occupy more streams than expected. Therefore, the buffer
area for streams in ACLgraph was not large enough. After discussion,
extra 120 streams were added as buffer.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

- vLLM version: main
- vLLM main:
https://github.com/vllm-project/vllm/commit/0ae43dbf8cb28a299ae724fc742b0c5bcddea868

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
---
 tests/ut/test_utils.py |  4 ++--
 vllm_ascend/utils.py   | 14 +++++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
index 99f821a..508bb2a 100644
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -259,7 +259,7 @@ class TestUtils(TestBase):
         utils.update_aclgraph_sizes(test_vllm_config)
         del os.environ['HCCL_OP_EXPANSION_MODE']
         self.assertEqual(
-            147,
+            138,
             len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
 
         test_vllm_config.speculative_config = mock.MagicMock()
@@ -272,7 +272,7 @@ class TestUtils(TestBase):
         utils.update_aclgraph_sizes(test_vllm_config)
         del os.environ['HCCL_OP_EXPANSION_MODE']
         self.assertEqual(
-            120,
+            112,
             len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # max_num_batch_sizes >= len(original_sizes)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index a61a5af..33e1e01 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -40,12 +40,13 @@ if TYPE_CHECKING:
 else:
     VllmConfig = None
 
-# NOTE: Currently, we can only capture 1920 graphs at most,
+# NOTE: Currently, we can only capture 1800 graphs at most,
 # due to the limitation of ACL graph. This number is bounded by
-# the number of streams, which is 2048, we save 128 streams
+# the number of streams, which is 2048, we save 248 streams
 # as a buffer.
 # Maximum number of graphs that can be captured by ACL Graph
-MAX_CAPTURE_SIZE = 1920
+# TODO: Find out whether we need to solve allreduce function
+MAX_CAPTURE_SIZE = 1800
 
 ASCEND_QUANTIZATION_METHOD = "ascend"
 SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
@@ -320,6 +321,8 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         # TODO: Find out whether we need to take into account the pp_size
         parallel_factor = 1 + num_comm_groups + int(
             parallel_config.enable_expert_parallel)
+        if is_moe_model(vllm_config):
+            parallel_factor += (parallel_config.data_parallel_size > 1)
         # Calculate maximum supported batch sizes considering model architecture on the A2 Hardware Device
         # Assume the following case:
         # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
@@ -583,3 +586,8 @@ def matmul_allreduce_enable() -> bool:
 
 def dense_optim_enable() -> bool:
     return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
+
+
+def is_moe_model(vllm_config: VllmConfig):
+    config = vllm_config.model_config.hf_config
+    return any('experts' in key.lower() for key in config.to_dict())