[BugFix] Fix ACLgraph bug in Qwen3_32b_int8 case (#3204)

### What this PR does / why we need it? 1. Solved the issue where sizes capture failed for the Qwen3-32b-int8 model when aclgraph, dp1, and tp4 were enabled. 2. Added the exception thrown when sizes capture fails and provided a solution 3. Add this common problem to the FAQ doc ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-09-28 17:44:04 +08:00
parent a86ece5e39
commit 1705501ae2
4 changed files with 47 additions and 14 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -40,14 +40,6 @@ if TYPE_CHECKING:
 else:
    VllmConfig = None

-# NOTE: Currently, we can only capture 1800 graphs at most,
-# due to the limitation of ACL graph. This number is bounded by
-# the number of streams, which is 2048, we save 248 streams
-# as a buffer.
-# Maximum number of graphs that can be captured by ACL Graph
-# TODO: Find out whether we need to solve allreduce function
-MAX_CAPTURE_SIZE = 1800
-
 ASCEND_QUANTIZATION_METHOD = "ascend"
 SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
 REGISTERED_ASCEND_OPS = {}
@@ -293,6 +285,14 @@ def get_max_hidden_layers(hf_config) -> int:

 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
    """Update ACL graph capture sizes based on hardware limitations"""
+    # NOTE: Currently, we can only capture 1800 graphs at most,
+    # due to the limitation of ACL graph. This number is bounded by
+    # the number of streams, which is 2048, we save 248 streams
+    # as a buffer.
+    # Maximum number of graphs that can be captured by ACL Graph
+    # TODO: Find out whether we need to solve allreduce function
+    MAX_CAPTURE_SIZE = 1800
+
    # Store original configuration and temporarily clear it
    compilation_config = vllm_config.compilation_config
    original_sizes, compilation_config.cudagraph_capture_sizes = \
@@ -326,6 +326,11 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
                    "multistream_overlap_shared_expert", False))
        if is_moe_model(vllm_config):
            parallel_factor += (parallel_config.data_parallel_size > 1)
+        else:
+            # When AIV mode is enabled, the allreduce operator of the dense
+            # layer model will occupy additional streams, which are buffered here.
+            MAX_CAPTURE_SIZE = MAX_CAPTURE_SIZE - parallel_factor * resources_per_graph
+
        # Calculate maximum supported batch sizes considering model architecture on the A2 Hardware Device
        # Assume the following case:
        # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,