[BugFix] Fix ACLgraph bug in Qwen3_32b_int8 case (#3204)

### What this PR does / why we need it? 1. Solved the issue where sizes capture failed for the Qwen3-32b-int8 model when aclgraph, dp1, and tp4 were enabled. 2. Added the exception thrown when sizes capture fails and provided a solution 3. Add this common problem to the FAQ doc ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-09-28 17:44:04 +08:00
parent a86ece5e39
commit 1705501ae2
4 changed files with 47 additions and 14 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -3418,10 +3418,23 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                aclgraph_runtime_mode = aclgraph_mode.mixed_mode()

                compilation_cases = list(reversed(self.aclgraph_batch_sizes))
-                self._capture_aclgraphs(
-                    compilation_cases,
-                    aclgraph_runtime_mode=aclgraph_runtime_mode,
-                    uniform_decode=False)
+
+                try:
+                    self._capture_aclgraphs(
+                        compilation_cases,
+                        aclgraph_runtime_mode=aclgraph_runtime_mode,
+                        uniform_decode=False)
+                except Exception as e:
+                    logger.error(
+                        f"ACLgraph sizes capture fail: {type(e).__name__}:\n"
+                        "ACLgraph has insufficient available streams to capture the configured number of sizes. "
+                        "Please verify both the availability of adequate streams and the appropriateness of the configured size count.\n\n"
+                        "Recommended solutions:\n"
+                        "1. Manually configure the compilation_config parameter "
+                        "with a reduced set of sizes: '{\"cudagraph_capture_sizes\":[size1, size2, size3, ...]}'.\n"
+                        "2. Utilize ACLgraph's full graph mode as an alternative to the piece-wise approach.\n\n"
+                        f"{str(e)}")
+                    raise

            if aclgraph_mode.decode_mode() == CUDAGraphMode.FULL and \
                aclgraph_mode.separate_routine():