From ac963f1519959f61074359926efaf2f4e4fc19fb Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Wed, 28 Jan 2026 16:34:20 +0800 Subject: [PATCH] [Fix] Adds CUDA graph stats to execution state (#6331) ### What this PR does / why we need it? Adds a CUDA graph profiling stats field to the execution state and updates the NPU model runner to set, unpack, and forward those stats during execution. This preserves CUDA graph metrics across state transitions, improving observability for later use and diagnostics. ### Does this PR introduce _any_ user-facing change? Enable this by set ```python llm = LLM( ... disable_log_stats=False, cudagraph_metrics=True, ... ) ``` or `--cudagraph-metrics` and make sure do not disable log stats. After this, you should be able to see something like this, which is really helpful for some light debugging: ``` [loggers.py:257] Engine 000: Avg prompt throughput: 32.3 tokens/s, Avg generation throughput: 114.4 tokens/s, Running: 4 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0% [cuda_graph.py:117] **CUDAGraph Config Settings:** [cuda_graph.py:117] [cuda_graph.py:117] - Mode: FULL_DECODE_ONLY [cuda_graph.py:117] - Capture sizes: [1, 2, 4, 8, 16, 24, 32] [cuda_graph.py:117] [cuda_graph.py:117] **CUDAGraph Stats:** [cuda_graph.py:117] [cuda_graph.py:117] | Unpadded Tokens | Padded Tokens | Num Paddings | Runtime Mode | Count | [cuda_graph.py:117] |-----------------|---------------|--------------|--------------|-------| [cuda_graph.py:117] | 4 | 4 | 0 | FULL | 18 | [cuda_graph.py:117] | 5 | 5 | 0 | NONE | 1 | [cuda_graph.py:117] | 1 | 1 | 0 | FULL | 1 | [cuda_graph.py:117] | 18 | 18 | 0 | NONE | 1 | ``` ### How was this patch tested? None. - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3b5f2812..6848a697 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -192,6 +192,7 @@ class ExecuteModelState(NamedTuple): attn_metadata: "PerLayerAttnMetadata" positions: torch.Tensor ec_connector_output: "ECConnectorOutput | None" + cudagraph_stats: CUDAGraphStat | None class NPUModelRunner(GPUModelRunner): @@ -1353,6 +1354,7 @@ class NPUModelRunner(GPUModelRunner): attn_metadata, positions, ec_connector_output, + cudagraph_stats, ) self.kv_connector_output = kv_connector_output return None @@ -1389,6 +1391,7 @@ class NPUModelRunner(GPUModelRunner): attn_metadata, positions, ec_connector_output, + cudagraph_stats, ) = self.execute_model_state # Clear ephemeral state. self.execute_model_state = None @@ -1466,6 +1469,7 @@ class NPUModelRunner(GPUModelRunner): ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, + cudagraph_stats=cudagraph_stats, ) durations = ProfileExecuteDuration().pop_captured_sync()