diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3b5f2812..6848a697 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -192,6 +192,7 @@ class ExecuteModelState(NamedTuple): attn_metadata: "PerLayerAttnMetadata" positions: torch.Tensor ec_connector_output: "ECConnectorOutput | None" + cudagraph_stats: CUDAGraphStat | None class NPUModelRunner(GPUModelRunner): @@ -1353,6 +1354,7 @@ class NPUModelRunner(GPUModelRunner): attn_metadata, positions, ec_connector_output, + cudagraph_stats, ) self.kv_connector_output = kv_connector_output return None @@ -1389,6 +1391,7 @@ class NPUModelRunner(GPUModelRunner): attn_metadata, positions, ec_connector_output, + cudagraph_stats, ) = self.execute_model_state # Clear ephemeral state. self.execute_model_state = None @@ -1466,6 +1469,7 @@ class NPUModelRunner(GPUModelRunner): ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, + cudagraph_stats=cudagraph_stats, ) durations = ProfileExecuteDuration().pop_captured_sync()