diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md index 8989bf9..3b37df6 100644 --- a/docs/source/developer_guide/evaluation/profile_execute_duration.md +++ b/docs/source/developer_guide/evaluation/profile_execute_duration.md @@ -9,6 +9,11 @@ The execution duration of each stage (including pre/post-processing, model forwa * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. +**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:** +``` +VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py +``` + ## Example Output ``` diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 51ffe1e..9b0a860 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1317,8 +1317,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): for tag, duration in durations.items() ] captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" - print(f"Profile execute duration [{captured_name}]:", - " ".join(dr_str)) + logger.info("Profile execute duration [%s]:%s", captured_name, + " ".join(dr_str)) return model_runner_output