diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 61327fff..87000b0d 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -33,8 +33,6 @@ e2e-singlecard: estimated_time: 300 - name: tests/e2e/singlecard/test_multistream_overlap_shared_expert.py estimated_time: 200 - - name: tests/e2e/singlecard/test_profile_execute_duration.py - estimated_time: 10 - name: tests/e2e/singlecard/test_quantization.py estimated_time: 200 - name: tests/e2e/singlecard/test_sampler.py diff --git a/docs/source/developer_guide/performance_and_debug/index.md b/docs/source/developer_guide/performance_and_debug/index.md index e0eec55d..c4981cec 100644 --- a/docs/source/developer_guide/performance_and_debug/index.md +++ b/docs/source/developer_guide/performance_and_debug/index.md @@ -4,7 +4,6 @@ :caption: Performance and Debug :maxdepth: 1 performance_benchmark -profile_execute_duration optimization_and_tuning service_profiling_guide msprobe_guide diff --git a/docs/source/developer_guide/performance_and_debug/profile_execute_duration.md b/docs/source/developer_guide/performance_and_debug/profile_execute_duration.md deleted file mode 100644 index 1e01dc42..00000000 --- a/docs/source/developer_guide/performance_and_debug/profile_execute_duration.md +++ /dev/null @@ -1,41 +0,0 @@ -# Profile Execute Duration - -The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization. - -**To reduce the performance overhead, we add this feature, using the NPU event timestamp mechanism to observe the device execution time asynchronously.** - -## Usage - -* Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to enable this feature. -* Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. -* Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. - -**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execution duration profiling. Execute the script as follows:** - -```shell -VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py -``` - -## Example Output - -```shell -5691:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.17ms [prepare input and forward]:9.57ms [forward]:4.14ms -5695:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.29ms [prepare input and forward]:10.19ms [forward]:4.14ms -5697:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.81ms [prepare input and forward]:10.29ms [forward]:3.99ms -5701:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.10ms [prepare input and forward]:10.62ms [forward]:4.33ms -5705:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.65ms [prepare input and forward]:9.58ms [forward]:4.20ms -5709:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.43ms [prepare input and forward]:9.88ms [forward]:4.20ms -5711:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.89ms [prepare input and forward]:10.49ms [forward]:4.19ms -5715:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.14ms [prepare input and forward]:11.21ms [forward]:4.18ms -5719:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.71ms [prepare input and forward]:10.15ms [forward]:4.42ms -5723:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.31ms [forward]:4.25ms -5725:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.12ms [prepare input and forward]:10.33ms [forward]:4.24ms -5729:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.58ms [prepare input and forward]:10.85ms [forward]:4.32ms -5733:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.32ms [prepare input and forward]:9.79ms [forward]:4.28ms -5737:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:15.06ms [prepare input and forward]:9.89ms [forward]:4.32ms -5739:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.48ms [forward]:4.27ms -5743:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.60ms [prepare input and forward]:10.71ms [forward]:4.61ms -5747:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.21ms [prepare input and forward]:10.10ms [forward]:4.52ms -5751:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:15.03ms [prepare input and forward]:10.00ms [forward]:4.42ms - -``` diff --git a/tests/e2e/singlecard/test_profile_execute_duration.py b/tests/e2e/singlecard/test_profile_execute_duration.py deleted file mode 100644 index 465db7d7..00000000 --- a/tests/e2e/singlecard/test_profile_execute_duration.py +++ /dev/null @@ -1,71 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import gc -import os -import time -from unittest.mock import patch - -import torch -import vllm # noqa: F401 - -from vllm_ascend.utils import ProfileExecuteDuration - - -@patch.dict(os.environ, {"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": "1"}) -def test_execue_duration_enabled_discrepancy(): - a = torch.randn(10000, 10000).npu() - b = torch.randn(10000, 10000).npu() - - # warmup - torch.matmul(a, b) - torch.npu.synchronize() - - cpu_start = time.perf_counter() - with ProfileExecuteDuration().capture_async("forward"): - torch.matmul(a, b) - torch.npu.synchronize() - cpu_duration = (time.perf_counter() - cpu_start) * 1000 - npu_durations = ProfileExecuteDuration().pop_captured_sync() - assert npu_durations and 'forward' in npu_durations - assert not ProfileExecuteDuration._observations - - # Assert discrepancy between CPU and NPU duration is within 50% roughly - diff = abs(cpu_duration - npu_durations['forward']) / max( - cpu_duration, npu_durations['forward']) - assert diff <= 0.5, ( - f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms") - - gc.collect() - torch.npu.empty_cache() - torch.npu.reset_peak_memory_stats() - - -def test_execue_duration_disabled(): - a = torch.randn(100, 100).npu() - b = torch.randn(100, 100).npu() - - with ProfileExecuteDuration().capture_async("forward"): - torch.matmul(a, b) - torch.npu.synchronize() - npu_durations = ProfileExecuteDuration().pop_captured_sync() - assert not npu_durations - - gc.collect() - torch.npu.empty_cache() - torch.npu.reset_peak_memory_stats() diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 6219e686..9e4bd623 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -246,56 +246,3 @@ class TestUtils(TestBase): utils.register_ascend_customop() self.assertEqual(mock_customop.register_oot.call_count, len(REGISTERED_ASCEND_OPS)) - - -class TestProfileExecuteDuration(TestBase): - - def setUp(self): - utils.ProfileExecuteDuration._instance = None - utils.ProfileExecuteDuration._observations = [] - utils.ProfileExecuteDuration._lock = Lock() - - def test_singleton_creation(self): - instance1 = utils.ProfileExecuteDuration() - self.assertIsNotNone(instance1) - self.assertIs(instance1, utils.ProfileExecuteDuration._instance) - - instance2 = utils.ProfileExecuteDuration() - self.assertIs(instance1, instance2) - - def test_thread_safety(self): - from threading import Thread - - instances = [] - - def create_instance(): - instances.append(utils.ProfileExecuteDuration()) - - threads = [Thread(target=create_instance) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - first_instance = instances[0] - for instance in instances[1:]: - self.assertIs(first_instance, instance) - - def test_atexit_registration(self): - with mock.patch('atexit.register') as mock_register: - instance = utils.ProfileExecuteDuration() - mock_register.assert_called_once_with(instance.destroy) - - def test_lock_usage(self): - original_lock = utils.ProfileExecuteDuration._lock - - with mock.patch.object(utils.ProfileExecuteDuration, - '_lock', - wraps=original_lock) as mock_lock: - utils.ProfileExecuteDuration() - mock_lock.__enter__.assert_called() - mock_lock.__exit__.assert_called() - - def test_observations_initialization(self): - instance = utils.ProfileExecuteDuration() - self.assertEqual(instance._observations, []) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 8298a5f5..3e5c4cc7 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -60,11 +60,6 @@ env_variables: dict[str, Callable[[], Any]] = { # In this case, developers need to set this value to "0.9.0" to make sure # that the correct package is installed. "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None), - # Whether to enable the model execute time observe profile. Disable it when - # running vllm ascend in production environment. - "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": lambda: bool( - int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", "0")) - ), # Some models are optimized by vllm ascend. While in some case, e.g. rlhf # training, the optimized model may not be suitable. In this case, set this # value to False to disable the optimized model. diff --git a/vllm_ascend/profiling_config.py b/vllm_ascend/profiling_config.py index f5c25dfc..f4ac3c96 100644 --- a/vllm_ascend/profiling_config.py +++ b/vllm_ascend/profiling_config.py @@ -98,10 +98,6 @@ SERVICE_PROFILING_SYMBOLS_YAML = """ name: _prepare_inputs domain: ModelExecute -- symbol: vllm_ascend.utils:ProfileExecuteDuration.capture_async - min_version: "0.9.1" - handler: msserviceprofiler.vllm_profiler.vllm_v1.model_hookers:capture_async - # ===== Request Lifecycle ===== - symbol: vllm.v1.engine.async_llm:AsyncLLM.add_request min_version: "0.9.1" diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 314d745e..173d37ee 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -10,6 +10,7 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID +from vllm.v1.utils import record_function_or_nullcontext from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.attention.attention_v1 import AscendAttentionState @@ -17,7 +18,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable, vllm_version_is +from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is class MtpProposer(EagleProposer): @@ -311,7 +312,7 @@ class MtpProposer(EagleProposer): batch_descriptor=batch_descriptor, num_actual_tokens=num_tokens, is_draft_model=True): - with ProfileExecuteDuration().capture_async('mtp_forward'): + with record_function_or_nullcontext('mtp_forward'): model_kwargs = {} model_kwargs["attn_metadata"] = attn_metadata input_ids = self.input_ids[:num_input_tokens] diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 3d8f6abe..f45d1440 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -23,7 +23,7 @@ import atexit import functools import math import os -from contextlib import contextmanager, nullcontext +from contextlib import nullcontext from enum import Enum from functools import lru_cache from threading import Lock @@ -32,7 +32,6 @@ from typing import TYPE_CHECKING, Any import torch import torch_npu # noqa: F401 from packaging.version import InvalidVersion, Version -from torch_npu.npu.streams import Event from vllm.logger import logger from vllm.sequence import IntermediateTensors @@ -562,53 +561,6 @@ def dispose_tensor(x: torch.Tensor): x.set_(torch.empty((0,), device=x.device, dtype=x.dtype)) -class ProfileExecuteDuration: - _instance = None - _observations: list[tuple[str, Event, Event]] = [] - _lock = Lock() - - def __new__(cls): - with cls._lock: - if cls._instance is None: - cls._instance = super().__new__(cls) - atexit.register(cls._instance.destroy) - return cls._instance - - def destroy(self): - with self._lock: - self._observations.clear() - - @contextmanager - def capture_async(self, duration_tag: str): - if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE: - yield - return - - observe_start = Event(enable_timing=True) - observe_start.record() - try: - yield - finally: - observe_end = Event(enable_timing=True) - observe_end.record() - with self._lock: - self._observations.append((duration_tag, observe_start, observe_end)) - - def pop_captured_sync(self) -> dict: - """Pop and synchronize all events in the observation list""" - durations: dict[str, float] = {} - if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE: - return durations - - while self._observations: - with self._lock: - tag, observe_start, observe_end = self._observations.pop() - observe_end.synchronize() - durations[tag] = observe_start.elapsed_time(observe_end) - - return durations - - def register_ascend_customop(vllm_config: VllmConfig | None = None): """Register Ascend CustomOP diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 26286dfe..df92ecdd 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -72,6 +72,7 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask +from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.worker.gpu_model_runner import (AsyncGPUModelRunnerOutput, GPUModelRunner) from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorOutput @@ -104,11 +105,11 @@ from vllm_ascend.spec_decode import get_spec_decode_method from vllm_ascend.spec_decode.eagle_proposer import EagleProposer from vllm_ascend.spec_decode.medusa_proposer import MedusaProposer from vllm_ascend.spec_decode.mtp_proposer import MtpProposer -from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, +from vllm_ascend.utils import (AscendDeviceType, enable_sp, get_ascend_device_type, is_drafter_moe_model, is_moe_model, lmhead_tp_enable, maybe_trans_nz, - set_weight_prefetch_method, vllm_version_is) + set_weight_prefetch_method) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -1104,7 +1105,7 @@ class NPUModelRunner(GPUModelRunner): ): scheduler_output = deepcopy(scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - with ProfileExecuteDuration().capture_async("prepare input"): + with record_function_or_nullcontext("prepare input"): with self.synchronize_input_prep(): # Update persistent batch states. self._update_states(scheduler_output) @@ -1268,7 +1269,7 @@ class NPUModelRunner(GPUModelRunner): ) # Run forward pass - with ProfileExecuteDuration().capture_async("forward"): + with record_function_or_nullcontext("forward"): with ( set_ascend_forward_context( attn_metadata, @@ -1286,7 +1287,7 @@ class NPUModelRunner(GPUModelRunner): hidden_states = self._model_forward( num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs) - with (ProfileExecuteDuration().capture_async("post process")): + with record_function_or_nullcontext("post process"): if self.pcp_size > 1: # NOTE we must `slice` hidden_states because pcp_allgather_restore_idx # ignores the padding from CUDA Graph. @@ -1408,7 +1409,7 @@ class NPUModelRunner(GPUModelRunner): self.input_batch, logits) logits = logits.to(self.device).to(logits_dtype) - with ProfileExecuteDuration().capture_async("Sample"): + with record_function_or_nullcontext("sample_token"): sampler_output = self._sample(logits, spec_decode_metadata) def propose_draft_token_ids(sampled_token_ids): @@ -1444,7 +1445,7 @@ class NPUModelRunner(GPUModelRunner): spec_decode_metadata, ) - with ProfileExecuteDuration().capture_async("Draft"): + with record_function_or_nullcontext("draft_token"): if self.speculative_config: use_padded_batch_for_eagle = self.speculative_config and \ self.speculative_config.use_eagle() and \ @@ -1474,15 +1475,6 @@ class NPUModelRunner(GPUModelRunner): cudagraph_stats=cudagraph_stats, ) - durations = ProfileExecuteDuration().pop_captured_sync() - if durations: - dr_str = [ - f"[{tag}]:{duration:.2f}ms" - for tag, duration in durations.items() - ] - captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" - logger.info("Profile execute duration [%s]:%s", captured_name, - " ".join(dr_str)) if self.dynamic_eplb: self.eplb_updator.forward_end()