diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 6de4cd1..d4286bd 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -90,7 +90,7 @@ # # * Worker Patch: # =============== -# ** File: worker/patch_common/patch_metrics.py ** +# ** File: worker/patch_0_8_4/patch_metrics.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.init_tensors` and # `vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async` @@ -104,7 +104,9 @@ # Future Plan: # Revert it when the related pr is merged in vllm. # -# 2. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` +# ** File: worker/patch_common/patch_metrics.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` # Why: # There are cuda hard code (current_platform.is_cuda_alike()) in # `AsyncMetricsCollector.maybe_collect_rejsample_metrics` diff --git a/vllm_ascend/patch/worker/patch_0_8_4/__init__.py b/vllm_ascend/patch/worker/patch_0_8_4/__init__.py index 2ed088b..2465b3f 100644 --- a/vllm_ascend/patch/worker/patch_0_8_4/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_8_4/__init__.py @@ -13,4 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# \ No newline at end of file +# + +import vllm_ascend.patch.worker.patch_0_8_4.patch_metrics # noqa diff --git a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py new file mode 100644 index 0000000..4ba223f --- /dev/null +++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Callable, Optional, Union + +import torch +import torch_npu +from vllm.spec_decode.metrics import (AsyncMetricsCollector, + SpecDecodeWorkerMetrics) + +Timer = Callable[[], float] + +# TODO: revert this patch when the cuda hard code is removed in vllm +# init_tensors: Modified the hard-coded cuda judgment logic to npu; +# maybe_collect_rejsample_metrics: Removed the check for current_platform.is_cuda_alike() + + +def init_tensors(self, + rank: int, + device_type: Union[torch.device, str] = 'npu') -> None: + self._rank = rank + if isinstance(device_type, torch.device): + device_type = device_type.type + if device_type == 'npu': + self._copy_stream = torch_npu.npu.Stream() + + +def maybe_collect_rejsample_metrics( + self, k: int) -> Optional[SpecDecodeWorkerMetrics]: + + # If a copy was initiated in the previous call, collect and return. + if self._in_flight_copy is not None: + ready_event = self._in_flight_copy + self._in_flight_copy = None + return self._collect_rejsample_metrics(k, ready_event) + + # Otherwise, check if we should start a new copy. + if self._should_collect_rejsample_metrics(self._timer()): + assert self._in_flight_copy is None + self._in_flight_copy = self._copy_rejsample_metrics_async() + + return None + + +AsyncMetricsCollector.init_tensors = init_tensors +AsyncMetricsCollector.maybe_collect_rejsample_metrics = maybe_collect_rejsample_metrics diff --git a/vllm_ascend/patch/worker/patch_common/patch_metrics.py b/vllm_ascend/patch/worker/patch_common/patch_metrics.py index 685755f..6d1f2dc 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_metrics.py +++ b/vllm_ascend/patch/worker/patch_common/patch_metrics.py @@ -15,46 +15,13 @@ # limitations under the License. # -from typing import Callable, Optional, Union +from typing import Callable import torch -import torch_npu -from vllm.spec_decode.metrics import (AsyncMetricsCollector, - SpecDecodeWorkerMetrics) +from vllm.spec_decode.metrics import AsyncMetricsCollector Timer = Callable[[], float] -# TODO: revert this patch when the cuda hard code is removed in vllm -# init_tensors: Modified the hard-coded cuda judgment logic to npu; -# maybe_collect_rejsample_metrics: Removed the check for current_platform.is_cuda_alike() - - -def init_tensors(self, - rank: int, - device_type: Union[torch.device, str] = 'npu') -> None: - self._rank = rank - if isinstance(device_type, torch.device): - device_type = device_type.type - if device_type == 'npu': - self._copy_stream = torch_npu.npu.Stream() - - -def maybe_collect_rejsample_metrics( - self, k: int) -> Optional[SpecDecodeWorkerMetrics]: - - # If a copy was initiated in the previous call, collect and return. - if self._in_flight_copy is not None: - ready_event = self._in_flight_copy - self._in_flight_copy = None - return self._collect_rejsample_metrics(k, ready_event) - - # Otherwise, check if we should start a new copy. - if self._should_collect_rejsample_metrics(self._timer()): - assert self._in_flight_copy is None - self._in_flight_copy = self._copy_rejsample_metrics_async() - - return None - def _copy_rejsample_metrics_async(self) -> torch.npu.Event: """ @@ -83,6 +50,4 @@ def _copy_rejsample_metrics_async(self) -> torch.npu.Event: return aggregate_metrics_ready -AsyncMetricsCollector.init_tensors = init_tensors -AsyncMetricsCollector.maybe_collect_rejsample_metrics = maybe_collect_rejsample_metrics AsyncMetricsCollector._copy_rejsample_metrics_async = _copy_rejsample_metrics_async