[CI] upgrade vllm to 0.8.5 (#715)

1. Upgrade vllm to 0.8.5 2. Drop 0.8.4 support 3. Keep doc to 0.8.4rc2 until we release 0.8.5 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-04-30 09:15:50 +08:00
parent 95e7aa4736
commit f8350569e6
20 changed files with 48 additions and 579 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.8.4"):
-    from vllm_ascend.patch.worker import patch_0_8_4  # noqa: F401
+if vllm_version_is("0.8.5"):
+    from vllm_ascend.patch.worker import patch_0_8_5  # noqa: F401
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py
+++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py
@@ -1,59 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Callable, Optional, Union
-
-import torch
-from vllm.spec_decode.metrics import (AsyncMetricsCollector,
-                                      SpecDecodeWorkerMetrics)
-
-Timer = Callable[[], float]
-
-# TODO: revert this patch when the cuda hard code is removed in vllm
-# init_tensors: Modified the hard-coded cuda judgment logic to npu;
-# maybe_collect_rejsample_metrics: Removed the check for current_platform.is_cuda_alike()
-
-
-def init_tensors(self,
-                 rank: int,
-                 device_type: Union[torch.device, str] = 'npu') -> None:
-    self._rank = rank
-    if isinstance(device_type, torch.device):
-        device_type = device_type.type
-    if device_type == 'npu':
-        self._copy_stream = torch.npu.Stream()
-
-
-def maybe_collect_rejsample_metrics(
-        self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
-
-    # If a copy was initiated in the previous call, collect and return.
-    if self._in_flight_copy is not None:
-        ready_event = self._in_flight_copy
-        self._in_flight_copy = None
-        return self._collect_rejsample_metrics(k, ready_event)
-
-    # Otherwise, check if we should start a new copy.
-    if self._should_collect_rejsample_metrics(self._timer()):
-        assert self._in_flight_copy is None
-        self._in_flight_copy = self._copy_rejsample_metrics_async()
-
-    return None
-
-
-AsyncMetricsCollector.init_tensors = init_tensors
-AsyncMetricsCollector.maybe_collect_rejsample_metrics = maybe_collect_rejsample_metrics
--- a/vllm_ascend/patch/worker/patch_0_8_4/patch_spec_decode_worker.py
+++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_spec_decode_worker.py
@@ -1,30 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
-
-
-def _configure_model_sampler_for_spec_decode(self):
-    (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
-     ) = True
-    (self.scorer_worker.model_runner.model.sampler.
-     should_modify_greedy_probs_inplace) = True
-    self.proposer_worker.set_include_gpu_probs_tensor()
-    self.proposer_worker.set_should_modify_greedy_probs_inplace()
-
-
-SpecDecodeWorker._configure_model_sampler_for_spec_decode = _configure_model_sampler_for_spec_decode
--- a/vllm_ascend/patch/worker/patch_0_8_4/patch_tritonplaceholder.py
+++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_tritonplaceholder.py
@@ -1,71 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm/triton_utils/importing.py
-#
-
-import importlib
-import sys
-import types
-from importlib.util import find_spec
-
-from vllm.logger import logger
-
-HAS_TRITON = (
-    find_spec("triton") is not None
-    or find_spec("pytorch-triton-xpu") is not None  # Not compatible
-)
-
-if not HAS_TRITON:
-    logger.info("Triton not installed or not compatible; certain GPU-related"
-                " functions will not be available.")
-
-    class TritonPlaceholder(types.ModuleType):
-
-        def __init__(self):
-            super().__init__("triton")
-            self.jit = self._dummy_decorator("jit")
-            self.autotune = self._dummy_decorator("autotune")
-            self.heuristics = self._dummy_decorator("heuristics")
-            self.language = TritonLanguagePlaceholder()
-            self.__spec__ = importlib.machinery.ModuleSpec(
-                name="triton", loader=None, origin="placeholder")
-            logger.warning_once(
-                "Triton is not installed. Using dummy decorators. "
-                "Install it via `pip install triton` to enable kernel"
-                " compilation.")
-
-        def _dummy_decorator(self, name):
-
-            def decorator(func=None, **kwargs):
-                if func is None:
-                    return lambda f: f
-                return func
-
-            return decorator
-
-    class TritonLanguagePlaceholder(types.ModuleType):
-
-        def __init__(self):
-            super().__init__("triton.language")
-            self.constexpr = None
-            self.dtype = None
-
-    sys.modules['triton'] = TritonPlaceholder()
-    sys.modules['triton.language'] = TritonLanguagePlaceholder()
-
-if 'triton' in sys.modules:
-    logger.info("Triton module has been replaced with a placeholder.")
--- a/vllm_ascend/patch/worker/patch_0_8_5/init.py
+++ b/vllm_ascend/patch/worker/patch_0_8_5/init.py
@@ -14,6 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-import vllm_ascend.patch.worker.patch_0_8_4.patch_metrics  # noqa
-import vllm_ascend.patch.worker.patch_0_8_4.patch_tritonplaceholder  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
@@ -22,7 +22,6 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.multi_step_worker import MultiStepWorker

-from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner


@@ -93,16 +92,14 @@ def set_include_gpu_probs_tensor(self) -> None:
    # Need include_gpu_probs_tensor for MultiSteoWorker
    if hasattr(self.model_runner.model, "sampler"):
        self.model_runner.model.sampler.include_gpu_probs_tensor = True
-    if not vllm_version_is("0.8.4"):
-        self.model_runner.sampler.include_gpu_probs_tensor = True
+    self.model_runner.sampler.include_gpu_probs_tensor = True


 def set_should_modify_greedy_probs_inplace(self) -> None:
    if hasattr(self.model_runner.model, "sampler"):
        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
            True)
-    if not vllm_version_is("0.8.4"):
-        self.model_runner.sampler.should_modify_greedy_probs_inplace = True
+    self.model_runner.sampler.should_modify_greedy_probs_inplace = True


 MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)