[Refactor] refactor patch module (#3555)

### What this PR does / why we need it? we notice that `patch_main` is never used. Usually the patch is for all version. And if it's for specified version, we can use `vllm_version_is` instead. So let's remove the useless sub folder in patch module to make it clear. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-21 20:19:46 +08:00
parent 0c6349610e
commit 13e8e75143
23 changed files with 33 additions and 234 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -15,5 +15,15 @@
 # limitations under the License.
 #

-from vllm_ascend.patch.worker import patch_common  # noqa: F401
-from vllm_ascend.patch.worker import patch_main  # noqa: F401
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    import vllm_ascend.patch.worker.patch_triton
+
+# isort: off
+import vllm_ascend.patch.worker.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_roberta  # noqa
+import vllm_ascend.patch.worker.patch_weight_loader  # noqa
+import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
+import vllm_ascend.patch.worker.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -1,29 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    import vllm_ascend.patch.worker.patch_common.patch_triton
-
-# isort: off
-import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_roberta  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_multimodal_merge  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -1,100 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# mypy: ignore-errors
-from functools import cache
-from typing import Optional
-
-import torch
-import vllm
-import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import resolve_obj_by_qualname
-
-
-def get_attn_backend(  # type: ignore[misc]
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    use_mla: bool = False,
-    use_sfa: bool = False,
-    has_sink: bool = False,
-) -> type[AttentionBackend]:
-    """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
-    return _cached_get_attn_backend(
-        head_size=head_size,
-        dtype=dtype,
-        kv_cache_dtype=kv_cache_dtype,
-        block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
-        use_mla=use_mla,
-        use_sfa=use_sfa,
-        has_sink=has_sink,
-    )
-
-
-@cache
-def _cached_get_attn_backend(
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    use_v1: bool = False,
-    use_mla: bool = False,
-    use_sfa: bool = False,
-    has_sink: bool = False,
-    use_sparse: bool = False,
-) -> type[AttentionBackend]:
-    # Check whether a particular choice of backend was
-    # previously forced.
-    #
-    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-    # ENVIRONMENT VARIABLE.
-    selected_backend = None
-    backend_by_global_setting: Optional[_Backend] = (
-        get_global_forced_attn_backend())
-    if backend_by_global_setting is not None:
-        selected_backend = backend_by_global_setting
-    else:
-        # Check the environment variable and override if specified
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-            if selected_backend is None:
-                raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. "
-                    f"Valid backends are: {list(_Backend.__members__.keys())}")
-
-    # get device-specific attn_backend
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla, use_sfa, has_sink)
-    if not attention_cls:
-        raise ValueError(
-            f"Invalid attention backend for {current_platform.device_name}")
-    return resolve_obj_by_qualname(attention_cls)
-
-
-vllm.attention.get_attn_backend = get_attn_backend
-vllm.attention.selector._cached_get_attn_backend = _cached_get_attn_backend
--- a/vllm_ascend/patch/worker/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_distributed.py
--- a/vllm_ascend/patch/worker/patch_common/patch_logits.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py
--- a/vllm_ascend/patch/worker/patch_main/init.py
+++ b/vllm_ascend/patch/worker/patch_main/init.py
@@ -1,16 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
--- a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
--- a/vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py
--- a/vllm_ascend/patch/worker/patch_common/patch_roberta.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_roberta.py
--- a/vllm_ascend/patch/worker/patch_common/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_triton.py
@@ -13,4 +13,4 @@ vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal
 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
 vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
 vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
-vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
+vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py