[Refactor] refactor patch module (#3555)

### What this PR does / why we need it? we notice that `patch_main` is never used. Usually the patch is for all version. And if it's for specified version, we can use `vllm_version_is` instead. So let's remove the useless sub folder in patch module to make it clear. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-21 20:19:46 +08:00
parent 0c6349610e
commit 13e8e75143
23 changed files with 33 additions and 234 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -23,11 +23,6 @@
 #           `vllm_ascend.utils.adapt_patch(is_global_patch=False)` in
 #           each worker's `__init__` function.
 #
-# Then in each kind of patch, there are three folders:
-# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
-# - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.10.0 and main branch.
-#
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------

@@ -35,7 +30,7 @@
 # --------------------------------
 # * Platform Patch:
 # =================
-# ** File: platform/patch_common/patch_distributed.py**
+# ** File: platform/patch_distributed.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.config.ParallelConfig.get_next_dp_init_port`
 #    Why:
@@ -56,7 +51,7 @@
 #    Future Plan:
 #       Find a better way to support tensor alignment for 310p without this patch.
 #
-# ** File: worker/patch_common/patch_multimodal_merge.py**
+# ** File: worker/patch_multimodal_merge.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.utils._merge_multimodal_embeddings`
 #    Why:
@@ -70,7 +65,7 @@
 #
 # * Worker Patch:
 # ===============
-# ** File: worker/patch_common/patch_minicpm.py **
+# ** File: worker/patch_minicpm.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward`
 #    Why:
@@ -84,7 +79,7 @@
 #    Future Plan:
 #       Keep this patch in vllm-ascend.
 #
-# ** File: worker/patch_common/patch_distributed.py **
+# ** File: worker/patch_distributed.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.distributed.parallel_state.GroupCoordinator`
 #   (1) __init__()
@@ -120,7 +115,7 @@
 #       - https://github.com/vllm-project/vllm/pull/21591
 #    Future Plan:
 #       Revert it when vLLM merge #21591 and release new version
-# ** File: worker/patch_common/patch_logits.py **
+# ** File: worker/patch_logits.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm._custom_ops.apply_repetition_penalties`
 #    Why:
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -14,5 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from vllm_ascend.patch.platform import patch_common  # noqa: F401
-from vllm_ascend.patch.platform import patch_main  # noqa: F401
+import os
+
+import vllm_ascend.patch.platform.patch_config  # noqa
+import vllm_ascend.patch.platform.patch_distributed  # noqa
+import vllm_ascend.patch.platform.patch_mamba_config  # noqa
+
+if os.getenv("DYNAMIC_EPLB", False) or os.getenv("EXPERT_MAP_RECORD", False):
+    import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
--- a/vllm_ascend/patch/platform/patch_common/init.py
+++ b/vllm_ascend/patch/platform/patch_common/init.py
@@ -1,40 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-
-from vllm.logger import logger
-
-import vllm_ascend.patch.platform.patch_common.patch_config  # noqa
-import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
-import vllm_ascend.patch.platform.patch_common.patch_mamba_config  # noqa
-
-
-def patch_v1_executor():
-    try:
-        dynamic_eplb = os.getenv("DYNAMIC_EPLB", False) or os.getenv(
-            "EXPERT_MAP_RECORD", False)
-        if dynamic_eplb:
-            import vllm_ascend.patch.platform.patch_common.patch_multiproc_executor  # noqa
-        else:
-            logger.warning("Do not patch v1 executor.")
-    except RuntimeError as e:
-        logger.warning(
-            f"Fail to patch v1 executor, please add environment params DYNAMIC_EPLB or EXPERT_MAP_RECORD : {e}"
-        )
-
-
-patch_v1_executor()
--- a/vllm_ascend/patch/platform/patch_common/patch_config.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_config.py
--- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
--- a/vllm_ascend/patch/platform/patch_main/init.py
+++ b/vllm_ascend/patch/platform/patch_main/init.py
@@ -1,16 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
--- a/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
--- a/vllm_ascend/patch/platform/patch_common/patch_multiproc_executor.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_multiproc_executor.py
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -15,5 +15,15 @@
 # limitations under the License.
 #

-from vllm_ascend.patch.worker import patch_common  # noqa: F401
-from vllm_ascend.patch.worker import patch_main  # noqa: F401
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    import vllm_ascend.patch.worker.patch_triton
+
+# isort: off
+import vllm_ascend.patch.worker.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_roberta  # noqa
+import vllm_ascend.patch.worker.patch_weight_loader  # noqa
+import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
+import vllm_ascend.patch.worker.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -1,29 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    import vllm_ascend.patch.worker.patch_common.patch_triton
-
-# isort: off
-import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_roberta  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_multimodal_merge  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -1,100 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# mypy: ignore-errors
-from functools import cache
-from typing import Optional
-
-import torch
-import vllm
-import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
-from vllm.platforms import _Backend, current_platform
-from vllm.utils import resolve_obj_by_qualname
-
-
-def get_attn_backend(  # type: ignore[misc]
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    use_mla: bool = False,
-    use_sfa: bool = False,
-    has_sink: bool = False,
-) -> type[AttentionBackend]:
-    """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
-    return _cached_get_attn_backend(
-        head_size=head_size,
-        dtype=dtype,
-        kv_cache_dtype=kv_cache_dtype,
-        block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
-        use_mla=use_mla,
-        use_sfa=use_sfa,
-        has_sink=has_sink,
-    )
-
-
-@cache
-def _cached_get_attn_backend(
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    use_v1: bool = False,
-    use_mla: bool = False,
-    use_sfa: bool = False,
-    has_sink: bool = False,
-    use_sparse: bool = False,
-) -> type[AttentionBackend]:
-    # Check whether a particular choice of backend was
-    # previously forced.
-    #
-    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-    # ENVIRONMENT VARIABLE.
-    selected_backend = None
-    backend_by_global_setting: Optional[_Backend] = (
-        get_global_forced_attn_backend())
-    if backend_by_global_setting is not None:
-        selected_backend = backend_by_global_setting
-    else:
-        # Check the environment variable and override if specified
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-            if selected_backend is None:
-                raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. "
-                    f"Valid backends are: {list(_Backend.__members__.keys())}")
-
-    # get device-specific attn_backend
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla, use_sfa, has_sink)
-    if not attention_cls:
-        raise ValueError(
-            f"Invalid attention backend for {current_platform.device_name}")
-    return resolve_obj_by_qualname(attention_cls)
-
-
-vllm.attention.get_attn_backend = get_attn_backend
-vllm.attention.selector._cached_get_attn_backend = _cached_get_attn_backend
--- a/vllm_ascend/patch/worker/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_distributed.py
--- a/vllm_ascend/patch/worker/patch_common/patch_logits.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py
--- a/vllm_ascend/patch/worker/patch_main/init.py
+++ b/vllm_ascend/patch/worker/patch_main/init.py
@@ -1,16 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
--- a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
--- a/vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py
--- a/vllm_ascend/patch/worker/patch_common/patch_roberta.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_roberta.py
--- a/vllm_ascend/patch/worker/patch_common/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_triton.py
@@ -13,4 +13,4 @@ vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal
 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
 vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
 vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
-vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
+vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -390,7 +390,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
        if is_310p():
            # on 300I Duo platform, we need to patch broadcast. however, this patch will be
            # overwritten by patch_for_hcom in torchair. so we need to re-patch it here.
-            from vllm_ascend.patch.platform.patch_common.patch_distributed import \
+            from vllm_ascend.patch.platform.patch_distributed import \
                communication_adaptation_310p
            communication_adaptation_310p()