diff --git a/docs/source/developer_guide/feature_guide/patch.md b/docs/source/developer_guide/feature_guide/patch.md index 19bb288..ca8e000 100644 --- a/docs/source/developer_guide/feature_guide/patch.md +++ b/docs/source/developer_guide/feature_guide/patch.md @@ -20,13 +20,9 @@ In `vllm_ascend/patch`, you can see the code structure as follows: vllm_ascend ├── patch │ ├── platform -│ │ ├── patch_0_9_2 -│ │ ├── patch_common -│ │ ├── patch_main +│ │ ├── patch_xxx.py │ ├── worker -│ │ ├── patch_0_9_2 -│ │ ├── patch_common -│ │ ├── patch_main +│ │ ├── patch_yyy.py └─────────── ``` @@ -36,19 +32,13 @@ vllm_ascend - **worker**: The patch code in this directory is for patching the code in vLLM worker process. It's called by `vllm_ascend/worker/worker_v1::NPUWorker::__init__` when the vLLM worker process is initialized. - For both online and offline mode, vLLM engine core process calls the worker patch here `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` when initializing the worker process. -In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM. - -- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0. -- `patch_main`: This module is used for patching the code in vLLM main branch. -- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch. - ## How to write a patch Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM. 1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM. 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`. -3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`. +3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_distributed.py`. 4. Write your patch code in the new file. Here is an example: ```python @@ -61,7 +51,7 @@ Before writing a patch, following the principle above, we should patch the least vllm.distributed.parallel_state.destroy_model_parallel = patch_destroy_model_parallel ``` -5. Import the patch file in `__init__.py`. In this example, add `import vllm_ascend.patch.platform.patch_common.patch_distributed` into `vllm_ascend/patch/platform/patch_common/__init__.py`. +5. Import the patch file in `__init__.py`. In this example, add `import vllm_ascend.patch.platform.patch_distributed` into `vllm_ascend/patch/platform/__init__.py`. 6. Add the description of the patch in `vllm_ascend/patch/__init__.py`. The description format is as follows: ``` diff --git a/tests/ut/patch/worker/patch_common/test_patch_distributed.py b/tests/ut/patch/worker/patch_common/test_patch_distributed.py index f3d9509..d6de489 100644 --- a/tests/ut/patch/worker/patch_common/test_patch_distributed.py +++ b/tests/ut/patch/worker/patch_common/test_patch_distributed.py @@ -19,8 +19,7 @@ import torch from vllm.distributed.parallel_state import GroupCoordinator from tests.ut.base import TestBase -from vllm_ascend.patch.worker.patch_common.patch_distributed import \ - GroupCoordinatorPatch +from vllm_ascend.patch.worker.patch_distributed import GroupCoordinatorPatch class TestPatchDistributed(TestBase): diff --git a/tests/ut/patch/worker/patch_common/test_patch_minicpm.py b/tests/ut/patch/worker/patch_common/test_patch_minicpm.py index 47d1957..9a63d0e 100644 --- a/tests/ut/patch/worker/patch_common/test_patch_minicpm.py +++ b/tests/ut/patch/worker/patch_common/test_patch_minicpm.py @@ -18,7 +18,7 @@ from unittest.mock import MagicMock import torch from tests.ut.base import TestBase -from vllm_ascend.patch.worker.patch_common.patch_minicpm import forward +from vllm_ascend.patch.worker.patch_minicpm import forward class TestPatchMiniCPM(TestBase): diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index f76d881..7d3725f 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -23,11 +23,6 @@ # `vllm_ascend.utils.adapt_patch(is_global_patch=False)` in # each worker's `__init__` function. # -# Then in each kind of patch, there are three folders: -# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0. -# - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.10.0 and main branch. -# # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -35,7 +30,7 @@ # -------------------------------- # * Platform Patch: # ================= -# ** File: platform/patch_common/patch_distributed.py** +# ** File: platform/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: @@ -56,7 +51,7 @@ # Future Plan: # Find a better way to support tensor alignment for 310p without this patch. # -# ** File: worker/patch_common/patch_multimodal_merge.py** +# ** File: worker/patch_multimodal_merge.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings` # Why: @@ -70,7 +65,7 @@ # # * Worker Patch: # =============== -# ** File: worker/patch_common/patch_minicpm.py ** +# ** File: worker/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` # Why: @@ -84,7 +79,7 @@ # Future Plan: # Keep this patch in vllm-ascend. # -# ** File: worker/patch_common/patch_distributed.py ** +# ** File: worker/patch_distributed.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.GroupCoordinator` # (1) __init__() @@ -120,7 +115,7 @@ # - https://github.com/vllm-project/vllm/pull/21591 # Future Plan: # Revert it when vLLM merge #21591 and release new version -# ** File: worker/patch_common/patch_logits.py ** +# ** File: worker/patch_logits.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm._custom_ops.apply_repetition_penalties` # Why: diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index c0772a8..5f15993 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -14,5 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from vllm_ascend.patch.platform import patch_common # noqa: F401 -from vllm_ascend.patch.platform import patch_main # noqa: F401 +import os + +import vllm_ascend.patch.platform.patch_config # noqa +import vllm_ascend.patch.platform.patch_distributed # noqa +import vllm_ascend.patch.platform.patch_mamba_config # noqa + +if os.getenv("DYNAMIC_EPLB", False) or os.getenv("EXPERT_MAP_RECORD", False): + import vllm_ascend.patch.platform.patch_multiproc_executor # noqa diff --git a/vllm_ascend/patch/platform/patch_common/__init__.py b/vllm_ascend/patch/platform/patch_common/__init__.py deleted file mode 100644 index 11d0b10..0000000 --- a/vllm_ascend/patch/platform/patch_common/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os - -from vllm.logger import logger - -import vllm_ascend.patch.platform.patch_common.patch_config # noqa -import vllm_ascend.patch.platform.patch_common.patch_distributed # noqa -import vllm_ascend.patch.platform.patch_common.patch_mamba_config # noqa - - -def patch_v1_executor(): - try: - dynamic_eplb = os.getenv("DYNAMIC_EPLB", False) or os.getenv( - "EXPERT_MAP_RECORD", False) - if dynamic_eplb: - import vllm_ascend.patch.platform.patch_common.patch_multiproc_executor # noqa - else: - logger.warning("Do not patch v1 executor.") - except RuntimeError as e: - logger.warning( - f"Fail to patch v1 executor, please add environment params DYNAMIC_EPLB or EXPERT_MAP_RECORD : {e}" - ) - - -patch_v1_executor() diff --git a/vllm_ascend/patch/platform/patch_common/patch_config.py b/vllm_ascend/patch/platform/patch_config.py similarity index 100% rename from vllm_ascend/patch/platform/patch_common/patch_config.py rename to vllm_ascend/patch/platform/patch_config.py diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_distributed.py similarity index 100% rename from vllm_ascend/patch/platform/patch_common/patch_distributed.py rename to vllm_ascend/patch/platform/patch_distributed.py diff --git a/vllm_ascend/patch/platform/patch_main/__init__.py b/vllm_ascend/patch/platform/patch_main/__init__.py deleted file mode 100644 index 116c73c..0000000 --- a/vllm_ascend/patch/platform/patch_main/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py similarity index 100% rename from vllm_ascend/patch/platform/patch_common/patch_mamba_config.py rename to vllm_ascend/patch/platform/patch_mamba_config.py diff --git a/vllm_ascend/patch/platform/patch_common/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py similarity index 100% rename from vllm_ascend/patch/platform/patch_common/patch_multiproc_executor.py rename to vllm_ascend/patch/platform/patch_multiproc_executor.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d294f14..fa7d195 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -15,5 +15,15 @@ # limitations under the License. # -from vllm_ascend.patch.worker import patch_common # noqa: F401 -from vllm_ascend.patch.worker import patch_main # noqa: F401 +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + import vllm_ascend.patch.worker.patch_triton + +# isort: off +import vllm_ascend.patch.worker.patch_distributed # noqa +import vllm_ascend.patch.worker.patch_logits # noqa +import vllm_ascend.patch.worker.patch_roberta # noqa +import vllm_ascend.patch.worker.patch_weight_loader # noqa +import vllm_ascend.patch.worker.patch_multimodal_merge # noqa +import vllm_ascend.patch.worker.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py deleted file mode 100644 index bed7e92..0000000 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from vllm.triton_utils import HAS_TRITON - -if HAS_TRITON: - import vllm_ascend.patch.worker.patch_common.patch_triton - -# isort: off -import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa -import vllm_ascend.patch.worker.patch_common.patch_logits # noqa -import vllm_ascend.patch.worker.patch_common.patch_roberta # noqa -import vllm_ascend.patch.worker.patch_common.patch_weight_loader # noqa -import vllm_ascend.patch.worker.patch_common.patch_multimodal_merge # noqa -import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py deleted file mode 100644 index 3bea9d4..0000000 --- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# mypy: ignore-errors -from functools import cache -from typing import Optional - -import torch -import vllm -import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.selector import (backend_name_to_enum, - get_global_forced_attn_backend) -from vllm.platforms import _Backend, current_platform -from vllm.utils import resolve_obj_by_qualname - - -def get_attn_backend( # type: ignore[misc] - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, -) -> type[AttentionBackend]: - """Selects which attention backend to use and lazily imports it.""" - # Accessing envs.* behind an @lru_cache decorator can cause the wrong - # value to be returned from the cache if the value changes between calls. - # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the - # private function. - return _cached_get_attn_backend( - head_size=head_size, - dtype=dtype, - kv_cache_dtype=kv_cache_dtype, - block_size=block_size, - use_v1=envs.VLLM_USE_V1, - use_mla=use_mla, - use_sfa=use_sfa, - has_sink=has_sink, - ) - - -@cache -def _cached_get_attn_backend( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - use_v1: bool = False, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, - use_sparse: bool = False, -) -> type[AttentionBackend]: - # Check whether a particular choice of backend was - # previously forced. - # - # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND - # ENVIRONMENT VARIABLE. - selected_backend = None - backend_by_global_setting: Optional[_Backend] = ( - get_global_forced_attn_backend()) - if backend_by_global_setting is not None: - selected_backend = backend_by_global_setting - else: - # Check the environment variable and override if specified - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - raise ValueError( - f"Invalid attention backend: '{backend_by_env_var}'. " - f"Valid backends are: {list(_Backend.__members__.keys())}") - - # get device-specific attn_backend - attention_cls = current_platform.get_attn_backend_cls( - selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, - use_mla, use_sfa, has_sink) - if not attention_cls: - raise ValueError( - f"Invalid attention backend for {current_platform.device_name}") - return resolve_obj_by_qualname(attention_cls) - - -vllm.attention.get_attn_backend = get_attn_backend -vllm.attention.selector._cached_get_attn_backend = _cached_get_attn_backend diff --git a/vllm_ascend/patch/worker/patch_common/patch_distributed.py b/vllm_ascend/patch/worker/patch_distributed.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_distributed.py rename to vllm_ascend/patch/worker/patch_distributed.py diff --git a/vllm_ascend/patch/worker/patch_common/patch_logits.py b/vllm_ascend/patch/worker/patch_logits.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_logits.py rename to vllm_ascend/patch/worker/patch_logits.py diff --git a/vllm_ascend/patch/worker/patch_main/__init__.py b/vllm_ascend/patch/worker/patch_main/__init__.py deleted file mode 100644 index 2ed088b..0000000 --- a/vllm_ascend/patch/worker/patch_main/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# \ No newline at end of file diff --git a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py b/vllm_ascend/patch/worker/patch_minicpm.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_minicpm.py rename to vllm_ascend/patch/worker/patch_minicpm.py diff --git a/vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py b/vllm_ascend/patch/worker/patch_multimodal_merge.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_multimodal_merge.py rename to vllm_ascend/patch/worker/patch_multimodal_merge.py diff --git a/vllm_ascend/patch/worker/patch_common/patch_roberta.py b/vllm_ascend/patch/worker/patch_roberta.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_roberta.py rename to vllm_ascend/patch/worker/patch_roberta.py diff --git a/vllm_ascend/patch/worker/patch_common/patch_triton.py b/vllm_ascend/patch/worker/patch_triton.py similarity index 96% rename from vllm_ascend/patch/worker/patch_common/patch_triton.py rename to vllm_ascend/patch/worker/patch_triton.py index 8904054..cc550cc 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_triton.py +++ b/vllm_ascend/patch/worker/patch_triton.py @@ -13,4 +13,4 @@ vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn -vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule \ No newline at end of file +vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_weight_loader.py similarity index 100% rename from vllm_ascend/patch/worker/patch_common/patch_weight_loader.py rename to vllm_ascend/patch/worker/patch_weight_loader.py diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index fbdb42b..7978b58 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -390,7 +390,7 @@ class NPUTorchairModelRunner(NPUModelRunner): if is_310p(): # on 300I Duo platform, we need to patch broadcast. however, this patch will be # overwritten by patch_for_hcom in torchair. so we need to re-patch it here. - from vllm_ascend.patch.platform.patch_common.patch_distributed import \ + from vllm_ascend.patch.platform.patch_distributed import \ communication_adaptation_310p communication_adaptation_310p()