From 532f7a82f2700e1d7e83c233e70f61793bc2beeb Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 26 Feb 2026 14:45:33 +0800 Subject: [PATCH] [Patch][Misc] Cleanup and update patches (#6802) ### What this PR does / why we need it? This PR performs a cleanup and update of the patch mechanism in `vllm-ascend`. - Removes several obsolete patches: `patch_deepseek.py`. - Updates the central patch documentation in `vllm_ascend/patch/__init__.py` to reflect these removals and additions, re-numbering and re-organizing the patch list for better clarity. ### Does this PR introduce _any_ user-facing change? No. These are internal changes to the patching mechanism and should not affect users. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83b47f67b1dfad505606070ae4d9f83e50ad4ebd Signed-off-by: wangxiyuan --- vllm_ascend/patch/__init__.py | 101 ++++++++++++++------- vllm_ascend/patch/worker/__init__.py | 1 - vllm_ascend/patch/worker/patch_deepseek.py | 54 ----------- 3 files changed, 67 insertions(+), 89 deletions(-) delete mode 100644 vllm_ascend/patch/worker/patch_deepseek.py diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 78a111bd..73c9c586 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -97,8 +97,8 @@ # * Worker Patch: # =============== # -# ** 1. File: worker/patch_distributed.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ** 1. File: worker/patch_distributed.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.GroupCoordinator` # Why: # vllm doesn't support all_to_all for GroupCoordinator. @@ -112,7 +112,7 @@ # Remove this patch when the refactor of all2all manager is done. # Remove this patch when vLLM support all_reduce as customop. # -# ** 3. File: worker/patch_multimodal_merge.py** +# ** 2. File: worker/patch_multimodal_merge.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings` # Why: @@ -124,9 +124,10 @@ # Future Plan: # Identify this pattern in torch-npu and remove this patch. # -# ** 4. File: worker/patch_roberta.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.models.bert ` +# ** 3. File: worker/patch_bert.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.bert._encode_token_type_ids` +# `vllm.model_executor.models.bert._decode_token_type_ids` # Why: # shift operation in `_encode_token_type_ids` and `_decode_token_type_ids` cannot run in ascend aclgraph mode # How: @@ -136,7 +137,7 @@ # Future Plan: # Revert this when CANN support shift aclnn operation # -# ** 5. File: worker/patch_triton.py** +# ** 4. File: worker/patch_triton.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.layers.mamba.ops`, `vllm.model_executor.layers.fla.ops`, # `vllm.v1.worker.gpu.sample.gumbel.gumbel_sample` @@ -149,7 +150,7 @@ # Future Plan: # Remove this patch when vLLM support the dispatch function. # -# ** 6. File: worker/patch_qwen3_next_mtp.py** +# ** 5. File: worker/patch_qwen3_next_mtp.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.utils.bind_kv_cache` # Why: @@ -162,7 +163,22 @@ # Future Plan: # Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu. # -# ** 7. File: worker/patch_module.py** +# ** 6. File: worker/patch_rejection_sampler.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.sample.rejection_sampler` +# Why: +# - some functions from `rejection_sampler` are not supported or slow on npu. +# How: +# - add npu_top_k_top_p to 'apply_sampling_constraints' func +# - add custom triton kernel to `expand_batch_to_tokens` and `rejection_sample` +# Related PR (if no, explain why): +# Let vLLM support triton ops dispatch. +# Future Plan: +# 1. make these functions as class func of RejectionSampler, create AscendRejectionSampler +# to override them, then delete the patch file `worker/patch_rejection_sampler.py`. +# 2. make these functions as costom op, then remove AscendRejectionSampler +# +## ** 7. File: worker/patch_module.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort` # Why: @@ -178,23 +194,7 @@ # Remove this patch when bool is supported in 'torch.argsort' func of npu. # Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable. # -# ** 8. File: worker/patch_rejection_sampler.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.v1.sample.rejection_sampler` -# Why: -# - some functions from `rejection_sampler` are not supported or slow on npu. -# How: -# - add npu_top_k_top_p to 'apply_sampling_constraints' func -# - add custom triton kernel to `expand_batch_to_tokens` and `rejection_sample` -# Related PR (if no, explain why): -# https://github.com/vllm-project/vllm/pull/874 -# https://github.com/vllm-project/vllm/pull/4849 -# Future Plan: -# 1. make these functions as class func of RejectionSampler, create AscendRejectionSampler -# to override them, then delete the patch file `worker/patch_rejection_sampler.py`. -# 2. make these functions as costom op, then remove AscendRejectionSampler -# -# ** 9.File: worker/patch_qwen3_next.py** +# ** 8. File: worker/patch_qwen3_next.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward` # Why: @@ -206,9 +206,7 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# ** 10. File: worker/patch_qwen3_next.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` +# 2. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` # Why: # triton ops fused_recurrent_gated_delta_rule and fused_gdn_gating in vLLM perform not good on NPU. # How: @@ -218,7 +216,7 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# 2. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` +# 3. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` # Why: # The Qwen3Next GatedDeltaNet _forward_core cannot directly add custom operators. # How: @@ -228,7 +226,17 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# ** 11. File: worker/patch_v2_eagle.py** +# ** 9. File: worker/patch_huanyuan_vl.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.transformers_utils.processors.hunyuan_vl.HunYuanVLProcessor.__call__` +# Why: +# The `add_special_tokens` parameter is not supported by default in the processor. +# How: +# Remove the `add_special_tokens` parameter from kwargs before calling the original method. +# Future Plan: +# Remove this patch when vLLM aligns with the latest processor implementation. +# +# ** 10. File: worker/patch_v2_eagle.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose` # Why: @@ -240,7 +248,7 @@ # Future Plan: # Remove this patch when cann fix the gather bug. # -# ** 12. File: worker/patch_unquantized_gemm.py** +# ** 11. File: worker/patch_unquantized_gemm.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.layers.utils.default_unquantized_gemm` # Why: @@ -250,7 +258,7 @@ # Future Plan: # Remove this patch when vLLM support the operator as customop. # -# ** 13. File: worker/patch_npugraph_ex_triton.py** +# ** 12. File: worker/patch_npugraph_ex_triton.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `torchair.core._concrete_graph.ValuePack`, # `torchair.npu_fx_compiler._unpack_meta`, @@ -263,7 +271,8 @@ # https://gitcode.com/Ascend/torchair/pull/2575 # Future Plan: # Remove this patch when the PTA version used by vllm-ascend has been upgraded. -# ** 14. File: worker/patch_v2_uva.py** +# +# ** 13. File: worker/patch_v2_uva.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.gpu.states.UvaBuffer` # Why: @@ -272,3 +281,27 @@ # make UvaBuffer a dummy class, mimic the interface of vllm UvaBuffer. # Future Plan: # Remove this patch when NPU support UVA. +# +# ** 14. File: worker/patch_kimi_k25.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.kimi_k25_vit.Learnable2DInterpPosEmbDivided_fixed.forward` +# Why: +# The forward method uses interpolate with ops not supported on NPU. +# How: +# Replace with a new forward that uses CPU for interpolate when shape mismatch, +# and use get_rope_shape to handle the rope shape interpolation. +# Future Plan: +# Remove this patch when vLLM aligns with the latest main. +# +# ** 15. File: worker/patch_routed_experts_capturer.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.layers.fused_moe.routed_experts_capturer.RoutedExpertsCapturer.init_buffer` +# Why: +# The `_device_buffer` initialization in vLLM uses `device="cuda"` hardcoded, +# which doesn't work on NPU. +# How: +# Replace `device="cuda"` with `device=current_platform.device_name` to support NPU. +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/34336 +# Future Plan: +# Remove this patch when vLLM merges the PR. diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 183f1846..ad2429d1 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -29,7 +29,6 @@ import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa -import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_v2_eagle # noqa import vllm_ascend.patch.worker.patch_v2_uva # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa diff --git a/vllm_ascend/patch/worker/patch_deepseek.py b/vllm_ascend/patch/worker/patch_deepseek.py deleted file mode 100644 index 26ef9ca1..00000000 --- a/vllm_ascend/patch/worker/patch_deepseek.py +++ /dev/null @@ -1,54 +0,0 @@ -from itertools import islice - -import torch -from vllm.distributed import get_pp_group -from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model, _get_llama_4_scaling -from vllm.sequence import IntermediateTensors - - -def forward( - self, - input_ids, - positions, - intermediate_tensors, - inputs_embeds, -): - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - # Compute llama 4 scaling once per forward pass if enabled - # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8 - # We'll find a better way to remove this patch. - try: - llama_4_scaling_config = self.config.llama_4_scaling - except AttributeError: - llama_4_scaling_config = None - llama_4_scaling: torch.Tensor | None - if llama_4_scaling_config is not None: - llama_4_scaling = _get_llama_4_scaling( - original_max_position_embeddings=llama_4_scaling_config["original_max_position_embeddings"], - scaling_beta=llama_4_scaling_config["beta"], - positions=positions, - ) - else: - llama_4_scaling = None - - for layer in islice(self.layers, self.start_layer, self.end_layer): - hidden_states, residual = layer(positions, hidden_states, residual, llama_4_scaling) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({"hidden_states": hidden_states, "residual": residual}) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -DeepseekV2Model.forward = forward