diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 94f68006..b7010b73 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -42,7 +42,7 @@ # Future Plan: # Find a better way to support tensor alignment for 310p without this patch. # -# ** 3. File: platform/patch_mamba_config.py** +# ** 2. File: platform/patch_mamba_config.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.config.HybridAttentionMambaModelConfig.verify_and_update_config` # Why: @@ -54,7 +54,7 @@ # Future Plan: # Remove this patch when vLLM merges the PR. # -# ** 4. File: platform/patch_multiproc_executor.py** +# ** 3. File: platform/patch_multiproc_executor.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.executor.multiproc_executor.MultiprocExecutor` # Why: @@ -67,7 +67,7 @@ # Future Plan: # Remove this patch when vLLM fix the issue. # -# ** 5. File: platform/patch_sched_yield.py** +# ** 4. File: platform/patch_sched_yield.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.utils.USE_SCHED_YIELD` # Why: @@ -79,7 +79,7 @@ # Future Plan: # Remove this patch when vLLM merge the PR. # -# ** 6. File: platform/patch_balance_schedule.py** +# ** 5. File: platform/patch_balance_schedule.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.engine.core.EngineCoreProc.run_engine_core` # `vllm.v1.core.sched.scheduler.Scheduler` @@ -97,20 +97,7 @@ # * Worker Patch: # =============== # -# ** 1. File: worker/patch_deepseek.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `DeepseekV2Model.forward` -# Why: -# getattr(self.config, "llama_4_scaling", None) will raise AttributeError -# on npu with graph mode. -# How: -# catch the AttributeError and set llama_4_scaling to None. -# Related PR (if no, explain why): -# No, this is a bug in vLLM Ascend -# Future Plan: -# Find the root cause of this bug and fix it in vLLM Ascend. -# -# ** 2. File: worker/patch_distributed.py ** +# ** 1. File: worker/patch_distributed.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.GroupCoordinator` # Why: @@ -125,7 +112,7 @@ # Remove this patch when the refactor of all2all manager is done. # Remove this patch when vLLM support all_reduce as customop. # -# ** 3. File: worker/patch_minicpm.py ** +# ** 2. File: worker/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` # Why: @@ -139,7 +126,7 @@ # Future Plan: # Keep this patch in vllm-ascend. # -# ** 4. File: worker/patch_multimodal_merge.py** +# ** 3. File: worker/patch_multimodal_merge.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings` # Why: @@ -151,7 +138,7 @@ # Future Plan: # Identify this pattern in torch-npu and remove this patch. # -# ** 5. File: worker/patch_roberta.py ** +# ** 4. File: worker/patch_roberta.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.bert ` # Why: @@ -163,7 +150,7 @@ # Future Plan: # Revert this when CANN support shift aclnn operation # -# ** 6. File: worker/patch_triton.py** +# ** 5. File: worker/patch_triton.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.layers.mamba.ops`, `vllm.model_executor.layers.fla.ops`, # `vllm.v1.worker.gpu.sample.gumbel.gumbel_sample` @@ -176,7 +163,7 @@ # Future Plan: # Remove this patch when vLLM support the dispatch function. # -# ** 7. File: worker/patch_qwen3_next_mtp.py** +# ** 6. File: worker/patch_qwen3_next_mtp.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.utils.bind_kv_cache` # Why: @@ -189,7 +176,7 @@ # Future Plan: # Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu. # -# ** 8. File: worker/patch_module.py** +# ** 7. File: worker/patch_module.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort` # Why: @@ -205,7 +192,7 @@ # Remove this patch when bool is supported in 'torch.argsort' func of npu. # Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable. # -# ** 9. File: worker/patch_rejection_sampler.py** +# ** 8. File: worker/patch_rejection_sampler.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.rejection_sampler` # Why: @@ -221,7 +208,7 @@ # to override them, then delete the patch file `worker/patch_rejection_sampler.py`. # 2. make these functions as costom op, then remove AscendRejectionSampler # -# ** 10.File: worker/patch_qwen3_next.py** +# ** 9.File: worker/patch_qwen3_next.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward` # Why: @@ -233,7 +220,7 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# ** 11. File: worker/patch_qwen3_next.py** +# ** 10. File: worker/patch_qwen3_next.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` # Why: @@ -255,7 +242,7 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# ** 12. File: worker/patch_v2_eagle.py** +# ** 11. File: worker/patch_v2_eagle.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose` # Why: @@ -267,7 +254,7 @@ # Future Plan: # Remove this patch when cann fix the gather bug. # -# ** 13. File: worker/patch_unquantized_gemm.py** +# ** 12. File: worker/patch_unquantized_gemm.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.layers.utils.default_unquantized_gemm` # Why: diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 0bb2e927..d214dbad 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -25,7 +25,6 @@ import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.worker.patch_unquantized_gemm # noqa import vllm_ascend.patch.worker.patch_bert # noqa import vllm_ascend.patch.worker.patch_distributed # noqa -import vllm_ascend.patch.worker.patch_deepseek # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_rope # noqa diff --git a/vllm_ascend/patch/worker/patch_deepseek.py b/vllm_ascend/patch/worker/patch_deepseek.py deleted file mode 100644 index 0578f90b..00000000 --- a/vllm_ascend/patch/worker/patch_deepseek.py +++ /dev/null @@ -1,60 +0,0 @@ -from itertools import islice - -import torch -from vllm.distributed import get_pp_group -from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model, - _get_llama_4_scaling) -from vllm.sequence import IntermediateTensors - - -def forward( - self, - input_ids, - positions, - intermediate_tensors, - inputs_embeds, -): - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - # Compute llama 4 scaling once per forward pass if enabled - # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8 - # We'll find a better way to remove this patch. - try: - llama_4_scaling_config = getattr(self.config, "llama_4_scaling") - except AttributeError: - llama_4_scaling_config = None - llama_4_scaling: torch.Tensor | None - if llama_4_scaling_config is not None: - llama_4_scaling = _get_llama_4_scaling( - original_max_position_embeddings=llama_4_scaling_config[ - "original_max_position_embeddings"], - scaling_beta=llama_4_scaling_config["beta"], - positions=positions, - ) - else: - llama_4_scaling = None - - for layer in islice(self.layers, self.start_layer, self.end_layer): - hidden_states, residual = layer(positions, hidden_states, residual, - llama_4_scaling) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -DeepseekV2Model.forward = forward