[Patch][Misc] Cleanup and update patches (#6802)

### What this PR does / why we need it? This PR performs a cleanup and update of the patch mechanism in `vllm-ascend`. - Removes several obsolete patches: `patch_deepseek.py`. - Updates the central patch documentation in `vllm_ascend/patch/__init__.py` to reflect these removals and additions, re-numbering and re-organizing the patch list for better clarity. ### Does this PR introduce _any_ user-facing change? No. These are internal changes to the patching mechanism and should not affect users. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-26 14:45:33 +08:00
parent c9d05d10aa
commit 532f7a82f2
3 changed files with 67 additions and 89 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -29,7 +29,6 @@ import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa
 import vllm_ascend.patch.worker.patch_rejection_sampler  # noqa
-import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_v2_eagle  # noqa
 import vllm_ascend.patch.worker.patch_v2_uva  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
--- a/vllm_ascend/patch/worker/patch_deepseek.py
+++ b/vllm_ascend/patch/worker/patch_deepseek.py
@@ -1,54 +0,0 @@
-from itertools import islice
-
-import torch
-from vllm.distributed import get_pp_group
-from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model, _get_llama_4_scaling
-from vllm.sequence import IntermediateTensors
-
-
-def forward(
-    self,
-    input_ids,
-    positions,
-    intermediate_tensors,
-    inputs_embeds,
-):
-    if get_pp_group().is_first_rank:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
-        else:
-            hidden_states = self.embed_input_ids(input_ids)
-        residual = None
-    else:
-        assert intermediate_tensors is not None
-        hidden_states = intermediate_tensors["hidden_states"]
-        residual = intermediate_tensors["residual"]
-
-    # Compute llama 4 scaling once per forward pass if enabled
-    # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
-    # We'll find a better way to remove this patch.
-    try:
-        llama_4_scaling_config = self.config.llama_4_scaling
-    except AttributeError:
-        llama_4_scaling_config = None
-    llama_4_scaling: torch.Tensor | None
-    if llama_4_scaling_config is not None:
-        llama_4_scaling = _get_llama_4_scaling(
-            original_max_position_embeddings=llama_4_scaling_config["original_max_position_embeddings"],
-            scaling_beta=llama_4_scaling_config["beta"],
-            positions=positions,
-        )
-    else:
-        llama_4_scaling = None
-
-    for layer in islice(self.layers, self.start_layer, self.end_layer):
-        hidden_states, residual = layer(positions, hidden_states, residual, llama_4_scaling)
-
-    if not get_pp_group().is_last_rank:
-        return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
-
-    hidden_states, _ = self.norm(hidden_states, residual)
-    return hidden_states
-
-
-DeepseekV2Model.forward = forward