[Patch][Misc] Cleanup and update patches (#6802)

### What this PR does / why we need it?

This PR performs a cleanup and update of the patch mechanism in
`vllm-ascend`.

- Removes several obsolete patches: `patch_deepseek.py`.
- Updates the central patch documentation in
`vllm_ascend/patch/__init__.py` to reflect these removals and additions,
re-numbering and re-organizing the patch list for better clarity.

### Does this PR introduce _any_ user-facing change?

No. These are internal changes to the patching mechanism and should not
affect users.

### How was this patch tested?

CI passed with new added/existing test.

- vLLM version: v0.15.0
- vLLM main:
83b47f67b1

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2026-02-26 14:45:33 +08:00
committed by GitHub
parent c9d05d10aa
commit 532f7a82f2
3 changed files with 67 additions and 89 deletions

View File

@@ -29,7 +29,6 @@ import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa

View File

@@ -1,54 +0,0 @@
from itertools import islice
import torch
from vllm.distributed import get_pp_group
from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model, _get_llama_4_scaling
from vllm.sequence import IntermediateTensors
def forward(
self,
input_ids,
positions,
intermediate_tensors,
inputs_embeds,
):
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.embed_input_ids(input_ids)
residual = None
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
residual = intermediate_tensors["residual"]
# Compute llama 4 scaling once per forward pass if enabled
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
# We'll find a better way to remove this patch.
try:
llama_4_scaling_config = self.config.llama_4_scaling
except AttributeError:
llama_4_scaling_config = None
llama_4_scaling: torch.Tensor | None
if llama_4_scaling_config is not None:
llama_4_scaling = _get_llama_4_scaling(
original_max_position_embeddings=llama_4_scaling_config["original_max_position_embeddings"],
scaling_beta=llama_4_scaling_config["beta"],
positions=positions,
)
else:
llama_4_scaling = None
for layer in islice(self.layers, self.start_layer, self.end_layer):
hidden_states, residual = layer(positions, hidden_states, residual, llama_4_scaling)
if not get_pp_group().is_last_rank:
return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
hidden_states, _ = self.norm(hidden_states, residual)
return hidden_states
DeepseekV2Model.forward = forward