Files
xc-llm-ascend/vllm_ascend/patch/worker/patch_deepseek.py
wangxiyuan 3f4c0ea0a0 upgrade vLLM to 0.12.0 tag (#4647)
Upgrade vLLM to v0.12.0 tag

- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-03 23:43:05 +08:00

61 lines
1.9 KiB
Python

from itertools import islice
import torch
from vllm.distributed import get_pp_group
from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
_get_llama_4_scaling)
from vllm.sequence import IntermediateTensors
def forward(
self,
input_ids,
positions,
intermediate_tensors,
inputs_embeds,
):
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.embed_input_ids(input_ids)
residual = None
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
residual = intermediate_tensors["residual"]
# Compute llama 4 scaling once per forward pass if enabled
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
# We'll find a better way to remove this patch.
try:
llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
except AttributeError:
llama_4_scaling_config = None
llama_4_scaling: torch.Tensor | None
if llama_4_scaling_config is not None:
llama_4_scaling = _get_llama_4_scaling(
original_max_position_embeddings=llama_4_scaling_config[
"original_max_position_embeddings"],
scaling_beta=llama_4_scaling_config["beta"],
positions=positions,
)
else:
llama_4_scaling = None
for layer in islice(self.layers, self.start_layer, self.end_layer):
hidden_states, residual = layer(positions, hidden_states, residual,
llama_4_scaling)
if not get_pp_group().is_last_rank:
return IntermediateTensors({
"hidden_states": hidden_states,
"residual": residual
})
hidden_states, _ = self.norm(hidden_states, residual)
return hidden_states
DeepseekV2Model.forward = forward