### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
|`vllm_ascend/ops/layer_shard_linear.py`|
|`vllm_ascend/ops/linear.py`|
|`vllm_ascend/ops/linear_op.py`|
|`vllm_ascend/worker/worker.py`|
| ` vllm_ascend/patch/worker/patch_bert.py` |
| ` vllm_ascend/patch/worker/patch_deepseek.py` |
| ` vllm_ascend/patch/worker/patch_distributed.py` |
| ` vllm_ascend/patch/worker/patch_module.py` |
| ` vllm_ascend/patch/worker/patch_multimodal_merge.py` |
| ` vllm_ascend/patch/worker/patch_qwen3_next.py` |
| ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` |
| ` vllm_ascend/patch/worker/patch_rejection_sampler.py` |
| ` vllm_ascend/patch/worker/patch_rope.py` |
| ` vllm_ascend/patch/worker/patch_triton.py` |
| ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` |
| ` vllm_ascend/patch/worker/patch_v2_egale.py` |
|` vllm_ascend/worker/npu_input_batch.py`|
|` vllm_ascend/worker/v2/aclgraph_utils.py`|
|` vllm_ascend/worker/v2/attn_utils.py`|
|` vllm_ascend/worker/v2/model_runner.py`|
|` vllm_ascend/worker/v2/sample/gumbel.py`|
|` vllm_ascend/worker/v2/sample/penalties.py`|
|` vllm_ascend/worker/v2/sample/sampler.py`|
|` vllm_ascend/worker/v2/spec_decode/__init__.py`|
|` vllm_ascend/worker/v2/spec_decode/eagle.py`|
|` vllm_ascend/worker/v2/states.py`|
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.0
- vLLM main:
d68209402d
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: SILONG ZENG <2609716663@qq.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
from itertools import islice
|
|
|
|
import torch
|
|
from vllm.distributed import get_pp_group
|
|
from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model, _get_llama_4_scaling
|
|
from vllm.sequence import IntermediateTensors
|
|
|
|
|
|
def forward(
|
|
self,
|
|
input_ids,
|
|
positions,
|
|
intermediate_tensors,
|
|
inputs_embeds,
|
|
):
|
|
if get_pp_group().is_first_rank:
|
|
if inputs_embeds is not None:
|
|
hidden_states = inputs_embeds
|
|
else:
|
|
hidden_states = self.embed_input_ids(input_ids)
|
|
residual = None
|
|
else:
|
|
assert intermediate_tensors is not None
|
|
hidden_states = intermediate_tensors["hidden_states"]
|
|
residual = intermediate_tensors["residual"]
|
|
|
|
# Compute llama 4 scaling once per forward pass if enabled
|
|
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
|
|
# We'll find a better way to remove this patch.
|
|
try:
|
|
llama_4_scaling_config = self.config.llama_4_scaling
|
|
except AttributeError:
|
|
llama_4_scaling_config = None
|
|
llama_4_scaling: torch.Tensor | None
|
|
if llama_4_scaling_config is not None:
|
|
llama_4_scaling = _get_llama_4_scaling(
|
|
original_max_position_embeddings=llama_4_scaling_config["original_max_position_embeddings"],
|
|
scaling_beta=llama_4_scaling_config["beta"],
|
|
positions=positions,
|
|
)
|
|
else:
|
|
llama_4_scaling = None
|
|
|
|
for layer in islice(self.layers, self.start_layer, self.end_layer):
|
|
hidden_states, residual = layer(positions, hidden_states, residual, llama_4_scaling)
|
|
|
|
if not get_pp_group().is_last_rank:
|
|
return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
|
|
|
|
hidden_states, _ = self.norm(hidden_states, residual)
|
|
return hidden_states
|
|
|
|
|
|
DeepseekV2Model.forward = forward
|