upgrade vLLM to 0.12.0 tag (#4647)
Upgrade vLLM to v0.12.0 tag
- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -23,6 +23,7 @@ if HAS_TRITON:
|
||||
# isort: off
|
||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||
import vllm_ascend.patch.worker.patch_deepseek # noqa
|
||||
import vllm_ascend.patch.worker.patch_roberta # noqa
|
||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
|
||||
60
vllm_ascend/patch/worker/patch_deepseek.py
Normal file
60
vllm_ascend/patch/worker/patch_deepseek.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from itertools import islice
|
||||
|
||||
import torch
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
|
||||
_get_llama_4_scaling)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids,
|
||||
positions,
|
||||
intermediate_tensors,
|
||||
inputs_embeds,
|
||||
):
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
hidden_states = inputs_embeds
|
||||
else:
|
||||
hidden_states = self.embed_input_ids(input_ids)
|
||||
residual = None
|
||||
else:
|
||||
assert intermediate_tensors is not None
|
||||
hidden_states = intermediate_tensors["hidden_states"]
|
||||
residual = intermediate_tensors["residual"]
|
||||
|
||||
# Compute llama 4 scaling once per forward pass if enabled
|
||||
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
|
||||
# We'll find a better way to remove this patch.
|
||||
try:
|
||||
llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
|
||||
except AttributeError:
|
||||
llama_4_scaling_config = None
|
||||
llama_4_scaling: torch.Tensor | None
|
||||
if llama_4_scaling_config is not None:
|
||||
llama_4_scaling = _get_llama_4_scaling(
|
||||
original_max_position_embeddings=llama_4_scaling_config[
|
||||
"original_max_position_embeddings"],
|
||||
scaling_beta=llama_4_scaling_config["beta"],
|
||||
positions=positions,
|
||||
)
|
||||
else:
|
||||
llama_4_scaling = None
|
||||
|
||||
for layer in islice(self.layers, self.start_layer, self.end_layer):
|
||||
hidden_states, residual = layer(positions, hidden_states, residual,
|
||||
llama_4_scaling)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
return IntermediateTensors({
|
||||
"hidden_states": hidden_states,
|
||||
"residual": residual
|
||||
})
|
||||
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
|
||||
DeepseekV2Model.forward = forward
|
||||
@@ -159,7 +159,8 @@ class NPUPlatform(Platform):
|
||||
compilation_config.splitting_ops = []
|
||||
|
||||
compilation_config.cudagraph_num_of_warmups = 1
|
||||
compilation_config.pass_config.enable_fusion = False
|
||||
compilation_config.pass_config.fuse_norm_quant = False
|
||||
compilation_config.pass_config.fuse_act_quant = False
|
||||
|
||||
if compilation_config.mode not in [
|
||||
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
||||
@@ -194,7 +195,7 @@ class NPUPlatform(Platform):
|
||||
# to ascend ops && hardwares. We update these sizes here to improve
|
||||
# default performance.
|
||||
update_default_aclgraph_sizes(vllm_config)
|
||||
# TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
|
||||
# TODO delete graph size update here when compilation_config.pass_config.enable_sp
|
||||
# is supported by vllm-ascend.
|
||||
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
|
||||
enable_sp(vllm_config):
|
||||
|
||||
@@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
||||
eps=config.rms_norm_eps)
|
||||
|
||||
self.enable_sequence_parallelism = (
|
||||
vllm_config.compilation_config.pass_config.
|
||||
enable_sequence_parallelism if vllm_config is not None else False)
|
||||
vllm_config.compilation_config.pass_config.enable_sp
|
||||
if vllm_config is not None else False)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
||||
self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
|
||||
self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp
|
||||
# Set MoE hyperparameters
|
||||
self.expert_weights: list[torch.Tensor] = []
|
||||
|
||||
|
||||
@@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
|
||||
from vllm.config import get_current_vllm_config
|
||||
vllm_config = get_current_vllm_config()
|
||||
_ENABLE_SP = (
|
||||
vllm_config.compilation_config.pass_config.
|
||||
enable_sequence_parallelism
|
||||
vllm_config.compilation_config.pass_config.enable_sp
|
||||
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
|
||||
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
|
||||
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
|
||||
|
||||
Reference in New Issue
Block a user