upgrade vLLM to 0.12.0 tag (#4647)

Upgrade vLLM to v0.12.0 tag - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: 86e178f7c4 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-03 23:43:05 +08:00
parent 26e8e58cea
commit 3f4c0ea0a0
22 changed files with 97 additions and 47 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -23,6 +23,7 @@ if HAS_TRITON:
 # isort: off
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.worker.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_deepseek  # noqa
 import vllm_ascend.patch.worker.patch_roberta  # noqa
 import vllm_ascend.patch.worker.patch_weight_loader  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
--- a/vllm_ascend/patch/worker/patch_deepseek.py
+++ b/vllm_ascend/patch/worker/patch_deepseek.py
@@ -0,0 +1,60 @@
+from itertools import islice
+
+import torch
+from vllm.distributed import get_pp_group
+from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
+                                                    _get_llama_4_scaling)
+from vllm.sequence import IntermediateTensors
+
+
+def forward(
+    self,
+    input_ids,
+    positions,
+    intermediate_tensors,
+    inputs_embeds,
+):
+    if get_pp_group().is_first_rank:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_input_ids(input_ids)
+        residual = None
+    else:
+        assert intermediate_tensors is not None
+        hidden_states = intermediate_tensors["hidden_states"]
+        residual = intermediate_tensors["residual"]
+
+    # Compute llama 4 scaling once per forward pass if enabled
+    # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
+    # We'll find a better way to remove this patch.
+    try:
+        llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
+    except AttributeError:
+        llama_4_scaling_config = None
+    llama_4_scaling: torch.Tensor | None
+    if llama_4_scaling_config is not None:
+        llama_4_scaling = _get_llama_4_scaling(
+            original_max_position_embeddings=llama_4_scaling_config[
+                "original_max_position_embeddings"],
+            scaling_beta=llama_4_scaling_config["beta"],
+            positions=positions,
+        )
+    else:
+        llama_4_scaling = None
+
+    for layer in islice(self.layers, self.start_layer, self.end_layer):
+        hidden_states, residual = layer(positions, hidden_states, residual,
+                                        llama_4_scaling)
+
+    if not get_pp_group().is_last_rank:
+        return IntermediateTensors({
+            "hidden_states": hidden_states,
+            "residual": residual
+        })
+
+    hidden_states, _ = self.norm(hidden_states, residual)
+    return hidden_states
+
+
+DeepseekV2Model.forward = forward
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -159,7 +159,8 @@ class NPUPlatform(Platform):
                compilation_config.splitting_ops = []

        compilation_config.cudagraph_num_of_warmups = 1
-        compilation_config.pass_config.enable_fusion = False
+        compilation_config.pass_config.fuse_norm_quant = False
+        compilation_config.pass_config.fuse_act_quant = False

        if compilation_config.mode not in [
                CompilationMode.NONE, CompilationMode.VLLM_COMPILE
@@ -194,7 +195,7 @@ class NPUPlatform(Platform):
        # to ascend ops && hardwares. We update these sizes here to improve
        # default performance.
        update_default_aclgraph_sizes(vllm_config)
-        # TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
+        # TODO delete graph size update here when compilation_config.pass_config.enable_sp
        # is supported by vllm-ascend.
        if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
                enable_sp(vllm_config):
--- a/vllm_ascend/torchair/models/qwen3_moe.py
+++ b/vllm_ascend/torchair/models/qwen3_moe.py
@@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
                                                eps=config.rms_norm_eps)

        self.enable_sequence_parallelism = (
-            vllm_config.compilation_config.pass_config.
-            enable_sequence_parallelism if vllm_config is not None else False)
+            vllm_config.compilation_config.pass_config.enable_sp
+            if vllm_config is not None else False)

    def forward(
        self,
@@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

-        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
+        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp
        # Set MoE hyperparameters
        self.expert_weights: list[torch.Tensor] = []

--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
            from vllm.config import get_current_vllm_config
            vllm_config = get_current_vllm_config()
        _ENABLE_SP = (
-            vllm_config.compilation_config.pass_config.
-            enable_sequence_parallelism
+            vllm_config.compilation_config.pass_config.enable_sp
            or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
            # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
            # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.