[npugraph_ex]enable npugraph_ex by default (#6664)

### What this PR does / why we need it? This pull request enables the `npugraph_ex` backend by default to improve performance on Ascend NPUs, as proposed in the [RFC](https://github.com/vllm-project/vllm-ascend/issues/6214). ### Does this PR introduce _any_ user-facing change? Yes. `npugraph_ex` is now enabled by default. Users can disable it by setting `enable: false` in the `npugraph_ex_config` section of the `additional_config`. ### How was this patch tested? CI passed. The changes are covered by existing and new E2E tests (`test_aclgraph_accuracy.py`) and unit tests (`test_ascend_config.py`) that have been updated to reflect the new default behavior. The tests verify correctness and consistency with `npugraph_ex` enabled and disabled, as well as with the new static kernel option. Signed-off-by: huyuanquan1 <huyuanquan1@huawei.com> Co-authored-by: huyuanquan1 <huyuanquan1@huawei.com>
2026-02-12 08:44:06 +08:00
parent b86ea66b0a
commit a0315f6697
10 changed files with 159 additions and 9 deletions
--- a/docs/source/user_guide/configuration/additional_config.md
+++ b/docs/source/user_guide/configuration/additional_config.md
@@ -94,7 +94,7 @@ The details of each configuration option are as follows:

 | Name                   | Type | Default | Description                                                                            |
 |------------------------| ---- |---------|----------------------------------------------------------------------------------------|
-| `enable`               | bool | `False` | Whether to enable npugraph_ex backend.                                                 |
+| `enable`               | bool | `True` | Whether to enable npugraph_ex backend.                                                 |
 | `enable_static_kernel` | bool | `False` | Whether to enable static kernel. Suitable for scenarios where shape changes are minimal and some time is available for static kernel compilation. |
 | `fuse_norm_quant`  | bool | `True` | Whether to enable fuse_norm_quant pass. |
 | `fuse_qknorm_rope` | bool | `True` | Whether to enable fuse_qknorm_rope pass. If Triton is not in the environment, set it to False. |
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -147,6 +147,11 @@ def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "quantization": cur_case.quantization,
+        "additional_config": {
+            "npugraph_ex_config": {
+                "enable": False
+            }
+        },
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -67,7 +67,7 @@ class TestAscendConfig(TestBase):
        self.assertTrue(ascend_config.multistream_overlap_shared_expert)

        npugraph_ex_config = ascend_config.npugraph_ex_config
-        self.assertFalse(npugraph_ex_config.enable)
+        self.assertTrue(npugraph_ex_config.enable)
        self.assertFalse(npugraph_ex_config.enable_static_kernel)

        ascend_compilation_config = ascend_config.ascend_compilation_config
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -260,7 +260,7 @@ class NpugraphExConfig:

    def __init__(
        self,
-        enable: bool = False,
+        enable: bool = True,
        enable_static_kernel: bool = False,
        fuse_norm_quant: bool = True,
        fuse_qknorm_rope: bool = True,
@@ -274,7 +274,7 @@ class NpugraphExConfig:
            enable (bool): Whether to enable npugraph_ex backend.
                When set to True, the Fx graph generated by Dymano will be
                optimized and compiled by the npugraph_ex backend.
-                Default: False
+                Default: True
            enable_static_kernel (bool): Whether to enable static kernel.
                Static kernel is suitable for scenarios with purely static shapes
                or minimal shape changes, and can improve network performance.
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -88,7 +88,7 @@ def npugraph_ex_compile(
        # that can trigger the compilation of static kernel. If this configuration is
        # not applied, new shapes will trigger the compilation of static kernels,
        # affecting program execution.
-        num_spec_tokens = vllm_config.speculative_config.num_speculative_token if vllm_config.speculative_config else 0
+        num_spec_tokens = vllm_config.speculative_config.num_speculative_tokens if vllm_config.speculative_config else 0
        uniform_decode_query_len = num_spec_tokens + 1
        max_num_tokens = vllm_config.scheduler_config.max_num_seqs * uniform_decode_query_len
        decode_cudagraph_batch_sizes = [
--- a/vllm_ascend/compilation/npu_graph_ex_pass_manager.py
+++ b/vllm_ascend/compilation/npu_graph_ex_pass_manager.py
@@ -19,6 +19,7 @@
 from torch import fx as fx
 from vllm.config import VllmConfig

+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import vllm_version_is

 if vllm_version_is("0.15.0"):
@@ -55,18 +56,19 @@ class NpuGraphEXPassManager:

    def configure(self, config: VllmConfig):
        # By default, we enable the graph fusion and quantization fusion pass.
-        self.npugraph_ex_config: dict = config.additional_config.get("npugraph_ex_config", {})
-        if self.npugraph_ex_config.get("fuse_norm_quant", True):
+        self.npugraph_ex_config = get_ascend_config().npugraph_ex_config
+
+        if self.npugraph_ex_config.fuse_norm_quant:
            from .npugraph_ex_passes.graphex_norm_quant_fusion_pass import GraphEXAddRMSNormFusionPass

            self.passes.append(GraphEXAddRMSNormFusionPass(config))

-        if self.npugraph_ex_config.get("fuse_qknorm_rope", True):
+        if self.npugraph_ex_config.fuse_qknorm_rope:
            from .npugraph_ex_passes.graphex_qknorm_rope_fusion_pass import GraphEXQKNormRopeFusionPass

            self.passes.append(GraphEXQKNormRopeFusionPass(config))

-        if self.npugraph_ex_config.get("fuse_allreduce_rms", True):
+        if self.npugraph_ex_config.fuse_allreduce_rms:
            from .npugraph_ex_passes.graphex_allreduce_rmsnorm_fusion_pass import GraphEXMatmulAllReduceAddRMSNormPass

            self.passes.append(GraphEXMatmulAllReduceAddRMSNormPass(config))
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -249,3 +249,17 @@
 #       make unquantized_gemm as a customop.
 #    Future Plan:
 #       Remove this patch when vLLM support the operator as customop.
+#
+# ** 13. File: worker/patch_npugraph_ex_triton.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `torchair.core._concrete_graph.ValuePack`,
+#      `torchair.npu_fx_compiler._unpack_meta`,
+#      `torchair.npu_fx_compiler._NpuGraphConverter._unpack_npu`
+#    Why:
+#       In the Triton scenario, npugraph_ex backend needs to process the value pack of the input parameters.
+#    How：
+#       Supplement the relevant processing logic through patches.
+#    Related PR (if no, explain why):
+#       https://gitcode.com/Ascend/torchair/pull/2575
+#    Future Plan:
+#       Remove this patch when the PTA version used by vllm-ascend has been upgraded.
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -33,3 +33,4 @@ import vllm_ascend.patch.worker.patch_rejection_sampler  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_v2_egale  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
+import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
--- a/vllm_ascend/patch/worker/patch_npugraph_ex_triton.py
+++ b/vllm_ascend/patch/worker/patch_npugraph_ex_triton.py
@@ -0,0 +1,116 @@
+#
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import importlib
+import sys
+
+import torch
+import torchair
+from torch._subclasses.fake_tensor import FakeTensor
+from torchair.core._concrete_graph import _is_symlist
+from torchair.npu_fx_compiler import _unpack_meta_list
+
+
+class ValuePack:
+    def __init__(self, meta, npu_meta=None) -> None:
+        self._meta = meta
+        self._npu_meta = meta if npu_meta is None else npu_meta
+
+    @property
+    def meta(self):
+        return self._meta
+
+    @property
+    def npu(self):
+        return self._npu_meta
+
+    def __getitem__(self, key):
+        if isinstance(self._meta, dict):
+            return self._meta.get(key)
+        raise ValueError(f"Unsupported meta type for ValuePack __getitem__, key:{key}, type: {type(self._meta)}")
+
+    def __repr__(self) -> str:
+        if isinstance(self._meta, FakeTensor):
+            meta_str = f"FakeTensor(dtype={self._meta.dtype}, size={list(self._meta.size())}"
+        elif isinstance(self._meta, torch.Tensor):
+            meta_str = f"torch.Tensor(dtype={self._meta.dtype}, size={list(self._meta.size())}"
+        elif isinstance(self._meta, torch.SymInt):
+            meta_str = f"torch.SymInt({self._meta})"
+        else:
+            try:
+                meta_str = f"{type(self._meta)}({self._meta})"
+            except Exception:
+                meta_str = f"{type(self._meta)}"
+        return f"Pack(meta:{meta_str} npu:{self._npu_meta})"
+
+
+def _unpack_meta(args, kwargs):
+    unpacked_args = []
+    unpacked_kwargs = {}
+
+    def _get_meta_part(arg):
+        if isinstance(arg, (list, tuple)) and any(isinstance(v, ValuePack) for v in arg):
+            return _unpack_meta_list(arg)
+        elif isinstance(arg, dict):
+            return {k: v.meta if isinstance(v, ValuePack) else v for k, v in arg.items()}
+        elif isinstance(arg, ValuePack):
+            return arg.meta
+        else:
+            return arg
+
+    for arg in args:
+        unpacked_args.append(_get_meta_part(arg))
+
+    for key, value in kwargs.items():
+        unpacked_kwargs[key] = _get_meta_part(value)
+
+    return list(unpacked_args), unpacked_kwargs
+
+
+def _unpack_npu(self, args, kwargs):
+    unpacked = []
+    unpacked_kwargs = {}
+
+    def _get_npu_part(arg):
+        if isinstance(arg, (list, tuple)) and len(arg):
+            if _is_symlist(arg):
+                arg = self._graph.parse_symlist(arg)
+            else:
+                arg = [(v.npu if isinstance(v, ValuePack) else v) for v in arg]
+            return arg
+        elif isinstance(arg, dict):
+            return {k: v.npu if isinstance(v, ValuePack) else v for k, v in arg.items()}
+        elif isinstance(arg, ValuePack):
+            return arg.npu
+        else:
+            return arg
+
+    for arg in args:
+        unpacked.append(_get_npu_part(arg))
+
+    for key, value in kwargs.items():
+        unpacked_kwargs[key] = _get_npu_part(value)
+
+    return unpacked, unpacked_kwargs
+
+
+torchair.core._concrete_graph.ValuePack = ValuePack
+# The ValuePack class is referenced in these two modules, and after the patch, these two modules need to be reloaded.
+importlib.reload(sys.modules["torchair.fx_summary"])
+importlib.reload(sys.modules["torchair.npu_fx_compiler"])
+torchair.npu_fx_compiler._unpack_meta = _unpack_meta
+torchair.npu_fx_compiler._NpuGraphConverter._unpack_npu = _unpack_npu
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -210,6 +210,18 @@ class NPUPlatform(Platform):
                    "{new_compile_ranges_split_points} for matmul and allreduce fusion"
                )

+        npugraph_ex_config = ascend_config.npugraph_ex_config
+        if npugraph_ex_config and npugraph_ex_config.fuse_allreduce_rms:
+            from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THREHOLD
+
+            new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
+            new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THREHOLD)
+            new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
+            vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
+            logger.debug(
+                "set compile_ranges_split_points to {new_compile_ranges_split_points} for matmul and allreduce fusion"
+            )
+
        elif model_config and hasattr(model_config.hf_text_config, "index_topk"):
            vllm_config.cache_config.cache_dtype = str(model_config.dtype).replace("torch.", "")