[Feature]refactor the npugraph_ex config, support online-infer with static kernel (#5775)

### What this PR does / why we need it? This is a part of https://github.com/vllm-project/vllm-ascend/issues/4715#issue-3694310762 1. refactor the npugraph_ex config，modified the default configuration of the static kernel, new default value of static kernel is false 2. support online-infer with static kernel 3. fixed the issue where manually modifying FX graphs caused an abnormal model return type, and removed the related redundant code. ### Does this PR introduce _any_ user-facing change? yes，the new config of npugraph_ex is as follow: ``` additional_config={ "npugraph_ex_config": { "enable": True, "enable_static_kernel": False } } ``` ### How was this patch tested? ``` vllm serve /data/DeepSeek-V3.1-Terminus-w4a8 \ --host 0.0.0.0 \ --port 8004 \ --data-parallel-size 4 \ --tensor-parallel-size 4 \ --quantization ascend \ --seed 1024 \ --served-model-name deepseek_v3 \ --enable-expert-parallel \ --max-num-seqs 48 \ --max-model-len 40000 \ --async-scheduling \ --max-num-batched-tokens 9000 \ --trust-remote-code \ --no-enable-prefix-caching \ --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp","disable_padded_drafter_batch": false}' \ --gpu-memory-utilization 0.9 \ --compilation-config '{"cudagraph_capture_sizes":[4,32,64,112,160,176,192], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config \ '{"enable_shared_expert_dp": true,"multistream_overlap_shared_expert": true,"npugraph_ex_config":{"enable":true}}' ``` - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: chencangtao <chencangtao@huawei.com> Signed-off-by: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com> Co-authored-by: chencangtao <chencangtao@huawei.com>
2026-01-20 21:31:38 +08:00
parent 0c0514579f
commit 6c30f8bf87
6 changed files with 91 additions and 17 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -102,7 +102,8 @@ class AscendConfig:
        from vllm_ascend.utils import get_flashcomm2_config_and_validate

        self.flashcomm2_oproj_tensor_parallel_size = get_flashcomm2_config_and_validate(self, vllm_config)
-        self.enable_npugraph_ex = additional_config.get("enable_npugraph_ex", False)
+        npugraph_ex_config = additional_config.get("npugraph_ex_config", {})
+        self.npugraph_ex_config = NpugraphExConfig(**npugraph_ex_config)
        # We find that _npu_paged_attention still performs better than
        # npu_fused_infer_attention_score in some cases. We allow to execute
        # _npu_paged_attention in this cases. This should be removed once
@@ -211,6 +212,36 @@ class AscendFusionConfig:
        self.fusion_ops_gmmswigluquant = fusion_ops_gmmswigluquant


+class NpugraphExConfig:
+    """
+    Configuration for controlling the behavior of npugraph_ex backend.
+
+    This class provides a way to configure whether to use the npugraph_ex backend and static kernel.
+    These configurations can directly impact the performance and behavior of models deployed on Ascend platforms.
+    """
+
+    def __init__(self, enable: bool = False, enable_static_kernel: bool = False, **kwargs):
+        """
+        Initialize the configuration.
+
+        Args:
+            enable (bool): Whether to enable npugraph_ex backend.
+                When set to True, the Fx graph generated by Dymano will be
+                optimized and compiled by the npugraph_ex backend.
+                Default: False
+            enable_static_kernel (bool): Whether to enable static kernel.
+                Static kernel is suitable for scenarios with purely static shapes
+                or minimal shape changes, and can improve network performance.
+                When set to True, when during graph capture, it will compile operator
+                binary files with the corresponding shapes based on the current batch_size,
+                which usually takes some time.
+                Default: False
+            **kwargs: Additional optional parameters for forward compatibility and configuration extension.
+        """
+        self.enable = enable
+        self.enable_static_kernel = enable_static_kernel
+
+
 class XliteGraphConfig:
    """
    Configuration Object for xlite_graph_config from additional_config