[Feature]refactor the npugraph_ex config, support online-infer with static kernel (#5775)

### What this PR does / why we need it? This is a part of https://github.com/vllm-project/vllm-ascend/issues/4715#issue-3694310762 1. refactor the npugraph_ex config，modified the default configuration of the static kernel, new default value of static kernel is false 2. support online-infer with static kernel 3. fixed the issue where manually modifying FX graphs caused an abnormal model return type, and removed the related redundant code. ### Does this PR introduce _any_ user-facing change? yes，the new config of npugraph_ex is as follow: ``` additional_config={ "npugraph_ex_config": { "enable": True, "enable_static_kernel": False } } ``` ### How was this patch tested? ``` vllm serve /data/DeepSeek-V3.1-Terminus-w4a8 \ --host 0.0.0.0 \ --port 8004 \ --data-parallel-size 4 \ --tensor-parallel-size 4 \ --quantization ascend \ --seed 1024 \ --served-model-name deepseek_v3 \ --enable-expert-parallel \ --max-num-seqs 48 \ --max-model-len 40000 \ --async-scheduling \ --max-num-batched-tokens 9000 \ --trust-remote-code \ --no-enable-prefix-caching \ --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp","disable_padded_drafter_batch": false}' \ --gpu-memory-utilization 0.9 \ --compilation-config '{"cudagraph_capture_sizes":[4,32,64,112,160,176,192], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config \ '{"enable_shared_expert_dp": true,"multistream_overlap_shared_expert": true,"npugraph_ex_config":{"enable":true}}' ``` - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: chencangtao <chencangtao@huawei.com> Signed-off-by: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com> Co-authored-by: chencangtao <chencangtao@huawei.com>
2026-01-20 21:31:38 +08:00
parent 0c0514579f
commit 6c30f8bf87
6 changed files with 91 additions and 17 deletions
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -65,7 +65,10 @@ class TestAscendConfig(TestBase):
        ascend_config = init_ascend_config(test_vllm_config)
        self.assertEqual(ascend_config.eplb_config.num_redundant_experts, 2)
        self.assertTrue(ascend_config.multistream_overlap_shared_expert)
-        self.assertFalse(ascend_config.enable_npugraph_ex)
+
+        npugraph_ex_config = ascend_config.npugraph_ex_config
+        self.assertFalse(npugraph_ex_config.enable)
+        self.assertFalse(npugraph_ex_config.enable_static_kernel)

        ascend_compilation_config = ascend_config.ascend_compilation_config
        self.assertFalse(ascend_compilation_config.fuse_norm_quant)
@@ -79,11 +82,16 @@ class TestAscendConfig(TestBase):
    def test_init_ascend_config_enable_npugraph_ex(self, mock_fix_incompatible_config):
        test_vllm_config = VllmConfig()
        test_vllm_config.additional_config = {
-            "enable_npugraph_ex": True,
-            "refresh": True,
+            "npugraph_ex_config": {
+                "enable": True,
+                "enable_static_kernel": True
+            },
+            "refresh": True
        }
-        ascend_config = init_ascend_config(test_vllm_config)
-        self.assertTrue(ascend_config.enable_npugraph_ex)
+        npugraph_ex_config = init_ascend_config(
+            test_vllm_config).npugraph_ex_config
+        self.assertTrue(npugraph_ex_config.enable)
+        self.assertTrue(npugraph_ex_config.enable_static_kernel)

    @_clean_up_ascend_config
    @patch("vllm_ascend.platform.NPUPlatform._fix_incompatible_config")