From f2990f7741b2e6dd66c2308a63db5277ed86ec4b Mon Sep 17 00:00:00 2001 From: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:24:48 +0800 Subject: [PATCH] [e2e Test][npugraph_ex]add static kernel e2e test case (#6320) ### What this PR does / why we need it? Added an E2E test case for the scenario of enabling a static kernel for npugraph_ex, monitoring its compilation and unloading process. Also fixed the previously existing spelling errors - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: chencangtao Co-authored-by: chencangtao --- .../e2e/singlecard/test_aclgraph_accuracy.py | 32 +++++++++++++++++++ vllm_ascend/compilation/compiler_interface.py | 4 +-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index 76ac04c7..0899fa33 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -16,6 +16,7 @@ # import pytest +import os from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid) @@ -133,3 +134,34 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): prompts=cur_case.prompts, sampling_params=cur_case.sampling_params, golden_answers=cur_case.golden_answers) + +# The accuracy has already been verified in the previous test case. +# This test case is used to check whether the functionality works properly +# after enabling the static kernel and whether it is uninstalled as expected. +@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX]) +def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch): + monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) + runner_kwargs = { + "model_name": cur_case.model, + "quantization": cur_case.quantization, + "max_model_len": 1024, + "compilation_config": { + "cudagraph_capture_sizes": [4, 8], + "cudagraph_mode": "FULL_DECODE_ONLY" + }, + "additional_config": { + "npugraph_ex_config": { + "enable": True, + "enable_static_kernel": True, + } + }, + } + gen_and_valid(runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers) + + # Check whether the static kernel is properly uninstall + ascend_home_path = os.environ["ASCEND_HOME_PATH"] + static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core') + assert not os.path.exists(static_kernel_install_path) diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py index 7a029121..cefac33d 100644 --- a/vllm_ascend/compilation/compiler_interface.py +++ b/vllm_ascend/compilation/compiler_interface.py @@ -90,10 +90,10 @@ def npugraph_ex_compile( # affecting program execution. num_spec_tokens = vllm_config.speculative_config.num_speculative_token if vllm_config.speculative_config else 0 uniform_decode_query_len = num_spec_tokens + 1 - max_num_tokens = vllm_config.scheduler_config.max_num_seq * uniform_decode_query_len + max_num_tokens = vllm_config.scheduler_config.max_num_seqs * uniform_decode_query_len decode_cudagraph_batch_sizes = [ x - for x in vllm_config.compilation_config.cudagraph_capture_size + for x in vllm_config.compilation_config.cudagraph_capture_sizes if max_num_tokens >= x >= uniform_decode_query_len ] config.experimental_config.aclgraph._aclnn_static_shape_kernel_sym_value_range = decode_cudagraph_batch_sizes