enable npugraph_ex (#5120)

### What this PR does / why we need it? We will expose the enabling switch for npugraph_ex to better facilitate subsequent optimization. ### Does this PR introduce _any_ user-facing change? Previously, the enable_npugraph_ex switch would trigger an error; now we have removed the error reporting mechanism to better facilitate subsequent optimization efforts. Basic functionalities are available in CANN and torch_npu for Q3, while advanced optimizations will depend on the Q4 release. ### How was this patch tested? llm =LLM( model=model, enforce_eager=False , additional_config={ "enable_npugraph_ex": True }, compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [16], }, } - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com> Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-18 09:08:40 +08:00
parent 39bdd4cfaa
commit 8069442b41
4 changed files with 107 additions and 13 deletions
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only(
    )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
 def test_output_between_eager_and_fullgraph_npugraph_ex(
    model: str,
    max_tokens: int,
 ) -> None:
    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
        del os.environ['HCCL_OP_EXPANSION_MODE']
    # NOTE: Randomly fill the prompt with the requested amount for
    # the specified capture shape to prevent accuracy issues caused by padding
    prompts = [
        ('Solve the following math problem step by step.'
         'The last line of your response should be of the form Answer: '
         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
         'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
         'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
         '$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
         'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
         'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
         ),
        ('Solve the following math problem step by step.'
         'The last line of your response should be of the form Answer: '
         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
         'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
         'independently and uniformly at random on the perimeter of $ABCD$.'
         'If the expected value of the area of triangle $\\triangle AXY$'
         'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
         'integers $m$ and $n$, compute $m+n$.'),
        ('Solve the following math problem step by step.'
         'The last line of your response should be of the form Answer: '
         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
         'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
         'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
         'and $x^2 + cx + b = 0$ also have a common real root.'
         'Compute the sum $a + b + c$.')
    ]
    vllm_aclgraph_qwen_answers = [
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
    ]
    vllm_aclgraph_ds_answers = [
        '\n\nSelect an assignment template',
        '\n\nSelect an assignment template',
        '\n\nSelect an assignment template'
    ]
    sampling_params = SamplingParams(max_tokens=max_tokens,
                                     n=1,
                                     temperature=0.0,
                                     top_p=1.0,
                                     top_k=1)
    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
        with VllmRunner(
                model,
                max_model_len=1024,
                enforce_eager=False,
                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
                additional_config={"enable_npugraph_ex": True},
                quantization="ascend",
        ) as runner:
            vllm_aclgraph_outputs = runner.model.generate(
                prompts, sampling_params)
    else:
        with VllmRunner(
                model,
                max_model_len=1024,
                enforce_eager=False,
                compilation_config={
                    "cudagraph_capture_sizes": [4, 8, 32, 64],
                    "cudagraph_mode": "FULL_DECODE_ONLY"
                },
                additional_config={"enable_npugraph_ex": True},
        ) as runner:
            vllm_aclgraph_outputs = runner.model.generate(
                prompts, sampling_params)
    vllm_aclgraph_outputs_list = []
    for output in vllm_aclgraph_outputs:
        vllm_aclgraph_outputs_list.append(
            ([output.outputs[0].index], output.outputs[0].text))
    vllm_eager_outputs_list = []
    vllm_eager_outputs_list = ([
        ([0], answer) for answer in vllm_aclgraph_ds_answers
    ] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
        ([0], answer) for answer in vllm_aclgraph_qwen_answers
    ])
    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs_list,
        outputs_1_lst=vllm_aclgraph_outputs_list,
        name_0="vllm_eager_outputs",
        name_1="vllm_aclgraph_outputs",
    )
 def test_aclgraph_enable():
    # Generally, this test is not belong to e2e, but it is a good way to check if
    # aclgraph is enabled in real environment
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -64,13 +64,13 @@ class TestAscendConfig(TestBase):
    @_clean_up_ascend_config
    def test_init_ascend_config_enable_npugraph_ex(self):
-        with self.assertRaises(NotImplementedError):
+        test_vllm_config = VllmConfig()
-            test_vllm_config = VllmConfig()
+        test_vllm_config.additional_config = {
-            test_vllm_config.additional_config = {
+            "enable_npugraph_ex": True,
-                "enable_npugraph_ex": True,
+            "refresh": True,
-                "refresh": True,
+        }
-            }
+        ascend_config = init_ascend_config(test_vllm_config)
-            init_ascend_config(test_vllm_config)
+        self.assertTrue(ascend_config.enable_npugraph_ex)
    @_clean_up_ascend_config
    def test_get_ascend_config(self):
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -150,10 +150,6 @@ class AscendConfig:
            self, vllm_config)
        self.enable_npugraph_ex = additional_config.get(
            "enable_npugraph_ex", False)
        if self.enable_npugraph_ex:
            raise NotImplementedError(
                "This feature is still in the experiment and will be supported soon."
            )
        # We find that _npu_paged_attention still performs better than
        # npu_fused_infer_attention_score in some cases. We allow to execute
        # _npu_paged_attention in this cases. This should be removed once
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -89,13 +89,13 @@ def npugraph_ex_compile(
                                              tuple,
                                              args=([return_value], ))
        output_node.args = (tuple_node, )
-        fx_graph.recompile()
+        graph.recompile()
    import torchair
    # TODO: use a better way to lazy register replacement, instead of import one by one
    # As an example, we directly import here to register replacement.
-    import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant  # noqa
+    # import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant  # noqa
    torch.npu.set_compile_mode(jit_compile=False)
    config = torchair.CompilerConfig()