enable npugraph_ex (#5120)

### What this PR does / why we need it? We will expose the enabling switch for npugraph_ex to better facilitate subsequent optimization. ### Does this PR introduce _any_ user-facing change? Previously, the enable_npugraph_ex switch would trigger an error; now we have removed the error reporting mechanism to better facilitate subsequent optimization efforts. Basic functionalities are available in CANN and torch_npu for Q3, while advanced optimizations will depend on the Q4 release. ### How was this patch tested? llm =LLM( model=model, enforce_eager=False , additional_config={ "enable_npugraph_ex": True }, compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [16], }, } - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com> Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-18 09:08:40 +08:00
parent 39bdd4cfaa
commit 8069442b41
4 changed files with 107 additions and 13 deletions
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only(
    )


+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+def test_output_between_eager_and_fullgraph_npugraph_ex(
+    model: str,
+    max_tokens: int,
+) -> None:
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    # NOTE: Randomly fill the prompt with the requested amount for
+    # the specified capture shape to prevent accuracy issues caused by padding
+    prompts = [
+        ('Solve the following math problem step by step.'
+         'The last line of your response should be of the form Answer: '
+         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
+         'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
+         'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
+         '$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
+         'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
+         'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
+         ),
+        ('Solve the following math problem step by step.'
+         'The last line of your response should be of the form Answer: '
+         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
+         'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
+         'independently and uniformly at random on the perimeter of $ABCD$.'
+         'If the expected value of the area of triangle $\\triangle AXY$'
+         'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
+         'integers $m$ and $n$, compute $m+n$.'),
+        ('Solve the following math problem step by step.'
+         'The last line of your response should be of the form Answer: '
+         '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
+         'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
+         'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
+         'and $x^2 + cx + b = 0$ also have a common real root.'
+         'Compute the sum $a + b + c$.')
+    ]
+    vllm_aclgraph_qwen_answers = [
+        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
+        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
+        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
+    ]
+
+    vllm_aclgraph_ds_answers = [
+        '\n\nSelect an assignment template',
+        '\n\nSelect an assignment template',
+        '\n\nSelect an assignment template'
+    ]
+
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     n=1,
+                                     temperature=0.0,
+                                     top_p=1.0,
+                                     top_k=1)
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
+                additional_config={"enable_npugraph_ex": True},
+                quantization="ascend",
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
+
+    else:
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={
+                    "cudagraph_capture_sizes": [4, 8, 32, 64],
+                    "cudagraph_mode": "FULL_DECODE_ONLY"
+                },
+                additional_config={"enable_npugraph_ex": True},
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
+
+    vllm_aclgraph_outputs_list = []
+    for output in vllm_aclgraph_outputs:
+        vllm_aclgraph_outputs_list.append(
+            ([output.outputs[0].index], output.outputs[0].text))
+    vllm_eager_outputs_list = []
+    vllm_eager_outputs_list = ([
+        ([0], answer) for answer in vllm_aclgraph_ds_answers
+    ] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
+        ([0], answer) for answer in vllm_aclgraph_qwen_answers
+    ])
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_aclgraph_outputs",
+    )
+
+
 def test_aclgraph_enable():
    # Generally, this test is not belong to e2e, but it is a good way to check if
    # aclgraph is enabled in real environment
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -64,13 +64,13 @@ class TestAscendConfig(TestBase):

    @_clean_up_ascend_config
    def test_init_ascend_config_enable_npugraph_ex(self):
-        with self.assertRaises(NotImplementedError):
        test_vllm_config = VllmConfig()
        test_vllm_config.additional_config = {
            "enable_npugraph_ex": True,
            "refresh": True,
        }
-            init_ascend_config(test_vllm_config)
+        ascend_config = init_ascend_config(test_vllm_config)
+        self.assertTrue(ascend_config.enable_npugraph_ex)

    @_clean_up_ascend_config
    def test_get_ascend_config(self):
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -150,10 +150,6 @@ class AscendConfig:
            self, vllm_config)
        self.enable_npugraph_ex = additional_config.get(
            "enable_npugraph_ex", False)
-        if self.enable_npugraph_ex:
-            raise NotImplementedError(
-                "This feature is still in the experiment and will be supported soon."
-            )
        # We find that _npu_paged_attention still performs better than
        # npu_fused_infer_attention_score in some cases. We allow to execute
        # _npu_paged_attention in this cases. This should be removed once
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -89,13 +89,13 @@ def npugraph_ex_compile(
                                              tuple,
                                              args=([return_value], ))
        output_node.args = (tuple_node, )
-        fx_graph.recompile()
+        graph.recompile()

    import torchair

    # TODO: use a better way to lazy register replacement, instead of import one by one
    # As an example, we directly import here to register replacement.
-    import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant  # noqa
+    # import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant  # noqa

    torch.npu.set_compile_mode(jit_compile=False)
    config = torchair.CompilerConfig()