diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index e217f0bb..9eb68894 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only( ) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +def test_output_between_eager_and_fullgraph_npugraph_ex( + model: str, + max_tokens: int, +) -> None: + if 'HCCL_OP_EXPANSION_MODE' in os.environ: + del os.environ['HCCL_OP_EXPANSION_MODE'] + # NOTE: Randomly fill the prompt with the requested amount for + # the specified capture shape to prevent accuracy issues caused by padding + prompts = [ + ('Solve the following math problem step by step.' + 'The last line of your response should be of the form Answer: ' + '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' + 'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$' + 'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,' + '$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.' + 'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,' + 'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.' + ), + ('Solve the following math problem step by step.' + 'The last line of your response should be of the form Answer: ' + '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' + 'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen' + 'independently and uniformly at random on the perimeter of $ABCD$.' + 'If the expected value of the area of triangle $\\triangle AXY$' + 'can be expressed as $\\frac{m}{n}$, for relatively prime positive' + 'integers $m$ and $n$, compute $m+n$.'), + ('Solve the following math problem step by step.' + 'The last line of your response should be of the form Answer: ' + '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' + 'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$' + 'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$' + 'and $x^2 + cx + b = 0$ also have a common real root.' + 'Compute the sum $a + b + c$.') + ] + vllm_aclgraph_qwen_answers = [ + ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the', + " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is", + ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can' + ] + + vllm_aclgraph_ds_answers = [ + '\n\nSelect an assignment template', + '\n\nSelect an assignment template', + '\n\nSelect an assignment template' + ] + + sampling_params = SamplingParams(max_tokens=max_tokens, + n=1, + temperature=0.0, + top_p=1.0, + top_k=1) + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, + additional_config={"enable_npugraph_ex": True}, + quantization="ascend", + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) + + else: + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + compilation_config={ + "cudagraph_capture_sizes": [4, 8, 32, 64], + "cudagraph_mode": "FULL_DECODE_ONLY" + }, + additional_config={"enable_npugraph_ex": True}, + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) + + vllm_aclgraph_outputs_list = [] + for output in vllm_aclgraph_outputs: + vllm_aclgraph_outputs_list.append( + ([output.outputs[0].index], output.outputs[0].text)) + vllm_eager_outputs_list = [] + vllm_eager_outputs_list = ([ + ([0], answer) for answer in vllm_aclgraph_ds_answers + ] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [ + ([0], answer) for answer in vllm_aclgraph_qwen_answers + ]) + + check_outputs_equal( + outputs_0_lst=vllm_eager_outputs_list, + outputs_1_lst=vllm_aclgraph_outputs_list, + name_0="vllm_eager_outputs", + name_1="vllm_aclgraph_outputs", + ) + + def test_aclgraph_enable(): # Generally, this test is not belong to e2e, but it is a good way to check if # aclgraph is enabled in real environment diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index 0367b45a..1a337dea 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -64,13 +64,13 @@ class TestAscendConfig(TestBase): @_clean_up_ascend_config def test_init_ascend_config_enable_npugraph_ex(self): - with self.assertRaises(NotImplementedError): - test_vllm_config = VllmConfig() - test_vllm_config.additional_config = { - "enable_npugraph_ex": True, - "refresh": True, - } - init_ascend_config(test_vllm_config) + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "enable_npugraph_ex": True, + "refresh": True, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertTrue(ascend_config.enable_npugraph_ex) @_clean_up_ascend_config def test_get_ascend_config(self): diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index e9b9b1f6..c6524a78 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -150,10 +150,6 @@ class AscendConfig: self, vllm_config) self.enable_npugraph_ex = additional_config.get( "enable_npugraph_ex", False) - if self.enable_npugraph_ex: - raise NotImplementedError( - "This feature is still in the experiment and will be supported soon." - ) # We find that _npu_paged_attention still performs better than # npu_fused_infer_attention_score in some cases. We allow to execute # _npu_paged_attention in this cases. This should be removed once diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py index 4bb7deae..1c706806 100644 --- a/vllm_ascend/compilation/compiler_interface.py +++ b/vllm_ascend/compilation/compiler_interface.py @@ -89,13 +89,13 @@ def npugraph_ex_compile( tuple, args=([return_value], )) output_node.args = (tuple_node, ) - fx_graph.recompile() + graph.recompile() import torchair # TODO: use a better way to lazy register replacement, instead of import one by one # As an example, we directly import here to register replacement. - import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa + # import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa torch.npu.set_compile_mode(jit_compile=False) config = torchair.CompilerConfig()