enable npugraph_ex (#5120)

### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.

### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.

### How was this patch tested?
llm =LLM(
    model=model,
    enforce_eager=False ,
        additional_config={
        "enable_npugraph_ex":  True
        },
        compilation_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [16],
        },
}


- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
panchao-hub
2025-12-18 09:08:40 +08:00
committed by GitHub
parent 39bdd4cfaa
commit 8069442b41
4 changed files with 107 additions and 13 deletions

View File

@@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only(
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_fullgraph_npugraph_ex(
model: str,
max_tokens: int,
) -> None:
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
# NOTE: Randomly fill the prompt with the requested amount for
# the specified capture shape to prevent accuracy issues caused by padding
prompts = [
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
),
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
'independently and uniformly at random on the perimeter of $ABCD$.'
'If the expected value of the area of triangle $\\triangle AXY$'
'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
'integers $m$ and $n$, compute $m+n$.'),
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
'and $x^2 + cx + b = 0$ also have a common real root.'
'Compute the sum $a + b + c$.')
]
vllm_aclgraph_qwen_answers = [
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
]
vllm_aclgraph_ds_answers = [
'\n\nSelect an assignment template',
'\n\nSelect an assignment template',
'\n\nSelect an assignment template'
]
sampling_params = SamplingParams(max_tokens=max_tokens,
n=1,
temperature=0.0,
top_p=1.0,
top_k=1)
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
additional_config={"enable_npugraph_ex": True},
quantization="ascend",
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
else:
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
additional_config={"enable_npugraph_ex": True},
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
vllm_aclgraph_outputs_list = []
for output in vllm_aclgraph_outputs:
vllm_aclgraph_outputs_list.append(
([output.outputs[0].index], output.outputs[0].text))
vllm_eager_outputs_list = []
vllm_eager_outputs_list = ([
([0], answer) for answer in vllm_aclgraph_ds_answers
] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
([0], answer) for answer in vllm_aclgraph_qwen_answers
])
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=vllm_aclgraph_outputs_list,
name_0="vllm_eager_outputs",
name_1="vllm_aclgraph_outputs",
)
def test_aclgraph_enable():
# Generally, this test is not belong to e2e, but it is a good way to check if
# aclgraph is enabled in real environment

View File

@@ -64,13 +64,13 @@ class TestAscendConfig(TestBase):
@_clean_up_ascend_config
def test_init_ascend_config_enable_npugraph_ex(self):
with self.assertRaises(NotImplementedError):
test_vllm_config = VllmConfig()
test_vllm_config.additional_config = {
"enable_npugraph_ex": True,
"refresh": True,
}
init_ascend_config(test_vllm_config)
ascend_config = init_ascend_config(test_vllm_config)
self.assertTrue(ascend_config.enable_npugraph_ex)
@_clean_up_ascend_config
def test_get_ascend_config(self):

View File

@@ -150,10 +150,6 @@ class AscendConfig:
self, vllm_config)
self.enable_npugraph_ex = additional_config.get(
"enable_npugraph_ex", False)
if self.enable_npugraph_ex:
raise NotImplementedError(
"This feature is still in the experiment and will be supported soon."
)
# We find that _npu_paged_attention still performs better than
# npu_fused_infer_attention_score in some cases. We allow to execute
# _npu_paged_attention in this cases. This should be removed once

View File

@@ -89,13 +89,13 @@ def npugraph_ex_compile(
tuple,
args=([return_value], ))
output_node.args = (tuple_node, )
fx_graph.recompile()
graph.recompile()
import torchair
# TODO: use a better way to lazy register replacement, instead of import one by one
# As an example, we directly import here to register replacement.
import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa
# import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa
torch.npu.set_compile_mode(jit_compile=False)
config = torchair.CompilerConfig()