enable npugraph_ex (#5120)
### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.
### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.
### How was this patch tested?
llm =LLM(
model=model,
enforce_eager=False ,
additional_config={
"enable_npugraph_ex": True
},
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [16],
},
}
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
@@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
# NOTE: Randomly fill the prompt with the requested amount for
|
||||
# the specified capture shape to prevent accuracy issues caused by padding
|
||||
prompts = [
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
|
||||
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
|
||||
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
|
||||
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
|
||||
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
|
||||
),
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
|
||||
'independently and uniformly at random on the perimeter of $ABCD$.'
|
||||
'If the expected value of the area of triangle $\\triangle AXY$'
|
||||
'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
|
||||
'integers $m$ and $n$, compute $m+n$.'),
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
|
||||
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
|
||||
'and $x^2 + cx + b = 0$ also have a common real root.'
|
||||
'Compute the sum $a + b + c$.')
|
||||
]
|
||||
vllm_aclgraph_qwen_answers = [
|
||||
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
||||
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
|
||||
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
||||
]
|
||||
|
||||
vllm_aclgraph_ds_answers = [
|
||||
'\n\nSelect an assignment template',
|
||||
'\n\nSelect an assignment template',
|
||||
'\n\nSelect an assignment template'
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
n=1,
|
||||
temperature=0.0,
|
||||
top_p=1.0,
|
||||
top_k=1)
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
additional_config={"enable_npugraph_ex": True},
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
else:
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
additional_config={"enable_npugraph_ex": True},
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
vllm_aclgraph_outputs_list = []
|
||||
for output in vllm_aclgraph_outputs:
|
||||
vllm_aclgraph_outputs_list.append(
|
||||
([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_eager_outputs_list = []
|
||||
vllm_eager_outputs_list = ([
|
||||
([0], answer) for answer in vllm_aclgraph_ds_answers
|
||||
] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
|
||||
([0], answer) for answer in vllm_aclgraph_qwen_answers
|
||||
])
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_aclgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_aclgraph_outputs",
|
||||
)
|
||||
|
||||
|
||||
def test_aclgraph_enable():
|
||||
# Generally, this test is not belong to e2e, but it is a good way to check if
|
||||
# aclgraph is enabled in real environment
|
||||
|
||||
Reference in New Issue
Block a user