enable npugraph_ex (#5120)
### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.
### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.
### How was this patch tested?
llm =LLM(
model=model,
enforce_eager=False ,
additional_config={
"enable_npugraph_ex": True
},
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [16],
},
}
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
@@ -194,6 +194,104 @@ def test_output_between_eager_and_full_decode_only(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
|
def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||||
|
model: str,
|
||||||
|
max_tokens: int,
|
||||||
|
) -> None:
|
||||||
|
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||||
|
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||||
|
# NOTE: Randomly fill the prompt with the requested amount for
|
||||||
|
# the specified capture shape to prevent accuracy issues caused by padding
|
||||||
|
prompts = [
|
||||||
|
('Solve the following math problem step by step.'
|
||||||
|
'The last line of your response should be of the form Answer: '
|
||||||
|
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||||
|
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
|
||||||
|
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
|
||||||
|
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
|
||||||
|
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
|
||||||
|
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
|
||||||
|
),
|
||||||
|
('Solve the following math problem step by step.'
|
||||||
|
'The last line of your response should be of the form Answer: '
|
||||||
|
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||||
|
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
|
||||||
|
'independently and uniformly at random on the perimeter of $ABCD$.'
|
||||||
|
'If the expected value of the area of triangle $\\triangle AXY$'
|
||||||
|
'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
|
||||||
|
'integers $m$ and $n$, compute $m+n$.'),
|
||||||
|
('Solve the following math problem step by step.'
|
||||||
|
'The last line of your response should be of the form Answer: '
|
||||||
|
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||||
|
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
|
||||||
|
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
|
||||||
|
'and $x^2 + cx + b = 0$ also have a common real root.'
|
||||||
|
'Compute the sum $a + b + c$.')
|
||||||
|
]
|
||||||
|
vllm_aclgraph_qwen_answers = [
|
||||||
|
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
||||||
|
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
|
||||||
|
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
||||||
|
]
|
||||||
|
|
||||||
|
vllm_aclgraph_ds_answers = [
|
||||||
|
'\n\nSelect an assignment template',
|
||||||
|
'\n\nSelect an assignment template',
|
||||||
|
'\n\nSelect an assignment template'
|
||||||
|
]
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||||
|
n=1,
|
||||||
|
temperature=0.0,
|
||||||
|
top_p=1.0,
|
||||||
|
top_k=1)
|
||||||
|
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||||
|
with VllmRunner(
|
||||||
|
model,
|
||||||
|
max_model_len=1024,
|
||||||
|
enforce_eager=False,
|
||||||
|
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||||
|
additional_config={"enable_npugraph_ex": True},
|
||||||
|
quantization="ascend",
|
||||||
|
) as runner:
|
||||||
|
vllm_aclgraph_outputs = runner.model.generate(
|
||||||
|
prompts, sampling_params)
|
||||||
|
|
||||||
|
else:
|
||||||
|
with VllmRunner(
|
||||||
|
model,
|
||||||
|
max_model_len=1024,
|
||||||
|
enforce_eager=False,
|
||||||
|
compilation_config={
|
||||||
|
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||||
|
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||||
|
},
|
||||||
|
additional_config={"enable_npugraph_ex": True},
|
||||||
|
) as runner:
|
||||||
|
vllm_aclgraph_outputs = runner.model.generate(
|
||||||
|
prompts, sampling_params)
|
||||||
|
|
||||||
|
vllm_aclgraph_outputs_list = []
|
||||||
|
for output in vllm_aclgraph_outputs:
|
||||||
|
vllm_aclgraph_outputs_list.append(
|
||||||
|
([output.outputs[0].index], output.outputs[0].text))
|
||||||
|
vllm_eager_outputs_list = []
|
||||||
|
vllm_eager_outputs_list = ([
|
||||||
|
([0], answer) for answer in vllm_aclgraph_ds_answers
|
||||||
|
] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
|
||||||
|
([0], answer) for answer in vllm_aclgraph_qwen_answers
|
||||||
|
])
|
||||||
|
|
||||||
|
check_outputs_equal(
|
||||||
|
outputs_0_lst=vllm_eager_outputs_list,
|
||||||
|
outputs_1_lst=vllm_aclgraph_outputs_list,
|
||||||
|
name_0="vllm_eager_outputs",
|
||||||
|
name_1="vllm_aclgraph_outputs",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_aclgraph_enable():
|
def test_aclgraph_enable():
|
||||||
# Generally, this test is not belong to e2e, but it is a good way to check if
|
# Generally, this test is not belong to e2e, but it is a good way to check if
|
||||||
# aclgraph is enabled in real environment
|
# aclgraph is enabled in real environment
|
||||||
|
|||||||
@@ -64,13 +64,13 @@ class TestAscendConfig(TestBase):
|
|||||||
|
|
||||||
@_clean_up_ascend_config
|
@_clean_up_ascend_config
|
||||||
def test_init_ascend_config_enable_npugraph_ex(self):
|
def test_init_ascend_config_enable_npugraph_ex(self):
|
||||||
with self.assertRaises(NotImplementedError):
|
test_vllm_config = VllmConfig()
|
||||||
test_vllm_config = VllmConfig()
|
test_vllm_config.additional_config = {
|
||||||
test_vllm_config.additional_config = {
|
"enable_npugraph_ex": True,
|
||||||
"enable_npugraph_ex": True,
|
"refresh": True,
|
||||||
"refresh": True,
|
}
|
||||||
}
|
ascend_config = init_ascend_config(test_vllm_config)
|
||||||
init_ascend_config(test_vllm_config)
|
self.assertTrue(ascend_config.enable_npugraph_ex)
|
||||||
|
|
||||||
@_clean_up_ascend_config
|
@_clean_up_ascend_config
|
||||||
def test_get_ascend_config(self):
|
def test_get_ascend_config(self):
|
||||||
|
|||||||
@@ -150,10 +150,6 @@ class AscendConfig:
|
|||||||
self, vllm_config)
|
self, vllm_config)
|
||||||
self.enable_npugraph_ex = additional_config.get(
|
self.enable_npugraph_ex = additional_config.get(
|
||||||
"enable_npugraph_ex", False)
|
"enable_npugraph_ex", False)
|
||||||
if self.enable_npugraph_ex:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"This feature is still in the experiment and will be supported soon."
|
|
||||||
)
|
|
||||||
# We find that _npu_paged_attention still performs better than
|
# We find that _npu_paged_attention still performs better than
|
||||||
# npu_fused_infer_attention_score in some cases. We allow to execute
|
# npu_fused_infer_attention_score in some cases. We allow to execute
|
||||||
# _npu_paged_attention in this cases. This should be removed once
|
# _npu_paged_attention in this cases. This should be removed once
|
||||||
|
|||||||
@@ -89,13 +89,13 @@ def npugraph_ex_compile(
|
|||||||
tuple,
|
tuple,
|
||||||
args=([return_value], ))
|
args=([return_value], ))
|
||||||
output_node.args = (tuple_node, )
|
output_node.args = (tuple_node, )
|
||||||
fx_graph.recompile()
|
graph.recompile()
|
||||||
|
|
||||||
import torchair
|
import torchair
|
||||||
|
|
||||||
# TODO: use a better way to lazy register replacement, instead of import one by one
|
# TODO: use a better way to lazy register replacement, instead of import one by one
|
||||||
# As an example, we directly import here to register replacement.
|
# As an example, we directly import here to register replacement.
|
||||||
import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa
|
# import vllm_ascend.compilation.npugraph_ex_passes.add_rms_norm_quant # noqa
|
||||||
|
|
||||||
torch.npu.set_compile_mode(jit_compile=False)
|
torch.npu.set_compile_mode(jit_compile=False)
|
||||||
config = torchair.CompilerConfig()
|
config = torchair.CompilerConfig()
|
||||||
|
|||||||
Reference in New Issue
Block a user