### What this PR does / why we need it?
This PR optimizes the `split_qkv_rmsnorm_rope` operator by introducing a
new Triton kernel, `split_qkv_rmsnorm_rope_prefill_kernel`, for the
prefill stage (i.e., large batch sizes). The implementation now
dynamically selects between the existing decode kernel and the new
prefill kernel based on the batch size, which improves performance for
large batch scenarios.
Additionally, the RoPE implementation is updated to support partial
rotation dimensions (`rope_dim`), making the operator more flexible.
### Does this PR introduce _any_ user-facing change?
No. This is a performance optimization and is not expected to introduce
any user-facing changes.
### How was this patch tested?
CI should pass with existing tests. The new prefill path is triggered
when the batch size is larger than the number of available vector cores.
The partial RoPE feature can be tested by passing the `rope_dim`
argument.
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
---------
Signed-off-by: guzhiyong <guzhiyong5@h-partners.com>
Signed-off-by: frank <2547457096@qq.com>
Co-authored-by: guzhiyong <guzhiyong5@h-partners.com>
213 lines
9.6 KiB
Python
213 lines
9.6 KiB
Python
#
|
||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||
# Copyright 2023 The vLLM team.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
|
||
# ruff: noqa: E501
|
||
|
||
import os
|
||
|
||
import pytest
|
||
|
||
from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
|
||
|
||
CASE_QWEN_ACLGRAPH = LLMTestCase(
|
||
model="Qwen/Qwen3-0.6B",
|
||
prompts=PROMPTS_SHORT,
|
||
golden_answers=[
|
||
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
|
||
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
|
||
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
|
||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
|
||
],
|
||
)
|
||
|
||
CASE_DS_ACLGRAPH = LLMTestCase(
|
||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||
quantization="ascend",
|
||
prompts=PROMPTS_SHORT,
|
||
golden_answers=[
|
||
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
|
||
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
|
||
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
|
||
" here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
|
||
],
|
||
)
|
||
|
||
CASE_QWEN_FULL = LLMTestCase(
|
||
model="Qwen/Qwen3-0.6B",
|
||
prompts=PROMPTS_SHORT,
|
||
golden_answers=[
|
||
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
|
||
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
|
||
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
|
||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
|
||
],
|
||
)
|
||
|
||
CASE_DS_FULL = LLMTestCase(
|
||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||
quantization="ascend",
|
||
prompts=PROMPTS_SHORT,
|
||
golden_answers=[
|
||
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
|
||
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
|
||
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
|
||
" here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
|
||
],
|
||
)
|
||
|
||
CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
|
||
model="Qwen/Qwen3-0.6B",
|
||
prompts=PROMPTS_LONG,
|
||
golden_answers=[
|
||
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
|
||
" \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
|
||
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
|
||
],
|
||
)
|
||
|
||
CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
|
||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||
quantization="ascend",
|
||
prompts=PROMPTS_LONG,
|
||
golden_answers=[
|
||
"\n\nSelect an assignment template",
|
||
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
|
||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
|
||
],
|
||
)
|
||
|
||
CASE_QWEN_EX = LLMTestCase(
|
||
model="Qwen/Qwen3-0.6B",
|
||
prompts=PROMPTS_LONG,
|
||
golden_answers=[
|
||
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
|
||
" \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
|
||
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
|
||
],
|
||
)
|
||
|
||
CASE_DS_EX = LLMTestCase(
|
||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||
quantization="ascend",
|
||
prompts=PROMPTS_LONG,
|
||
golden_answers=[
|
||
"\n\nSelect an assignment template",
|
||
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
|
||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
|
||
],
|
||
)
|
||
|
||
|
||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
|
||
def test_piecewise_res_consistency(cur_case: LLMTestCase):
|
||
runner_kwargs = {
|
||
"model_name": cur_case.model,
|
||
"max_model_len": 1024,
|
||
"cudagraph_capture_sizes": [1, 2, 4, 8],
|
||
"quantization": cur_case.quantization,
|
||
}
|
||
gen_and_valid(
|
||
runner_kwargs=runner_kwargs,
|
||
prompts=cur_case.prompts,
|
||
sampling_params=cur_case.sampling_params,
|
||
golden_answers=cur_case.golden_answers,
|
||
)
|
||
|
||
|
||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
|
||
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||
runner_kwargs = {
|
||
"model_name": cur_case.model,
|
||
"max_model_len": 1024,
|
||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||
"quantization": cur_case.quantization,
|
||
}
|
||
gen_and_valid(
|
||
runner_kwargs=runner_kwargs,
|
||
prompts=cur_case.prompts,
|
||
sampling_params=cur_case.sampling_params,
|
||
golden_answers=cur_case.golden_answers,
|
||
)
|
||
|
||
|
||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
|
||
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||
runner_kwargs = {
|
||
"model_name": cur_case.model,
|
||
"max_model_len": 1024,
|
||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||
"quantization": cur_case.quantization,
|
||
"additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||
}
|
||
gen_and_valid(
|
||
runner_kwargs=runner_kwargs,
|
||
prompts=cur_case.prompts,
|
||
sampling_params=cur_case.sampling_params,
|
||
golden_answers=cur_case.golden_answers,
|
||
)
|
||
|
||
|
||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
|
||
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||
runner_kwargs = {
|
||
"model_name": cur_case.model,
|
||
"quantization": cur_case.quantization,
|
||
"max_model_len": 1024,
|
||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||
"additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": True}},
|
||
}
|
||
gen_and_valid(
|
||
runner_kwargs=runner_kwargs,
|
||
prompts=cur_case.prompts,
|
||
sampling_params=cur_case.sampling_params,
|
||
golden_answers=cur_case.golden_answers,
|
||
)
|
||
|
||
|
||
# The accuracy has already been verified in the previous test case.
|
||
# This test case is used to check whether the functionality works properly
|
||
# after enabling the static kernel and whether it is uninstalled as expected.
|
||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX])
|
||
def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
|
||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||
runner_kwargs = {
|
||
"model_name": cur_case.model,
|
||
"quantization": cur_case.quantization,
|
||
"max_model_len": 1024,
|
||
"compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||
"additional_config": {
|
||
"ascend_compilation_config": {
|
||
"enable_npugraph_ex": True,
|
||
"enable_static_kernel": True,
|
||
}
|
||
},
|
||
}
|
||
gen_and_valid(
|
||
runner_kwargs=runner_kwargs,
|
||
prompts=cur_case.prompts,
|
||
sampling_params=cur_case.sampling_params,
|
||
golden_answers=cur_case.golden_answers,
|
||
)
|
||
|
||
# Check whether the static kernel is properly uninstall
|
||
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
|
||
static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
|
||
assert not os.path.exists(static_kernel_install_path)
|