[Fusion] [Graph] Add qknorm rope fusion operator (#4711)
### What this PR does / why we need it?
This PR add `qkv_rmsnorm_rope` operator and introduces a graph fusion
pass for `qknorm_rope` operations. The implementation includes a new
configuration flag, a pattern matching pass using
`torch._inductor.pattern_matcher`, and a custom Triton kernel for the
fused operation.
Co-authored-by: Angazenn
[supperccell@163.com](mailto:supperccell@163.com)
### Does this PR introduce _any_ user-facing change?
Yes, add new additional_config
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
@@ -84,12 +84,6 @@ from vllm.v1.worker.utils import AttentionGroup
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import (MoECommType,
|
||||
get_mc2_tokens_capacity,
|
||||
select_moe_comm_method,
|
||||
set_ascend_forward_context,
|
||||
set_cos_and_sin, set_mc2_mask,
|
||||
set_mc2_tokens_capacity)
|
||||
from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
|
||||
@@ -111,6 +105,7 @@ from vllm_ascend.eplb.core.eplb_utils import EPLBParamUtils
|
||||
from vllm_ascend.eplb.core.eplb_worker import EplbProcess
|
||||
from vllm_ascend.eplb.eplb_updator import EplbUpdator
|
||||
from vllm_ascend.eplb.utils import model_register
|
||||
from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
|
||||
from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
|
||||
from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
|
||||
from vllm_ascend.sample.logits_processor import build_logitsprocs
|
||||
@@ -125,6 +120,10 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
is_moe_model, lmhead_tp_enable, vllm_version_is)
|
||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||
|
||||
from vllm_ascend.ascend_forward_context import ( # isort: skip
|
||||
MoECommType, get_mc2_tokens_capacity, select_moe_comm_method,
|
||||
set_ascend_forward_context, set_mc2_mask, set_mc2_tokens_capacity)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
@@ -1122,6 +1121,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
for layer_name in attn_group.layer_names:
|
||||
attn_metadata[layer_name] = attn_metadata_i
|
||||
|
||||
# update global cos, sin
|
||||
update_cos_sin(positions)
|
||||
|
||||
if lmhead_tp_enable():
|
||||
max_num_reqs_across_dp = self.max_num_reqs * self.uniform_decode_query_len
|
||||
logits_indices = nn.functional.pad(
|
||||
@@ -2084,6 +2086,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
else:
|
||||
positions = self.positions.gpu[:num_tokens_padded]
|
||||
|
||||
# update global cos, sin
|
||||
update_cos_sin(positions)
|
||||
|
||||
if get_pp_group().is_first_rank:
|
||||
intermediate_tensors = None
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user