[Performance]Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder (#7737)

### What this PR does / why we need it?
Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder and add doc
for DeepSeekOCR2.md

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
- vllm 0.18.0
- vllm-ascend main

1. _create_custom_4d_mask during 141ms49us620ns -->
_create_npu_optimized_mask during 1ms227us780ns
2. convd2d : 27ms --> matmul <1ms
3. relposattention:sdpa->prompt_flash_attention

---------

Signed-off-by: Wangbei25 <wangbei41@huawie.com>
Signed-off-by: Wangbei25 <wangbei41@huawei.com>
Co-authored-by: Wangbei25 <wangbei41@huawie.com>
This commit is contained in:
Wangbei25
2026-03-31 14:49:29 +08:00
committed by GitHub
parent 2a0a588311
commit 4f259d4fd8
4 changed files with 432 additions and 2 deletions

View File

@@ -604,7 +604,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
from vllm.model_executor.custom_op import CustomOp
from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
from vllm_ascend.ops.conv import AscendConv2dLayer, AscendConv3dLayer
from vllm_ascend.ops.conv import AscendConv3dLayer
from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE
from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated
from vllm_ascend.ops.linear import (
@@ -616,6 +616,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
)
from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention
from vllm_ascend.ops.mm_encoder_attention import AscendMMEncoderAttention
from vllm_ascend.ops.rel_pos_attention import AscendRelPosAttention
from vllm_ascend.ops.rotary_embedding import (
AscendApplyRotaryEmb,
AscendDeepseekScalingRotaryEmbedding,
@@ -653,8 +654,8 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
"MMEncoderAttention": AscendMMEncoderAttention,
"ApplyRotaryEmb": AscendApplyRotaryEmb,
"RMSNormGated": AscendRMSNormGated,
"Conv2dLayer": AscendConv2dLayer,
"Conv3dLayer": AscendConv3dLayer,
"RelPosAttention": AscendRelPosAttention,
}
# 310P: override selected ops with 310P implementations (keep minimal changes outside _310p)