[Performance]Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder (#7737)

### What this PR does / why we need it? Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder and add doc for DeepSeekOCR2.md ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vllm 0.18.0 - vllm-ascend main 1. _create_custom_4d_mask during 141ms49us620ns --> _create_npu_optimized_mask during 1ms227us780ns 2. convd2d : 27ms --> matmul <1ms 3. relposattention：sdpa->prompt_flash_attention --------- Signed-off-by: Wangbei25 <wangbei41@huawie.com> Signed-off-by: Wangbei25 <wangbei41@huawei.com> Co-authored-by: Wangbei25 <wangbei41@huawie.com>
2026-03-31 14:49:29 +08:00
parent 2a0a588311
commit 4f259d4fd8
4 changed files with 432 additions and 2 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -604,7 +604,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
    from vllm.model_executor.custom_op import CustomOp

    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
-    from vllm_ascend.ops.conv import AscendConv2dLayer, AscendConv3dLayer
+    from vllm_ascend.ops.conv import AscendConv3dLayer
    from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE
    from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated
    from vllm_ascend.ops.linear import (
@@ -616,6 +616,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
    )
    from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention
    from vllm_ascend.ops.mm_encoder_attention import AscendMMEncoderAttention
+    from vllm_ascend.ops.rel_pos_attention import AscendRelPosAttention
    from vllm_ascend.ops.rotary_embedding import (
        AscendApplyRotaryEmb,
        AscendDeepseekScalingRotaryEmbedding,
@@ -653,8 +654,8 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
        "MMEncoderAttention": AscendMMEncoderAttention,
        "ApplyRotaryEmb": AscendApplyRotaryEmb,
        "RMSNormGated": AscendRMSNormGated,
-        "Conv2dLayer": AscendConv2dLayer,
        "Conv3dLayer": AscendConv3dLayer,
+        "RelPosAttention": AscendRelPosAttention,
    }

    # 310P: override selected ops with 310P implementations (keep minimal changes outside _310p)