[Performance]Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder (#7737)
### What this PR does / why we need it? Optimize DeepSeekOCR2 RelPosAttention and CustomQwen2Decoder and add doc for DeepSeekOCR2.md ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vllm 0.18.0 - vllm-ascend main 1. _create_custom_4d_mask during 141ms49us620ns --> _create_npu_optimized_mask during 1ms227us780ns 2. convd2d : 27ms --> matmul <1ms 3. relposattention:sdpa->prompt_flash_attention --------- Signed-off-by: Wangbei25 <wangbei41@huawie.com> Signed-off-by: Wangbei25 <wangbei41@huawei.com> Co-authored-by: Wangbei25 <wangbei41@huawie.com>
This commit is contained in:
@@ -604,7 +604,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
|
||||
from vllm_ascend.ops.conv import AscendConv2dLayer, AscendConv3dLayer
|
||||
from vllm_ascend.ops.conv import AscendConv3dLayer
|
||||
from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE
|
||||
from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated
|
||||
from vllm_ascend.ops.linear import (
|
||||
@@ -616,6 +616,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
|
||||
)
|
||||
from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention
|
||||
from vllm_ascend.ops.mm_encoder_attention import AscendMMEncoderAttention
|
||||
from vllm_ascend.ops.rel_pos_attention import AscendRelPosAttention
|
||||
from vllm_ascend.ops.rotary_embedding import (
|
||||
AscendApplyRotaryEmb,
|
||||
AscendDeepseekScalingRotaryEmbedding,
|
||||
@@ -653,8 +654,8 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
|
||||
"MMEncoderAttention": AscendMMEncoderAttention,
|
||||
"ApplyRotaryEmb": AscendApplyRotaryEmb,
|
||||
"RMSNormGated": AscendRMSNormGated,
|
||||
"Conv2dLayer": AscendConv2dLayer,
|
||||
"Conv3dLayer": AscendConv3dLayer,
|
||||
"RelPosAttention": AscendRelPosAttention,
|
||||
}
|
||||
|
||||
# 310P: override selected ops with 310P implementations (keep minimal changes outside _310p)
|
||||
|
||||
Reference in New Issue
Block a user