add mlp tp optimze (#2120)

### What this PR does / why we need it?
For dense models, by not applying tensor parallelism (TP) to the
attention module and applying TP to the MLP module, the allreduce
operations in the attention module can be eliminated, thereby reducing
computational overhead. However, this approach increases memory usage,
so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to
control this optimization.

- vLLM main:
b17109beea

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
sherie
2025-08-21 09:22:07 +08:00
committed by GitHub
parent 973a7cfdf0
commit 3fb80ee356
6 changed files with 729 additions and 2 deletions

View File

@@ -475,9 +475,20 @@ def register_ascend_customop():
from vllm.model_executor.custom_op import CustomOp
from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
from vllm_ascend.ops.linear import (AscendMlpColumnParallelLinear,
AscendMlpMergedColumnParallelLinear,
AscendMlpRowParallelLinear)
CustomOp.register_oot(_decorated_op_cls=AscendQuickGELU, name="QuickGELU")
CustomOp.register_oot(_decorated_op_cls=AscendSiluAndMul,
name="SiluAndMul")
if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE:
CustomOp.register_oot(_decorated_op_cls=AscendMlpColumnParallelLinear,
name="ColumnParallelLinear")
CustomOp.register_oot(_decorated_op_cls=AscendMlpRowParallelLinear,
name="RowParallelLinear")
CustomOp.register_oot(
_decorated_op_cls=AscendMlpMergedColumnParallelLinear,
name="MergedColumnParallelLinear")
from vllm_ascend.ops.layernorm import AscendRMSNorm
CustomOp.register_oot(_decorated_op_cls=AscendRMSNorm, name="RMSNorm")