[main] flashcomm_v1 optim in Qwen Dense Models (#2802)

### What this PR does / why we need it?
Flashcomm_v1 optim in Qwen Dense Models.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

- vLLM version: v0.10.1.1
- vLLM main:
5e537f45b4

Co-authored-by: 1024daniel <xxltju324@gmail.com>
This commit is contained in:
rjg-lyh
2025-09-08 22:52:24 +08:00
committed by GitHub
parent 4df8df5b94
commit 1bbb20ea13
11 changed files with 362 additions and 20 deletions

View File

@@ -493,6 +493,7 @@ def register_ascend_customop():
from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
AscendMergedColumnParallelLinear,
AscendQKVParallelLinear,
AscendRowParallelLinear)
from vllm_ascend.ops.rotary_embedding import (
AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding)
@@ -510,6 +511,8 @@ def register_ascend_customop():
name="RowParallelLinear")
CustomOp.register_oot(_decorated_op_cls=AscendMergedColumnParallelLinear,
name="MergedColumnParallelLinear")
CustomOp.register_oot(_decorated_op_cls=AscendQKVParallelLinear,
name="QKVParallelLinear")
CustomOp.register_oot(
_decorated_op_cls=AscendDeepseekScalingRotaryEmbedding,
name="DeepseekScalingRotaryEmbedding")
@@ -572,3 +575,7 @@ def mlp_tp_enable() -> bool:
def matmul_allreduce_enable() -> bool:
return envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE
def dense_optim_enable() -> bool:
return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE