From a47aa4da2f87dfd2792ef13d31476eccd27685bf Mon Sep 17 00:00:00 2001 From: hwhaokun Date: Thu, 11 Dec 2025 17:02:21 +0800 Subject: [PATCH] [feat] apply flashcomm1 on bailing (#4868) ### What this PR does / why we need it? This PR adjusts the layer prefix matching rules for tensor parallelism (column/row parallel ops) to fit Bailing model's naming conventions (adding "query_key_value" for column parallel and "attention.dense" for row parallel), enabling flashcomm1 to work properly on the Bailing model. ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hwhaokun --- vllm_ascend/ops/linear_op.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index 2bffa44c..65affe8b 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -610,12 +610,16 @@ def _get_column_parallel_op( if enable_sp(): if "shared_expert" in prefix: return None - if "gate_up_proj" in prefix: - return SequenceColumnParallelOp(layer) - if "in_proj" in prefix: - return SequenceColumnParallelOp(layer) - if "qkv_proj" in prefix or "conv1d" in prefix: - return SequenceColumnParallelOp(layer) + sp_column_prefix = [ + "gate_up_proj", # first MLP of most LLMs + "in_proj", # gated deltanet of Qwen3 Next + "qkv_proj", # qkv linear of most LLMs + "conv1d", # gated deltanet of Qwen3 Next + "query_key_value", # qkv linear of Bailing + ] + for a_prefix in sp_column_prefix: + if a_prefix in prefix: + return SequenceColumnParallelOp(layer) return None @@ -637,8 +641,15 @@ def _get_row_parallel_op( if enable_sp(): if "shared_expert" in prefix: return None - if "o_proj" in prefix or "out_proj" in prefix or "down_proj" in prefix: - return SequenceRowParallelOp(layer) + sp_row_prefixes = [ + "o_proj", # attn output linear of most LLMs + "out_proj", # attn output linear of Qwen3 Next + "down_proj", # second MLP of most LLMs + "attention.dense", # attn output linear of Bailing + ] + for a_prefix in sp_row_prefixes: + if a_prefix in prefix: + return SequenceRowParallelOp(layer) return None