[feat] apply flashcomm1 on bailing (#4868)

### What this PR does / why we need it? This PR adjusts the layer prefix matching rules for tensor parallelism (column/row parallel ops) to fit Bailing model's naming conventions (adding "query_key_value" for column parallel and "attention.dense" for row parallel), enabling flashcomm1 to work properly on the Bailing model. ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: hwhaokun <haokun0405@163.com>
2025-12-11 17:02:21 +08:00
parent 2f965d8339
commit a47aa4da2f
1 changed files with 19 additions and 8 deletions
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -610,12 +610,16 @@ def _get_column_parallel_op(
    if enable_sp():
        if "shared_expert" in prefix:
            return None
-        if "gate_up_proj" in prefix:
+        sp_column_prefix = [
-            return SequenceColumnParallelOp(layer)
+            "gate_up_proj",  # first MLP of most LLMs 
-        if "in_proj" in prefix:
+            "in_proj",  # gated deltanet of Qwen3 Next
-            return SequenceColumnParallelOp(layer)
+            "qkv_proj",  # qkv linear of most LLMs
-        if "qkv_proj" in prefix or "conv1d" in prefix:
+            "conv1d",  # gated deltanet of Qwen3 Next
-            return SequenceColumnParallelOp(layer)
+            "query_key_value",  # qkv linear of Bailing
        ]
        for a_prefix in sp_column_prefix:
            if a_prefix in prefix:
                return SequenceColumnParallelOp(layer)
    return None
@@ -637,8 +641,15 @@ def _get_row_parallel_op(
    if enable_sp():
        if "shared_expert" in prefix:
            return None
-        if "o_proj" in prefix or "out_proj" in prefix or "down_proj" in prefix:
+        sp_row_prefixes = [
-            return SequenceRowParallelOp(layer)
+            "o_proj",  # attn output linear of most LLMs
            "out_proj",  # attn output linear of Qwen3 Next
            "down_proj",  # second MLP of most LLMs
            "attention.dense",  # attn output linear of Bailing
        ]
        for a_prefix in sp_row_prefixes:
            if a_prefix in prefix:
                return SequenceRowParallelOp(layer)
    return None