[feat] apply flashcomm1 on bailing (#4868)
### What this PR does / why we need it?
This PR adjusts the layer prefix matching rules for tensor parallelism
(column/row parallel ops) to fit Bailing model's naming conventions
(adding "query_key_value" for column parallel and "attention.dense" for
row parallel), enabling flashcomm1 to work properly on the Bailing
model.
### Does this PR introduce _any_ user-facing change?
No
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: hwhaokun <haokun0405@163.com>
This commit is contained in:
@@ -610,12 +610,16 @@ def _get_column_parallel_op(
|
|||||||
if enable_sp():
|
if enable_sp():
|
||||||
if "shared_expert" in prefix:
|
if "shared_expert" in prefix:
|
||||||
return None
|
return None
|
||||||
if "gate_up_proj" in prefix:
|
sp_column_prefix = [
|
||||||
return SequenceColumnParallelOp(layer)
|
"gate_up_proj", # first MLP of most LLMs
|
||||||
if "in_proj" in prefix:
|
"in_proj", # gated deltanet of Qwen3 Next
|
||||||
return SequenceColumnParallelOp(layer)
|
"qkv_proj", # qkv linear of most LLMs
|
||||||
if "qkv_proj" in prefix or "conv1d" in prefix:
|
"conv1d", # gated deltanet of Qwen3 Next
|
||||||
return SequenceColumnParallelOp(layer)
|
"query_key_value", # qkv linear of Bailing
|
||||||
|
]
|
||||||
|
for a_prefix in sp_column_prefix:
|
||||||
|
if a_prefix in prefix:
|
||||||
|
return SequenceColumnParallelOp(layer)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -637,8 +641,15 @@ def _get_row_parallel_op(
|
|||||||
if enable_sp():
|
if enable_sp():
|
||||||
if "shared_expert" in prefix:
|
if "shared_expert" in prefix:
|
||||||
return None
|
return None
|
||||||
if "o_proj" in prefix or "out_proj" in prefix or "down_proj" in prefix:
|
sp_row_prefixes = [
|
||||||
return SequenceRowParallelOp(layer)
|
"o_proj", # attn output linear of most LLMs
|
||||||
|
"out_proj", # attn output linear of Qwen3 Next
|
||||||
|
"down_proj", # second MLP of most LLMs
|
||||||
|
"attention.dense", # attn output linear of Bailing
|
||||||
|
]
|
||||||
|
for a_prefix in sp_row_prefixes:
|
||||||
|
if a_prefix in prefix:
|
||||||
|
return SequenceRowParallelOp(layer)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user