[Model] Add LongCat-Flash (#3833)

### What this PR does / why we need it?
Add LongCat-Flash support.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed

- vLLM version: v0.13.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: chuyuelin <923822139@qq.com>
Co-authored-by: chuyuelin <chuyuelin1@huawei.com>
This commit is contained in:
Chu Yuelin
2025-12-31 17:06:55 +08:00
committed by GitHub
parent 03679cf1d3
commit d07d8a4535
8 changed files with 79 additions and 14 deletions

View File

@@ -298,6 +298,12 @@ packed_modules_model_mapping = {
"experts":
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
},
"longcat_flash": {
"gate_up_proj": ["gate_proj", "up_proj"],
"experts":
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
},
}
@@ -514,6 +520,7 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
num_expert_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
routed_scaling_factor: float = 1.0,
e_score_correction_bias: Optional[torch.Tensor] = None,
is_prefill: bool = True,
enable_force_load_balance: bool = False,
@@ -524,9 +531,9 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
return self.quant_method.apply(
layer, x, router_logits, top_k, renormalize, use_grouped_topk,
global_num_experts, expert_map, topk_group, num_expert_group,
custom_routing_function, scoring_func, e_score_correction_bias,
is_prefill, enable_force_load_balance, log2phy,
global_redundant_expert_num, **kwargs)
custom_routing_function, scoring_func, routed_scaling_factor,
e_score_correction_bias, is_prefill, enable_force_load_balance,
log2phy, global_redundant_expert_num, **kwargs)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if hasattr(self.quant_method, "process_weights_after_loading"):