[Model] Add LongCat-Flash (#3833)

### What this PR does / why we need it? Add LongCat-Flash support. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed - vLLM version: v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: chuyuelin <923822139@qq.com> Co-authored-by: chuyuelin <chuyuelin1@huawei.com>
2025-12-31 17:06:55 +08:00
parent 03679cf1d3
commit d07d8a4535
8 changed files with 79 additions and 14 deletions
--- a/vllm_ascend/ops/mla.py
+++ b/vllm_ascend/ops/mla.py
@@ -94,8 +94,6 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
        hf_config = get_current_vllm_config().model_config.hf_config
        self.enable_shared_expert_dp = get_ascend_config(
        ).enable_shared_expert_dp
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
-        self.first_k_dense_replace = hf_config.first_k_dense_replace
        self.tp_size = get_tensor_model_parallel_world_size()
        self.layers = hf_config.num_hidden_layers
        if mla_modules.indexer is not None: