[Model] Add LongCat-Flash (#3833)

### What this PR does / why we need it? Add LongCat-Flash support. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed - vLLM version: v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: chuyuelin <923822139@qq.com> Co-authored-by: chuyuelin <chuyuelin1@huawei.com>
2025-12-31 17:06:55 +08:00
parent 03679cf1d3
commit d07d8a4535
8 changed files with 79 additions and 14 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2240,9 +2240,10 @@ class NPUModelRunner(GPUModelRunner):
            kv_caches[layer_name] = kv_caches[target_layer_name]

        from vllm.v1.worker.utils import bind_kv_cache
+        num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
        bind_kv_cache(kv_caches,
                      self.compilation_config.static_forward_context,
-                      self.kv_caches)
+                      self.kv_caches, num_attn_module)
        return kv_caches

    def _allocate_kv_cache_tensors(