[Model] Add LongCat-Flash (#3833)
### What this PR does / why we need it?
Add LongCat-Flash support.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed
- vLLM version: v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: chuyuelin <923822139@qq.com>
Co-authored-by: chuyuelin <chuyuelin1@huawei.com>
This commit is contained in:
@@ -2240,9 +2240,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
kv_caches[layer_name] = kv_caches[target_layer_name]
|
||||
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
|
||||
bind_kv_cache(kv_caches,
|
||||
self.compilation_config.static_forward_context,
|
||||
self.kv_caches)
|
||||
self.kv_caches, num_attn_module)
|
||||
return kv_caches
|
||||
|
||||
def _allocate_kv_cache_tensors(
|
||||
|
||||
Reference in New Issue
Block a user