diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bac7345a..84264698 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -3241,7 +3241,6 @@ class NPUModelRunner(GPUModelRunner): cache_sparse_c8=self.use_sparse_c8_indexer, ) elif spec := attn_module.get_kv_cache_spec(self.vllm_config): - assert isinstance(spec, MLAAttentionSpec) from vllm.v1.kv_cache_interface import MLAAttentionSpec as AscendMLAAttentionSpec if getattr(attn_module.impl, "fa_quant_layer", False): head_size = attn_module.head_size + attn_module.qk_rope_head_dim