diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 537cd8e..5d993e0 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -28,8 +28,6 @@ from vllm_ascend.worker.npu_input_batch import InputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput -_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] - class AscendMLABackend(AttentionBackend): @@ -548,15 +546,6 @@ class AscendMLAImpl(MLAAttentionImpl): self.spec_token_num = speculative_config.num_speculative_tokens assert self.spec_token_num > 0 - # TODO: support numHeads / numKvHeads < 16 in MLA kernel - if self.torchair_graph_enabled: - assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ - ("The allowed number of queries per kv when enabling both MLA and Graph mode" - " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," - " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," - " please make sure after the tensor parallel split, num_heads / num_kv_heads in " - "{32, 64, 128}.") - def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)