[BugFix] Fix the problem that torchair doesn't support tp > 4. (#1508)
This PR removes the restriction that TP cannot be greater than 4 in
torchair scenario, because current newest version of CANN has fixed this
bug.
- vLLM version: v0.10.0
- vLLM main:
04ff4be310
Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -28,8 +28,6 @@ from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
|
||||
|
||||
|
||||
class AscendMLABackend(AttentionBackend):
|
||||
|
||||
@@ -548,15 +546,6 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
self.spec_token_num = speculative_config.num_speculative_tokens
|
||||
assert self.spec_token_num > 0
|
||||
|
||||
# TODO: support numHeads / numKvHeads < 16 in MLA kernel
|
||||
if self.torchair_graph_enabled:
|
||||
assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
|
||||
("The allowed number of queries per kv when enabling both MLA and Graph mode"
|
||||
" only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
|
||||
" as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
|
||||
" please make sure after the tensor parallel split, num_heads / num_kv_heads in "
|
||||
"{32, 64, 128}.")
|
||||
|
||||
def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False):
|
||||
# Convert from (B, N, L) to (N, B, L)
|
||||
x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
|
||||
|
||||
Reference in New Issue
Block a user