Ascend attention backend(PA&MLA) (#7722)

Co-authored-by: Maksim <makcum888e@mail.ru>
Co-authored-by: VDV1985 <vladdv85@mail.ru>
This commit is contained in:
ronnie_zheng
2025-07-03 19:23:19 +03:00
committed by GitHub
parent b58226510f
commit 1e0e549766
17 changed files with 842 additions and 16 deletions

View File

@@ -380,6 +380,12 @@ class ServerArgs:
)
self.disable_cuda_graph = True
if self.attention_backend == "ascend":
logger.warning(
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
)
self.page_size = 128
# Choose grammar backend
if self.grammar_backend is None:
self.grammar_backend = "xgrammar"
@@ -1113,6 +1119,7 @@ class ServerArgs:
"flashmla",
"intel_amx",
"torch_native",
"ascend",
"triton",
],
default=ServerArgs.attention_backend,