Ascend attention backend(PA&MLA) (#7722)
Co-authored-by: Maksim <makcum888e@mail.ru> Co-authored-by: VDV1985 <vladdv85@mail.ru>
This commit is contained in:
@@ -380,6 +380,12 @@ class ServerArgs:
|
||||
)
|
||||
self.disable_cuda_graph = True
|
||||
|
||||
if self.attention_backend == "ascend":
|
||||
logger.warning(
|
||||
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
||||
)
|
||||
self.page_size = 128
|
||||
|
||||
# Choose grammar backend
|
||||
if self.grammar_backend is None:
|
||||
self.grammar_backend = "xgrammar"
|
||||
@@ -1113,6 +1119,7 @@ class ServerArgs:
|
||||
"flashmla",
|
||||
"intel_amx",
|
||||
"torch_native",
|
||||
"ascend",
|
||||
"triton",
|
||||
],
|
||||
default=ServerArgs.attention_backend,
|
||||
|
||||
Reference in New Issue
Block a user