Ascend attention backend(PA&MLA) (#7722)

Co-authored-by: Maksim <makcum888e@mail.ru> Co-authored-by: VDV1985 <vladdv85@mail.ru>
2025-07-03 19:23:19 +03:00
parent b58226510f
commit 1e0e549766
17 changed files with 842 additions and 16 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -380,6 +380,12 @@ class ServerArgs:
            )
            self.disable_cuda_graph = True

+        if self.attention_backend == "ascend":
+            logger.warning(
+                "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
+            )
+            self.page_size = 128
+
        # Choose grammar backend
        if self.grammar_backend is None:
            self.grammar_backend = "xgrammar"
@@ -1113,6 +1119,7 @@ class ServerArgs:
                "flashmla",
                "intel_amx",
                "torch_native",
+                "ascend",
                "triton",
            ],
            default=ServerArgs.attention_backend,