ROCm: Flex Attention Enablement with custom backends (#4178)

Co-authored-by: linsun12 <linsun12@amd.com>
2025-03-07 04:38:53 -08:00
parent c827c671f7
commit 0beea4503f
7 changed files with 1434 additions and 35 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -710,13 +710,23 @@ class ServerArgs:
        )

        # Kernel backend
-        parser.add_argument(
-            "--attention-backend",
-            type=str,
-            choices=["flashinfer", "triton", "torch_native"],
-            default=ServerArgs.attention_backend,
-            help="Choose the kernels for attention layers.",
-        )
+        if is_hip():
+            parser.add_argument(
+                "--attention-backend",
+                type=str,
+                choices=["triton", "torch_native", "aiter", "aiter_decode"],
+                default=ServerArgs.attention_backend,
+                help="Choose the kernels for attention layers.",
+            )
+        else:
+            parser.add_argument(
+                "--attention-backend",
+                type=str,
+                choices=["flashinfer", "triton", "torch_native"],
+                default=ServerArgs.attention_backend,
+                help="Choose the kernels for attention layers.",
+            )
+
        parser.add_argument(
            "--sampling-backend",
            type=str,