AITER backend extension and workload optimizations (#6838)

Co-authored-by: wunhuang <wunhuang@amd.com> Co-authored-by: Hubert Lu <Hubert.Lu@amd.com>
2025-06-05 23:00:18 -07:00
parent 562f279a2d
commit b819381fec
12 changed files with 583 additions and 164 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -355,6 +355,15 @@ class ModelRunner:
                # MLA architecture
                if is_hopper_with_cuda_12_3():
                    server_args.attention_backend = "fa3"
+                elif _is_hip:
+                    head_num = self.model_config.get_num_kv_heads(self.tp_size)
+                    # TODO current aiter only support head number 16 or 128 head number
+                    if (
+                        head_num == 128 or head_num == 16
+                    ) and self.spec_algorithm.is_none():
+                        server_args.attention_backend = "aiter"
+                    else:
+                        server_args.attention_backend = "triton"
                else:
                    server_args.attention_backend = "triton"
            logger.info(
@@ -363,6 +372,7 @@ class ModelRunner:
        elif self.use_mla_backend:
            if server_args.device != "cpu":
                if server_args.attention_backend in [
+                    "aiter",
                    "flashinfer",
                    "fa3",
                    "triton",