Integrate triton moe kernel (#7689)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
2025-07-07 11:05:49 +08:00
parent ea3e7ffec7
commit 253454de9b
7 changed files with 697 additions and 54 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -222,6 +222,7 @@ class ServerArgs:
    disable_chunked_prefix_cache: bool = False
    disable_fast_image_processor: bool = False
    enable_return_hidden_states: bool = False
+    enable_triton_kernel_moe: bool = False
    warmups: Optional[str] = None

    # Debug tensor dumps
@@ -1554,6 +1555,11 @@ class ServerArgs:
            action="store_true",
            help="Enable returning hidden states with responses.",
        )
+        parser.add_argument(
+            "--enable-triton-kernel-moe",
+            action="store_true",
+            help="Use triton moe grouped gemm kernel.",
+        )
        parser.add_argument(
            "--warmups",
            type=str,