Integrate triton moe kernel (#7689)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
This commit is contained in:
Yuan Luo
2025-07-07 11:05:49 +08:00
committed by GitHub
parent ea3e7ffec7
commit 253454de9b
7 changed files with 697 additions and 54 deletions

View File

@@ -222,6 +222,7 @@ class ServerArgs:
disable_chunked_prefix_cache: bool = False
disable_fast_image_processor: bool = False
enable_return_hidden_states: bool = False
enable_triton_kernel_moe: bool = False
warmups: Optional[str] = None
# Debug tensor dumps
@@ -1554,6 +1555,11 @@ class ServerArgs:
action="store_true",
help="Enable returning hidden states with responses.",
)
parser.add_argument(
"--enable-triton-kernel-moe",
action="store_true",
help="Use triton moe grouped gemm kernel.",
)
parser.add_argument(
"--warmups",
type=str,