diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json index 6a976788f..4d4b752fa 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json index 6a976788f..4d4b752fa 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json index 6a976788f..4d4b752fa 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 1, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json index 0a46390b2..a218fc406 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json index 0a46390b2..a218fc406 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json index 0a46390b2..a218fc406 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json index 91011e64c..3682cc548 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json index 91011e64c..3682cc548 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json index 91011e64c..3682cc548 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index bb17743b6..d7f14d665 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json index bb17743b6..d7f14d665 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json index bb17743b6..d7f14d665 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json index f807d4a5a..21742854c 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json index f807d4a5a..21742854c 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json index f807d4a5a..21742854c 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -71,7 +71,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -82,7 +82,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -104,7 +104,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -115,7 +115,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -126,7 +126,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -148,7 +148,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,7 +192,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 92c41a28b..d9d2f5eac 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -79,7 +79,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -90,7 +90,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -101,7 +101,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -112,7 +112,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -123,7 +123,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -134,7 +134,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -145,7 +145,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -156,7 +156,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -167,7 +167,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json index 92c41a28b..d9d2f5eac 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -79,7 +79,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -90,7 +90,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -101,7 +101,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -112,7 +112,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -123,7 +123,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -134,7 +134,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -145,7 +145,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -156,7 +156,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -167,7 +167,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json index 92c41a28b..d9d2f5eac 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json @@ -5,7 +5,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,7 +16,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -27,7 +27,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -38,7 +38,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,7 +49,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -60,7 +60,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -79,7 +79,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -90,7 +90,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -101,7 +101,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -112,7 +112,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -123,7 +123,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -134,7 +134,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -145,7 +145,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -156,7 +156,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -167,7 +167,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 2, "matrix_instr_nonkdim": 16, "kpack": 2