diff --git a/benchmark/kernels/quantization/tuning_block_wise_fp8.py b/benchmark/kernels/quantization/tuning_block_wise_fp8.py index 07bdb4bf1..b2df304f0 100644 --- a/benchmark/kernels/quantization/tuning_block_wise_fp8.py +++ b/benchmark/kernels/quantization/tuning_block_wise_fp8.py @@ -23,8 +23,13 @@ import torch import triton from tqdm import tqdm -from sglang.srt.layers.quantization.fp8_kernel import _w8a8_block_fp8_matmul -from sglang.srt.utils import get_device_name +from sglang.srt.layers.quantization.fp8_kernel import ( + _w8a8_block_fp8_matmul, + _w8a8_block_fp8_matmul_unrolledx4, +) +from sglang.srt.utils import get_device_core_count, get_device_name, is_hip + +is_hip_ = is_hip() DTYPE_MAP = { "float32": torch.float32, @@ -80,7 +85,19 @@ def w8a8_block_fp8_matmul( triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), ) - _w8a8_block_fp8_matmul[grid]( + # Use manually unrolledx4 kernel on AMD GPU when the grid size is small. + # Empirical testing shows the sweet spot lies when it's less than the # of + # compute units available on the device. + num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv( + N, config["BLOCK_SIZE_N"] + ) + kernel = ( + _w8a8_block_fp8_matmul_unrolledx4 + if (is_hip_ == True and num_workgroups <= get_device_core_count()) + else _w8a8_block_fp8_matmul + ) + + kernel[grid]( A, B, C, @@ -107,14 +124,15 @@ def w8a8_block_fp8_matmul( return C -def get_configs_compute_bound(): +def get_rocm_configs_compute_bound(): configs = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128]: - for block_n in [32, 64, 128, 256]: + waves_per_eu_range = 0 + for num_stages in [2]: + for block_m in [32, 64, 128, 256]: + for block_k in [32, 64, 128, 256]: + for block_n in [16, 32, 64, 128, 256]: for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: + for group_size in [1, 4, 8, 16, 32]: configs.append( { "BLOCK_SIZE_M": block_m, @@ -123,11 +141,36 @@ def get_configs_compute_bound(): "GROUP_SIZE_M": group_size, "num_warps": num_warps, "num_stages": num_stages, + "waves_per_eu": waves_per_eu_range, } ) return configs +def get_configs_compute_bound(): + configs = [] + if is_hip_: + configs = get_rocm_configs_compute_bound() + else: + for num_stages in [2, 3, 4, 5]: + for block_m in [16, 32, 64, 128, 256]: + for block_k in [64, 128]: + for block_n in [32, 64, 128, 256]: + for num_warps in [4, 8]: + for group_size in [1, 16, 32, 64]: + configs.append( + { + "BLOCK_SIZE_M": block_m, + "BLOCK_SIZE_N": block_n, + "BLOCK_SIZE_K": block_k, + "GROUP_SIZE_M": group_size, + "num_warps": num_warps, + "num_stages": num_stages, + } + ) + return configs + + def get_weight_shapes(tp_size): # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model. # cannot TP @@ -190,14 +233,18 @@ def benchmark_config( def tune(M, N, K, block_size, out_dtype, search_space): factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_info = torch.finfo(torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn) fp8_max, fp8_min = fp8_info.max, fp8_info.min A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max - A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to( + torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn + ) B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max - B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to( + torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn + ) block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index a7be90051..e25d0492d 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -72,10 +72,10 @@ "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index c098ef2db..3633e363f 100644 --- a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,15 +1,6 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, @@ -17,35 +8,44 @@ "num_stages": 2, "waves_per_eu": 0 }, - "4": { - "BLOCK_SIZE_M": 64, + "2": { + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -54,10 +54,10 @@ "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -66,7 +66,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -75,7 +75,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -102,43 +102,43 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "512": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2048": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -147,16 +147,16 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "4096": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index 6f5adbb93..80d23e969 100644 --- a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,33 +1,15 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -35,29 +17,47 @@ "num_stages": 2, "waves_per_eu": 0 }, - "16": { - "BLOCK_SIZE_M": 64, + "4": { + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -75,7 +75,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -93,23 +93,23 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "256": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "512": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -118,27 +118,27 @@ }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2048": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -147,7 +147,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -156,7 +156,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index 4225c78eb..a5518d979 100644 --- a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,63 +1,63 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -66,43 +66,43 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, - "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "256": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -117,17 +117,17 @@ "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, @@ -138,7 +138,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index 5e6789d00..44c67c008 100644 --- a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,63 +1,63 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -66,7 +66,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -75,32 +75,32 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "256": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -109,7 +109,7 @@ }, "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -120,16 +120,16 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -138,7 +138,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -147,7 +147,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -156,7 +156,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index 49ac14d2a..09502f05c 100644 --- a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,15 +1,15 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -18,7 +18,7 @@ "waves_per_eu": 0 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -27,16 +27,16 @@ "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -45,7 +45,7 @@ "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -54,28 +54,28 @@ "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -84,7 +84,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -93,7 +93,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -117,8 +117,8 @@ "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, @@ -126,8 +126,8 @@ "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, @@ -136,27 +136,27 @@ }, "2048": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, - "3072": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, "4096": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index dcbb0efc5..7fd942e3e 100644 --- a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,15 +1,15 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -18,7 +18,7 @@ "waves_per_eu": 0 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -27,34 +27,34 @@ "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -63,19 +63,19 @@ "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -84,7 +84,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -93,7 +93,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -102,7 +102,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -111,32 +111,32 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2048": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, @@ -144,19 +144,19 @@ "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "4096": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index dfe5c1e43..72ab3dff2 100644 --- a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,33 +1,33 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -36,7 +36,7 @@ "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -45,19 +45,19 @@ "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -66,14 +66,14 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -81,8 +81,8 @@ "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -90,17 +90,17 @@ "waves_per_eu": 0 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "256": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -118,7 +118,7 @@ }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -127,7 +127,7 @@ }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index a87f5de1b..37208577d 100644 --- a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,6 +1,33 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, @@ -8,17 +35,26 @@ "num_stages": 2, "waves_per_eu": 0 }, - "2": { - "BLOCK_SIZE_M": 64, + "16": { + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, - "4": { - "BLOCK_SIZE_M": 64, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, @@ -26,45 +62,9 @@ "num_stages": 2, "waves_per_eu": 0 }, - "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, "48": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -72,8 +72,8 @@ "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -81,8 +81,8 @@ "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -91,7 +91,7 @@ }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -99,8 +99,8 @@ "waves_per_eu": 0 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -111,14 +111,14 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -127,9 +127,9 @@ }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -138,7 +138,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -147,7 +147,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -156,7 +156,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json index 468f9e78d..d8cc0f896 100644 --- a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -1,61 +1,61 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, @@ -64,59 +64,14 @@ }, "48": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "256": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "512": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "1024": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, @@ -125,20 +80,65 @@ "num_stages": 2, "waves_per_eu": 0 }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2048": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -156,7 +156,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0