[ROCm] Add ROCm tuning config to block gemm and Re-tune for AMD Radeon Graphics (#3418)
Co-authored-by: Bruce Xue <yigex@xilinx.com> Co-authored-by: HAI <hixiao@gmail.com>
This commit is contained in:
@@ -23,8 +23,13 @@ import torch
|
|||||||
import triton
|
import triton
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from sglang.srt.layers.quantization.fp8_kernel import _w8a8_block_fp8_matmul
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
||||||
from sglang.srt.utils import get_device_name
|
_w8a8_block_fp8_matmul,
|
||||||
|
_w8a8_block_fp8_matmul_unrolledx4,
|
||||||
|
)
|
||||||
|
from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
|
||||||
|
|
||||||
|
is_hip_ = is_hip()
|
||||||
|
|
||||||
DTYPE_MAP = {
|
DTYPE_MAP = {
|
||||||
"float32": torch.float32,
|
"float32": torch.float32,
|
||||||
@@ -80,7 +85,19 @@ def w8a8_block_fp8_matmul(
|
|||||||
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
|
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
_w8a8_block_fp8_matmul[grid](
|
# Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
|
||||||
|
# Empirical testing shows the sweet spot lies when it's less than the # of
|
||||||
|
# compute units available on the device.
|
||||||
|
num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
|
||||||
|
N, config["BLOCK_SIZE_N"]
|
||||||
|
)
|
||||||
|
kernel = (
|
||||||
|
_w8a8_block_fp8_matmul_unrolledx4
|
||||||
|
if (is_hip_ == True and num_workgroups <= get_device_core_count())
|
||||||
|
else _w8a8_block_fp8_matmul
|
||||||
|
)
|
||||||
|
|
||||||
|
kernel[grid](
|
||||||
A,
|
A,
|
||||||
B,
|
B,
|
||||||
C,
|
C,
|
||||||
@@ -107,14 +124,15 @@ def w8a8_block_fp8_matmul(
|
|||||||
return C
|
return C
|
||||||
|
|
||||||
|
|
||||||
def get_configs_compute_bound():
|
def get_rocm_configs_compute_bound():
|
||||||
configs = []
|
configs = []
|
||||||
for num_stages in [2, 3, 4, 5]:
|
waves_per_eu_range = 0
|
||||||
for block_m in [16, 32, 64, 128, 256]:
|
for num_stages in [2]:
|
||||||
for block_k in [64, 128]:
|
for block_m in [32, 64, 128, 256]:
|
||||||
for block_n in [32, 64, 128, 256]:
|
for block_k in [32, 64, 128, 256]:
|
||||||
|
for block_n in [16, 32, 64, 128, 256]:
|
||||||
for num_warps in [4, 8]:
|
for num_warps in [4, 8]:
|
||||||
for group_size in [1, 16, 32, 64]:
|
for group_size in [1, 4, 8, 16, 32]:
|
||||||
configs.append(
|
configs.append(
|
||||||
{
|
{
|
||||||
"BLOCK_SIZE_M": block_m,
|
"BLOCK_SIZE_M": block_m,
|
||||||
@@ -123,11 +141,36 @@ def get_configs_compute_bound():
|
|||||||
"GROUP_SIZE_M": group_size,
|
"GROUP_SIZE_M": group_size,
|
||||||
"num_warps": num_warps,
|
"num_warps": num_warps,
|
||||||
"num_stages": num_stages,
|
"num_stages": num_stages,
|
||||||
|
"waves_per_eu": waves_per_eu_range,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return configs
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs_compute_bound():
|
||||||
|
configs = []
|
||||||
|
if is_hip_:
|
||||||
|
configs = get_rocm_configs_compute_bound()
|
||||||
|
else:
|
||||||
|
for num_stages in [2, 3, 4, 5]:
|
||||||
|
for block_m in [16, 32, 64, 128, 256]:
|
||||||
|
for block_k in [64, 128]:
|
||||||
|
for block_n in [32, 64, 128, 256]:
|
||||||
|
for num_warps in [4, 8]:
|
||||||
|
for group_size in [1, 16, 32, 64]:
|
||||||
|
configs.append(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": block_m,
|
||||||
|
"BLOCK_SIZE_N": block_n,
|
||||||
|
"BLOCK_SIZE_K": block_k,
|
||||||
|
"GROUP_SIZE_M": group_size,
|
||||||
|
"num_warps": num_warps,
|
||||||
|
"num_stages": num_stages,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
def get_weight_shapes(tp_size):
|
def get_weight_shapes(tp_size):
|
||||||
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
|
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
|
||||||
# cannot TP
|
# cannot TP
|
||||||
@@ -190,14 +233,18 @@ def benchmark_config(
|
|||||||
|
|
||||||
def tune(M, N, K, block_size, out_dtype, search_space):
|
def tune(M, N, K, block_size, out_dtype, search_space):
|
||||||
factor_for_scale = 1e-2
|
factor_for_scale = 1e-2
|
||||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
fp8_info = torch.finfo(torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn)
|
||||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||||
|
|
||||||
A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
||||||
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(
|
||||||
|
torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
|
||||||
|
)
|
||||||
|
|
||||||
B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
||||||
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(
|
||||||
|
torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
|
||||||
|
)
|
||||||
|
|
||||||
block_n, block_k = block_size[0], block_size[1]
|
block_n, block_k = block_size[0], block_size[1]
|
||||||
n_tiles = (N + block_n - 1) // block_n
|
n_tiles = (N + block_n - 1) // block_n
|
||||||
|
|||||||
@@ -72,10 +72,10 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 256,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,15 +1,6 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 4,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"2": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 8,
|
||||||
@@ -17,35 +8,44 @@
|
|||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 32,
|
||||||
@@ -54,10 +54,10 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -66,7 +66,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -75,7 +75,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -102,43 +102,43 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"512": {
|
"512": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2048": {
|
"2048": {
|
||||||
"BLOCK_SIZE_M": 128,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 32,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -147,16 +147,16 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4096": {
|
"4096": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,33 +1,15 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"4": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"8": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 32,
|
||||||
@@ -35,29 +17,47 @@
|
|||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -75,7 +75,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -93,23 +93,23 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"256": {
|
"256": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 128,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"512": {
|
"512": {
|
||||||
"BLOCK_SIZE_M": 128,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -118,27 +118,27 @@
|
|||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2048": {
|
"2048": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -147,7 +147,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -156,7 +156,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,63 +1,63 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -66,43 +66,43 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"96": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"128": {
|
"128": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"256": {
|
"256": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -117,17 +117,17 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -138,7 +138,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,63 +1,63 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -66,7 +66,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -75,32 +75,32 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"96": {
|
"96": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"128": {
|
"128": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"256": {
|
"256": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 128,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -109,7 +109,7 @@
|
|||||||
},
|
},
|
||||||
"512": {
|
"512": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -120,16 +120,16 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -138,7 +138,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -147,7 +147,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -156,7 +156,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -18,7 +18,7 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -27,16 +27,16 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -45,7 +45,7 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -54,28 +54,28 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"48": {
|
"48": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -84,7 +84,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -117,8 +117,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -126,8 +126,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 128,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -136,27 +136,27 @@
|
|||||||
},
|
},
|
||||||
"2048": {
|
"2048": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"3072": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 64,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 32,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"4096": {
|
"4096": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -18,7 +18,7 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -27,34 +27,34 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -63,19 +63,19 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"48": {
|
"48": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -84,7 +84,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -102,7 +102,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -111,32 +111,32 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 128,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2048": {
|
"2048": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -144,19 +144,19 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"3072": {
|
"3072": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 128,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4096": {
|
"4096": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,33 +1,33 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"4": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -36,7 +36,7 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
@@ -45,19 +45,19 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -66,14 +66,14 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -81,8 +81,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"96": {
|
"96": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -90,17 +90,17 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"128": {
|
"128": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 8,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"256": {
|
"256": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -118,7 +118,7 @@
|
|||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -127,7 +127,7 @@
|
|||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
|
|||||||
@@ -1,6 +1,33 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 16,
|
||||||
@@ -8,17 +35,26 @@
|
|||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
@@ -26,45 +62,9 @@
|
|||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"8": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"16": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"24": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"32": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"48": {
|
"48": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -72,8 +72,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -81,8 +81,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"96": {
|
"96": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -91,7 +91,7 @@
|
|||||||
},
|
},
|
||||||
"128": {
|
"128": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -99,8 +99,8 @@
|
|||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"256": {
|
"256": {
|
||||||
"BLOCK_SIZE_M": 128,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 32,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -111,14 +111,14 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"1024": {
|
"1024": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -127,9 +127,9 @@
|
|||||||
},
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -138,7 +138,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -147,7 +147,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -156,7 +156,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
@@ -1,61 +1,61 @@
|
|||||||
{
|
{
|
||||||
"1": {
|
"1": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2": {
|
"2": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 32,
|
"GROUP_SIZE_M": 32,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"4": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"8": {
|
"8": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"16": {
|
"16": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"24": {
|
"24": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 8,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"32": {
|
"32": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 16,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
@@ -64,59 +64,14 @@
|
|||||||
},
|
},
|
||||||
"48": {
|
"48": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 32,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 16,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"64": {
|
"64": {
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 16,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"96": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 16,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"128": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 32,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"256": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
|
||||||
"BLOCK_SIZE_N": 32,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 1,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"512": {
|
|
||||||
"BLOCK_SIZE_M": 128,
|
|
||||||
"BLOCK_SIZE_N": 32,
|
|
||||||
"BLOCK_SIZE_K": 128,
|
|
||||||
"GROUP_SIZE_M": 32,
|
|
||||||
"num_warps": 4,
|
|
||||||
"num_stages": 2,
|
|
||||||
"waves_per_eu": 0
|
|
||||||
},
|
|
||||||
"1024": {
|
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 64,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
@@ -125,20 +80,65 @@
|
|||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0
|
||||||
|
},
|
||||||
"1536": {
|
"1536": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 64,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
},
|
},
|
||||||
"2048": {
|
"2048": {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 32,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
@@ -156,7 +156,7 @@
|
|||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
"BLOCK_SIZE_N": 128,
|
"BLOCK_SIZE_N": 128,
|
||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 2,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0
|
"waves_per_eu": 0
|
||||||
|
|||||||
Reference in New Issue
Block a user