27727 lines
459 KiB
JSON
27727 lines
459 KiB
JSON
{
|
|
"nvidia_h200": {
|
|
"intermediate_2048_numtokens_256": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"default": {
|
|
"block_sizes": [
|
|
1,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_256": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_256": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_7688_numtokens_256": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
1
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_2": {
|
|
"block_sizes": [
|
|
2,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_2": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_4": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_4": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_8": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_8": {
|
|
"block_sizes": [
|
|
4,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_16": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_16": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_24": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_24": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_24": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_32": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_40": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_40": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_48": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_48": {
|
|
"block_sizes": [
|
|
8,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_48": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_48": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_48": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_48": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_56": {
|
|
"block_sizes": [
|
|
2,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_56": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_56": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_64": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_64": {
|
|
"block_sizes": [
|
|
4,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_64": {
|
|
"block_sizes": [
|
|
2,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_64": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_64": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_64": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_72": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_72": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_72": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_72": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_72": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_72": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_80": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_88": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_88": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_88": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_88": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"first",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_88": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_88": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_96": {
|
|
"block_sizes": [
|
|
128,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_96": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_96": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_96": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_96": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_96": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_104": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_104": {
|
|
"block_sizes": [
|
|
8,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_104": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
1024
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_112": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_120": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_120": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_120": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_128": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_128": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_128": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_136": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_136": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_136": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_144": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_144": {
|
|
"block_sizes": [
|
|
256,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_144": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_144": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_144": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_144": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_152": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_152": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_152": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_160": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_160": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_160": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_168": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_168": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_168": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_176": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_176": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_176": {
|
|
"block_sizes": [
|
|
4,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_184": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_184": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_184": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_192": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_192": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_192": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_200": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_200": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_200": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_208": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_208": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_208": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_208": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_208": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_208": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_216": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_216": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_216": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_224": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_224": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_224": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_224": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_224": {
|
|
"block_sizes": [
|
|
256,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_224": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_232": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_232": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_232": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_240": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_240": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_240": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_248": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_248": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_248": {
|
|
"block_sizes": [
|
|
256,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_272": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_272": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_272": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_288": {
|
|
"block_sizes": [
|
|
4,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_288": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_288": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_288": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_288": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_288": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_304": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_304": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_304": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_304": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_304": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_304": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_320": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_320": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_320": {
|
|
"block_sizes": [
|
|
512,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_320": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_320": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_320": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_336": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_336": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_336": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_352": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_352": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_352": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_368": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_368": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_384": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_384": {
|
|
"block_sizes": [
|
|
8,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_400": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_400": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_400": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_400": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_400": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_400": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_416": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_416": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_416": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_416": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_416": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_416": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_432": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_432": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_448": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_448": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_448": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_448": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_448": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_448": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_464": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_464": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_464": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_464": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_464": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_464": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_480": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_480": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_480": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_480": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_480": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_480": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_496": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_496": {
|
|
"block_sizes": [
|
|
256,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_512": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_512": {
|
|
"block_sizes": [
|
|
128,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_512": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat",
|
|
"range_warp_specializes": []
|
|
}
|
|
},
|
|
"nvidia_h100": {
|
|
"intermediate_2048_numtokens_256": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"default": {
|
|
"block_sizes": [
|
|
1,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_256": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_256": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_7688_numtokens_256": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_256": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
1
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_1": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_2": {
|
|
"block_sizes": [
|
|
2,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_2": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_2": {
|
|
"block_sizes": [
|
|
1,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_4": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_4": {
|
|
"block_sizes": [
|
|
1,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_4": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_8": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_8": {
|
|
"block_sizes": [
|
|
4,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_8": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_16": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_16": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_16": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_24": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_24": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_24": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_24": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_32": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_32": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_40": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_40": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_40": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_48": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_48": {
|
|
"block_sizes": [
|
|
8,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_48": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_48": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_48": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_48": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_56": {
|
|
"block_sizes": [
|
|
2,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_56": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_56": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_56": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_64": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_64": {
|
|
"block_sizes": [
|
|
4,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_64": {
|
|
"block_sizes": [
|
|
2,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_64": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_64": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_64": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_72": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_72": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_72": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_72": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_72": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_72": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_80": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_80": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_88": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_88": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_88": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_88": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"first",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_88": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_88": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_96": {
|
|
"block_sizes": [
|
|
128,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_96": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_96": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_96": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_96": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_96": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_104": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_104": {
|
|
"block_sizes": [
|
|
8,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_104": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_104": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
1024
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_112": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_112": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_120": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_120": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_120": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_120": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_128": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_128": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_128": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_128": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_136": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_136": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_136": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 3,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_136": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_144": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_144": {
|
|
"block_sizes": [
|
|
256,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_144": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_144": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_144": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_144": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_152": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_152": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_152": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_152": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_160": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_160": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_160": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_160": {
|
|
"block_sizes": [
|
|
128,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_168": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_168": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_168": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_168": {
|
|
"block_sizes": [
|
|
32,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_176": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_176": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_176": {
|
|
"block_sizes": [
|
|
4,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_176": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_184": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_184": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_184": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_184": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_192": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_192": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_192": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_192": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_200": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_200": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_200": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_200": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_208": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_208": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_208": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_208": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_208": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_208": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_216": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_216": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_216": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_216": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_224": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_224": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_224": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_224": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_224": {
|
|
"block_sizes": [
|
|
256,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_224": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_232": {
|
|
"block_sizes": [
|
|
64,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_232": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_232": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_232": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_240": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_240": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_240": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_240": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_248": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_248": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_248": {
|
|
"block_sizes": [
|
|
256,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_248": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_272": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
1,
|
|
0
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_272": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_272": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_272": {
|
|
"block_sizes": [
|
|
64,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_288": {
|
|
"block_sizes": [
|
|
4,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_288": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_288": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_288": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_288": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_288": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_304": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_304": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_304": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_304": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_304": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_304": {
|
|
"block_sizes": [
|
|
64,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_320": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_320": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_320": {
|
|
"block_sizes": [
|
|
512,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_320": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_320": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_320": {
|
|
"block_sizes": [
|
|
128,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_336": {
|
|
"block_sizes": [
|
|
2,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_336": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_336": {
|
|
"block_sizes": [
|
|
64,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_336": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_352": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_352": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_352": {
|
|
"block_sizes": [
|
|
8,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_352": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_368": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_368": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
4
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_368": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_384": {
|
|
"block_sizes": [
|
|
64,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_384": {
|
|
"block_sizes": [
|
|
8,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_384": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_400": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_400": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_400": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_400": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_400": {
|
|
"block_sizes": [
|
|
256,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_400": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_416": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_416": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_416": {
|
|
"block_sizes": [
|
|
64,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_416": {
|
|
"block_sizes": [
|
|
128,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_416": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_416": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_432": {
|
|
"block_sizes": [
|
|
32,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_432": {
|
|
"block_sizes": [
|
|
16,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_432": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_448": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_448": {
|
|
"block_sizes": [
|
|
8,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_448": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_448": {
|
|
"block_sizes": [
|
|
32,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_448": {
|
|
"block_sizes": [
|
|
16,
|
|
256
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_448": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"last",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_464": {
|
|
"block_sizes": [
|
|
32,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_464": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_464": {
|
|
"block_sizes": [
|
|
16,
|
|
64
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_464": {
|
|
"block_sizes": [
|
|
8,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_464": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"first"
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_464": {
|
|
"block_sizes": [
|
|
128,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_480": {
|
|
"block_sizes": [
|
|
4,
|
|
16
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_480": {
|
|
"block_sizes": [
|
|
4,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
"first"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_480": {
|
|
"block_sizes": [
|
|
8,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_480": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_480": {
|
|
"block_sizes": [
|
|
64,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_480": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"last",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"first",
|
|
""
|
|
],
|
|
"num_warps": 8,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_496": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
4
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 2,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"tensor_descriptor"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_496": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 4,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_496": {
|
|
"block_sizes": [
|
|
256,
|
|
8
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2048_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
"last"
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_2880_numtokens_512": {
|
|
"block_sizes": [
|
|
16,
|
|
32
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_4096_numtokens_512": {
|
|
"block_sizes": [
|
|
128,
|
|
512
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 16,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_8192_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
false
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 32,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_11008_numtokens_512": {
|
|
"block_sizes": [
|
|
32,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
1
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_warp_specializes": [],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 1,
|
|
"indexing": [
|
|
"tensor_descriptor",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat"
|
|
},
|
|
"intermediate_14336_numtokens_512": {
|
|
"block_sizes": [
|
|
16,
|
|
128
|
|
],
|
|
"loop_orders": [
|
|
[
|
|
0,
|
|
1
|
|
]
|
|
],
|
|
"flatten_loops": [
|
|
true
|
|
],
|
|
"l2_groupings": [
|
|
2
|
|
],
|
|
"range_unroll_factors": [
|
|
0
|
|
],
|
|
"range_num_stages": [
|
|
0
|
|
],
|
|
"range_multi_buffers": [
|
|
null
|
|
],
|
|
"range_flattens": [
|
|
null
|
|
],
|
|
"load_eviction_policies": [
|
|
"first",
|
|
"",
|
|
""
|
|
],
|
|
"num_warps": 1,
|
|
"num_stages": 2,
|
|
"indexing": [
|
|
"pointer",
|
|
"pointer",
|
|
"pointer",
|
|
"pointer"
|
|
],
|
|
"pid_type": "flat",
|
|
"range_warp_specializes": []
|
|
}
|
|
}
|
|
}
|