vulkan: Use larger workgroups for mul_mat_vec when M is small (#15355)
* vulkan: Use larger workgroups for mul_mat_vec when M is small Also use subgroup instructions for (part of) the reduction when supported. Without this, the more expensive reductions would eat into the benefits of the larger workgroups. * update heuristic for amd/intel Co-authored-by: 0cc4m <picard12@live.de> --------- Co-authored-by: 0cc4m <picard12@live.de>
This commit is contained in:
@@ -472,6 +472,9 @@ void process_shaders() {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
// Dequant shaders
|
||||
@@ -789,6 +792,18 @@ void write_output_files() {
|
||||
fputs(data.c_str(), src);
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
|
||||
for (const std::string& btype : {"f16", "f32"}) {
|
||||
for (const auto& tname : type_names) {
|
||||
fprintf(hdr, "extern unsigned char *arr_dmmv_%s_%s_f32_data[2];\n", tname.c_str(), btype.c_str());
|
||||
fprintf(hdr, "extern uint64_t arr_dmmv_%s_%s_f32_len[2];\n", tname.c_str(), btype.c_str());
|
||||
std::string data = "unsigned char *arr_dmmv_" + tname + "_" + btype + "_f32_data[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_data, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_data};\n";
|
||||
std::string len = "uint64_t arr_dmmv_" + tname + "_" + btype + "_f32_len[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_len, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_len};\n";
|
||||
fputs(data.c_str(), src);
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user