3779 lines
111 KiB
JSON
3779 lines
111 KiB
JSON
{
|
|
"results": {
|
|
"mmlu": {
|
|
"acc,none": 0.6782509614015098,
|
|
"acc_stderr,none": 0.0037024105374157067,
|
|
"alias": "mmlu"
|
|
},
|
|
"mmlu_humanities": {
|
|
"acc,none": 0.5787460148777895,
|
|
"acc_stderr,none": 0.006609876050554653,
|
|
"alias": " - humanities"
|
|
},
|
|
"mmlu_formal_logic": {
|
|
"alias": " - formal_logic",
|
|
"acc,none": 0.5873015873015873,
|
|
"acc_stderr,none": 0.04403438954768176
|
|
},
|
|
"mmlu_high_school_european_history": {
|
|
"alias": " - high_school_european_history",
|
|
"acc,none": 0.7818181818181819,
|
|
"acc_stderr,none": 0.032250781083062896
|
|
},
|
|
"mmlu_high_school_us_history": {
|
|
"alias": " - high_school_us_history",
|
|
"acc,none": 0.8480392156862745,
|
|
"acc_stderr,none": 0.025195658428931792
|
|
},
|
|
"mmlu_high_school_world_history": {
|
|
"alias": " - high_school_world_history",
|
|
"acc,none": 0.8312236286919831,
|
|
"acc_stderr,none": 0.02438140683258622
|
|
},
|
|
"mmlu_international_law": {
|
|
"alias": " - international_law",
|
|
"acc,none": 0.7768595041322314,
|
|
"acc_stderr,none": 0.03800754475228733
|
|
},
|
|
"mmlu_jurisprudence": {
|
|
"alias": " - jurisprudence",
|
|
"acc,none": 0.7870370370370371,
|
|
"acc_stderr,none": 0.0395783547198098
|
|
},
|
|
"mmlu_logical_fallacies": {
|
|
"alias": " - logical_fallacies",
|
|
"acc,none": 0.8282208588957055,
|
|
"acc_stderr,none": 0.029634717272371037
|
|
},
|
|
"mmlu_moral_disputes": {
|
|
"alias": " - moral_disputes",
|
|
"acc,none": 0.7109826589595376,
|
|
"acc_stderr,none": 0.024405173935783234
|
|
},
|
|
"mmlu_moral_scenarios": {
|
|
"alias": " - moral_scenarios",
|
|
"acc,none": 0.2837988826815642,
|
|
"acc_stderr,none": 0.015078358970751772
|
|
},
|
|
"mmlu_philosophy": {
|
|
"alias": " - philosophy",
|
|
"acc,none": 0.7331189710610932,
|
|
"acc_stderr,none": 0.025122637608816643
|
|
},
|
|
"mmlu_prehistory": {
|
|
"alias": " - prehistory",
|
|
"acc,none": 0.7623456790123457,
|
|
"acc_stderr,none": 0.02368359183700856
|
|
},
|
|
"mmlu_professional_law": {
|
|
"alias": " - professional_law",
|
|
"acc,none": 0.47392438070404175,
|
|
"acc_stderr,none": 0.012752858346533133
|
|
},
|
|
"mmlu_world_religions": {
|
|
"alias": " - world_religions",
|
|
"acc,none": 0.783625730994152,
|
|
"acc_stderr,none": 0.031581495393387324
|
|
},
|
|
"mmlu_other": {
|
|
"acc,none": 0.7212745413582233,
|
|
"acc_stderr,none": 0.00782940794550645,
|
|
"alias": " - other"
|
|
},
|
|
"mmlu_business_ethics": {
|
|
"alias": " - business_ethics",
|
|
"acc,none": 0.73,
|
|
"acc_stderr,none": 0.044619604333847394
|
|
},
|
|
"mmlu_clinical_knowledge": {
|
|
"alias": " - clinical_knowledge",
|
|
"acc,none": 0.7547169811320755,
|
|
"acc_stderr,none": 0.026480357179895685
|
|
},
|
|
"mmlu_college_medicine": {
|
|
"alias": " - college_medicine",
|
|
"acc,none": 0.7109826589595376,
|
|
"acc_stderr,none": 0.034564257450869995
|
|
},
|
|
"mmlu_global_facts": {
|
|
"alias": " - global_facts",
|
|
"acc,none": 0.41,
|
|
"acc_stderr,none": 0.049431107042371025
|
|
},
|
|
"mmlu_human_aging": {
|
|
"alias": " - human_aging",
|
|
"acc,none": 0.6771300448430493,
|
|
"acc_stderr,none": 0.031381476375754995
|
|
},
|
|
"mmlu_management": {
|
|
"alias": " - management",
|
|
"acc,none": 0.8446601941747572,
|
|
"acc_stderr,none": 0.03586594738573974
|
|
},
|
|
"mmlu_marketing": {
|
|
"alias": " - marketing",
|
|
"acc,none": 0.8547008547008547,
|
|
"acc_stderr,none": 0.0230866350868414
|
|
},
|
|
"mmlu_medical_genetics": {
|
|
"alias": " - medical_genetics",
|
|
"acc,none": 0.75,
|
|
"acc_stderr,none": 0.04351941398892446
|
|
},
|
|
"mmlu_miscellaneous": {
|
|
"alias": " - miscellaneous",
|
|
"acc,none": 0.7969348659003831,
|
|
"acc_stderr,none": 0.01438552507661157
|
|
},
|
|
"mmlu_nutrition": {
|
|
"alias": " - nutrition",
|
|
"acc,none": 0.7483660130718954,
|
|
"acc_stderr,none": 0.02484801826387519
|
|
},
|
|
"mmlu_professional_accounting": {
|
|
"alias": " - professional_accounting",
|
|
"acc,none": 0.549645390070922,
|
|
"acc_stderr,none": 0.02968010556502904
|
|
},
|
|
"mmlu_professional_medicine": {
|
|
"alias": " - professional_medicine",
|
|
"acc,none": 0.7169117647058824,
|
|
"acc_stderr,none": 0.02736586113151381
|
|
},
|
|
"mmlu_virology": {
|
|
"alias": " - virology",
|
|
"acc,none": 0.5301204819277109,
|
|
"acc_stderr,none": 0.03885425420866767
|
|
},
|
|
"mmlu_social_sciences": {
|
|
"acc,none": 0.7838804029899252,
|
|
"acc_stderr,none": 0.007332607468558831,
|
|
"alias": " - social sciences"
|
|
},
|
|
"mmlu_econometrics": {
|
|
"alias": " - econometrics",
|
|
"acc,none": 0.6403508771929824,
|
|
"acc_stderr,none": 0.04514496132873633
|
|
},
|
|
"mmlu_high_school_geography": {
|
|
"alias": " - high_school_geography",
|
|
"acc,none": 0.8434343434343434,
|
|
"acc_stderr,none": 0.025890520358141454
|
|
},
|
|
"mmlu_high_school_government_and_politics": {
|
|
"alias": " - high_school_government_and_politics",
|
|
"acc,none": 0.8704663212435233,
|
|
"acc_stderr,none": 0.024233532297758723
|
|
},
|
|
"mmlu_high_school_macroeconomics": {
|
|
"alias": " - high_school_macroeconomics",
|
|
"acc,none": 0.7461538461538462,
|
|
"acc_stderr,none": 0.022066054378726257
|
|
},
|
|
"mmlu_high_school_microeconomics": {
|
|
"alias": " - high_school_microeconomics",
|
|
"acc,none": 0.8067226890756303,
|
|
"acc_stderr,none": 0.02564947026588919
|
|
},
|
|
"mmlu_high_school_psychology": {
|
|
"alias": " - high_school_psychology",
|
|
"acc,none": 0.8715596330275229,
|
|
"acc_stderr,none": 0.014344977542914313
|
|
},
|
|
"mmlu_human_sexuality": {
|
|
"alias": " - human_sexuality",
|
|
"acc,none": 0.7557251908396947,
|
|
"acc_stderr,none": 0.037683359597287434
|
|
},
|
|
"mmlu_professional_psychology": {
|
|
"alias": " - professional_psychology",
|
|
"acc,none": 0.7124183006535948,
|
|
"acc_stderr,none": 0.018311653053648222
|
|
},
|
|
"mmlu_public_relations": {
|
|
"alias": " - public_relations",
|
|
"acc,none": 0.7181818181818181,
|
|
"acc_stderr,none": 0.043091187099464585
|
|
},
|
|
"mmlu_security_studies": {
|
|
"alias": " - security_studies",
|
|
"acc,none": 0.7428571428571429,
|
|
"acc_stderr,none": 0.027979823538744546
|
|
},
|
|
"mmlu_sociology": {
|
|
"alias": " - sociology",
|
|
"acc,none": 0.8407960199004975,
|
|
"acc_stderr,none": 0.02587064676616914
|
|
},
|
|
"mmlu_us_foreign_policy": {
|
|
"alias": " - us_foreign_policy",
|
|
"acc,none": 0.81,
|
|
"acc_stderr,none": 0.03942772444036625
|
|
},
|
|
"mmlu_stem": {
|
|
"acc,none": 0.681255946717412,
|
|
"acc_stderr,none": 0.007991359841827956,
|
|
"alias": " - stem"
|
|
},
|
|
"mmlu_abstract_algebra": {
|
|
"alias": " - abstract_algebra",
|
|
"acc,none": 0.45,
|
|
"acc_stderr,none": 0.05
|
|
},
|
|
"mmlu_anatomy": {
|
|
"alias": " - anatomy",
|
|
"acc,none": 0.6148148148148148,
|
|
"acc_stderr,none": 0.042039210401562783
|
|
},
|
|
"mmlu_astronomy": {
|
|
"alias": " - astronomy",
|
|
"acc,none": 0.7894736842105263,
|
|
"acc_stderr,none": 0.033176727875331574
|
|
},
|
|
"mmlu_college_biology": {
|
|
"alias": " - college_biology",
|
|
"acc,none": 0.8333333333333334,
|
|
"acc_stderr,none": 0.031164899666948614
|
|
},
|
|
"mmlu_college_chemistry": {
|
|
"alias": " - college_chemistry",
|
|
"acc,none": 0.54,
|
|
"acc_stderr,none": 0.05009082659620333
|
|
},
|
|
"mmlu_college_computer_science": {
|
|
"alias": " - college_computer_science",
|
|
"acc,none": 0.69,
|
|
"acc_stderr,none": 0.04648231987117316
|
|
},
|
|
"mmlu_college_mathematics": {
|
|
"alias": " - college_mathematics",
|
|
"acc,none": 0.58,
|
|
"acc_stderr,none": 0.049604496374885836
|
|
},
|
|
"mmlu_college_physics": {
|
|
"alias": " - college_physics",
|
|
"acc,none": 0.5392156862745098,
|
|
"acc_stderr,none": 0.04959859966384181
|
|
},
|
|
"mmlu_computer_security": {
|
|
"alias": " - computer_security",
|
|
"acc,none": 0.8,
|
|
"acc_stderr,none": 0.04020151261036846
|
|
},
|
|
"mmlu_conceptual_physics": {
|
|
"alias": " - conceptual_physics",
|
|
"acc,none": 0.7702127659574468,
|
|
"acc_stderr,none": 0.027501752944412417
|
|
},
|
|
"mmlu_electrical_engineering": {
|
|
"alias": " - electrical_engineering",
|
|
"acc,none": 0.7655172413793103,
|
|
"acc_stderr,none": 0.035306258743465914
|
|
},
|
|
"mmlu_elementary_mathematics": {
|
|
"alias": " - elementary_mathematics",
|
|
"acc,none": 0.656084656084656,
|
|
"acc_stderr,none": 0.024464426625596444
|
|
},
|
|
"mmlu_high_school_biology": {
|
|
"alias": " - high_school_biology",
|
|
"acc,none": 0.8612903225806452,
|
|
"acc_stderr,none": 0.019662961321414027
|
|
},
|
|
"mmlu_high_school_chemistry": {
|
|
"alias": " - high_school_chemistry",
|
|
"acc,none": 0.7044334975369458,
|
|
"acc_stderr,none": 0.032104944337514575
|
|
},
|
|
"mmlu_high_school_computer_science": {
|
|
"alias": " - high_school_computer_science",
|
|
"acc,none": 0.86,
|
|
"acc_stderr,none": 0.034873508801977704
|
|
},
|
|
"mmlu_high_school_mathematics": {
|
|
"alias": " - high_school_mathematics",
|
|
"acc,none": 0.42592592592592593,
|
|
"acc_stderr,none": 0.030149135601365947
|
|
},
|
|
"mmlu_high_school_physics": {
|
|
"alias": " - high_school_physics",
|
|
"acc,none": 0.6291390728476821,
|
|
"acc_stderr,none": 0.03943966699183629
|
|
},
|
|
"mmlu_high_school_statistics": {
|
|
"alias": " - high_school_statistics",
|
|
"acc,none": 0.7129629629629629,
|
|
"acc_stderr,none": 0.030851992993257013
|
|
},
|
|
"mmlu_machine_learning": {
|
|
"alias": " - machine_learning",
|
|
"acc,none": 0.5714285714285714,
|
|
"acc_stderr,none": 0.04697113923010212
|
|
}
|
|
},
|
|
"configs": {
|
|
"mmlu_abstract_algebra": {
|
|
"task": "mmlu_abstract_algebra",
|
|
"task_alias": "abstract_algebra",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "abstract_algebra",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_anatomy": {
|
|
"task": "mmlu_anatomy",
|
|
"task_alias": "anatomy",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "anatomy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_astronomy": {
|
|
"task": "mmlu_astronomy",
|
|
"task_alias": "astronomy",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "astronomy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_business_ethics": {
|
|
"task": "mmlu_business_ethics",
|
|
"task_alias": "business_ethics",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "business_ethics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_clinical_knowledge": {
|
|
"task": "mmlu_clinical_knowledge",
|
|
"task_alias": "clinical_knowledge",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "clinical_knowledge",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_biology": {
|
|
"task": "mmlu_college_biology",
|
|
"task_alias": "college_biology",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_biology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college biology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_chemistry": {
|
|
"task": "mmlu_college_chemistry",
|
|
"task_alias": "college_chemistry",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_chemistry",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_computer_science": {
|
|
"task": "mmlu_college_computer_science",
|
|
"task_alias": "college_computer_science",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_computer_science",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_mathematics": {
|
|
"task": "mmlu_college_mathematics",
|
|
"task_alias": "college_mathematics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_medicine": {
|
|
"task": "mmlu_college_medicine",
|
|
"task_alias": "college_medicine",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_medicine",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_college_physics": {
|
|
"task": "mmlu_college_physics",
|
|
"task_alias": "college_physics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "college_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_computer_security": {
|
|
"task": "mmlu_computer_security",
|
|
"task_alias": "computer_security",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "computer_security",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about computer security.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_conceptual_physics": {
|
|
"task": "mmlu_conceptual_physics",
|
|
"task_alias": "conceptual_physics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "conceptual_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_econometrics": {
|
|
"task": "mmlu_econometrics",
|
|
"task_alias": "econometrics",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "econometrics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_electrical_engineering": {
|
|
"task": "mmlu_electrical_engineering",
|
|
"task_alias": "electrical_engineering",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "electrical_engineering",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_elementary_mathematics": {
|
|
"task": "mmlu_elementary_mathematics",
|
|
"task_alias": "elementary_mathematics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "elementary_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_formal_logic": {
|
|
"task": "mmlu_formal_logic",
|
|
"task_alias": "formal_logic",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "formal_logic",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_global_facts": {
|
|
"task": "mmlu_global_facts",
|
|
"task_alias": "global_facts",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "global_facts",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about global facts.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_biology": {
|
|
"task": "mmlu_high_school_biology",
|
|
"task_alias": "high_school_biology",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_biology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_chemistry": {
|
|
"task": "mmlu_high_school_chemistry",
|
|
"task_alias": "high_school_chemistry",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_chemistry",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_computer_science": {
|
|
"task": "mmlu_high_school_computer_science",
|
|
"task_alias": "high_school_computer_science",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_computer_science",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_european_history": {
|
|
"task": "mmlu_high_school_european_history",
|
|
"task_alias": "high_school_european_history",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_european_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_geography": {
|
|
"task": "mmlu_high_school_geography",
|
|
"task_alias": "high_school_geography",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_geography",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_government_and_politics": {
|
|
"task": "mmlu_high_school_government_and_politics",
|
|
"task_alias": "high_school_government_and_politics",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_government_and_politics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_macroeconomics": {
|
|
"task": "mmlu_high_school_macroeconomics",
|
|
"task_alias": "high_school_macroeconomics",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_macroeconomics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_mathematics": {
|
|
"task": "mmlu_high_school_mathematics",
|
|
"task_alias": "high_school_mathematics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_microeconomics": {
|
|
"task": "mmlu_high_school_microeconomics",
|
|
"task_alias": "high_school_microeconomics",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_microeconomics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_physics": {
|
|
"task": "mmlu_high_school_physics",
|
|
"task_alias": "high_school_physics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_psychology": {
|
|
"task": "mmlu_high_school_psychology",
|
|
"task_alias": "high_school_psychology",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_psychology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_statistics": {
|
|
"task": "mmlu_high_school_statistics",
|
|
"task_alias": "high_school_statistics",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_statistics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_us_history": {
|
|
"task": "mmlu_high_school_us_history",
|
|
"task_alias": "high_school_us_history",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_us_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_high_school_world_history": {
|
|
"task": "mmlu_high_school_world_history",
|
|
"task_alias": "high_school_world_history",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "high_school_world_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_human_aging": {
|
|
"task": "mmlu_human_aging",
|
|
"task_alias": "human_aging",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "human_aging",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about human aging.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_human_sexuality": {
|
|
"task": "mmlu_human_sexuality",
|
|
"task_alias": "human_sexuality",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "human_sexuality",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_international_law": {
|
|
"task": "mmlu_international_law",
|
|
"task_alias": "international_law",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "international_law",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about international law.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_jurisprudence": {
|
|
"task": "mmlu_jurisprudence",
|
|
"task_alias": "jurisprudence",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "jurisprudence",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_logical_fallacies": {
|
|
"task": "mmlu_logical_fallacies",
|
|
"task_alias": "logical_fallacies",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "logical_fallacies",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_machine_learning": {
|
|
"task": "mmlu_machine_learning",
|
|
"task_alias": "machine_learning",
|
|
"tag": "mmlu_stem_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "machine_learning",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_management": {
|
|
"task": "mmlu_management",
|
|
"task_alias": "management",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "management",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about management.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_marketing": {
|
|
"task": "mmlu_marketing",
|
|
"task_alias": "marketing",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "marketing",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about marketing.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_medical_genetics": {
|
|
"task": "mmlu_medical_genetics",
|
|
"task_alias": "medical_genetics",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "medical_genetics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_miscellaneous": {
|
|
"task": "mmlu_miscellaneous",
|
|
"task_alias": "miscellaneous",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "miscellaneous",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_moral_disputes": {
|
|
"task": "mmlu_moral_disputes",
|
|
"task_alias": "moral_disputes",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "moral_disputes",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_moral_scenarios": {
|
|
"task": "mmlu_moral_scenarios",
|
|
"task_alias": "moral_scenarios",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "moral_scenarios",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_nutrition": {
|
|
"task": "mmlu_nutrition",
|
|
"task_alias": "nutrition",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "nutrition",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_philosophy": {
|
|
"task": "mmlu_philosophy",
|
|
"task_alias": "philosophy",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "philosophy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_prehistory": {
|
|
"task": "mmlu_prehistory",
|
|
"task_alias": "prehistory",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "prehistory",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_professional_accounting": {
|
|
"task": "mmlu_professional_accounting",
|
|
"task_alias": "professional_accounting",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "professional_accounting",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_professional_law": {
|
|
"task": "mmlu_professional_law",
|
|
"task_alias": "professional_law",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "professional_law",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional law.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_professional_medicine": {
|
|
"task": "mmlu_professional_medicine",
|
|
"task_alias": "professional_medicine",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "professional_medicine",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_professional_psychology": {
|
|
"task": "mmlu_professional_psychology",
|
|
"task_alias": "professional_psychology",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "professional_psychology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_public_relations": {
|
|
"task": "mmlu_public_relations",
|
|
"task_alias": "public_relations",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "public_relations",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about public relations.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_security_studies": {
|
|
"task": "mmlu_security_studies",
|
|
"task_alias": "security_studies",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "security_studies",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about security studies.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_sociology": {
|
|
"task": "mmlu_sociology",
|
|
"task_alias": "sociology",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "sociology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about sociology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_us_foreign_policy": {
|
|
"task": "mmlu_us_foreign_policy",
|
|
"task_alias": "us_foreign_policy",
|
|
"tag": "mmlu_social_sciences_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "us_foreign_policy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_virology": {
|
|
"task": "mmlu_virology",
|
|
"task_alias": "virology",
|
|
"tag": "mmlu_other_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "virology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about virology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
},
|
|
"mmlu_world_religions": {
|
|
"task": "mmlu_world_religions",
|
|
"task_alias": "world_religions",
|
|
"tag": "mmlu_humanities_tasks",
|
|
"dataset_path": "cais/mmlu",
|
|
"dataset_name": "world_religions",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"unsafe_code": false,
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about world religions.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"split": "dev",
|
|
"process_docs": null,
|
|
"fewshot_indices": null,
|
|
"samples": null,
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"doc_to_target": "answer",
|
|
"gen_prefix": null,
|
|
"fewshot_delimiter": "\n\n",
|
|
"target_delimiter": " "
|
|
},
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0,
|
|
"pretrained": "hadadxyz/Qwen3-4B-Diversity"
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"mmlu": 2,
|
|
"mmlu_abstract_algebra": 1.0,
|
|
"mmlu_anatomy": 1.0,
|
|
"mmlu_astronomy": 1.0,
|
|
"mmlu_business_ethics": 1.0,
|
|
"mmlu_clinical_knowledge": 1.0,
|
|
"mmlu_college_biology": 1.0,
|
|
"mmlu_college_chemistry": 1.0,
|
|
"mmlu_college_computer_science": 1.0,
|
|
"mmlu_college_mathematics": 1.0,
|
|
"mmlu_college_medicine": 1.0,
|
|
"mmlu_college_physics": 1.0,
|
|
"mmlu_computer_security": 1.0,
|
|
"mmlu_conceptual_physics": 1.0,
|
|
"mmlu_econometrics": 1.0,
|
|
"mmlu_electrical_engineering": 1.0,
|
|
"mmlu_elementary_mathematics": 1.0,
|
|
"mmlu_formal_logic": 1.0,
|
|
"mmlu_global_facts": 1.0,
|
|
"mmlu_high_school_biology": 1.0,
|
|
"mmlu_high_school_chemistry": 1.0,
|
|
"mmlu_high_school_computer_science": 1.0,
|
|
"mmlu_high_school_european_history": 1.0,
|
|
"mmlu_high_school_geography": 1.0,
|
|
"mmlu_high_school_government_and_politics": 1.0,
|
|
"mmlu_high_school_macroeconomics": 1.0,
|
|
"mmlu_high_school_mathematics": 1.0,
|
|
"mmlu_high_school_microeconomics": 1.0,
|
|
"mmlu_high_school_physics": 1.0,
|
|
"mmlu_high_school_psychology": 1.0,
|
|
"mmlu_high_school_statistics": 1.0,
|
|
"mmlu_high_school_us_history": 1.0,
|
|
"mmlu_high_school_world_history": 1.0,
|
|
"mmlu_human_aging": 1.0,
|
|
"mmlu_human_sexuality": 1.0,
|
|
"mmlu_humanities": 2,
|
|
"mmlu_international_law": 1.0,
|
|
"mmlu_jurisprudence": 1.0,
|
|
"mmlu_logical_fallacies": 1.0,
|
|
"mmlu_machine_learning": 1.0,
|
|
"mmlu_management": 1.0,
|
|
"mmlu_marketing": 1.0,
|
|
"mmlu_medical_genetics": 1.0,
|
|
"mmlu_miscellaneous": 1.0,
|
|
"mmlu_moral_disputes": 1.0,
|
|
"mmlu_moral_scenarios": 1.0,
|
|
"mmlu_nutrition": 1.0,
|
|
"mmlu_other": 2,
|
|
"mmlu_philosophy": 1.0,
|
|
"mmlu_prehistory": 1.0,
|
|
"mmlu_professional_accounting": 1.0,
|
|
"mmlu_professional_law": 1.0,
|
|
"mmlu_professional_medicine": 1.0,
|
|
"mmlu_professional_psychology": 1.0,
|
|
"mmlu_public_relations": 1.0,
|
|
"mmlu_security_studies": 1.0,
|
|
"mmlu_social_sciences": 2,
|
|
"mmlu_sociology": 1.0,
|
|
"mmlu_stem": 2,
|
|
"mmlu_us_foreign_policy": 1.0,
|
|
"mmlu_virology": 1.0,
|
|
"mmlu_world_religions": 1.0
|
|
},
|
|
"n-shot": {
|
|
"mmlu_abstract_algebra": 0,
|
|
"mmlu_anatomy": 0,
|
|
"mmlu_astronomy": 0,
|
|
"mmlu_business_ethics": 0,
|
|
"mmlu_clinical_knowledge": 0,
|
|
"mmlu_college_biology": 0,
|
|
"mmlu_college_chemistry": 0,
|
|
"mmlu_college_computer_science": 0,
|
|
"mmlu_college_mathematics": 0,
|
|
"mmlu_college_medicine": 0,
|
|
"mmlu_college_physics": 0,
|
|
"mmlu_computer_security": 0,
|
|
"mmlu_conceptual_physics": 0,
|
|
"mmlu_econometrics": 0,
|
|
"mmlu_electrical_engineering": 0,
|
|
"mmlu_elementary_mathematics": 0,
|
|
"mmlu_formal_logic": 0,
|
|
"mmlu_global_facts": 0,
|
|
"mmlu_high_school_biology": 0,
|
|
"mmlu_high_school_chemistry": 0,
|
|
"mmlu_high_school_computer_science": 0,
|
|
"mmlu_high_school_european_history": 0,
|
|
"mmlu_high_school_geography": 0,
|
|
"mmlu_high_school_government_and_politics": 0,
|
|
"mmlu_high_school_macroeconomics": 0,
|
|
"mmlu_high_school_mathematics": 0,
|
|
"mmlu_high_school_microeconomics": 0,
|
|
"mmlu_high_school_physics": 0,
|
|
"mmlu_high_school_psychology": 0,
|
|
"mmlu_high_school_statistics": 0,
|
|
"mmlu_high_school_us_history": 0,
|
|
"mmlu_high_school_world_history": 0,
|
|
"mmlu_human_aging": 0,
|
|
"mmlu_human_sexuality": 0,
|
|
"mmlu_international_law": 0,
|
|
"mmlu_jurisprudence": 0,
|
|
"mmlu_logical_fallacies": 0,
|
|
"mmlu_machine_learning": 0,
|
|
"mmlu_management": 0,
|
|
"mmlu_marketing": 0,
|
|
"mmlu_medical_genetics": 0,
|
|
"mmlu_miscellaneous": 0,
|
|
"mmlu_moral_disputes": 0,
|
|
"mmlu_moral_scenarios": 0,
|
|
"mmlu_nutrition": 0,
|
|
"mmlu_philosophy": 0,
|
|
"mmlu_prehistory": 0,
|
|
"mmlu_professional_accounting": 0,
|
|
"mmlu_professional_law": 0,
|
|
"mmlu_professional_medicine": 0,
|
|
"mmlu_professional_psychology": 0,
|
|
"mmlu_public_relations": 0,
|
|
"mmlu_security_studies": 0,
|
|
"mmlu_sociology": 0,
|
|
"mmlu_us_foreign_policy": 0,
|
|
"mmlu_virology": 0,
|
|
"mmlu_world_religions": 0
|
|
},
|
|
"higher_is_better": {
|
|
"mmlu": {
|
|
"acc": true
|
|
},
|
|
"mmlu_abstract_algebra": {
|
|
"acc": true
|
|
},
|
|
"mmlu_anatomy": {
|
|
"acc": true
|
|
},
|
|
"mmlu_astronomy": {
|
|
"acc": true
|
|
},
|
|
"mmlu_business_ethics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_clinical_knowledge": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_biology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_chemistry": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_computer_science": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_mathematics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_medicine": {
|
|
"acc": true
|
|
},
|
|
"mmlu_college_physics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_computer_security": {
|
|
"acc": true
|
|
},
|
|
"mmlu_conceptual_physics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_econometrics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_electrical_engineering": {
|
|
"acc": true
|
|
},
|
|
"mmlu_elementary_mathematics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_formal_logic": {
|
|
"acc": true
|
|
},
|
|
"mmlu_global_facts": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_biology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_chemistry": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_computer_science": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_european_history": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_geography": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_government_and_politics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_macroeconomics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_mathematics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_microeconomics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_physics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_psychology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_statistics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_us_history": {
|
|
"acc": true
|
|
},
|
|
"mmlu_high_school_world_history": {
|
|
"acc": true
|
|
},
|
|
"mmlu_human_aging": {
|
|
"acc": true
|
|
},
|
|
"mmlu_human_sexuality": {
|
|
"acc": true
|
|
},
|
|
"mmlu_humanities": {
|
|
"acc": true
|
|
},
|
|
"mmlu_international_law": {
|
|
"acc": true
|
|
},
|
|
"mmlu_jurisprudence": {
|
|
"acc": true
|
|
},
|
|
"mmlu_logical_fallacies": {
|
|
"acc": true
|
|
},
|
|
"mmlu_machine_learning": {
|
|
"acc": true
|
|
},
|
|
"mmlu_management": {
|
|
"acc": true
|
|
},
|
|
"mmlu_marketing": {
|
|
"acc": true
|
|
},
|
|
"mmlu_medical_genetics": {
|
|
"acc": true
|
|
},
|
|
"mmlu_miscellaneous": {
|
|
"acc": true
|
|
},
|
|
"mmlu_moral_disputes": {
|
|
"acc": true
|
|
},
|
|
"mmlu_moral_scenarios": {
|
|
"acc": true
|
|
},
|
|
"mmlu_nutrition": {
|
|
"acc": true
|
|
},
|
|
"mmlu_other": {
|
|
"acc": true
|
|
},
|
|
"mmlu_philosophy": {
|
|
"acc": true
|
|
},
|
|
"mmlu_prehistory": {
|
|
"acc": true
|
|
},
|
|
"mmlu_professional_accounting": {
|
|
"acc": true
|
|
},
|
|
"mmlu_professional_law": {
|
|
"acc": true
|
|
},
|
|
"mmlu_professional_medicine": {
|
|
"acc": true
|
|
},
|
|
"mmlu_professional_psychology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_public_relations": {
|
|
"acc": true
|
|
},
|
|
"mmlu_security_studies": {
|
|
"acc": true
|
|
},
|
|
"mmlu_social_sciences": {
|
|
"acc": true
|
|
},
|
|
"mmlu_sociology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_stem": {
|
|
"acc": true
|
|
},
|
|
"mmlu_us_foreign_policy": {
|
|
"acc": true
|
|
},
|
|
"mmlu_virology": {
|
|
"acc": true
|
|
},
|
|
"mmlu_world_religions": {
|
|
"acc": true
|
|
}
|
|
},
|
|
"group_subtasks": {
|
|
"mmlu_humanities": [
|
|
"mmlu_formal_logic",
|
|
"mmlu_high_school_european_history",
|
|
"mmlu_high_school_us_history",
|
|
"mmlu_high_school_world_history",
|
|
"mmlu_international_law",
|
|
"mmlu_jurisprudence",
|
|
"mmlu_logical_fallacies",
|
|
"mmlu_moral_disputes",
|
|
"mmlu_moral_scenarios",
|
|
"mmlu_philosophy",
|
|
"mmlu_prehistory",
|
|
"mmlu_professional_law",
|
|
"mmlu_world_religions"
|
|
],
|
|
"mmlu_social_sciences": [
|
|
"mmlu_econometrics",
|
|
"mmlu_high_school_geography",
|
|
"mmlu_high_school_government_and_politics",
|
|
"mmlu_high_school_macroeconomics",
|
|
"mmlu_high_school_microeconomics",
|
|
"mmlu_high_school_psychology",
|
|
"mmlu_human_sexuality",
|
|
"mmlu_professional_psychology",
|
|
"mmlu_public_relations",
|
|
"mmlu_security_studies",
|
|
"mmlu_sociology",
|
|
"mmlu_us_foreign_policy"
|
|
],
|
|
"mmlu_other": [
|
|
"mmlu_business_ethics",
|
|
"mmlu_clinical_knowledge",
|
|
"mmlu_college_medicine",
|
|
"mmlu_global_facts",
|
|
"mmlu_human_aging",
|
|
"mmlu_management",
|
|
"mmlu_marketing",
|
|
"mmlu_medical_genetics",
|
|
"mmlu_miscellaneous",
|
|
"mmlu_nutrition",
|
|
"mmlu_professional_accounting",
|
|
"mmlu_professional_medicine",
|
|
"mmlu_virology"
|
|
],
|
|
"mmlu_stem": [
|
|
"mmlu_abstract_algebra",
|
|
"mmlu_anatomy",
|
|
"mmlu_astronomy",
|
|
"mmlu_college_biology",
|
|
"mmlu_college_chemistry",
|
|
"mmlu_college_computer_science",
|
|
"mmlu_college_mathematics",
|
|
"mmlu_college_physics",
|
|
"mmlu_computer_security",
|
|
"mmlu_conceptual_physics",
|
|
"mmlu_electrical_engineering",
|
|
"mmlu_elementary_mathematics",
|
|
"mmlu_high_school_biology",
|
|
"mmlu_high_school_chemistry",
|
|
"mmlu_high_school_computer_science",
|
|
"mmlu_high_school_mathematics",
|
|
"mmlu_high_school_physics",
|
|
"mmlu_high_school_statistics",
|
|
"mmlu_machine_learning"
|
|
],
|
|
"mmlu": [
|
|
"mmlu_stem",
|
|
"mmlu_other",
|
|
"mmlu_social_sciences",
|
|
"mmlu_humanities"
|
|
]
|
|
},
|
|
"model": "hadadxyz/Qwen3-4B-Diversity"
|
|
} |