3211 lines
102 KiB
JSON
3211 lines
102 KiB
JSON
[
|
|
{
|
|
"results": {
|
|
"arc_challenge": {
|
|
"acc,none": 0.6006825938566553,
|
|
"acc_stderr,none": 0.014312094557946705,
|
|
"acc_norm,none": 0.6271331058020477,
|
|
"acc_norm_stderr,none": 0.014131176760131165,
|
|
"alias": "arc_challenge"
|
|
}
|
|
},
|
|
"configs": {
|
|
"arc_challenge": {
|
|
"task": "arc_challenge",
|
|
"group": [
|
|
"ai2_arc"
|
|
],
|
|
"dataset_path": "ai2_arc",
|
|
"dataset_name": "ARC-Challenge",
|
|
"training_split": "train",
|
|
"validation_split": "validation",
|
|
"test_split": "test",
|
|
"doc_to_text": "Question: {{question}}\nAnswer:",
|
|
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
|
"doc_to_choice": "{{choices.text}}",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 25,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc_norm",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
|
|
"metadata": {
|
|
"version": 1
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"arc_challenge": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"arc_challenge": 25
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
},
|
|
{
|
|
"results": {
|
|
"gsm8k": {
|
|
"exact_match,get-answer": 0.5458680818802123,
|
|
"exact_match_stderr,get-answer": 0.013714410945264554,
|
|
"alias": "gsm8k"
|
|
}
|
|
},
|
|
"configs": {
|
|
"gsm8k": {
|
|
"task": "gsm8k",
|
|
"group": [
|
|
"math_word_problems"
|
|
],
|
|
"dataset_path": "gsm8k",
|
|
"dataset_name": "main",
|
|
"training_split": "train",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Question: {{question}}\nAnswer:",
|
|
"doc_to_target": "{{answer}}",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "exact_match",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true,
|
|
"ignore_case": true,
|
|
"ignore_punctuation": false,
|
|
"regexes_to_ignore": [
|
|
",",
|
|
"\\$",
|
|
"(?s).*#### "
|
|
]
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"until": [
|
|
"\n\n",
|
|
"Question:"
|
|
],
|
|
"do_sample": false,
|
|
"temperature": 0
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "get-answer",
|
|
"filter": [
|
|
{
|
|
"function": "regex",
|
|
"regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"gsm8k": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"gsm8k": 5
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
},
|
|
{
|
|
"results": {
|
|
"hellaswag": {
|
|
"acc,none": 0.6486755626369249,
|
|
"acc_stderr,none": 0.004764084597176902,
|
|
"acc_norm,none": 0.839573790081657,
|
|
"acc_norm_stderr,none": 0.0036625082723308246,
|
|
"alias": "hellaswag"
|
|
}
|
|
},
|
|
"configs": {
|
|
"hellaswag": {
|
|
"task": "hellaswag",
|
|
"group": [
|
|
"multiple_choice"
|
|
],
|
|
"dataset_path": "hellaswag",
|
|
"training_split": "train",
|
|
"validation_split": "validation",
|
|
"process_docs": "<function process_docs at 0x7ff8ef44ef20>",
|
|
"doc_to_text": "{{query}}",
|
|
"doc_to_target": "{{label}}",
|
|
"doc_to_choice": "choices",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 10,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc_norm",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"hellaswag": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"hellaswag": 10
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
},
|
|
{
|
|
"results": {
|
|
"mmlu": {
|
|
"acc,none": 0.6177182737501781,
|
|
"acc_stderr,none": 0.12146933986032843,
|
|
"alias": "mmlu"
|
|
},
|
|
"mmlu_humanities": {
|
|
"alias": " - humanities",
|
|
"acc,none": 0.5742826780021254,
|
|
"acc_stderr,none": 0.10969794763314945
|
|
},
|
|
"mmlu_formal_logic": {
|
|
"alias": " - formal_logic",
|
|
"acc,none": 0.3888888888888889,
|
|
"acc_stderr,none": 0.04360314860077459
|
|
},
|
|
"mmlu_high_school_european_history": {
|
|
"alias": " - high_school_european_history",
|
|
"acc,none": 0.7878787878787878,
|
|
"acc_stderr,none": 0.03192271569548301
|
|
},
|
|
"mmlu_high_school_us_history": {
|
|
"alias": " - high_school_us_history",
|
|
"acc,none": 0.75,
|
|
"acc_stderr,none": 0.03039153369274154
|
|
},
|
|
"mmlu_high_school_world_history": {
|
|
"alias": " - high_school_world_history",
|
|
"acc,none": 0.7679324894514767,
|
|
"acc_stderr,none": 0.027479744550808517
|
|
},
|
|
"mmlu_international_law": {
|
|
"alias": " - international_law",
|
|
"acc,none": 0.7851239669421488,
|
|
"acc_stderr,none": 0.03749492448709699
|
|
},
|
|
"mmlu_jurisprudence": {
|
|
"alias": " - jurisprudence",
|
|
"acc,none": 0.7870370370370371,
|
|
"acc_stderr,none": 0.039578354719809784
|
|
},
|
|
"mmlu_logical_fallacies": {
|
|
"alias": " - logical_fallacies",
|
|
"acc,none": 0.7055214723926381,
|
|
"acc_stderr,none": 0.03581165790474082
|
|
},
|
|
"mmlu_moral_disputes": {
|
|
"alias": " - moral_disputes",
|
|
"acc,none": 0.7167630057803468,
|
|
"acc_stderr,none": 0.02425790170532337
|
|
},
|
|
"mmlu_moral_scenarios": {
|
|
"alias": " - moral_scenarios",
|
|
"acc,none": 0.4044692737430168,
|
|
"acc_stderr,none": 0.01641444091729315
|
|
},
|
|
"mmlu_philosophy": {
|
|
"alias": " - philosophy",
|
|
"acc,none": 0.6913183279742765,
|
|
"acc_stderr,none": 0.02623696588115326
|
|
},
|
|
"mmlu_prehistory": {
|
|
"alias": " - prehistory",
|
|
"acc,none": 0.7129629629629629,
|
|
"acc_stderr,none": 0.02517104191530968
|
|
},
|
|
"mmlu_professional_law": {
|
|
"alias": " - professional_law",
|
|
"acc,none": 0.45241199478487615,
|
|
"acc_stderr,none": 0.012712265105889133
|
|
},
|
|
"mmlu_world_religions": {
|
|
"alias": " - world_religions",
|
|
"acc,none": 0.8362573099415205,
|
|
"acc_stderr,none": 0.028380919596145866
|
|
},
|
|
"mmlu_other": {
|
|
"alias": " - other",
|
|
"acc,none": 0.6842613453492115,
|
|
"acc_stderr,none": 0.10567866943210723
|
|
},
|
|
"mmlu_business_ethics": {
|
|
"alias": " - business_ethics",
|
|
"acc,none": 0.57,
|
|
"acc_stderr,none": 0.049756985195624284
|
|
},
|
|
"mmlu_clinical_knowledge": {
|
|
"alias": " - clinical_knowledge",
|
|
"acc,none": 0.6641509433962264,
|
|
"acc_stderr,none": 0.029067220146644823
|
|
},
|
|
"mmlu_college_medicine": {
|
|
"alias": " - college_medicine",
|
|
"acc,none": 0.6069364161849711,
|
|
"acc_stderr,none": 0.0372424959581773
|
|
},
|
|
"mmlu_global_facts": {
|
|
"alias": " - global_facts",
|
|
"acc,none": 0.38,
|
|
"acc_stderr,none": 0.04878317312145633
|
|
},
|
|
"mmlu_human_aging": {
|
|
"alias": " - human_aging",
|
|
"acc,none": 0.6816143497757847,
|
|
"acc_stderr,none": 0.03126580522513713
|
|
},
|
|
"mmlu_management": {
|
|
"alias": " - management",
|
|
"acc,none": 0.7572815533980582,
|
|
"acc_stderr,none": 0.04245022486384495
|
|
},
|
|
"mmlu_marketing": {
|
|
"alias": " - marketing",
|
|
"acc,none": 0.8675213675213675,
|
|
"acc_stderr,none": 0.022209309073165616
|
|
},
|
|
"mmlu_medical_genetics": {
|
|
"alias": " - medical_genetics",
|
|
"acc,none": 0.74,
|
|
"acc_stderr,none": 0.044084400227680794
|
|
},
|
|
"mmlu_miscellaneous": {
|
|
"alias": " - miscellaneous",
|
|
"acc,none": 0.7931034482758621,
|
|
"acc_stderr,none": 0.014485656041669173
|
|
},
|
|
"mmlu_nutrition": {
|
|
"alias": " - nutrition",
|
|
"acc,none": 0.7058823529411765,
|
|
"acc_stderr,none": 0.02609016250427905
|
|
},
|
|
"mmlu_professional_accounting": {
|
|
"alias": " - professional_accounting",
|
|
"acc,none": 0.46808510638297873,
|
|
"acc_stderr,none": 0.029766675075873866
|
|
},
|
|
"mmlu_professional_medicine": {
|
|
"alias": " - professional_medicine",
|
|
"acc,none": 0.6764705882352942,
|
|
"acc_stderr,none": 0.02841820861940675
|
|
},
|
|
"mmlu_virology": {
|
|
"alias": " - virology",
|
|
"acc,none": 0.5421686746987951,
|
|
"acc_stderr,none": 0.038786267710023595
|
|
},
|
|
"mmlu_social_sciences": {
|
|
"alias": " - social_sciences",
|
|
"acc,none": 0.7227819304517387,
|
|
"acc_stderr,none": 0.0710141347586875
|
|
},
|
|
"mmlu_econometrics": {
|
|
"alias": " - econometrics",
|
|
"acc,none": 0.47368421052631576,
|
|
"acc_stderr,none": 0.046970851366478626
|
|
},
|
|
"mmlu_high_school_geography": {
|
|
"alias": " - high_school_geography",
|
|
"acc,none": 0.7777777777777778,
|
|
"acc_stderr,none": 0.029620227874790465
|
|
},
|
|
"mmlu_high_school_government_and_politics": {
|
|
"alias": " - high_school_government_and_politics",
|
|
"acc,none": 0.8549222797927462,
|
|
"acc_stderr,none": 0.02541634309630644
|
|
},
|
|
"mmlu_high_school_macroeconomics": {
|
|
"alias": " - high_school_macroeconomics",
|
|
"acc,none": 0.6333333333333333,
|
|
"acc_stderr,none": 0.02443301646605246
|
|
},
|
|
"mmlu_high_school_microeconomics": {
|
|
"alias": " - high_school_microeconomics",
|
|
"acc,none": 0.6596638655462185,
|
|
"acc_stderr,none": 0.030778057422931673
|
|
},
|
|
"mmlu_high_school_psychology": {
|
|
"alias": " - high_school_psychology",
|
|
"acc,none": 0.8165137614678899,
|
|
"acc_stderr,none": 0.01659525971039932
|
|
},
|
|
"mmlu_human_sexuality": {
|
|
"alias": " - human_sexuality",
|
|
"acc,none": 0.7480916030534351,
|
|
"acc_stderr,none": 0.03807387116306085
|
|
},
|
|
"mmlu_professional_psychology": {
|
|
"alias": " - professional_psychology",
|
|
"acc,none": 0.6486928104575164,
|
|
"acc_stderr,none": 0.019312676065786554
|
|
},
|
|
"mmlu_public_relations": {
|
|
"alias": " - public_relations",
|
|
"acc,none": 0.6636363636363637,
|
|
"acc_stderr,none": 0.04525393596302505
|
|
},
|
|
"mmlu_security_studies": {
|
|
"alias": " - security_studies",
|
|
"acc,none": 0.726530612244898,
|
|
"acc_stderr,none": 0.028535560337128438
|
|
},
|
|
"mmlu_sociology": {
|
|
"alias": " - sociology",
|
|
"acc,none": 0.835820895522388,
|
|
"acc_stderr,none": 0.026193923544454156
|
|
},
|
|
"mmlu_us_foreign_policy": {
|
|
"alias": " - us_foreign_policy",
|
|
"acc,none": 0.88,
|
|
"acc_stderr,none": 0.03265986323710905
|
|
},
|
|
"mmlu_stem": {
|
|
"alias": " - stem",
|
|
"acc,none": 0.5144307009197588,
|
|
"acc_stderr,none": 0.13040661655168
|
|
},
|
|
"mmlu_abstract_algebra": {
|
|
"alias": " - abstract_algebra",
|
|
"acc,none": 0.3,
|
|
"acc_stderr,none": 0.046056618647183814
|
|
},
|
|
"mmlu_anatomy": {
|
|
"alias": " - anatomy",
|
|
"acc,none": 0.6296296296296297,
|
|
"acc_stderr,none": 0.041716541613545426
|
|
},
|
|
"mmlu_astronomy": {
|
|
"alias": " - astronomy",
|
|
"acc,none": 0.6381578947368421,
|
|
"acc_stderr,none": 0.03910525752849724
|
|
},
|
|
"mmlu_college_biology": {
|
|
"alias": " - college_biology",
|
|
"acc,none": 0.7013888888888888,
|
|
"acc_stderr,none": 0.03827052357950756
|
|
},
|
|
"mmlu_college_chemistry": {
|
|
"alias": " - college_chemistry",
|
|
"acc,none": 0.41,
|
|
"acc_stderr,none": 0.049431107042371025
|
|
},
|
|
"mmlu_college_computer_science": {
|
|
"alias": " - college_computer_science",
|
|
"acc,none": 0.5,
|
|
"acc_stderr,none": 0.050251890762960605
|
|
},
|
|
"mmlu_college_mathematics": {
|
|
"alias": " - college_mathematics",
|
|
"acc,none": 0.33,
|
|
"acc_stderr,none": 0.04725815626252604
|
|
},
|
|
"mmlu_college_physics": {
|
|
"alias": " - college_physics",
|
|
"acc,none": 0.4019607843137255,
|
|
"acc_stderr,none": 0.04878608714466996
|
|
},
|
|
"mmlu_computer_security": {
|
|
"alias": " - computer_security",
|
|
"acc,none": 0.79,
|
|
"acc_stderr,none": 0.040936018074033256
|
|
},
|
|
"mmlu_conceptual_physics": {
|
|
"alias": " - conceptual_physics",
|
|
"acc,none": 0.5617021276595745,
|
|
"acc_stderr,none": 0.03243618636108101
|
|
},
|
|
"mmlu_electrical_engineering": {
|
|
"alias": " - electrical_engineering",
|
|
"acc,none": 0.5448275862068965,
|
|
"acc_stderr,none": 0.04149886942192118
|
|
},
|
|
"mmlu_elementary_mathematics": {
|
|
"alias": " - elementary_mathematics",
|
|
"acc,none": 0.3941798941798942,
|
|
"acc_stderr,none": 0.025167982333894143
|
|
},
|
|
"mmlu_high_school_biology": {
|
|
"alias": " - high_school_biology",
|
|
"acc,none": 0.7677419354838709,
|
|
"acc_stderr,none": 0.024022256130308235
|
|
},
|
|
"mmlu_high_school_chemistry": {
|
|
"alias": " - high_school_chemistry",
|
|
"acc,none": 0.5024630541871922,
|
|
"acc_stderr,none": 0.035179450386910616
|
|
},
|
|
"mmlu_high_school_computer_science": {
|
|
"alias": " - high_school_computer_science",
|
|
"acc,none": 0.66,
|
|
"acc_stderr,none": 0.04760952285695237
|
|
},
|
|
"mmlu_high_school_mathematics": {
|
|
"alias": " - high_school_mathematics",
|
|
"acc,none": 0.34814814814814815,
|
|
"acc_stderr,none": 0.02904560029061626
|
|
},
|
|
"mmlu_high_school_physics": {
|
|
"alias": " - high_school_physics",
|
|
"acc,none": 0.2913907284768212,
|
|
"acc_stderr,none": 0.03710185726119996
|
|
},
|
|
"mmlu_high_school_statistics": {
|
|
"alias": " - high_school_statistics",
|
|
"acc,none": 0.49537037037037035,
|
|
"acc_stderr,none": 0.03409825519163572
|
|
},
|
|
"mmlu_machine_learning": {
|
|
"alias": " - machine_learning",
|
|
"acc,none": 0.48214285714285715,
|
|
"acc_stderr,none": 0.047427623612430116
|
|
}
|
|
},
|
|
"groups": {
|
|
"mmlu": {
|
|
"acc,none": 0.6177182737501781,
|
|
"acc_stderr,none": 0.12146933986032843,
|
|
"alias": "mmlu"
|
|
},
|
|
"mmlu_humanities": {
|
|
"alias": " - humanities",
|
|
"acc,none": 0.5742826780021254,
|
|
"acc_stderr,none": 0.10969794763314945
|
|
},
|
|
"mmlu_other": {
|
|
"alias": " - other",
|
|
"acc,none": 0.6842613453492115,
|
|
"acc_stderr,none": 0.10567866943210723
|
|
},
|
|
"mmlu_social_sciences": {
|
|
"alias": " - social_sciences",
|
|
"acc,none": 0.7227819304517387,
|
|
"acc_stderr,none": 0.0710141347586875
|
|
},
|
|
"mmlu_stem": {
|
|
"alias": " - stem",
|
|
"acc,none": 0.5144307009197588,
|
|
"acc_stderr,none": 0.13040661655168
|
|
}
|
|
},
|
|
"configs": {
|
|
"mmlu_abstract_algebra": {
|
|
"task": "mmlu_abstract_algebra",
|
|
"task_alias": "abstract_algebra",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "abstract_algebra",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_anatomy": {
|
|
"task": "mmlu_anatomy",
|
|
"task_alias": "anatomy",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "anatomy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_astronomy": {
|
|
"task": "mmlu_astronomy",
|
|
"task_alias": "astronomy",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "astronomy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_business_ethics": {
|
|
"task": "mmlu_business_ethics",
|
|
"task_alias": "business_ethics",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "business_ethics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_clinical_knowledge": {
|
|
"task": "mmlu_clinical_knowledge",
|
|
"task_alias": "clinical_knowledge",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "clinical_knowledge",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_biology": {
|
|
"task": "mmlu_college_biology",
|
|
"task_alias": "college_biology",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_biology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college biology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_chemistry": {
|
|
"task": "mmlu_college_chemistry",
|
|
"task_alias": "college_chemistry",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_chemistry",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_computer_science": {
|
|
"task": "mmlu_college_computer_science",
|
|
"task_alias": "college_computer_science",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_computer_science",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_mathematics": {
|
|
"task": "mmlu_college_mathematics",
|
|
"task_alias": "college_mathematics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_medicine": {
|
|
"task": "mmlu_college_medicine",
|
|
"task_alias": "college_medicine",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_medicine",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_college_physics": {
|
|
"task": "mmlu_college_physics",
|
|
"task_alias": "college_physics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "college_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about college physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_computer_security": {
|
|
"task": "mmlu_computer_security",
|
|
"task_alias": "computer_security",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "computer_security",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about computer security.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_conceptual_physics": {
|
|
"task": "mmlu_conceptual_physics",
|
|
"task_alias": "conceptual_physics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "conceptual_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_econometrics": {
|
|
"task": "mmlu_econometrics",
|
|
"task_alias": "econometrics",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "econometrics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_electrical_engineering": {
|
|
"task": "mmlu_electrical_engineering",
|
|
"task_alias": "electrical_engineering",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "electrical_engineering",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_elementary_mathematics": {
|
|
"task": "mmlu_elementary_mathematics",
|
|
"task_alias": "elementary_mathematics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "elementary_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_formal_logic": {
|
|
"task": "mmlu_formal_logic",
|
|
"task_alias": "formal_logic",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "formal_logic",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_global_facts": {
|
|
"task": "mmlu_global_facts",
|
|
"task_alias": "global_facts",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "global_facts",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about global facts.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_biology": {
|
|
"task": "mmlu_high_school_biology",
|
|
"task_alias": "high_school_biology",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_biology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_chemistry": {
|
|
"task": "mmlu_high_school_chemistry",
|
|
"task_alias": "high_school_chemistry",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_chemistry",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_computer_science": {
|
|
"task": "mmlu_high_school_computer_science",
|
|
"task_alias": "high_school_computer_science",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_computer_science",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_european_history": {
|
|
"task": "mmlu_high_school_european_history",
|
|
"task_alias": "high_school_european_history",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_european_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_geography": {
|
|
"task": "mmlu_high_school_geography",
|
|
"task_alias": "high_school_geography",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_geography",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_government_and_politics": {
|
|
"task": "mmlu_high_school_government_and_politics",
|
|
"task_alias": "high_school_government_and_politics",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_government_and_politics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_macroeconomics": {
|
|
"task": "mmlu_high_school_macroeconomics",
|
|
"task_alias": "high_school_macroeconomics",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_macroeconomics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_mathematics": {
|
|
"task": "mmlu_high_school_mathematics",
|
|
"task_alias": "high_school_mathematics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_mathematics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_microeconomics": {
|
|
"task": "mmlu_high_school_microeconomics",
|
|
"task_alias": "high_school_microeconomics",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_microeconomics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_physics": {
|
|
"task": "mmlu_high_school_physics",
|
|
"task_alias": "high_school_physics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_physics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_psychology": {
|
|
"task": "mmlu_high_school_psychology",
|
|
"task_alias": "high_school_psychology",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_psychology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_statistics": {
|
|
"task": "mmlu_high_school_statistics",
|
|
"task_alias": "high_school_statistics",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_statistics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_us_history": {
|
|
"task": "mmlu_high_school_us_history",
|
|
"task_alias": "high_school_us_history",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_us_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_high_school_world_history": {
|
|
"task": "mmlu_high_school_world_history",
|
|
"task_alias": "high_school_world_history",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "high_school_world_history",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_human_aging": {
|
|
"task": "mmlu_human_aging",
|
|
"task_alias": "human_aging",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "human_aging",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about human aging.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_human_sexuality": {
|
|
"task": "mmlu_human_sexuality",
|
|
"task_alias": "human_sexuality",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "human_sexuality",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_international_law": {
|
|
"task": "mmlu_international_law",
|
|
"task_alias": "international_law",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "international_law",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about international law.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_jurisprudence": {
|
|
"task": "mmlu_jurisprudence",
|
|
"task_alias": "jurisprudence",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "jurisprudence",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_logical_fallacies": {
|
|
"task": "mmlu_logical_fallacies",
|
|
"task_alias": "logical_fallacies",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "logical_fallacies",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_machine_learning": {
|
|
"task": "mmlu_machine_learning",
|
|
"task_alias": "machine_learning",
|
|
"group": "mmlu_stem",
|
|
"group_alias": "stem",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "machine_learning",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_management": {
|
|
"task": "mmlu_management",
|
|
"task_alias": "management",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "management",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about management.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_marketing": {
|
|
"task": "mmlu_marketing",
|
|
"task_alias": "marketing",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "marketing",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about marketing.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_medical_genetics": {
|
|
"task": "mmlu_medical_genetics",
|
|
"task_alias": "medical_genetics",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "medical_genetics",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_miscellaneous": {
|
|
"task": "mmlu_miscellaneous",
|
|
"task_alias": "miscellaneous",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "miscellaneous",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_moral_disputes": {
|
|
"task": "mmlu_moral_disputes",
|
|
"task_alias": "moral_disputes",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "moral_disputes",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_moral_scenarios": {
|
|
"task": "mmlu_moral_scenarios",
|
|
"task_alias": "moral_scenarios",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "moral_scenarios",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_nutrition": {
|
|
"task": "mmlu_nutrition",
|
|
"task_alias": "nutrition",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "nutrition",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_philosophy": {
|
|
"task": "mmlu_philosophy",
|
|
"task_alias": "philosophy",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "philosophy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_prehistory": {
|
|
"task": "mmlu_prehistory",
|
|
"task_alias": "prehistory",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "prehistory",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_professional_accounting": {
|
|
"task": "mmlu_professional_accounting",
|
|
"task_alias": "professional_accounting",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "professional_accounting",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_professional_law": {
|
|
"task": "mmlu_professional_law",
|
|
"task_alias": "professional_law",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "professional_law",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional law.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_professional_medicine": {
|
|
"task": "mmlu_professional_medicine",
|
|
"task_alias": "professional_medicine",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "professional_medicine",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_professional_psychology": {
|
|
"task": "mmlu_professional_psychology",
|
|
"task_alias": "professional_psychology",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "professional_psychology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_public_relations": {
|
|
"task": "mmlu_public_relations",
|
|
"task_alias": "public_relations",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "public_relations",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about public relations.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_security_studies": {
|
|
"task": "mmlu_security_studies",
|
|
"task_alias": "security_studies",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "security_studies",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about security studies.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_sociology": {
|
|
"task": "mmlu_sociology",
|
|
"task_alias": "sociology",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "sociology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about sociology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_us_foreign_policy": {
|
|
"task": "mmlu_us_foreign_policy",
|
|
"task_alias": "us_foreign_policy",
|
|
"group": "mmlu_social_sciences",
|
|
"group_alias": "social_sciences",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "us_foreign_policy",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_virology": {
|
|
"task": "mmlu_virology",
|
|
"task_alias": "virology",
|
|
"group": "mmlu_other",
|
|
"group_alias": "other",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "virology",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about virology.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
},
|
|
"mmlu_world_religions": {
|
|
"task": "mmlu_world_religions",
|
|
"task_alias": "world_religions",
|
|
"group": "mmlu_humanities",
|
|
"group_alias": "humanities",
|
|
"dataset_path": "hails/mmlu_no_train",
|
|
"dataset_name": "world_religions",
|
|
"test_split": "test",
|
|
"fewshot_split": "dev",
|
|
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
|
|
"doc_to_target": "answer",
|
|
"doc_to_choice": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"description": "The following are multiple choice questions (with answers) about world religions.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 0
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"mmlu": "N/A",
|
|
"mmlu_abstract_algebra": "Yaml",
|
|
"mmlu_anatomy": "Yaml",
|
|
"mmlu_astronomy": "Yaml",
|
|
"mmlu_business_ethics": "Yaml",
|
|
"mmlu_clinical_knowledge": "Yaml",
|
|
"mmlu_college_biology": "Yaml",
|
|
"mmlu_college_chemistry": "Yaml",
|
|
"mmlu_college_computer_science": "Yaml",
|
|
"mmlu_college_mathematics": "Yaml",
|
|
"mmlu_college_medicine": "Yaml",
|
|
"mmlu_college_physics": "Yaml",
|
|
"mmlu_computer_security": "Yaml",
|
|
"mmlu_conceptual_physics": "Yaml",
|
|
"mmlu_econometrics": "Yaml",
|
|
"mmlu_electrical_engineering": "Yaml",
|
|
"mmlu_elementary_mathematics": "Yaml",
|
|
"mmlu_formal_logic": "Yaml",
|
|
"mmlu_global_facts": "Yaml",
|
|
"mmlu_high_school_biology": "Yaml",
|
|
"mmlu_high_school_chemistry": "Yaml",
|
|
"mmlu_high_school_computer_science": "Yaml",
|
|
"mmlu_high_school_european_history": "Yaml",
|
|
"mmlu_high_school_geography": "Yaml",
|
|
"mmlu_high_school_government_and_politics": "Yaml",
|
|
"mmlu_high_school_macroeconomics": "Yaml",
|
|
"mmlu_high_school_mathematics": "Yaml",
|
|
"mmlu_high_school_microeconomics": "Yaml",
|
|
"mmlu_high_school_physics": "Yaml",
|
|
"mmlu_high_school_psychology": "Yaml",
|
|
"mmlu_high_school_statistics": "Yaml",
|
|
"mmlu_high_school_us_history": "Yaml",
|
|
"mmlu_high_school_world_history": "Yaml",
|
|
"mmlu_human_aging": "Yaml",
|
|
"mmlu_human_sexuality": "Yaml",
|
|
"mmlu_humanities": "N/A",
|
|
"mmlu_international_law": "Yaml",
|
|
"mmlu_jurisprudence": "Yaml",
|
|
"mmlu_logical_fallacies": "Yaml",
|
|
"mmlu_machine_learning": "Yaml",
|
|
"mmlu_management": "Yaml",
|
|
"mmlu_marketing": "Yaml",
|
|
"mmlu_medical_genetics": "Yaml",
|
|
"mmlu_miscellaneous": "Yaml",
|
|
"mmlu_moral_disputes": "Yaml",
|
|
"mmlu_moral_scenarios": "Yaml",
|
|
"mmlu_nutrition": "Yaml",
|
|
"mmlu_other": "N/A",
|
|
"mmlu_philosophy": "Yaml",
|
|
"mmlu_prehistory": "Yaml",
|
|
"mmlu_professional_accounting": "Yaml",
|
|
"mmlu_professional_law": "Yaml",
|
|
"mmlu_professional_medicine": "Yaml",
|
|
"mmlu_professional_psychology": "Yaml",
|
|
"mmlu_public_relations": "Yaml",
|
|
"mmlu_security_studies": "Yaml",
|
|
"mmlu_social_sciences": "N/A",
|
|
"mmlu_sociology": "Yaml",
|
|
"mmlu_stem": "N/A",
|
|
"mmlu_us_foreign_policy": "Yaml",
|
|
"mmlu_virology": "Yaml",
|
|
"mmlu_world_religions": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"mmlu": 0,
|
|
"mmlu_abstract_algebra": 5,
|
|
"mmlu_anatomy": 5,
|
|
"mmlu_astronomy": 5,
|
|
"mmlu_business_ethics": 5,
|
|
"mmlu_clinical_knowledge": 5,
|
|
"mmlu_college_biology": 5,
|
|
"mmlu_college_chemistry": 5,
|
|
"mmlu_college_computer_science": 5,
|
|
"mmlu_college_mathematics": 5,
|
|
"mmlu_college_medicine": 5,
|
|
"mmlu_college_physics": 5,
|
|
"mmlu_computer_security": 5,
|
|
"mmlu_conceptual_physics": 5,
|
|
"mmlu_econometrics": 5,
|
|
"mmlu_electrical_engineering": 5,
|
|
"mmlu_elementary_mathematics": 5,
|
|
"mmlu_formal_logic": 5,
|
|
"mmlu_global_facts": 5,
|
|
"mmlu_high_school_biology": 5,
|
|
"mmlu_high_school_chemistry": 5,
|
|
"mmlu_high_school_computer_science": 5,
|
|
"mmlu_high_school_european_history": 5,
|
|
"mmlu_high_school_geography": 5,
|
|
"mmlu_high_school_government_and_politics": 5,
|
|
"mmlu_high_school_macroeconomics": 5,
|
|
"mmlu_high_school_mathematics": 5,
|
|
"mmlu_high_school_microeconomics": 5,
|
|
"mmlu_high_school_physics": 5,
|
|
"mmlu_high_school_psychology": 5,
|
|
"mmlu_high_school_statistics": 5,
|
|
"mmlu_high_school_us_history": 5,
|
|
"mmlu_high_school_world_history": 5,
|
|
"mmlu_human_aging": 5,
|
|
"mmlu_human_sexuality": 5,
|
|
"mmlu_humanities": 5,
|
|
"mmlu_international_law": 5,
|
|
"mmlu_jurisprudence": 5,
|
|
"mmlu_logical_fallacies": 5,
|
|
"mmlu_machine_learning": 5,
|
|
"mmlu_management": 5,
|
|
"mmlu_marketing": 5,
|
|
"mmlu_medical_genetics": 5,
|
|
"mmlu_miscellaneous": 5,
|
|
"mmlu_moral_disputes": 5,
|
|
"mmlu_moral_scenarios": 5,
|
|
"mmlu_nutrition": 5,
|
|
"mmlu_other": 5,
|
|
"mmlu_philosophy": 5,
|
|
"mmlu_prehistory": 5,
|
|
"mmlu_professional_accounting": 5,
|
|
"mmlu_professional_law": 5,
|
|
"mmlu_professional_medicine": 5,
|
|
"mmlu_professional_psychology": 5,
|
|
"mmlu_public_relations": 5,
|
|
"mmlu_security_studies": 5,
|
|
"mmlu_social_sciences": 5,
|
|
"mmlu_sociology": 5,
|
|
"mmlu_stem": 5,
|
|
"mmlu_us_foreign_policy": 5,
|
|
"mmlu_virology": 5,
|
|
"mmlu_world_religions": 5
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
},
|
|
{
|
|
"results": {
|
|
"truthfulqa": {
|
|
"bleu_max,none": 18.554335172009438,
|
|
"bleu_max_stderr,none": 0.4870146522868547,
|
|
"bleu_acc,none": 0.4700122399020808,
|
|
"bleu_acc_stderr,none": 0.0003052705076523414,
|
|
"bleu_diff,none": 1.796550472428361,
|
|
"bleu_diff_stderr,none": 0.36884836086068395,
|
|
"rouge1_max,none": 43.34592888439128,
|
|
"rouge1_max_stderr,none": 0.7203233909280009,
|
|
"rouge1_acc,none": 0.48592411260709917,
|
|
"rouge1_acc_stderr,none": 0.00030612974190453806,
|
|
"rouge1_diff,none": 2.669412598334855,
|
|
"rouge1_diff_stderr,none": 0.7470375221454185,
|
|
"rouge2_max,none": 29.031317372267626,
|
|
"rouge2_max_stderr,none": 0.8811402776932646,
|
|
"rouge2_acc,none": 0.3929008567931457,
|
|
"rouge2_acc_stderr,none": 0.000292315898926905,
|
|
"rouge2_diff,none": 2.4113364755020705,
|
|
"rouge2_diff_stderr,none": 0.8330187812762287,
|
|
"rougeL_max,none": 39.92961806205577,
|
|
"rougeL_max_stderr,none": 0.7405383762139712,
|
|
"rougeL_acc,none": 0.46511627906976744,
|
|
"rougeL_acc_stderr,none": 0.000304881281879978,
|
|
"rougeL_diff,none": 2.3038129032098413,
|
|
"rougeL_diff_stderr,none": 0.7567374331096873,
|
|
"acc,none": 0.43053037942609285,
|
|
"acc_stderr,none": 0.05531283099906769,
|
|
"alias": "truthfulqa"
|
|
},
|
|
"truthfulqa_gen": {
|
|
"bleu_max,none": 18.554335172009438,
|
|
"bleu_max_stderr,none": 0.6978643509213339,
|
|
"bleu_acc,none": 0.4700122399020808,
|
|
"bleu_acc_stderr,none": 0.01747199209169754,
|
|
"bleu_diff,none": 1.796550472428361,
|
|
"bleu_diff_stderr,none": 0.6073288737254997,
|
|
"rouge1_max,none": 43.34592888439128,
|
|
"rouge1_max_stderr,none": 0.8487186759627721,
|
|
"rouge1_acc,none": 0.48592411260709917,
|
|
"rouge1_acc_stderr,none": 0.017496563717042786,
|
|
"rouge1_diff,none": 2.669412598334855,
|
|
"rouge1_diff_stderr,none": 0.8643133240587111,
|
|
"rouge2_max,none": 29.031317372267626,
|
|
"rouge2_max_stderr,none": 0.9386907252621944,
|
|
"rouge2_acc,none": 0.3929008567931457,
|
|
"rouge2_acc_stderr,none": 0.017097248285233065,
|
|
"rouge2_diff,none": 2.4113364755020705,
|
|
"rouge2_diff_stderr,none": 0.9126986256570285,
|
|
"rougeL_max,none": 39.92961806205577,
|
|
"rougeL_max_stderr,none": 0.8605453946271349,
|
|
"rougeL_acc,none": 0.46511627906976744,
|
|
"rougeL_acc_stderr,none": 0.01746084997587397,
|
|
"rougeL_diff,none": 2.3038129032098413,
|
|
"rougeL_diff_stderr,none": 0.8699065657354744,
|
|
"alias": " - truthfulqa_gen"
|
|
},
|
|
"truthfulqa_mc1": {
|
|
"acc,none": 0.37454100367197063,
|
|
"acc_stderr,none": 0.016943535128405303,
|
|
"alias": " - truthfulqa_mc1"
|
|
},
|
|
"truthfulqa_mc2": {
|
|
"acc,none": 0.5425091309343373,
|
|
"acc_stderr,none": 0.015548945177533002,
|
|
"alias": " - truthfulqa_mc2"
|
|
}
|
|
},
|
|
"groups": {
|
|
"truthfulqa": {
|
|
"bleu_max,none": 18.554335172009438,
|
|
"bleu_max_stderr,none": 0.4870146522868547,
|
|
"bleu_acc,none": 0.4700122399020808,
|
|
"bleu_acc_stderr,none": 0.0003052705076523414,
|
|
"bleu_diff,none": 1.796550472428361,
|
|
"bleu_diff_stderr,none": 0.36884836086068395,
|
|
"rouge1_max,none": 43.34592888439128,
|
|
"rouge1_max_stderr,none": 0.7203233909280009,
|
|
"rouge1_acc,none": 0.48592411260709917,
|
|
"rouge1_acc_stderr,none": 0.00030612974190453806,
|
|
"rouge1_diff,none": 2.669412598334855,
|
|
"rouge1_diff_stderr,none": 0.7470375221454185,
|
|
"rouge2_max,none": 29.031317372267626,
|
|
"rouge2_max_stderr,none": 0.8811402776932646,
|
|
"rouge2_acc,none": 0.3929008567931457,
|
|
"rouge2_acc_stderr,none": 0.000292315898926905,
|
|
"rouge2_diff,none": 2.4113364755020705,
|
|
"rouge2_diff_stderr,none": 0.8330187812762287,
|
|
"rougeL_max,none": 39.92961806205577,
|
|
"rougeL_max_stderr,none": 0.7405383762139712,
|
|
"rougeL_acc,none": 0.46511627906976744,
|
|
"rougeL_acc_stderr,none": 0.000304881281879978,
|
|
"rougeL_diff,none": 2.3038129032098413,
|
|
"rougeL_diff_stderr,none": 0.7567374331096873,
|
|
"acc,none": 0.43053037942609285,
|
|
"acc_stderr,none": 0.05531283099906769,
|
|
"alias": "truthfulqa"
|
|
}
|
|
},
|
|
"configs": {
|
|
"truthfulqa_gen": {
|
|
"task": "truthfulqa_gen",
|
|
"group": [
|
|
"truthfulqa"
|
|
],
|
|
"dataset_path": "truthful_qa",
|
|
"dataset_name": "generation",
|
|
"validation_split": "validation",
|
|
"process_docs": "<function process_docs_gen at 0x7f2a97c39260>",
|
|
"doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}",
|
|
"doc_to_target": " ",
|
|
"process_results": "<function process_results_gen at 0x7f2a97c398a0>",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "bleu_max",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "bleu_acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "bleu_diff",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge1_max",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge1_acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge1_diff",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge2_max",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge2_acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rouge2_diff",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rougeL_max",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rougeL_acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "rougeL_diff",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"until": [
|
|
"\n\n"
|
|
],
|
|
"do_sample": false
|
|
},
|
|
"repeats": 1,
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "question",
|
|
"metadata": {
|
|
"version": 2
|
|
}
|
|
},
|
|
"truthfulqa_mc1": {
|
|
"task": "truthfulqa_mc1",
|
|
"group": [
|
|
"truthfulqa"
|
|
],
|
|
"dataset_path": "truthful_qa",
|
|
"dataset_name": "multiple_choice",
|
|
"validation_split": "validation",
|
|
"doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
|
|
"doc_to_target": 0,
|
|
"doc_to_choice": "{{mc1_targets.choices}}",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "question",
|
|
"metadata": {
|
|
"version": 2
|
|
}
|
|
},
|
|
"truthfulqa_mc2": {
|
|
"task": "truthfulqa_mc2",
|
|
"group": [
|
|
"truthfulqa"
|
|
],
|
|
"dataset_path": "truthful_qa",
|
|
"dataset_name": "multiple_choice",
|
|
"validation_split": "validation",
|
|
"doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
|
|
"doc_to_target": 0,
|
|
"doc_to_choice": "{{mc2_targets.choices}}",
|
|
"process_results": "<function process_results_mc2 at 0x7f2a97c39b20>",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 0,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "question",
|
|
"metadata": {
|
|
"version": 2
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"truthfulqa": "N/A",
|
|
"truthfulqa_gen": "Yaml",
|
|
"truthfulqa_mc1": "Yaml",
|
|
"truthfulqa_mc2": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"truthfulqa": 0,
|
|
"truthfulqa_gen": 0,
|
|
"truthfulqa_mc1": 0,
|
|
"truthfulqa_mc2": 0
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
},
|
|
{
|
|
"results": {
|
|
"winogrande": {
|
|
"acc,none": 0.7505919494869772,
|
|
"acc_stderr,none": 0.012160189196930685,
|
|
"alias": "winogrande"
|
|
}
|
|
},
|
|
"configs": {
|
|
"winogrande": {
|
|
"task": "winogrande",
|
|
"dataset_path": "winogrande",
|
|
"dataset_name": "winogrande_xl",
|
|
"training_split": "train",
|
|
"validation_split": "validation",
|
|
"doc_to_text": "<function doc_to_text at 0x7fc282eca700>",
|
|
"doc_to_target": "<function doc_to_target at 0x7fc282ecaa20>",
|
|
"doc_to_choice": "<function doc_to_choice at 0x7fc282ecad40>",
|
|
"description": "",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"num_fewshot": 5,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "mean",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "multiple_choice",
|
|
"repeats": 1,
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "sentence",
|
|
"metadata": {
|
|
"version": 1
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"winogrande": "Yaml"
|
|
},
|
|
"n-shot": {
|
|
"winogrande": 5
|
|
},
|
|
"config": {
|
|
"model": "vllm",
|
|
"model_args": "pretrained=cognitivecomputations/dolphin-2.2-mistral-7b,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
|
|
"batch_size": "8",
|
|
"batch_sizes": [],
|
|
"device": null,
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": "46c79664"
|
|
}
|
|
]
|