[ { "task": "hellaswag", "benchmark": "HellaSwag", "metric": "acc_norm", "score": 0.26707827126070505, "shots": 0, "runtime_sec": 318.67, "status": "success" }, { "task": "piqa", "benchmark": "PIQA", "metric": "acc_norm", "score": 0.5386289445048966, "shots": 0, "runtime_sec": 38.85, "status": "success" }, { "task": "winogrande", "benchmark": "WinoGrande", "metric": "acc", "score": 0.5067087608524072, "shots": 0, "runtime_sec": 23.73, "status": "success" }, { "task": "boolq", "benchmark": "BoolQ", "metric": "acc", "score": 0.40214067278287463, "shots": 0, "runtime_sec": 144.8, "status": "success" }, { "task": "arc_easy", "benchmark": "ARC-Easy", "metric": "acc_norm", "score": 0.3468013468013468, "shots": 0, "runtime_sec": 51.41, "status": "success" }, { "task": "arc_challenge", "benchmark": "ARC-Challenge", "metric": "acc_norm", "score": 0.25597269624573377, "shots": 0, "runtime_sec": 37.69, "status": "success" }, { "task": "openbookqa", "benchmark": "OpenBookQA", "metric": "acc_norm", "score": 0.25, "shots": 0, "runtime_sec": 21.14, "status": "success" }, { "task": "commonsense_qa", "benchmark": "CommonsenseQA", "metric": "acc", "score": 0.2031122031122031, "shots": 0, "runtime_sec": 27.66, "status": "success" }, { "task": "lambada_openai", "benchmark": "LAMBADA", "metric": "acc", "score": 0.0023287405394915583, "shots": 0, "runtime_sec": 96.28, "status": "success" }, { "task": "blimp", "benchmark": "BLiMP", "metric": "acc", "score": 0.5923432835820895, "shots": 0, "runtime_sec": 354.79, "status": "success" }, { "task": "mmlu", "benchmark": "MMLU", "metric": "acc", "score": 0.23892607890613873, "shots": 0, "runtime_sec": 388.62, "status": "success" }, { "task": "wikitext", "benchmark": "WikiText-2", "metric": "word_perplexity", "score": 12524.42105099034, "shots": 0, "runtime_sec": 182.89, "status": "success" }, { "task": "wikitext", "benchmark": "WikiText-2", "metric": "byte_perplexity", "score": 5.838498405241562, "shots": 0, "runtime_sec": 181.42, "status": "success" }, { "task": "sciq", "benchmark": "SciQ", "metric": "acc_norm", "score": 0.356, "shots": 0, "runtime_sec": 87.15, "status": "success" }, { "task": "copa", "benchmark": "COPA", "metric": "acc", "score": 0.64, "shots": 0, "runtime_sec": 17.21, "status": "success" }, { "task": "race", "benchmark": "RACE", "metric": "acc", "score": 0.23157894736842105, "shots": 0, "runtime_sec": 334.7, "status": "success" }, { "task": "swag", "benchmark": "SWAG", "metric": "acc_norm", "score": 0.2912626212136359, "shots": 0, "runtime_sec": 252.0, "status": "success" }, { "task": "truthfulqa_mc2", "benchmark": "TruthfulQA MC2", "metric": "acc", "score": 0.48740972804833826, "shots": 0, "runtime_sec": 126.29, "status": "success" } ]