| bigbench_causal_judgement |
0 |
multiple_choice_grade |
0.5684 |
± |
0.0360 |
| bigbench_date_understanding |
0 |
multiple_choice_grade |
0.6612 |
± |
0.0247 |
| bigbench_disambiguation_qa |
0 |
multiple_choice_grade |
0.4380 |
± |
0.0309 |
| bigbench_geometric_shapes |
0 |
multiple_choice_grade |
0.2173 |
± |
0.0218 |
|
|
exact_str_match |
0.0000 |
± |
0.0000 |
| bigbench_logical_deduction_five_objects |
0 |
multiple_choice_grade |
0.3320 |
± |
0.0211 |
| bigbench_logical_deduction_seven_objects |
0 |
multiple_choice_grade |
0.2243 |
± |
0.0158 |
| bigbench_logical_deduction_three_objects |
0 |
multiple_choice_grade |
0.5667 |
± |
0.0287 |
| bigbench_movie_recommendation |
0 |
multiple_choice_grade |
0.4260 |
± |
0.0221 |
| bigbench_navigate |
0 |
multiple_choice_grade |
0.5310 |
± |
0.0158 |
| bigbench_reasoning_about_colored_objects |
0 |
multiple_choice_grade |
0.7230 |
± |
0.0100 |
| bigbench_ruin_names |
0 |
multiple_choice_grade |
0.5379 |
± |
0.0236 |
| bigbench_salient_translation_error_detection |
0 |
multiple_choice_grade |
0.2956 |
± |
0.0145 |
| bigbench_snarks |
0 |
multiple_choice_grade |
0.6961 |
± |
0.0343 |
| bigbench_sports_understanding |
0 |
multiple_choice_grade |
0.7424 |
± |
0.0139 |
| bigbench_temporal_sequences |
0 |
multiple_choice_grade |
0.4690 |
± |
0.0158 |
| bigbench_tracking_shuffled_objects_five_objects |
0 |
multiple_choice_grade |
0.2304 |
± |
0.0119 |
| bigbench_tracking_shuffled_objects_seven_objects |
0 |
multiple_choice_grade |
0.1880 |
± |
0.0093 |
| bigbench_tracking_shuffled_objects_three_objects |
0 |
multiple_choice_grade |
0.5667 |
± |
0.0287 |