| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| AI2 Reasoning Challenge (25-Shot) |
ai2_arc |
ARC-Challenge |
test |
|
|
| type |
value |
name |
| acc_norm |
24.74 |
normalized accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
split |
args |
| HellaSwag (10-Shot) |
hellaswag |
validation |
|
|
| type |
value |
name |
| acc_norm |
35.29 |
normalized accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| MMLU (5-Shot) |
cais/mmlu |
all |
test |
|
|
| type |
value |
name |
| acc |
26.13 |
accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| TruthfulQA (0-shot) |
truthful_qa |
multiple_choice |
validation |
|
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| Winogrande (5-shot) |
winogrande |
winogrande_xl |
validation |
|
|
| type |
value |
name |
| acc |
51.3 |
accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| GSM8k (5-shot) |
gsm8k |
main |
test |
|
|
| type |
value |
name |
| acc |
0.0 |
accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
args |
| IFEval (0-Shot) |
HuggingFaceH4/ifeval |
|
|
| type |
value |
name |
| inst_level_strict_acc and prompt_level_strict_acc |
15.75 |
strict accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
args |
| BBH (3-Shot) |
BBH |
|
|
| type |
value |
name |
| acc_norm |
3.17 |
normalized accuracy |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
args |
| MATH Lvl 5 (4-Shot) |
hendrycks/competition_math |
|
|
| type |
value |
name |
| exact_match |
0.0 |
exact match |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
args |
| GPQA (0-shot) |
Idavidrein/gpqa |
|
|
| type |
value |
name |
| acc_norm |
1.01 |
acc_norm |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
args |
| MuSR (0-shot) |
TAUR-Lab/MuSR |
|
|
| type |
value |
name |
| acc_norm |
3.17 |
acc_norm |
|
|
|
|
| task |
dataset |
metrics |
source |
| type |
name |
| text-generation |
Text Generation |
|
| name |
type |
config |
split |
args |
| MMLU-PRO (5-shot) |
TIGER-Lab/MMLU-Pro |
main |
test |
|
|
| type |
value |
name |
| acc |
1.51 |
accuracy |
|
|
|
|