191 lines
9.6 KiB
YAML
191 lines
9.6 KiB
YAML
evaluations:
|
|
arc_challenge_poly_pt_acc: 0.32905982905982906
|
|
arc_challenge_poly_pt_acc_norm: 0.37435897435897436
|
|
arc_challenge_poly_pt_acc_norm_stderr: 0.014154661190814505
|
|
arc_challenge_poly_pt_acc_stderr: 0.013742700308521677
|
|
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
|
|
assin2_rte_acc,all: 0.610702614379085
|
|
assin2_rte_acc_stderr,all: 0.006976455542871003
|
|
assin2_rte_alias: assin2_rte
|
|
assin2_rte_f1_macro,all: 0.5493894012220668
|
|
assin2_rte_f1_macro_stderr,all: 0.007313213532417135
|
|
assin2_sts_alias: assin2_sts
|
|
assin2_sts_mse,all: 1.954166666666667
|
|
assin2_sts_mse_stderr,all: N/A
|
|
assin2_sts_pearson,all: 0.06429110147600466
|
|
assin2_sts_pearson_stderr,all: 0.013460910678684006
|
|
assin_entailment_acc: 0.70675
|
|
assin_entailment_acc_stderr: 0.007199067024031941
|
|
assin_entailment_alias: assin_entailment
|
|
assin_paraphrase_acc: 0.72675
|
|
assin_paraphrase_acc_stderr: 0.007046880444991351
|
|
assin_paraphrase_alias: assin_paraphrase
|
|
belebele_por_Latn_acc: 0.5388888888888889
|
|
belebele_por_Latn_acc_norm: 0.5388888888888889
|
|
belebele_por_Latn_acc_norm_stderr: 0.016625417583086437
|
|
belebele_por_Latn_acc_stderr: 0.016625417583086437
|
|
belebele_por_Latn_alias: belebele_por_Latn
|
|
bluex_acc,all: 0.46870653685674546
|
|
bluex_acc,exam_id__UNICAMP_2018: 0.46296296296296297
|
|
bluex_acc,exam_id__UNICAMP_2019: 0.46
|
|
bluex_acc,exam_id__UNICAMP_2020: 0.5636363636363636
|
|
bluex_acc,exam_id__UNICAMP_2021_1: 0.3695652173913043
|
|
bluex_acc,exam_id__UNICAMP_2021_2: 0.37254901960784315
|
|
bluex_acc,exam_id__UNICAMP_2022: 0.5641025641025641
|
|
bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976
|
|
bluex_acc,exam_id__UNICAMP_2024: 0.5777777777777777
|
|
bluex_acc,exam_id__USP_2018: 0.37037037037037035
|
|
bluex_acc,exam_id__USP_2019: 0.475
|
|
bluex_acc,exam_id__USP_2020: 0.4642857142857143
|
|
bluex_acc,exam_id__USP_2021: 0.34615384615384615
|
|
bluex_acc,exam_id__USP_2022: 0.3877551020408163
|
|
bluex_acc,exam_id__USP_2023: 0.5227272727272727
|
|
bluex_acc,exam_id__USP_2024: 0.5609756097560976
|
|
bluex_acc_stderr,all: 0.010751763481122825
|
|
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03908140507987583
|
|
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.04086284322684593
|
|
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03854268045176616
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.041049198501562786
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.03903627297829125
|
|
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.046067534061712136
|
|
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.043110276875963825
|
|
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04235722643358554
|
|
bluex_acc_stderr,exam_id__USP_2018: 0.03788178159383037
|
|
bluex_acc_stderr,exam_id__USP_2019: 0.04557332896428892
|
|
bluex_acc_stderr,exam_id__USP_2020: 0.038341658891473986
|
|
bluex_acc_stderr,exam_id__USP_2021: 0.038139467533341624
|
|
bluex_acc_stderr,exam_id__USP_2022: 0.040012931518371306
|
|
bluex_acc_stderr,exam_id__USP_2023: 0.043422447984322925
|
|
bluex_acc_stderr,exam_id__USP_2024: 0.044825815810456374
|
|
bluex_alias: bluex
|
|
calame_pt_acc: 0.5867052023121387
|
|
calame_pt_acc_stderr: 0.01081012929476997
|
|
calame_pt_alias: calame_pt
|
|
calame_pt_perplexity: 7.2359833214160085
|
|
calame_pt_perplexity_stderr: 0.42482617041660564
|
|
enem_challenge_acc,all: 0.5514345696291113
|
|
enem_challenge_acc,exam_id__2009: 0.5043478260869565
|
|
enem_challenge_acc,exam_id__2010: 0.5726495726495726
|
|
enem_challenge_acc,exam_id__2011: 0.6153846153846154
|
|
enem_challenge_acc,exam_id__2012: 0.5344827586206896
|
|
enem_challenge_acc,exam_id__2013: 0.5833333333333334
|
|
enem_challenge_acc,exam_id__2014: 0.5688073394495413
|
|
enem_challenge_acc,exam_id__2015: 0.4789915966386555
|
|
enem_challenge_acc,exam_id__2016: 0.5371900826446281
|
|
enem_challenge_acc,exam_id__2016_2: 0.5284552845528455
|
|
enem_challenge_acc,exam_id__2017: 0.5086206896551724
|
|
enem_challenge_acc,exam_id__2022: 0.5639097744360902
|
|
enem_challenge_acc,exam_id__2023: 0.6148148148148148
|
|
enem_challenge_acc_stderr,all: 0.007618623435000518
|
|
enem_challenge_acc_stderr,exam_id__2009: 0.026949045711548218
|
|
enem_challenge_acc_stderr,exam_id__2010: 0.026429279577735547
|
|
enem_challenge_acc_stderr,exam_id__2011: 0.02589822342752356
|
|
enem_challenge_acc_stderr,exam_id__2012: 0.02669879193984667
|
|
enem_challenge_acc_stderr,exam_id__2013: 0.02740882906544383
|
|
enem_challenge_acc_stderr,exam_id__2014: 0.027336979010138398
|
|
enem_challenge_acc_stderr,exam_id__2015: 0.026385065690201895
|
|
enem_challenge_acc_stderr,exam_id__2016: 0.026088144599797948
|
|
enem_challenge_acc_stderr,exam_id__2016_2: 0.02606861159558674
|
|
enem_challenge_acc_stderr,exam_id__2017: 0.02684138658677631
|
|
enem_challenge_acc_stderr,exam_id__2022: 0.02485425227418429
|
|
enem_challenge_acc_stderr,exam_id__2023: 0.024259924778206914
|
|
enem_challenge_alias: enem
|
|
faquad_nli_acc,all: 0.7846153846153846
|
|
faquad_nli_acc_stderr,all: 0.011396120309131366
|
|
faquad_nli_alias: faquad_nli
|
|
faquad_nli_f1_macro,all: 0.4396551724137931
|
|
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
|
|
global_piqa_completions_por_latn_braz_acc: 0.79
|
|
global_piqa_completions_por_latn_braz_acc_bytes: 0.75
|
|
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04351941398892446
|
|
global_piqa_completions_por_latn_braz_acc_norm: 0.74
|
|
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.0440844002276808
|
|
global_piqa_completions_por_latn_braz_acc_stderr: 0.040936018074033236
|
|
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
|
|
hatebr_offensive_acc,all: 0.7964285714285714
|
|
hatebr_offensive_acc_stderr,all: 0.007613133172324561
|
|
hatebr_offensive_alias: hatebr_offensive_binary
|
|
hatebr_offensive_f1_macro,all: 0.7963528271484649
|
|
hatebr_offensive_f1_macro_stderr,all: 0.00761763073122653
|
|
hellaswag_poly_pt_acc: 0.3776140426915159
|
|
hellaswag_poly_pt_acc_norm: 0.48434283237620546
|
|
hellaswag_poly_pt_acc_norm_stderr: 0.005202393220555622
|
|
hellaswag_poly_pt_acc_stderr: 0.005046614926940191
|
|
hellaswag_poly_pt_alias: hellaswag_poly_pt
|
|
lambada_poly_pt_acc: 0.4513875412381137
|
|
lambada_poly_pt_acc_stderr: 0.006932975888368315
|
|
lambada_poly_pt_alias: lambada_poly_pt
|
|
lambada_poly_pt_perplexity: 15.604144991512424
|
|
lambada_poly_pt_perplexity_stderr: 0.5333901055397442
|
|
mmlu_poly_pt_acc: 0.39680276193335334
|
|
mmlu_poly_pt_acc_stderr: 0.0042385372220186505
|
|
mmlu_poly_pt_alias: mmlu_poly_pt
|
|
oab_exams_acc,all: 0.40364464692482915
|
|
oab_exams_acc,exam_id__2010-01: 0.4
|
|
oab_exams_acc,exam_id__2010-02: 0.4
|
|
oab_exams_acc,exam_id__2011-03: 0.3434343434343434
|
|
oab_exams_acc,exam_id__2011-04: 0.4125
|
|
oab_exams_acc,exam_id__2011-05: 0.425
|
|
oab_exams_acc,exam_id__2012-06: 0.375
|
|
oab_exams_acc,exam_id__2012-06a: 0.475
|
|
oab_exams_acc,exam_id__2012-07: 0.3125
|
|
oab_exams_acc,exam_id__2012-08: 0.425
|
|
oab_exams_acc,exam_id__2012-09: 0.3246753246753247
|
|
oab_exams_acc,exam_id__2013-10: 0.425
|
|
oab_exams_acc,exam_id__2013-11: 0.4375
|
|
oab_exams_acc,exam_id__2013-12: 0.4375
|
|
oab_exams_acc,exam_id__2014-13: 0.425
|
|
oab_exams_acc,exam_id__2014-14: 0.4125
|
|
oab_exams_acc,exam_id__2014-15: 0.41025641025641024
|
|
oab_exams_acc,exam_id__2015-16: 0.35
|
|
oab_exams_acc,exam_id__2015-17: 0.48717948717948717
|
|
oab_exams_acc,exam_id__2015-18: 0.3375
|
|
oab_exams_acc,exam_id__2016-19: 0.44871794871794873
|
|
oab_exams_acc,exam_id__2016-20: 0.3875
|
|
oab_exams_acc,exam_id__2016-20a: 0.3625
|
|
oab_exams_acc,exam_id__2016-21: 0.425
|
|
oab_exams_acc,exam_id__2017-22: 0.425
|
|
oab_exams_acc,exam_id__2017-23: 0.425
|
|
oab_exams_acc,exam_id__2017-24: 0.3875
|
|
oab_exams_acc,exam_id__2018-25: 0.4375
|
|
oab_exams_acc_stderr,all: 0.006053332133276562
|
|
oab_exams_acc_stderr,exam_id__2010-01: 0.030610161516106096
|
|
oab_exams_acc_stderr,exam_id__2010-02: 0.028296630437280906
|
|
oab_exams_acc_stderr,exam_id__2011-03: 0.027549621066797744
|
|
oab_exams_acc_stderr,exam_id__2011-04: 0.031730143003018005
|
|
oab_exams_acc_stderr,exam_id__2011-05: 0.03187425289992289
|
|
oab_exams_acc_stderr,exam_id__2012-06: 0.031303363618806604
|
|
oab_exams_acc_stderr,exam_id__2012-06a: 0.03204492958151872
|
|
oab_exams_acc_stderr,exam_id__2012-07: 0.030024899757931458
|
|
oab_exams_acc_stderr,exam_id__2012-08: 0.03186228006495766
|
|
oab_exams_acc_stderr,exam_id__2012-09: 0.030794179748498546
|
|
oab_exams_acc_stderr,exam_id__2013-10: 0.03186391626480471
|
|
oab_exams_acc_stderr,exam_id__2013-11: 0.03202492621379354
|
|
oab_exams_acc_stderr,exam_id__2013-12: 0.03195252134301974
|
|
oab_exams_acc_stderr,exam_id__2014-13: 0.03180998638392341
|
|
oab_exams_acc_stderr,exam_id__2014-14: 0.03183014975650971
|
|
oab_exams_acc_stderr,exam_id__2014-15: 0.03207677106125773
|
|
oab_exams_acc_stderr,exam_id__2015-16: 0.030649958521189145
|
|
oab_exams_acc_stderr,exam_id__2015-17: 0.03266206847243709
|
|
oab_exams_acc_stderr,exam_id__2015-18: 0.030453791802761587
|
|
oab_exams_acc_stderr,exam_id__2016-19: 0.03245421470850268
|
|
oab_exams_acc_stderr,exam_id__2016-20: 0.03147299166016239
|
|
oab_exams_acc_stderr,exam_id__2016-20a: 0.031016128861774545
|
|
oab_exams_acc_stderr,exam_id__2016-21: 0.03173265714315922
|
|
oab_exams_acc_stderr,exam_id__2017-22: 0.03191749762684417
|
|
oab_exams_acc_stderr,exam_id__2017-23: 0.0320020087581464
|
|
oab_exams_acc_stderr,exam_id__2017-24: 0.031539234776498976
|
|
oab_exams_acc_stderr,exam_id__2018-25: 0.03202176928915031
|
|
oab_exams_alias: oab_exams
|
|
portuguese_hate_speech_acc,all: 0.43478260869565216
|
|
portuguese_hate_speech_acc_stderr,all: 0.012048917796997311
|
|
portuguese_hate_speech_alias: portuguese_hate_speech_binary
|
|
portuguese_hate_speech_f1_macro,all: 0.42838929145469706
|
|
portuguese_hate_speech_f1_macro_stderr,all: 0.011986624066160297
|
|
tweetsentbr_acc,all: 0.4736318407960199
|
|
tweetsentbr_acc_stderr,all: 0.007893686144300348
|
|
tweetsentbr_alias: tweetsentbr
|
|
tweetsentbr_f1_macro,all: 0.2678826295357763
|
|
tweetsentbr_f1_macro_stderr,all: 0.005779014212286808
|
|
step: 50000
|