208 lines
10 KiB
YAML
208 lines
10 KiB
YAML
|
|
evaluations:
|
||
|
|
arc_challenge_poly_pt_acc: 0.30427350427350425
|
||
|
|
arc_challenge_poly_pt_acc_norm: 0.32735042735042735
|
||
|
|
arc_challenge_poly_pt_acc_norm_stderr: 0.013724408490743929
|
||
|
|
arc_challenge_poly_pt_acc_stderr: 0.013456870841978025
|
||
|
|
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
|
||
|
|
assin2_rte_acc,all: 0.5
|
||
|
|
assin2_rte_acc_stderr,all: 0.007138073526203373
|
||
|
|
assin2_rte_alias: assin2_rte
|
||
|
|
assin2_rte_f1_macro,all: 0.3333333333333333
|
||
|
|
assin2_rte_f1_macro_stderr,all: 0.003173025139437921
|
||
|
|
assin2_sts_alias: assin2_sts
|
||
|
|
assin2_sts_mse,all: 3.059456699346405
|
||
|
|
assin2_sts_mse_stderr,all: N/A
|
||
|
|
assin2_sts_pearson,all: 0.0276213074062244
|
||
|
|
assin2_sts_pearson_stderr,all: 0.011665375504884477
|
||
|
|
assin_entailment_acc: 0.7335
|
||
|
|
assin_entailment_acc_stderr: 0.006991541883910775
|
||
|
|
assin_entailment_alias: assin_entailment
|
||
|
|
assin_paraphrase_acc: 0.71575
|
||
|
|
assin_paraphrase_acc_stderr: 0.0071327206100355
|
||
|
|
assin_paraphrase_alias: assin_paraphrase
|
||
|
|
belebele_por_Latn_acc: 0.3611111111111111
|
||
|
|
belebele_por_Latn_acc_norm: 0.3611111111111111
|
||
|
|
belebele_por_Latn_acc_norm_stderr: 0.016019658270537297
|
||
|
|
belebele_por_Latn_acc_stderr: 0.016019658270537297
|
||
|
|
belebele_por_Latn_alias: belebele_por_Latn
|
||
|
|
bluex_acc,all: 0.34492350486787204
|
||
|
|
bluex_acc,exam_id__UNICAMP_2018: 0.4074074074074074
|
||
|
|
bluex_acc,exam_id__UNICAMP_2019: 0.28
|
||
|
|
bluex_acc,exam_id__UNICAMP_2020: 0.36363636363636365
|
||
|
|
bluex_acc,exam_id__UNICAMP_2021_1: 0.2391304347826087
|
||
|
|
bluex_acc,exam_id__UNICAMP_2021_2: 0.23529411764705882
|
||
|
|
bluex_acc,exam_id__UNICAMP_2022: 0.2564102564102564
|
||
|
|
bluex_acc,exam_id__UNICAMP_2023: 0.5581395348837209
|
||
|
|
bluex_acc,exam_id__UNICAMP_2024: 0.4
|
||
|
|
bluex_acc,exam_id__USP_2018: 0.3148148148148148
|
||
|
|
bluex_acc,exam_id__USP_2019: 0.3
|
||
|
|
bluex_acc,exam_id__USP_2020: 0.30357142857142855
|
||
|
|
bluex_acc,exam_id__USP_2021: 0.36538461538461536
|
||
|
|
bluex_acc,exam_id__USP_2022: 0.3673469387755102
|
||
|
|
bluex_acc,exam_id__USP_2023: 0.38636363636363635
|
||
|
|
bluex_acc,exam_id__USP_2024: 0.4146341463414634
|
||
|
|
bluex_acc_stderr,all: 0.010242775047627772
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.038534741152469736
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0367961057296462
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.037368794971719924
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.036222183210060134
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.034079376824662555
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04027891922030821
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04379564728920204
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04209744896628822
|
||
|
|
bluex_acc_stderr,exam_id__USP_2018: 0.03647519527201769
|
||
|
|
bluex_acc_stderr,exam_id__USP_2019: 0.04181463130976304
|
||
|
|
bluex_acc_stderr,exam_id__USP_2020: 0.03539764734011866
|
||
|
|
bluex_acc_stderr,exam_id__USP_2021: 0.03859991685452418
|
||
|
|
bluex_acc_stderr,exam_id__USP_2022: 0.0397939306907615
|
||
|
|
bluex_acc_stderr,exam_id__USP_2023: 0.042529993265009204
|
||
|
|
bluex_acc_stderr,exam_id__USP_2024: 0.04456715995150719
|
||
|
|
bluex_alias: bluex
|
||
|
|
calame_pt_acc: 0.09489402697495183
|
||
|
|
calame_pt_acc_stderr: 0.006433689590297553
|
||
|
|
calame_pt_alias: calame_pt
|
||
|
|
calame_pt_perplexity: 8435.632130447184
|
||
|
|
calame_pt_perplexity_stderr: 963.9665411064852
|
||
|
|
enem_challenge_acc,all: 0.3198040587823653
|
||
|
|
enem_challenge_acc,exam_id__2009: 0.3130434782608696
|
||
|
|
enem_challenge_acc,exam_id__2010: 0.38461538461538464
|
||
|
|
enem_challenge_acc,exam_id__2011: 0.29914529914529914
|
||
|
|
enem_challenge_acc,exam_id__2012: 0.31896551724137934
|
||
|
|
enem_challenge_acc,exam_id__2013: 0.37037037037037035
|
||
|
|
enem_challenge_acc,exam_id__2014: 0.3394495412844037
|
||
|
|
enem_challenge_acc,exam_id__2015: 0.3025210084033613
|
||
|
|
enem_challenge_acc,exam_id__2016: 0.3140495867768595
|
||
|
|
enem_challenge_acc,exam_id__2016_2: 0.2764227642276423
|
||
|
|
enem_challenge_acc,exam_id__2017: 0.25
|
||
|
|
enem_challenge_acc,exam_id__2022: 0.3233082706766917
|
||
|
|
enem_challenge_acc,exam_id__2023: 0.34814814814814815
|
||
|
|
enem_challenge_acc_stderr,all: 0.007148951828535609
|
||
|
|
enem_challenge_acc_stderr,exam_id__2009: 0.024960208775102345
|
||
|
|
enem_challenge_acc_stderr,exam_id__2010: 0.025924496341894532
|
||
|
|
enem_challenge_acc_stderr,exam_id__2011: 0.02449152244622838
|
||
|
|
enem_challenge_acc_stderr,exam_id__2012: 0.024889831032253663
|
||
|
|
enem_challenge_acc_stderr,exam_id__2013: 0.02682596737556379
|
||
|
|
enem_challenge_acc_stderr,exam_id__2014: 0.026210250439613336
|
||
|
|
enem_challenge_acc_stderr,exam_id__2015: 0.02434000254473757
|
||
|
|
enem_challenge_acc_stderr,exam_id__2016: 0.024414986986171384
|
||
|
|
enem_challenge_acc_stderr,exam_id__2016_2: 0.023342214704782527
|
||
|
|
enem_challenge_acc_stderr,exam_id__2017: 0.02314953920904528
|
||
|
|
enem_challenge_acc_stderr,exam_id__2022: 0.023367019181975722
|
||
|
|
enem_challenge_acc_stderr,exam_id__2023: 0.023749430869741607
|
||
|
|
enem_challenge_alias: enem
|
||
|
|
faquad_nli_acc,all: 0.7846153846153846
|
||
|
|
faquad_nli_acc_stderr,all: 0.011396120309131366
|
||
|
|
faquad_nli_alias: faquad_nli
|
||
|
|
faquad_nli_f1_macro,all: 0.4396551724137931
|
||
|
|
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
|
||
|
|
global_piqa_completions_por_latn_braz_acc: 0.7
|
||
|
|
global_piqa_completions_por_latn_braz_acc_bytes: 0.69
|
||
|
|
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.046482319871173176
|
||
|
|
global_piqa_completions_por_latn_braz_acc_norm: 0.68
|
||
|
|
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.046882617226215076
|
||
|
|
global_piqa_completions_por_latn_braz_acc_stderr: 0.04605661864718383
|
||
|
|
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
|
||
|
|
gsm8k_pt_alias: gsm8k_pt
|
||
|
|
gsm8k_pt_exact_match,flexible-extract: 0.1461187214611872
|
||
|
|
gsm8k_pt_exact_match,strict-match: 0.0
|
||
|
|
gsm8k_pt_exact_match_stderr,flexible-extract: 0.009748085159664667
|
||
|
|
gsm8k_pt_exact_match_stderr,strict-match: 0.0
|
||
|
|
hatebr_offensive_acc,all: 0.5328571428571428
|
||
|
|
hatebr_offensive_acc_stderr,all: 0.009452579861644791
|
||
|
|
hatebr_offensive_alias: hatebr_offensive_binary
|
||
|
|
hatebr_offensive_f1_macro,all: 0.4054916057180851
|
||
|
|
hatebr_offensive_f1_macro_stderr,all: 0.007877082274553134
|
||
|
|
hellaswag_poly_pt_acc: 0.3781558131975295
|
||
|
|
hellaswag_poly_pt_acc_norm: 0.47209881894029687
|
||
|
|
hellaswag_poly_pt_acc_norm_stderr: 0.005196835630828483
|
||
|
|
hellaswag_poly_pt_acc_stderr: 0.005048035343611143
|
||
|
|
hellaswag_poly_pt_alias: hellaswag_poly_pt
|
||
|
|
humaneval_instruct_alias: humaneval_instruct
|
||
|
|
humaneval_instruct_pass@1,create_test: 0.0
|
||
|
|
humaneval_instruct_pass@1_stderr,create_test: 0.0
|
||
|
|
ifeval_pt_alias: ifeval_pt
|
||
|
|
ifeval_pt_inst_level_loose_acc: 0.3930232558139535
|
||
|
|
ifeval_pt_inst_level_loose_acc_stderr: N/A
|
||
|
|
ifeval_pt_inst_level_strict_acc: 0.3302325581395349
|
||
|
|
ifeval_pt_inst_level_strict_acc_stderr: N/A
|
||
|
|
ifeval_pt_prompt_level_loose_acc: 0.27666666666666667
|
||
|
|
ifeval_pt_prompt_level_loose_acc_stderr: 0.025870931391123536
|
||
|
|
ifeval_pt_prompt_level_strict_acc: 0.21
|
||
|
|
ifeval_pt_prompt_level_strict_acc_stderr: 0.023555243542102446
|
||
|
|
lambada_poly_pt_acc: 0.20861633999611875
|
||
|
|
lambada_poly_pt_acc_stderr: 0.005660825573438258
|
||
|
|
lambada_poly_pt_alias: lambada_poly_pt
|
||
|
|
lambada_poly_pt_perplexity: 861.6919525698232
|
||
|
|
lambada_poly_pt_perplexity_stderr: 63.082803555905926
|
||
|
|
mmlu_poly_pt_acc: 0.3607775442809967
|
||
|
|
mmlu_poly_pt_acc_stderr: 0.004160492531275285
|
||
|
|
mmlu_poly_pt_alias: mmlu_poly_pt
|
||
|
|
oab_exams_acc,all: 0.27015945330296126
|
||
|
|
oab_exams_acc,exam_id__2010-01: 0.24705882352941178
|
||
|
|
oab_exams_acc,exam_id__2010-02: 0.28
|
||
|
|
oab_exams_acc,exam_id__2011-03: 0.24242424242424243
|
||
|
|
oab_exams_acc,exam_id__2011-04: 0.225
|
||
|
|
oab_exams_acc,exam_id__2011-05: 0.325
|
||
|
|
oab_exams_acc,exam_id__2012-06: 0.3
|
||
|
|
oab_exams_acc,exam_id__2012-06a: 0.2125
|
||
|
|
oab_exams_acc,exam_id__2012-07: 0.3125
|
||
|
|
oab_exams_acc,exam_id__2012-08: 0.1375
|
||
|
|
oab_exams_acc,exam_id__2012-09: 0.22077922077922077
|
||
|
|
oab_exams_acc,exam_id__2013-10: 0.2375
|
||
|
|
oab_exams_acc,exam_id__2013-11: 0.275
|
||
|
|
oab_exams_acc,exam_id__2013-12: 0.2375
|
||
|
|
oab_exams_acc,exam_id__2014-13: 0.2875
|
||
|
|
oab_exams_acc,exam_id__2014-14: 0.25
|
||
|
|
oab_exams_acc,exam_id__2014-15: 0.21794871794871795
|
||
|
|
oab_exams_acc,exam_id__2015-16: 0.2875
|
||
|
|
oab_exams_acc,exam_id__2015-17: 0.3333333333333333
|
||
|
|
oab_exams_acc,exam_id__2015-18: 0.275
|
||
|
|
oab_exams_acc,exam_id__2016-19: 0.2692307692307692
|
||
|
|
oab_exams_acc,exam_id__2016-20: 0.3625
|
||
|
|
oab_exams_acc,exam_id__2016-20a: 0.275
|
||
|
|
oab_exams_acc,exam_id__2016-21: 0.3
|
||
|
|
oab_exams_acc,exam_id__2017-22: 0.3375
|
||
|
|
oab_exams_acc,exam_id__2017-23: 0.3
|
||
|
|
oab_exams_acc,exam_id__2017-24: 0.3
|
||
|
|
oab_exams_acc,exam_id__2018-25: 0.25
|
||
|
|
oab_exams_acc_stderr,all: 0.0054708493847973475
|
||
|
|
oab_exams_acc_stderr,exam_id__2010-01: 0.027072546734021444
|
||
|
|
oab_exams_acc_stderr,exam_id__2010-02: 0.025924135072639538
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-03: 0.024845779837305654
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-04: 0.026955276691079964
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-05: 0.03024661215761147
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-06: 0.02951246277176116
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-06a: 0.026389734380509827
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-07: 0.029841237684623958
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-08: 0.0221709705988969
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-09: 0.027302999732294067
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-10: 0.0274908309628061
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-11: 0.02882257223036154
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-12: 0.027413917931195794
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-13: 0.02917450516650155
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-14: 0.027979401755101777
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-15: 0.027011736935613068
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-16: 0.029203723510899393
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-17: 0.030881077024925714
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-18: 0.0288121979710083
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-19: 0.029017747344098694
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-20: 0.030947316988827907
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-20a: 0.02885616149691637
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-21: 0.029466001599879478
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-22: 0.0304433823531397
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-23: 0.0295029507416334
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-24: 0.029614631804279756
|
||
|
|
oab_exams_acc_stderr,exam_id__2018-25: 0.028064299041816978
|
||
|
|
oab_exams_alias: oab_exams
|
||
|
|
portuguese_hate_speech_acc,all: 0.30552291421856637
|
||
|
|
portuguese_hate_speech_acc_stderr,all: 0.0111342367647186
|
||
|
|
portuguese_hate_speech_alias: portuguese_hate_speech_binary
|
||
|
|
portuguese_hate_speech_f1_macro,all: 0.24331655796529078
|
||
|
|
portuguese_hate_speech_f1_macro_stderr,all: 0.007523479961403228
|
||
|
|
tweetsentbr_acc,all: 0.34626865671641793
|
||
|
|
tweetsentbr_acc_stderr,all: 0.007507813204647027
|
||
|
|
tweetsentbr_alias: tweetsentbr
|
||
|
|
tweetsentbr_f1_macro,all: 0.24098176805099736
|
||
|
|
tweetsentbr_f1_macro_stderr,all: 0.005874676993024155
|
||
|
|
step: 3595
|