Files
Tucano2-qwen-0.5B-Think/evals.yaml

208 lines
10 KiB
YAML
Raw Normal View History

evaluations:
arc_challenge_poly_pt_acc: 0.30427350427350425
arc_challenge_poly_pt_acc_norm: 0.32735042735042735
arc_challenge_poly_pt_acc_norm_stderr: 0.013724408490743929
arc_challenge_poly_pt_acc_stderr: 0.013456870841978025
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.5
assin2_rte_acc_stderr,all: 0.007138073526203373
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.3333333333333333
assin2_rte_f1_macro_stderr,all: 0.003173025139437921
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 3.059456699346405
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.0276213074062244
assin2_sts_pearson_stderr,all: 0.011665375504884477
assin_entailment_acc: 0.7335
assin_entailment_acc_stderr: 0.006991541883910775
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.71575
assin_paraphrase_acc_stderr: 0.0071327206100355
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.3611111111111111
belebele_por_Latn_acc_norm: 0.3611111111111111
belebele_por_Latn_acc_norm_stderr: 0.016019658270537297
belebele_por_Latn_acc_stderr: 0.016019658270537297
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.34492350486787204
bluex_acc,exam_id__UNICAMP_2018: 0.4074074074074074
bluex_acc,exam_id__UNICAMP_2019: 0.28
bluex_acc,exam_id__UNICAMP_2020: 0.36363636363636365
bluex_acc,exam_id__UNICAMP_2021_1: 0.2391304347826087
bluex_acc,exam_id__UNICAMP_2021_2: 0.23529411764705882
bluex_acc,exam_id__UNICAMP_2022: 0.2564102564102564
bluex_acc,exam_id__UNICAMP_2023: 0.5581395348837209
bluex_acc,exam_id__UNICAMP_2024: 0.4
bluex_acc,exam_id__USP_2018: 0.3148148148148148
bluex_acc,exam_id__USP_2019: 0.3
bluex_acc,exam_id__USP_2020: 0.30357142857142855
bluex_acc,exam_id__USP_2021: 0.36538461538461536
bluex_acc,exam_id__USP_2022: 0.3673469387755102
bluex_acc,exam_id__USP_2023: 0.38636363636363635
bluex_acc,exam_id__USP_2024: 0.4146341463414634
bluex_acc_stderr,all: 0.010242775047627772
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.038534741152469736
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0367961057296462
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.037368794971719924
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.036222183210060134
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.034079376824662555
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04027891922030821
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04379564728920204
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04209744896628822
bluex_acc_stderr,exam_id__USP_2018: 0.03647519527201769
bluex_acc_stderr,exam_id__USP_2019: 0.04181463130976304
bluex_acc_stderr,exam_id__USP_2020: 0.03539764734011866
bluex_acc_stderr,exam_id__USP_2021: 0.03859991685452418
bluex_acc_stderr,exam_id__USP_2022: 0.0397939306907615
bluex_acc_stderr,exam_id__USP_2023: 0.042529993265009204
bluex_acc_stderr,exam_id__USP_2024: 0.04456715995150719
bluex_alias: bluex
calame_pt_acc: 0.09489402697495183
calame_pt_acc_stderr: 0.006433689590297553
calame_pt_alias: calame_pt
calame_pt_perplexity: 8435.632130447184
calame_pt_perplexity_stderr: 963.9665411064852
enem_challenge_acc,all: 0.3198040587823653
enem_challenge_acc,exam_id__2009: 0.3130434782608696
enem_challenge_acc,exam_id__2010: 0.38461538461538464
enem_challenge_acc,exam_id__2011: 0.29914529914529914
enem_challenge_acc,exam_id__2012: 0.31896551724137934
enem_challenge_acc,exam_id__2013: 0.37037037037037035
enem_challenge_acc,exam_id__2014: 0.3394495412844037
enem_challenge_acc,exam_id__2015: 0.3025210084033613
enem_challenge_acc,exam_id__2016: 0.3140495867768595
enem_challenge_acc,exam_id__2016_2: 0.2764227642276423
enem_challenge_acc,exam_id__2017: 0.25
enem_challenge_acc,exam_id__2022: 0.3233082706766917
enem_challenge_acc,exam_id__2023: 0.34814814814814815
enem_challenge_acc_stderr,all: 0.007148951828535609
enem_challenge_acc_stderr,exam_id__2009: 0.024960208775102345
enem_challenge_acc_stderr,exam_id__2010: 0.025924496341894532
enem_challenge_acc_stderr,exam_id__2011: 0.02449152244622838
enem_challenge_acc_stderr,exam_id__2012: 0.024889831032253663
enem_challenge_acc_stderr,exam_id__2013: 0.02682596737556379
enem_challenge_acc_stderr,exam_id__2014: 0.026210250439613336
enem_challenge_acc_stderr,exam_id__2015: 0.02434000254473757
enem_challenge_acc_stderr,exam_id__2016: 0.024414986986171384
enem_challenge_acc_stderr,exam_id__2016_2: 0.023342214704782527
enem_challenge_acc_stderr,exam_id__2017: 0.02314953920904528
enem_challenge_acc_stderr,exam_id__2022: 0.023367019181975722
enem_challenge_acc_stderr,exam_id__2023: 0.023749430869741607
enem_challenge_alias: enem
faquad_nli_acc,all: 0.7846153846153846
faquad_nli_acc_stderr,all: 0.011396120309131366
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.4396551724137931
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
global_piqa_completions_por_latn_braz_acc: 0.7
global_piqa_completions_por_latn_braz_acc_bytes: 0.69
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.046482319871173176
global_piqa_completions_por_latn_braz_acc_norm: 0.68
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.046882617226215076
global_piqa_completions_por_latn_braz_acc_stderr: 0.04605661864718383
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
gsm8k_pt_alias: gsm8k_pt
gsm8k_pt_exact_match,flexible-extract: 0.1461187214611872
gsm8k_pt_exact_match,strict-match: 0.0
gsm8k_pt_exact_match_stderr,flexible-extract: 0.009748085159664667
gsm8k_pt_exact_match_stderr,strict-match: 0.0
hatebr_offensive_acc,all: 0.5328571428571428
hatebr_offensive_acc_stderr,all: 0.009452579861644791
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.4054916057180851
hatebr_offensive_f1_macro_stderr,all: 0.007877082274553134
hellaswag_poly_pt_acc: 0.3781558131975295
hellaswag_poly_pt_acc_norm: 0.47209881894029687
hellaswag_poly_pt_acc_norm_stderr: 0.005196835630828483
hellaswag_poly_pt_acc_stderr: 0.005048035343611143
hellaswag_poly_pt_alias: hellaswag_poly_pt
humaneval_instruct_alias: humaneval_instruct
humaneval_instruct_pass@1,create_test: 0.0
humaneval_instruct_pass@1_stderr,create_test: 0.0
ifeval_pt_alias: ifeval_pt
ifeval_pt_inst_level_loose_acc: 0.3930232558139535
ifeval_pt_inst_level_loose_acc_stderr: N/A
ifeval_pt_inst_level_strict_acc: 0.3302325581395349
ifeval_pt_inst_level_strict_acc_stderr: N/A
ifeval_pt_prompt_level_loose_acc: 0.27666666666666667
ifeval_pt_prompt_level_loose_acc_stderr: 0.025870931391123536
ifeval_pt_prompt_level_strict_acc: 0.21
ifeval_pt_prompt_level_strict_acc_stderr: 0.023555243542102446
lambada_poly_pt_acc: 0.20861633999611875
lambada_poly_pt_acc_stderr: 0.005660825573438258
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 861.6919525698232
lambada_poly_pt_perplexity_stderr: 63.082803555905926
mmlu_poly_pt_acc: 0.3607775442809967
mmlu_poly_pt_acc_stderr: 0.004160492531275285
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.27015945330296126
oab_exams_acc,exam_id__2010-01: 0.24705882352941178
oab_exams_acc,exam_id__2010-02: 0.28
oab_exams_acc,exam_id__2011-03: 0.24242424242424243
oab_exams_acc,exam_id__2011-04: 0.225
oab_exams_acc,exam_id__2011-05: 0.325
oab_exams_acc,exam_id__2012-06: 0.3
oab_exams_acc,exam_id__2012-06a: 0.2125
oab_exams_acc,exam_id__2012-07: 0.3125
oab_exams_acc,exam_id__2012-08: 0.1375
oab_exams_acc,exam_id__2012-09: 0.22077922077922077
oab_exams_acc,exam_id__2013-10: 0.2375
oab_exams_acc,exam_id__2013-11: 0.275
oab_exams_acc,exam_id__2013-12: 0.2375
oab_exams_acc,exam_id__2014-13: 0.2875
oab_exams_acc,exam_id__2014-14: 0.25
oab_exams_acc,exam_id__2014-15: 0.21794871794871795
oab_exams_acc,exam_id__2015-16: 0.2875
oab_exams_acc,exam_id__2015-17: 0.3333333333333333
oab_exams_acc,exam_id__2015-18: 0.275
oab_exams_acc,exam_id__2016-19: 0.2692307692307692
oab_exams_acc,exam_id__2016-20: 0.3625
oab_exams_acc,exam_id__2016-20a: 0.275
oab_exams_acc,exam_id__2016-21: 0.3
oab_exams_acc,exam_id__2017-22: 0.3375
oab_exams_acc,exam_id__2017-23: 0.3
oab_exams_acc,exam_id__2017-24: 0.3
oab_exams_acc,exam_id__2018-25: 0.25
oab_exams_acc_stderr,all: 0.0054708493847973475
oab_exams_acc_stderr,exam_id__2010-01: 0.027072546734021444
oab_exams_acc_stderr,exam_id__2010-02: 0.025924135072639538
oab_exams_acc_stderr,exam_id__2011-03: 0.024845779837305654
oab_exams_acc_stderr,exam_id__2011-04: 0.026955276691079964
oab_exams_acc_stderr,exam_id__2011-05: 0.03024661215761147
oab_exams_acc_stderr,exam_id__2012-06: 0.02951246277176116
oab_exams_acc_stderr,exam_id__2012-06a: 0.026389734380509827
oab_exams_acc_stderr,exam_id__2012-07: 0.029841237684623958
oab_exams_acc_stderr,exam_id__2012-08: 0.0221709705988969
oab_exams_acc_stderr,exam_id__2012-09: 0.027302999732294067
oab_exams_acc_stderr,exam_id__2013-10: 0.0274908309628061
oab_exams_acc_stderr,exam_id__2013-11: 0.02882257223036154
oab_exams_acc_stderr,exam_id__2013-12: 0.027413917931195794
oab_exams_acc_stderr,exam_id__2014-13: 0.02917450516650155
oab_exams_acc_stderr,exam_id__2014-14: 0.027979401755101777
oab_exams_acc_stderr,exam_id__2014-15: 0.027011736935613068
oab_exams_acc_stderr,exam_id__2015-16: 0.029203723510899393
oab_exams_acc_stderr,exam_id__2015-17: 0.030881077024925714
oab_exams_acc_stderr,exam_id__2015-18: 0.0288121979710083
oab_exams_acc_stderr,exam_id__2016-19: 0.029017747344098694
oab_exams_acc_stderr,exam_id__2016-20: 0.030947316988827907
oab_exams_acc_stderr,exam_id__2016-20a: 0.02885616149691637
oab_exams_acc_stderr,exam_id__2016-21: 0.029466001599879478
oab_exams_acc_stderr,exam_id__2017-22: 0.0304433823531397
oab_exams_acc_stderr,exam_id__2017-23: 0.0295029507416334
oab_exams_acc_stderr,exam_id__2017-24: 0.029614631804279756
oab_exams_acc_stderr,exam_id__2018-25: 0.028064299041816978
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.30552291421856637
portuguese_hate_speech_acc_stderr,all: 0.0111342367647186
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.24331655796529078
portuguese_hate_speech_f1_macro_stderr,all: 0.007523479961403228
tweetsentbr_acc,all: 0.34626865671641793
tweetsentbr_acc_stderr,all: 0.007507813204647027
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.24098176805099736
tweetsentbr_f1_macro_stderr,all: 0.005874676993024155
step: 3595