Files
Tucano2-qwen-0.5B-Base/evals.yaml
ModelHub XC 5dcf73b2aa 初始化项目,由ModelHub XC社区提供模型
Model: Polygl0t/Tucano2-qwen-0.5B-Base
Source: Original Platform
2026-05-31 00:50:30 +08:00

191 lines
9.6 KiB
YAML

evaluations:
arc_challenge_poly_pt_acc: 0.32905982905982906
arc_challenge_poly_pt_acc_norm: 0.37435897435897436
arc_challenge_poly_pt_acc_norm_stderr: 0.014154661190814505
arc_challenge_poly_pt_acc_stderr: 0.013742700308521677
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.610702614379085
assin2_rte_acc_stderr,all: 0.006976455542871003
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.5493894012220668
assin2_rte_f1_macro_stderr,all: 0.007313213532417135
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 1.954166666666667
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.06429110147600466
assin2_sts_pearson_stderr,all: 0.013460910678684006
assin_entailment_acc: 0.70675
assin_entailment_acc_stderr: 0.007199067024031941
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.72675
assin_paraphrase_acc_stderr: 0.007046880444991351
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.5388888888888889
belebele_por_Latn_acc_norm: 0.5388888888888889
belebele_por_Latn_acc_norm_stderr: 0.016625417583086437
belebele_por_Latn_acc_stderr: 0.016625417583086437
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.46870653685674546
bluex_acc,exam_id__UNICAMP_2018: 0.46296296296296297
bluex_acc,exam_id__UNICAMP_2019: 0.46
bluex_acc,exam_id__UNICAMP_2020: 0.5636363636363636
bluex_acc,exam_id__UNICAMP_2021_1: 0.3695652173913043
bluex_acc,exam_id__UNICAMP_2021_2: 0.37254901960784315
bluex_acc,exam_id__UNICAMP_2022: 0.5641025641025641
bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976
bluex_acc,exam_id__UNICAMP_2024: 0.5777777777777777
bluex_acc,exam_id__USP_2018: 0.37037037037037035
bluex_acc,exam_id__USP_2019: 0.475
bluex_acc,exam_id__USP_2020: 0.4642857142857143
bluex_acc,exam_id__USP_2021: 0.34615384615384615
bluex_acc,exam_id__USP_2022: 0.3877551020408163
bluex_acc,exam_id__USP_2023: 0.5227272727272727
bluex_acc,exam_id__USP_2024: 0.5609756097560976
bluex_acc_stderr,all: 0.010751763481122825
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03908140507987583
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.04086284322684593
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03854268045176616
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.041049198501562786
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.03903627297829125
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.046067534061712136
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.043110276875963825
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04235722643358554
bluex_acc_stderr,exam_id__USP_2018: 0.03788178159383037
bluex_acc_stderr,exam_id__USP_2019: 0.04557332896428892
bluex_acc_stderr,exam_id__USP_2020: 0.038341658891473986
bluex_acc_stderr,exam_id__USP_2021: 0.038139467533341624
bluex_acc_stderr,exam_id__USP_2022: 0.040012931518371306
bluex_acc_stderr,exam_id__USP_2023: 0.043422447984322925
bluex_acc_stderr,exam_id__USP_2024: 0.044825815810456374
bluex_alias: bluex
calame_pt_acc: 0.5867052023121387
calame_pt_acc_stderr: 0.01081012929476997
calame_pt_alias: calame_pt
calame_pt_perplexity: 7.2359833214160085
calame_pt_perplexity_stderr: 0.42482617041660564
enem_challenge_acc,all: 0.5514345696291113
enem_challenge_acc,exam_id__2009: 0.5043478260869565
enem_challenge_acc,exam_id__2010: 0.5726495726495726
enem_challenge_acc,exam_id__2011: 0.6153846153846154
enem_challenge_acc,exam_id__2012: 0.5344827586206896
enem_challenge_acc,exam_id__2013: 0.5833333333333334
enem_challenge_acc,exam_id__2014: 0.5688073394495413
enem_challenge_acc,exam_id__2015: 0.4789915966386555
enem_challenge_acc,exam_id__2016: 0.5371900826446281
enem_challenge_acc,exam_id__2016_2: 0.5284552845528455
enem_challenge_acc,exam_id__2017: 0.5086206896551724
enem_challenge_acc,exam_id__2022: 0.5639097744360902
enem_challenge_acc,exam_id__2023: 0.6148148148148148
enem_challenge_acc_stderr,all: 0.007618623435000518
enem_challenge_acc_stderr,exam_id__2009: 0.026949045711548218
enem_challenge_acc_stderr,exam_id__2010: 0.026429279577735547
enem_challenge_acc_stderr,exam_id__2011: 0.02589822342752356
enem_challenge_acc_stderr,exam_id__2012: 0.02669879193984667
enem_challenge_acc_stderr,exam_id__2013: 0.02740882906544383
enem_challenge_acc_stderr,exam_id__2014: 0.027336979010138398
enem_challenge_acc_stderr,exam_id__2015: 0.026385065690201895
enem_challenge_acc_stderr,exam_id__2016: 0.026088144599797948
enem_challenge_acc_stderr,exam_id__2016_2: 0.02606861159558674
enem_challenge_acc_stderr,exam_id__2017: 0.02684138658677631
enem_challenge_acc_stderr,exam_id__2022: 0.02485425227418429
enem_challenge_acc_stderr,exam_id__2023: 0.024259924778206914
enem_challenge_alias: enem
faquad_nli_acc,all: 0.7846153846153846
faquad_nli_acc_stderr,all: 0.011396120309131366
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.4396551724137931
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
global_piqa_completions_por_latn_braz_acc: 0.79
global_piqa_completions_por_latn_braz_acc_bytes: 0.75
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04351941398892446
global_piqa_completions_por_latn_braz_acc_norm: 0.74
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.0440844002276808
global_piqa_completions_por_latn_braz_acc_stderr: 0.040936018074033236
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
hatebr_offensive_acc,all: 0.7964285714285714
hatebr_offensive_acc_stderr,all: 0.007613133172324561
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.7963528271484649
hatebr_offensive_f1_macro_stderr,all: 0.00761763073122653
hellaswag_poly_pt_acc: 0.3776140426915159
hellaswag_poly_pt_acc_norm: 0.48434283237620546
hellaswag_poly_pt_acc_norm_stderr: 0.005202393220555622
hellaswag_poly_pt_acc_stderr: 0.005046614926940191
hellaswag_poly_pt_alias: hellaswag_poly_pt
lambada_poly_pt_acc: 0.4513875412381137
lambada_poly_pt_acc_stderr: 0.006932975888368315
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 15.604144991512424
lambada_poly_pt_perplexity_stderr: 0.5333901055397442
mmlu_poly_pt_acc: 0.39680276193335334
mmlu_poly_pt_acc_stderr: 0.0042385372220186505
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.40364464692482915
oab_exams_acc,exam_id__2010-01: 0.4
oab_exams_acc,exam_id__2010-02: 0.4
oab_exams_acc,exam_id__2011-03: 0.3434343434343434
oab_exams_acc,exam_id__2011-04: 0.4125
oab_exams_acc,exam_id__2011-05: 0.425
oab_exams_acc,exam_id__2012-06: 0.375
oab_exams_acc,exam_id__2012-06a: 0.475
oab_exams_acc,exam_id__2012-07: 0.3125
oab_exams_acc,exam_id__2012-08: 0.425
oab_exams_acc,exam_id__2012-09: 0.3246753246753247
oab_exams_acc,exam_id__2013-10: 0.425
oab_exams_acc,exam_id__2013-11: 0.4375
oab_exams_acc,exam_id__2013-12: 0.4375
oab_exams_acc,exam_id__2014-13: 0.425
oab_exams_acc,exam_id__2014-14: 0.4125
oab_exams_acc,exam_id__2014-15: 0.41025641025641024
oab_exams_acc,exam_id__2015-16: 0.35
oab_exams_acc,exam_id__2015-17: 0.48717948717948717
oab_exams_acc,exam_id__2015-18: 0.3375
oab_exams_acc,exam_id__2016-19: 0.44871794871794873
oab_exams_acc,exam_id__2016-20: 0.3875
oab_exams_acc,exam_id__2016-20a: 0.3625
oab_exams_acc,exam_id__2016-21: 0.425
oab_exams_acc,exam_id__2017-22: 0.425
oab_exams_acc,exam_id__2017-23: 0.425
oab_exams_acc,exam_id__2017-24: 0.3875
oab_exams_acc,exam_id__2018-25: 0.4375
oab_exams_acc_stderr,all: 0.006053332133276562
oab_exams_acc_stderr,exam_id__2010-01: 0.030610161516106096
oab_exams_acc_stderr,exam_id__2010-02: 0.028296630437280906
oab_exams_acc_stderr,exam_id__2011-03: 0.027549621066797744
oab_exams_acc_stderr,exam_id__2011-04: 0.031730143003018005
oab_exams_acc_stderr,exam_id__2011-05: 0.03187425289992289
oab_exams_acc_stderr,exam_id__2012-06: 0.031303363618806604
oab_exams_acc_stderr,exam_id__2012-06a: 0.03204492958151872
oab_exams_acc_stderr,exam_id__2012-07: 0.030024899757931458
oab_exams_acc_stderr,exam_id__2012-08: 0.03186228006495766
oab_exams_acc_stderr,exam_id__2012-09: 0.030794179748498546
oab_exams_acc_stderr,exam_id__2013-10: 0.03186391626480471
oab_exams_acc_stderr,exam_id__2013-11: 0.03202492621379354
oab_exams_acc_stderr,exam_id__2013-12: 0.03195252134301974
oab_exams_acc_stderr,exam_id__2014-13: 0.03180998638392341
oab_exams_acc_stderr,exam_id__2014-14: 0.03183014975650971
oab_exams_acc_stderr,exam_id__2014-15: 0.03207677106125773
oab_exams_acc_stderr,exam_id__2015-16: 0.030649958521189145
oab_exams_acc_stderr,exam_id__2015-17: 0.03266206847243709
oab_exams_acc_stderr,exam_id__2015-18: 0.030453791802761587
oab_exams_acc_stderr,exam_id__2016-19: 0.03245421470850268
oab_exams_acc_stderr,exam_id__2016-20: 0.03147299166016239
oab_exams_acc_stderr,exam_id__2016-20a: 0.031016128861774545
oab_exams_acc_stderr,exam_id__2016-21: 0.03173265714315922
oab_exams_acc_stderr,exam_id__2017-22: 0.03191749762684417
oab_exams_acc_stderr,exam_id__2017-23: 0.0320020087581464
oab_exams_acc_stderr,exam_id__2017-24: 0.031539234776498976
oab_exams_acc_stderr,exam_id__2018-25: 0.03202176928915031
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.43478260869565216
portuguese_hate_speech_acc_stderr,all: 0.012048917796997311
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.42838929145469706
portuguese_hate_speech_f1_macro_stderr,all: 0.011986624066160297
tweetsentbr_acc,all: 0.4736318407960199
tweetsentbr_acc_stderr,all: 0.007893686144300348
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.2678826295357763
tweetsentbr_f1_macro_stderr,all: 0.005779014212286808
step: 50000