{ "results": { "arc_challenge": { "alias": "arc_challenge", "acc,none": 0.4525, "acc_stderr,none": 0.024918098926991643, "acc_norm,none": 0.4975, "acc_norm_stderr,none": 0.0250309958227734 }, "arc_easy": { "alias": "arc_easy", "acc,none": 0.7625, "acc_stderr,none": 0.02130420258115865, "acc_norm,none": 0.755, "acc_norm_stderr,none": 0.02153129097913246 }, "hellaswag": { "alias": "hellaswag", "acc,none": 0.4925, "acc_stderr,none": 0.025028492535438325, "acc_norm,none": 0.6225, "acc_norm_stderr,none": 0.024268431488608636 }, "kmmlu": { "acc,none": 0.47556707712248864, "acc_stderr,none": 0.003918983222456166, "alias": "kmmlu" }, "kmmlu_applied_science": { "acc,none": 0.45875, "acc_stderr,none": 0.007101063857525891, "alias": " - kmmlu_applied_science" }, "kmmlu_aviation_engineering_and_maintenance": { "alias": " - kmmlu_aviation_engineering_and_maintenance", "acc,none": 0.4475, "acc_stderr,none": 0.024892941194307603 }, "kmmlu_electronics_engineering": { "alias": " - kmmlu_electronics_engineering", "acc,none": 0.65, "acc_stderr,none": 0.023878346647046 }, "kmmlu_energy_management": { "alias": " - kmmlu_energy_management", "acc,none": 0.4, "acc_stderr,none": 0.02452557357939856 }, "kmmlu_environmental_science": { "alias": " - kmmlu_environmental_science", "acc,none": 0.3875, "acc_stderr,none": 0.02438947500927543 }, "kmmlu_gas_technology_and_engineering": { "alias": " - kmmlu_gas_technology_and_engineering", "acc,none": 0.3775, "acc_stderr,none": 0.02426843148860864 }, "kmmlu_geomatics": { "alias": " - kmmlu_geomatics", "acc,none": 0.4325, "acc_stderr,none": 0.024802162065186362 }, "kmmlu_industrial_engineer": { "alias": " - kmmlu_industrial_engineer", "acc,none": 0.4275, "acc_stderr,none": 0.024766769210836766 }, "kmmlu_machine_design_and_manufacturing": { "alias": " - kmmlu_machine_design_and_manufacturing", "acc,none": 0.52, "acc_stderr,none": 0.025011275652681887 }, "kmmlu_maritime_engineering": { "alias": " - kmmlu_maritime_engineering", "acc,none": 0.405, "acc_stderr,none": 0.024575340657273674 }, "kmmlu_nondestructive_testing": { "alias": " - kmmlu_nondestructive_testing", "acc,none": 0.4825, "acc_stderr,none": 0.025015972341295333 }, "kmmlu_railway_and_automotive_engineering": { "alias": " - kmmlu_railway_and_automotive_engineering", "acc,none": 0.3875, "acc_stderr,none": 0.02438947500927542 }, "kmmlu_telecommunications_and_wireless_technology": { "alias": " - kmmlu_telecommunications_and_wireless_technology", "acc,none": 0.5875, "acc_stderr,none": 0.024645036407943802 }, "kmmlu_humss": { "acc,none": 0.4805860805860806, "acc_stderr,none": 0.009419825503999339, "alias": " - kmmlu_humss" }, "kmmlu_accounting": { "alias": " - kmmlu_accounting", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956912 }, "kmmlu_criminal_law": { "alias": " - kmmlu_criminal_law", "acc,none": 0.39, "acc_stderr,none": 0.03457567623250011 }, "kmmlu_economics": { "alias": " - kmmlu_economics", "acc,none": 0.5615384615384615, "acc_stderr,none": 0.04368784779071991 }, "kmmlu_education": { "alias": " - kmmlu_education", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196 }, "kmmlu_korean_history": { "alias": " - kmmlu_korean_history", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284 }, "kmmlu_law": { "alias": " - kmmlu_law", "acc,none": 0.3875, "acc_stderr,none": 0.024389475009275435 }, "kmmlu_management": { "alias": " - kmmlu_management", "acc,none": 0.53, "acc_stderr,none": 0.02498621173652297 }, "kmmlu_political_science_and_sociology": { "alias": " - kmmlu_political_science_and_sociology", "acc,none": 0.5466666666666666, "acc_stderr,none": 0.028789526978043094 }, "kmmlu_psychology": { "alias": " - kmmlu_psychology", "acc,none": 0.4275, "acc_stderr,none": 0.02476676921083677 }, "kmmlu_social_welfare": { "alias": " - kmmlu_social_welfare", "acc,none": 0.585, "acc_stderr,none": 0.02466695454685353 }, "kmmlu_taxation": { "alias": " - kmmlu_taxation", "acc,none": 0.435, "acc_stderr,none": 0.03514328173714407 }, "kmmlu_other": { "acc,none": 0.4772222222222222, "acc_stderr,none": 0.008073884461069719, "alias": " - kmmlu_other" }, "kmmlu_agricultural_sciences": { "alias": " - kmmlu_agricultural_sciences", "acc,none": 0.3625, "acc_stderr,none": 0.024066207238097725 }, "kmmlu_construction": { "alias": " - kmmlu_construction", "acc,none": 0.3925, "acc_stderr,none": 0.024445927747963316 }, "kmmlu_fashion": { "alias": " - kmmlu_fashion", "acc,none": 0.4575, "acc_stderr,none": 0.024940719189394073 }, "kmmlu_food_processing": { "alias": " - kmmlu_food_processing", "acc,none": 0.39, "acc_stderr,none": 0.024418038445046374 }, "kmmlu_health": { "alias": " - kmmlu_health", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391 }, "kmmlu_interior_architecture_and_design": { "alias": " - kmmlu_interior_architecture_and_design", "acc,none": 0.6025, "acc_stderr,none": 0.024499693108404712 }, "kmmlu_marketing": { "alias": " - kmmlu_marketing", "acc,none": 0.76, "acc_stderr,none": 0.021380899352993952 }, "kmmlu_patent": { "alias": " - kmmlu_patent", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620332 }, "kmmlu_public_safety": { "alias": " - kmmlu_public_safety", "acc,none": 0.4025, "acc_stderr,none": 0.024550788746396206 }, "kmmlu_real_estate": { "alias": " - kmmlu_real_estate", "acc,none": 0.485, "acc_stderr,none": 0.03542810683297719 }, "kmmlu_refrigerating_machinery": { "alias": " - kmmlu_refrigerating_machinery", "acc,none": 0.4125, "acc_stderr,none": 0.024645036407943802 }, "kmmlu_stem": { "acc,none": 0.4897674418604651, "acc_stderr,none": 0.007312394370135803, "alias": " - kmmlu_stem" }, "kmmlu_biology": { "alias": " - kmmlu_biology", "acc,none": 0.3225, "acc_stderr,none": 0.023400926978618723 }, "kmmlu_chemical_engineering": { "alias": " - kmmlu_chemical_engineering", "acc,none": 0.4875, "acc_stderr,none": 0.025023485209500245 }, "kmmlu_chemistry": { "alias": " - kmmlu_chemistry", "acc,none": 0.5175, "acc_stderr,none": 0.02501597234129533 }, "kmmlu_civil_engineering": { "alias": " - kmmlu_civil_engineering", "acc,none": 0.3825, "acc_stderr,none": 0.024330316186072946 }, "kmmlu_computer_science": { "alias": " - kmmlu_computer_science", "acc,none": 0.75, "acc_stderr,none": 0.021677749238103 }, "kmmlu_ecology": { "alias": " - kmmlu_ecology", "acc,none": 0.5425, "acc_stderr,none": 0.024940719189394077 }, "kmmlu_electrical_engineering": { "alias": " - kmmlu_electrical_engineering", "acc,none": 0.355, "acc_stderr,none": 0.023955629410456463 }, "kmmlu_information_technology": { "alias": " - kmmlu_information_technology", "acc,none": 0.75, "acc_stderr,none": 0.021677749238103 }, "kmmlu_materials_engineering": { "alias": " - kmmlu_materials_engineering", "acc,none": 0.495, "acc_stderr,none": 0.025030057119361453 }, "kmmlu_math": { "alias": " - kmmlu_math", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.027262027336984396 }, "kmmlu_mechanical_engineering": { "alias": " - kmmlu_mechanical_engineering", "acc,none": 0.4125, "acc_stderr,none": 0.024645036407943802 }, "kobest_boolq": { "alias": "kobest_boolq", "acc,none": 0.6675, "acc_stderr,none": 0.023584952830141535, "f1,none": 0.6247575383530242, "f1_stderr,none": "N/A" }, "kobest_copa": { "alias": "kobest_copa", "acc,none": 0.6475, "acc_stderr,none": 0.023917346710791564, "f1,none": 0.6473920138042275, "f1_stderr,none": "N/A" }, "kobest_hellaswag": { "alias": "kobest_hellaswag", "acc,none": 0.44, "acc_stderr,none": 0.02485042976789583, "f1,none": 0.4328647077786627, "f1_stderr,none": "N/A", "acc_norm,none": 0.5825, "acc_norm_stderr,none": 0.024688218756390913 }, "mmlu": { "acc,none": 0.7404266255461321, "acc_stderr,none": 0.003869340083262106, "alias": "mmlu" }, "mmlu_humanities": { "acc,none": 0.6931079323797139, "acc_stderr,none": 0.0077779673157217745, "alias": " - humanities" }, "mmlu_formal_logic": { "alias": " - formal_logic", "acc,none": 0.5793650793650794, "acc_stderr,none": 0.04415438226743745 }, "mmlu_high_school_european_history": { "alias": " - high_school_european_history", "acc,none": 0.7818181818181819, "acc_stderr,none": 0.03225078108306289 }, "mmlu_high_school_us_history": { "alias": " - high_school_us_history", "acc,none": 0.8284313725490197, "acc_stderr,none": 0.02646056956124065 }, "mmlu_high_school_world_history": { "alias": " - high_school_world_history", "acc,none": 0.8438818565400844, "acc_stderr,none": 0.023627159460318684 }, "mmlu_international_law": { "alias": " - international_law", "acc,none": 0.8016528925619835, "acc_stderr,none": 0.03640118271990946 }, "mmlu_jurisprudence": { "alias": " - jurisprudence", "acc,none": 0.7962962962962963, "acc_stderr,none": 0.03893542518824847 }, "mmlu_logical_fallacies": { "alias": " - logical_fallacies", "acc,none": 0.8404907975460123, "acc_stderr,none": 0.02876748172598387 }, "mmlu_moral_disputes": { "alias": " - moral_disputes", "acc,none": 0.7543352601156069, "acc_stderr,none": 0.023176298203992 }, "mmlu_moral_scenarios": { "alias": " - moral_scenarios", "acc,none": 0.3475, "acc_stderr,none": 0.023838625698390636 }, "mmlu_philosophy": { "alias": " - philosophy", "acc,none": 0.7588424437299035, "acc_stderr,none": 0.02429659403476343 }, "mmlu_prehistory": { "alias": " - prehistory", "acc,none": 0.7870370370370371, "acc_stderr,none": 0.02277971908873339 }, "mmlu_professional_law": { "alias": " - professional_law", "acc,none": 0.53, "acc_stderr,none": 0.02498621173652297 }, "mmlu_world_religions": { "alias": " - world_religions", "acc,none": 0.8070175438596491, "acc_stderr,none": 0.030267457554898458 }, "mmlu_other": { "acc,none": 0.7437591776798825, "acc_stderr,none": 0.008056333552095894, "alias": " - other" }, "mmlu_business_ethics": { "alias": " - business_ethics", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446 }, "mmlu_clinical_knowledge": { "alias": " - clinical_knowledge", "acc,none": 0.7773584905660378, "acc_stderr,none": 0.0256042334708991 }, "mmlu_college_medicine": { "alias": " - college_medicine", "acc,none": 0.7341040462427746, "acc_stderr,none": 0.03368762932259431 }, "mmlu_global_facts": { "alias": " - global_facts", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562429 }, "mmlu_human_aging": { "alias": " - human_aging", "acc,none": 0.7488789237668162, "acc_stderr,none": 0.02910522083322461 }, "mmlu_management": { "alias": " - management", "acc,none": 0.8932038834951457, "acc_stderr,none": 0.030581088928331356 }, "mmlu_marketing": { "alias": " - marketing", "acc,none": 0.9145299145299145, "acc_stderr,none": 0.018315891685625862 }, "mmlu_medical_genetics": { "alias": " - medical_genetics", "acc,none": 0.8, "acc_stderr,none": 0.04020151261036846 }, "mmlu_miscellaneous": { "alias": " - miscellaneous", "acc,none": 0.82, "acc_stderr,none": 0.01923342954415769 }, "mmlu_nutrition": { "alias": " - nutrition", "acc,none": 0.7745098039215687, "acc_stderr,none": 0.023929155517351277 }, "mmlu_professional_accounting": { "alias": " - professional_accounting", "acc,none": 0.5709219858156028, "acc_stderr,none": 0.02952591430255856 }, "mmlu_professional_medicine": { "alias": " - professional_medicine", "acc,none": 0.7757352941176471, "acc_stderr,none": 0.025336848563332365 }, "mmlu_virology": { "alias": " - virology", "acc,none": 0.5120481927710844, "acc_stderr,none": 0.03891364495835817 }, "mmlu_social_sciences": { "acc,none": 0.8202205882352941, "acc_stderr,none": 0.007248431086566561, "alias": " - social sciences" }, "mmlu_econometrics": { "alias": " - econometrics", "acc,none": 0.6578947368421053, "acc_stderr,none": 0.04462917535336937 }, "mmlu_high_school_geography": { "alias": " - high_school_geography", "acc,none": 0.8737373737373737, "acc_stderr,none": 0.02366435940288024 }, "mmlu_high_school_government_and_politics": { "alias": " - high_school_government_and_politics", "acc,none": 0.8756476683937824, "acc_stderr,none": 0.023814477086593556 }, "mmlu_high_school_macroeconomics": { "alias": " - high_school_macroeconomics", "acc,none": 0.8076923076923077, "acc_stderr,none": 0.019982347208637292 }, "mmlu_high_school_microeconomics": { "alias": " - high_school_microeconomics", "acc,none": 0.8991596638655462, "acc_stderr,none": 0.019559663430480802 }, "mmlu_high_school_psychology": { "alias": " - high_school_psychology", "acc,none": 0.9025, "acc_stderr,none": 0.0148504449187799 }, "mmlu_human_sexuality": { "alias": " - human_sexuality", "acc,none": 0.7862595419847328, "acc_stderr,none": 0.035954616117746904 }, "mmlu_professional_psychology": { "alias": " - professional_psychology", "acc,none": 0.7475, "acc_stderr,none": 0.0217495282695941 }, "mmlu_public_relations": { "alias": " - public_relations", "acc,none": 0.6818181818181818, "acc_stderr,none": 0.04461272175910509 }, "mmlu_security_studies": { "alias": " - security_studies", "acc,none": 0.7673469387755102, "acc_stderr,none": 0.02704925791589618 }, "mmlu_sociology": { "alias": " - sociology", "acc,none": 0.845771144278607, "acc_stderr,none": 0.02553843336857833 }, "mmlu_us_foreign_policy": { "alias": " - us_foreign_policy", "acc,none": 0.89, "acc_stderr,none": 0.03144660377352203 }, "mmlu_stem": { "acc,none": 0.7148747224865207, "acc_stderr,none": 0.007751851248299227, "alias": " - stem" }, "mmlu_abstract_algebra": { "alias": " - abstract_algebra", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205 }, "mmlu_anatomy": { "alias": " - anatomy", "acc,none": 0.6888888888888889, "acc_stderr,none": 0.03999262876617723 }, "mmlu_astronomy": { "alias": " - astronomy", "acc,none": 0.8421052631578947, "acc_stderr,none": 0.02967416752010141 }, "mmlu_college_biology": { "alias": " - college_biology", "acc,none": 0.8402777777777778, "acc_stderr,none": 0.030635578972093267 }, "mmlu_college_chemistry": { "alias": " - college_chemistry", "acc,none": 0.56, "acc_stderr,none": 0.049888765156985884 }, "mmlu_college_computer_science": { "alias": " - college_computer_science", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237 }, "mmlu_college_mathematics": { "alias": " - college_mathematics", "acc,none": 0.52, "acc_stderr,none": 0.050211673156867795 }, "mmlu_college_physics": { "alias": " - college_physics", "acc,none": 0.5686274509803921, "acc_stderr,none": 0.04928099597287534 }, "mmlu_computer_security": { "alias": " - computer_security", "acc,none": 0.83, "acc_stderr,none": 0.0377525168068637 }, "mmlu_conceptual_physics": { "alias": " - conceptual_physics", "acc,none": 0.7957446808510639, "acc_stderr,none": 0.026355158413349428 }, "mmlu_electrical_engineering": { "alias": " - electrical_engineering", "acc,none": 0.7517241379310344, "acc_stderr,none": 0.036001056927277716 }, "mmlu_elementary_mathematics": { "alias": " - elementary_mathematics", "acc,none": 0.7116402116402116, "acc_stderr,none": 0.023330654054535903 }, "mmlu_high_school_biology": { "alias": " - high_school_biology", "acc,none": 0.9161290322580645, "acc_stderr,none": 0.015769027496775653 }, "mmlu_high_school_chemistry": { "alias": " - high_school_chemistry", "acc,none": 0.7192118226600985, "acc_stderr,none": 0.03161856335358611 }, "mmlu_high_school_computer_science": { "alias": " - high_school_computer_science", "acc,none": 0.87, "acc_stderr,none": 0.03379976689896309 }, "mmlu_high_school_mathematics": { "alias": " - high_school_mathematics", "acc,none": 0.5222222222222223, "acc_stderr,none": 0.030455413985678408 }, "mmlu_high_school_physics": { "alias": " - high_school_physics", "acc,none": 0.6754966887417219, "acc_stderr,none": 0.038227469376587525 }, "mmlu_high_school_statistics": { "alias": " - high_school_statistics", "acc,none": 0.7222222222222222, "acc_stderr,none": 0.030546745264953185 }, "mmlu_machine_learning": { "alias": " - machine_learning", "acc,none": 0.6160714285714286, "acc_stderr,none": 0.04616143075028546 }, "winogrande": { "alias": "winogrande", "acc,none": 0.7375, "acc_stderr,none": 0.022027196108925243 } }, "groups": { "kmmlu": { "acc,none": 0.47556707712248864, "acc_stderr,none": 0.003918983222456166, "alias": "kmmlu" }, "kmmlu_applied_science": { "acc,none": 0.45875, "acc_stderr,none": 0.007101063857525891, "alias": " - kmmlu_applied_science" }, "kmmlu_humss": { "acc,none": 0.4805860805860806, "acc_stderr,none": 0.009419825503999339, "alias": " - kmmlu_humss" }, "kmmlu_other": { "acc,none": 0.4772222222222222, "acc_stderr,none": 0.008073884461069719, "alias": " - kmmlu_other" }, "kmmlu_stem": { "acc,none": 0.4897674418604651, "acc_stderr,none": 0.007312394370135803, "alias": " - kmmlu_stem" }, "mmlu": { "acc,none": 0.7404266255461321, "acc_stderr,none": 0.003869340083262106, "alias": "mmlu" }, "mmlu_humanities": { "acc,none": 0.6931079323797139, "acc_stderr,none": 0.0077779673157217745, "alias": " - humanities" }, "mmlu_other": { "acc,none": 0.7437591776798825, "acc_stderr,none": 0.008056333552095894, "alias": " - other" }, "mmlu_social_sciences": { "acc,none": 0.8202205882352941, "acc_stderr,none": 0.007248431086566561, "alias": " - social sciences" }, "mmlu_stem": { "acc,none": 0.7148747224865207, "acc_stderr,none": 0.007751851248299227, "alias": " - stem" } }, "group_subtasks": { "mmlu_humanities": [ "mmlu_formal_logic", "mmlu_high_school_european_history", "mmlu_high_school_us_history", "mmlu_high_school_world_history", "mmlu_international_law", "mmlu_jurisprudence", "mmlu_logical_fallacies", "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_law", "mmlu_world_religions" ], "mmlu_social_sciences": [ "mmlu_econometrics", "mmlu_high_school_geography", "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", "mmlu_high_school_microeconomics", "mmlu_high_school_psychology", "mmlu_human_sexuality", "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies", "mmlu_sociology", "mmlu_us_foreign_policy" ], "mmlu_other": [ "mmlu_business_ethics", "mmlu_clinical_knowledge", "mmlu_college_medicine", "mmlu_global_facts", "mmlu_human_aging", "mmlu_management", "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous", "mmlu_nutrition", "mmlu_professional_accounting", "mmlu_professional_medicine", "mmlu_virology" ], "mmlu_stem": [ "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics", "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", "mmlu_high_school_mathematics", "mmlu_high_school_physics", "mmlu_high_school_statistics", "mmlu_machine_learning" ], "mmlu": [ "mmlu_stem", "mmlu_other", "mmlu_social_sciences", "mmlu_humanities" ], "hellaswag": [], "arc_easy": [], "arc_challenge": [], "winogrande": [], "kmmlu_humss": [ "kmmlu_accounting", "kmmlu_criminal_law", "kmmlu_economics", "kmmlu_education", "kmmlu_korean_history", "kmmlu_law", "kmmlu_management", "kmmlu_political_science_and_sociology", "kmmlu_psychology", "kmmlu_social_welfare", "kmmlu_taxation" ], "kmmlu_applied_science": [ "kmmlu_aviation_engineering_and_maintenance", "kmmlu_electronics_engineering", "kmmlu_energy_management", "kmmlu_environmental_science", "kmmlu_gas_technology_and_engineering", "kmmlu_geomatics", "kmmlu_industrial_engineer", "kmmlu_machine_design_and_manufacturing", "kmmlu_maritime_engineering", "kmmlu_nondestructive_testing", "kmmlu_railway_and_automotive_engineering", "kmmlu_telecommunications_and_wireless_technology" ], "kmmlu_other": [ "kmmlu_agricultural_sciences", "kmmlu_construction", "kmmlu_fashion", "kmmlu_food_processing", "kmmlu_health", "kmmlu_interior_architecture_and_design", "kmmlu_marketing", "kmmlu_patent", "kmmlu_public_safety", "kmmlu_real_estate", "kmmlu_refrigerating_machinery" ], "kmmlu_stem": [ "kmmlu_biology", "kmmlu_chemical_engineering", "kmmlu_chemistry", "kmmlu_civil_engineering", "kmmlu_computer_science", "kmmlu_ecology", "kmmlu_electrical_engineering", "kmmlu_information_technology", "kmmlu_materials_engineering", "kmmlu_math", "kmmlu_mechanical_engineering" ], "kmmlu": [ "kmmlu_stem", "kmmlu_other", "kmmlu_applied_science", "kmmlu_humss" ], "kobest_boolq": [], "kobest_copa": [], "kobest_hellaswag": [] }, "configs": { "arc_challenge": { "task": "arc_challenge", "tag": [ "ai2_arc" ], "dataset_path": "allenai/ai2_arc", "dataset_name": "ARC-Challenge", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "unsafe_code": false, "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_choice": "{{choices.text}}", "doc_to_target": "{{choices.label.index(answerKey)}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "arc_easy": { "task": "arc_easy", "tag": [ "ai2_arc" ], "dataset_path": "allenai/ai2_arc", "dataset_name": "ARC-Easy", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "unsafe_code": false, "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_choice": "{{choices.text}}", "doc_to_target": "{{choices.label.index(answerKey)}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "hellaswag": { "task": "hellaswag", "tag": [ "multiple_choice" ], "dataset_path": "Rowan/hellaswag", "training_split": "train", "validation_split": "validation", "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", "doc_to_text": "{{query}}", "doc_to_target": "{{label}}", "unsafe_code": false, "doc_to_choice": "choices", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": "", "fewshot_indices": null, "samples": null, "doc_to_text": "{{query}}", "doc_to_choice": "choices", "doc_to_target": "{{label}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_accounting": { "task": "kmmlu_accounting", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Accounting", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_agricultural_sciences": { "task": "kmmlu_agricultural_sciences", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Agricultural-Sciences", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_aviation_engineering_and_maintenance": { "task": "kmmlu_aviation_engineering_and_maintenance", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Aviation-Engineering-and-Maintenance", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_biology": { "task": "kmmlu_biology", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_chemical_engineering": { "task": "kmmlu_chemical_engineering", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Chemical-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_chemistry": { "task": "kmmlu_chemistry", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_civil_engineering": { "task": "kmmlu_civil_engineering", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Civil-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_computer_science": { "task": "kmmlu_computer_science", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Computer-Science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_construction": { "task": "kmmlu_construction", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Construction", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_criminal_law": { "task": "kmmlu_criminal_law", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Criminal-Law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_ecology": { "task": "kmmlu_ecology", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Ecology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_economics": { "task": "kmmlu_economics", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Economics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_education": { "task": "kmmlu_education", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Education", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_electrical_engineering": { "task": "kmmlu_electrical_engineering", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Electrical-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_electronics_engineering": { "task": "kmmlu_electronics_engineering", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Electronics-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_energy_management": { "task": "kmmlu_energy_management", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Energy-Management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_environmental_science": { "task": "kmmlu_environmental_science", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Environmental-Science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_fashion": { "task": "kmmlu_fashion", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Fashion", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_food_processing": { "task": "kmmlu_food_processing", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Food-Processing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_gas_technology_and_engineering": { "task": "kmmlu_gas_technology_and_engineering", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Gas-Technology-and-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_geomatics": { "task": "kmmlu_geomatics", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Geomatics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_health": { "task": "kmmlu_health", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Health", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_industrial_engineer": { "task": "kmmlu_industrial_engineer", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Industrial-Engineer", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_information_technology": { "task": "kmmlu_information_technology", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Information-Technology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_interior_architecture_and_design": { "task": "kmmlu_interior_architecture_and_design", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Interior-Architecture-and-Design", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_korean_history": { "task": "kmmlu_korean_history", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Korean-History", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_law": { "task": "kmmlu_law", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_machine_design_and_manufacturing": { "task": "kmmlu_machine_design_and_manufacturing", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Machine-Design-and-Manufacturing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_management": { "task": "kmmlu_management", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_maritime_engineering": { "task": "kmmlu_maritime_engineering", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Maritime-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_marketing": { "task": "kmmlu_marketing", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Marketing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_materials_engineering": { "task": "kmmlu_materials_engineering", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Materials-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_math": { "task": "kmmlu_math", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Math", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_mechanical_engineering": { "task": "kmmlu_mechanical_engineering", "tag": "kmmlu_stem_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Mechanical-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_nondestructive_testing": { "task": "kmmlu_nondestructive_testing", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Nondestructive-Testing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_patent": { "task": "kmmlu_patent", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Patent", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_political_science_and_sociology": { "task": "kmmlu_political_science_and_sociology", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Political-Science-and-Sociology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_psychology": { "task": "kmmlu_psychology", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_public_safety": { "task": "kmmlu_public_safety", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Public-Safety", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_railway_and_automotive_engineering": { "task": "kmmlu_railway_and_automotive_engineering", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Railway-and-Automotive-Engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_real_estate": { "task": "kmmlu_real_estate", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Real-Estate", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_refrigerating_machinery": { "task": "kmmlu_refrigerating_machinery", "tag": "kmmlu_other_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Refrigerating-Machinery", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_social_welfare": { "task": "kmmlu_social_welfare", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Social-Welfare", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_taxation": { "task": "kmmlu_taxation", "tag": "kmmlu_humss_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Taxation", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kmmlu_telecommunications_and_wireless_technology": { "task": "kmmlu_telecommunications_and_wireless_technology", "tag": "kmmlu_applied_science_tasks", "dataset_path": "HAERAE-HUB/KMMLU", "dataset_name": "Telecommunications-and-Wireless-Technology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_target": "{{answer-1}}", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "{{answer-1}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 2.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kobest_boolq": { "task": "kobest_boolq", "dataset_path": "skt/kobest_v1", "dataset_name": "boolq", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", "doc_to_target": "{{label}}", "unsafe_code": false, "doc_to_choice": [ "아니오", "예" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", "doc_to_choice": [ "아니오", "예" ], "doc_to_target": "{{label}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "f1", "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", "average": "macro", "hf_evaluate": true, "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kobest_copa": { "task": "kobest_copa", "dataset_path": "skt/kobest_v1", "dataset_name": "copa", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "def copa_doc_to_text(doc: dict) -> str:\n connector = {\"원인\": \" 왜냐하면\", \"결과\": \" 그래서\"}[doc[\"question\"].strip()]\n return f\"\"\"{doc[\"premise\"]} {connector}\"\"\"\n", "doc_to_target": "def copa_doc_to_target(doc: dict) -> str:\n correct_choice = doc[\"alternative_1\"] if doc[\"label\"] == 0 else doc[\"alternative_2\"]\n return f\"\"\"{correct_choice}\"\"\"\n", "unsafe_code": false, "doc_to_choice": "def copa_doc_to_choice(doc: dict) -> list:\n return [f\"\"\"{doc[\"alternative_1\"]}\"\"\", f\"\"\"{doc[\"alternative_2\"]}\"\"\"]\n", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "", "doc_to_choice": "", "doc_to_target": "", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "f1", "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", "average": "macro", "hf_evaluate": true, "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "kobest_hellaswag": { "task": "kobest_hellaswag", "dataset_path": "skt/kobest_v1", "dataset_name": "hellaswag", "training_split": "train", "validation_split": "validation", "test_split": "test", "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [\n dataset[\"ending_1\"],\n dataset[\"ending_2\"],\n dataset[\"ending_3\"],\n dataset[\"ending_4\"],\n ],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", "doc_to_text": "{{query}}", "doc_to_target": "{{label}}", "unsafe_code": false, "doc_to_choice": "choices", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": "", "fewshot_indices": null, "samples": null, "doc_to_text": "{{query}}", "doc_to_choice": "choices", "doc_to_target": "{{label}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true }, { "metric": "f1", "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", "average": "macro", "hf_evaluate": true, "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_abstract_algebra": { "task": "mmlu_abstract_algebra", "task_alias": "abstract_algebra", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_anatomy": { "task": "mmlu_anatomy", "task_alias": "anatomy", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_astronomy": { "task": "mmlu_astronomy", "task_alias": "astronomy", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_business_ethics": { "task": "mmlu_business_ethics", "task_alias": "business_ethics", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_clinical_knowledge": { "task": "mmlu_clinical_knowledge", "task_alias": "clinical_knowledge", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_biology": { "task": "mmlu_college_biology", "task_alias": "college_biology", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_chemistry": { "task": "mmlu_college_chemistry", "task_alias": "college_chemistry", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_computer_science": { "task": "mmlu_college_computer_science", "task_alias": "college_computer_science", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_mathematics": { "task": "mmlu_college_mathematics", "task_alias": "college_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_medicine": { "task": "mmlu_college_medicine", "task_alias": "college_medicine", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_college_physics": { "task": "mmlu_college_physics", "task_alias": "college_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_computer_security": { "task": "mmlu_computer_security", "task_alias": "computer_security", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about computer security.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_conceptual_physics": { "task": "mmlu_conceptual_physics", "task_alias": "conceptual_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_econometrics": { "task": "mmlu_econometrics", "task_alias": "econometrics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_electrical_engineering": { "task": "mmlu_electrical_engineering", "task_alias": "electrical_engineering", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_elementary_mathematics": { "task": "mmlu_elementary_mathematics", "task_alias": "elementary_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_formal_logic": { "task": "mmlu_formal_logic", "task_alias": "formal_logic", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_global_facts": { "task": "mmlu_global_facts", "task_alias": "global_facts", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about global facts.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_biology": { "task": "mmlu_high_school_biology", "task_alias": "high_school_biology", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_chemistry": { "task": "mmlu_high_school_chemistry", "task_alias": "high_school_chemistry", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_computer_science": { "task": "mmlu_high_school_computer_science", "task_alias": "high_school_computer_science", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_european_history": { "task": "mmlu_high_school_european_history", "task_alias": "high_school_european_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_geography": { "task": "mmlu_high_school_geography", "task_alias": "high_school_geography", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_government_and_politics": { "task": "mmlu_high_school_government_and_politics", "task_alias": "high_school_government_and_politics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_macroeconomics": { "task": "mmlu_high_school_macroeconomics", "task_alias": "high_school_macroeconomics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_mathematics": { "task": "mmlu_high_school_mathematics", "task_alias": "high_school_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_microeconomics": { "task": "mmlu_high_school_microeconomics", "task_alias": "high_school_microeconomics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_physics": { "task": "mmlu_high_school_physics", "task_alias": "high_school_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_psychology": { "task": "mmlu_high_school_psychology", "task_alias": "high_school_psychology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_statistics": { "task": "mmlu_high_school_statistics", "task_alias": "high_school_statistics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_us_history": { "task": "mmlu_high_school_us_history", "task_alias": "high_school_us_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_high_school_world_history": { "task": "mmlu_high_school_world_history", "task_alias": "high_school_world_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_human_aging": { "task": "mmlu_human_aging", "task_alias": "human_aging", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human aging.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_human_sexuality": { "task": "mmlu_human_sexuality", "task_alias": "human_sexuality", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_international_law": { "task": "mmlu_international_law", "task_alias": "international_law", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "international_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about international law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_jurisprudence": { "task": "mmlu_jurisprudence", "task_alias": "jurisprudence", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_logical_fallacies": { "task": "mmlu_logical_fallacies", "task_alias": "logical_fallacies", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_machine_learning": { "task": "mmlu_machine_learning", "task_alias": "machine_learning", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_management": { "task": "mmlu_management", "task_alias": "management", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about management.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_marketing": { "task": "mmlu_marketing", "task_alias": "marketing", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "marketing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about marketing.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_medical_genetics": { "task": "mmlu_medical_genetics", "task_alias": "medical_genetics", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_miscellaneous": { "task": "mmlu_miscellaneous", "task_alias": "miscellaneous", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_moral_disputes": { "task": "mmlu_moral_disputes", "task_alias": "moral_disputes", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_moral_scenarios": { "task": "mmlu_moral_scenarios", "task_alias": "moral_scenarios", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_nutrition": { "task": "mmlu_nutrition", "task_alias": "nutrition", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_philosophy": { "task": "mmlu_philosophy", "task_alias": "philosophy", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_prehistory": { "task": "mmlu_prehistory", "task_alias": "prehistory", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_professional_accounting": { "task": "mmlu_professional_accounting", "task_alias": "professional_accounting", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_professional_law": { "task": "mmlu_professional_law", "task_alias": "professional_law", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_professional_medicine": { "task": "mmlu_professional_medicine", "task_alias": "professional_medicine", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_professional_psychology": { "task": "mmlu_professional_psychology", "task_alias": "professional_psychology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_public_relations": { "task": "mmlu_public_relations", "task_alias": "public_relations", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about public relations.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_security_studies": { "task": "mmlu_security_studies", "task_alias": "security_studies", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about security studies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_sociology": { "task": "mmlu_sociology", "task_alias": "sociology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "sociology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about sociology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_us_foreign_policy": { "task": "mmlu_us_foreign_policy", "task_alias": "us_foreign_policy", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_virology": { "task": "mmlu_virology", "task_alias": "virology", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "virology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about virology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "mmlu_world_religions": { "task": "mmlu_world_religions", "task_alias": "world_religions", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about world religions.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "split": "dev", "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_choice": [ "A", "B", "C", "D" ], "doc_to_target": "answer", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } }, "winogrande": { "task": "winogrande", "dataset_path": "allenai/winogrande", "dataset_name": "winogrande_xl", "training_split": "train", "validation_split": "validation", "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", "unsafe_code": false, "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "", "doc_to_choice": "", "doc_to_target": "", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " " }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "sentence", "metadata": { "version": 1.0, "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true } } }, "versions": { "arc_challenge": 1.0, "arc_easy": 1.0, "hellaswag": 1.0, "kmmlu": 2.0, "kmmlu_accounting": 2.0, "kmmlu_agricultural_sciences": 2.0, "kmmlu_applied_science": 2.0, "kmmlu_aviation_engineering_and_maintenance": 2.0, "kmmlu_biology": 2.0, "kmmlu_chemical_engineering": 2.0, "kmmlu_chemistry": 2.0, "kmmlu_civil_engineering": 2.0, "kmmlu_computer_science": 2.0, "kmmlu_construction": 2.0, "kmmlu_criminal_law": 2.0, "kmmlu_ecology": 2.0, "kmmlu_economics": 2.0, "kmmlu_education": 2.0, "kmmlu_electrical_engineering": 2.0, "kmmlu_electronics_engineering": 2.0, "kmmlu_energy_management": 2.0, "kmmlu_environmental_science": 2.0, "kmmlu_fashion": 2.0, "kmmlu_food_processing": 2.0, "kmmlu_gas_technology_and_engineering": 2.0, "kmmlu_geomatics": 2.0, "kmmlu_health": 2.0, "kmmlu_humss": 2.0, "kmmlu_industrial_engineer": 2.0, "kmmlu_information_technology": 2.0, "kmmlu_interior_architecture_and_design": 2.0, "kmmlu_korean_history": 2.0, "kmmlu_law": 2.0, "kmmlu_machine_design_and_manufacturing": 2.0, "kmmlu_management": 2.0, "kmmlu_maritime_engineering": 2.0, "kmmlu_marketing": 2.0, "kmmlu_materials_engineering": 2.0, "kmmlu_math": 2.0, "kmmlu_mechanical_engineering": 2.0, "kmmlu_nondestructive_testing": 2.0, "kmmlu_other": 2.0, "kmmlu_patent": 2.0, "kmmlu_political_science_and_sociology": 2.0, "kmmlu_psychology": 2.0, "kmmlu_public_safety": 2.0, "kmmlu_railway_and_automotive_engineering": 2.0, "kmmlu_real_estate": 2.0, "kmmlu_refrigerating_machinery": 2.0, "kmmlu_social_welfare": 2.0, "kmmlu_stem": 2.0, "kmmlu_taxation": 2.0, "kmmlu_telecommunications_and_wireless_technology": 2.0, "kobest_boolq": 1.0, "kobest_copa": 1.0, "kobest_hellaswag": 1.0, "mmlu": 2, "mmlu_abstract_algebra": 1.0, "mmlu_anatomy": 1.0, "mmlu_astronomy": 1.0, "mmlu_business_ethics": 1.0, "mmlu_clinical_knowledge": 1.0, "mmlu_college_biology": 1.0, "mmlu_college_chemistry": 1.0, "mmlu_college_computer_science": 1.0, "mmlu_college_mathematics": 1.0, "mmlu_college_medicine": 1.0, "mmlu_college_physics": 1.0, "mmlu_computer_security": 1.0, "mmlu_conceptual_physics": 1.0, "mmlu_econometrics": 1.0, "mmlu_electrical_engineering": 1.0, "mmlu_elementary_mathematics": 1.0, "mmlu_formal_logic": 1.0, "mmlu_global_facts": 1.0, "mmlu_high_school_biology": 1.0, "mmlu_high_school_chemistry": 1.0, "mmlu_high_school_computer_science": 1.0, "mmlu_high_school_european_history": 1.0, "mmlu_high_school_geography": 1.0, "mmlu_high_school_government_and_politics": 1.0, "mmlu_high_school_macroeconomics": 1.0, "mmlu_high_school_mathematics": 1.0, "mmlu_high_school_microeconomics": 1.0, "mmlu_high_school_physics": 1.0, "mmlu_high_school_psychology": 1.0, "mmlu_high_school_statistics": 1.0, "mmlu_high_school_us_history": 1.0, "mmlu_high_school_world_history": 1.0, "mmlu_human_aging": 1.0, "mmlu_human_sexuality": 1.0, "mmlu_humanities": 2, "mmlu_international_law": 1.0, "mmlu_jurisprudence": 1.0, "mmlu_logical_fallacies": 1.0, "mmlu_machine_learning": 1.0, "mmlu_management": 1.0, "mmlu_marketing": 1.0, "mmlu_medical_genetics": 1.0, "mmlu_miscellaneous": 1.0, "mmlu_moral_disputes": 1.0, "mmlu_moral_scenarios": 1.0, "mmlu_nutrition": 1.0, "mmlu_other": 2, "mmlu_philosophy": 1.0, "mmlu_prehistory": 1.0, "mmlu_professional_accounting": 1.0, "mmlu_professional_law": 1.0, "mmlu_professional_medicine": 1.0, "mmlu_professional_psychology": 1.0, "mmlu_public_relations": 1.0, "mmlu_security_studies": 1.0, "mmlu_social_sciences": 2, "mmlu_sociology": 1.0, "mmlu_stem": 2, "mmlu_us_foreign_policy": 1.0, "mmlu_virology": 1.0, "mmlu_world_religions": 1.0, "winogrande": 1.0 }, "n-shot": { "arc_challenge": 0, "arc_easy": 0, "hellaswag": 0, "kmmlu_accounting": 0, "kmmlu_agricultural_sciences": 0, "kmmlu_aviation_engineering_and_maintenance": 0, "kmmlu_biology": 0, "kmmlu_chemical_engineering": 0, "kmmlu_chemistry": 0, "kmmlu_civil_engineering": 0, "kmmlu_computer_science": 0, "kmmlu_construction": 0, "kmmlu_criminal_law": 0, "kmmlu_ecology": 0, "kmmlu_economics": 0, "kmmlu_education": 0, "kmmlu_electrical_engineering": 0, "kmmlu_electronics_engineering": 0, "kmmlu_energy_management": 0, "kmmlu_environmental_science": 0, "kmmlu_fashion": 0, "kmmlu_food_processing": 0, "kmmlu_gas_technology_and_engineering": 0, "kmmlu_geomatics": 0, "kmmlu_health": 0, "kmmlu_industrial_engineer": 0, "kmmlu_information_technology": 0, "kmmlu_interior_architecture_and_design": 0, "kmmlu_korean_history": 0, "kmmlu_law": 0, "kmmlu_machine_design_and_manufacturing": 0, "kmmlu_management": 0, "kmmlu_maritime_engineering": 0, "kmmlu_marketing": 0, "kmmlu_materials_engineering": 0, "kmmlu_math": 0, "kmmlu_mechanical_engineering": 0, "kmmlu_nondestructive_testing": 0, "kmmlu_patent": 0, "kmmlu_political_science_and_sociology": 0, "kmmlu_psychology": 0, "kmmlu_public_safety": 0, "kmmlu_railway_and_automotive_engineering": 0, "kmmlu_real_estate": 0, "kmmlu_refrigerating_machinery": 0, "kmmlu_social_welfare": 0, "kmmlu_taxation": 0, "kmmlu_telecommunications_and_wireless_technology": 0, "kobest_boolq": 0, "kobest_copa": 0, "kobest_hellaswag": 0, "mmlu_abstract_algebra": 0, "mmlu_anatomy": 0, "mmlu_astronomy": 0, "mmlu_business_ethics": 0, "mmlu_clinical_knowledge": 0, "mmlu_college_biology": 0, "mmlu_college_chemistry": 0, "mmlu_college_computer_science": 0, "mmlu_college_mathematics": 0, "mmlu_college_medicine": 0, "mmlu_college_physics": 0, "mmlu_computer_security": 0, "mmlu_conceptual_physics": 0, "mmlu_econometrics": 0, "mmlu_electrical_engineering": 0, "mmlu_elementary_mathematics": 0, "mmlu_formal_logic": 0, "mmlu_global_facts": 0, "mmlu_high_school_biology": 0, "mmlu_high_school_chemistry": 0, "mmlu_high_school_computer_science": 0, "mmlu_high_school_european_history": 0, "mmlu_high_school_geography": 0, "mmlu_high_school_government_and_politics": 0, "mmlu_high_school_macroeconomics": 0, "mmlu_high_school_mathematics": 0, "mmlu_high_school_microeconomics": 0, "mmlu_high_school_physics": 0, "mmlu_high_school_psychology": 0, "mmlu_high_school_statistics": 0, "mmlu_high_school_us_history": 0, "mmlu_high_school_world_history": 0, "mmlu_human_aging": 0, "mmlu_human_sexuality": 0, "mmlu_international_law": 0, "mmlu_jurisprudence": 0, "mmlu_logical_fallacies": 0, "mmlu_machine_learning": 0, "mmlu_management": 0, "mmlu_marketing": 0, "mmlu_medical_genetics": 0, "mmlu_miscellaneous": 0, "mmlu_moral_disputes": 0, "mmlu_moral_scenarios": 0, "mmlu_nutrition": 0, "mmlu_philosophy": 0, "mmlu_prehistory": 0, "mmlu_professional_accounting": 0, "mmlu_professional_law": 0, "mmlu_professional_medicine": 0, "mmlu_professional_psychology": 0, "mmlu_public_relations": 0, "mmlu_security_studies": 0, "mmlu_sociology": 0, "mmlu_us_foreign_policy": 0, "mmlu_virology": 0, "mmlu_world_religions": 0, "winogrande": 0 }, "higher_is_better": { "arc_challenge": { "acc": true, "acc_norm": true }, "arc_easy": { "acc": true, "acc_norm": true }, "hellaswag": { "acc": true, "acc_norm": true }, "kmmlu": { "acc": true }, "kmmlu_accounting": { "acc": true }, "kmmlu_agricultural_sciences": { "acc": true }, "kmmlu_applied_science": { "acc": true }, "kmmlu_aviation_engineering_and_maintenance": { "acc": true }, "kmmlu_biology": { "acc": true }, "kmmlu_chemical_engineering": { "acc": true }, "kmmlu_chemistry": { "acc": true }, "kmmlu_civil_engineering": { "acc": true }, "kmmlu_computer_science": { "acc": true }, "kmmlu_construction": { "acc": true }, "kmmlu_criminal_law": { "acc": true }, "kmmlu_ecology": { "acc": true }, "kmmlu_economics": { "acc": true }, "kmmlu_education": { "acc": true }, "kmmlu_electrical_engineering": { "acc": true }, "kmmlu_electronics_engineering": { "acc": true }, "kmmlu_energy_management": { "acc": true }, "kmmlu_environmental_science": { "acc": true }, "kmmlu_fashion": { "acc": true }, "kmmlu_food_processing": { "acc": true }, "kmmlu_gas_technology_and_engineering": { "acc": true }, "kmmlu_geomatics": { "acc": true }, "kmmlu_health": { "acc": true }, "kmmlu_humss": { "acc": true }, "kmmlu_industrial_engineer": { "acc": true }, "kmmlu_information_technology": { "acc": true }, "kmmlu_interior_architecture_and_design": { "acc": true }, "kmmlu_korean_history": { "acc": true }, "kmmlu_law": { "acc": true }, "kmmlu_machine_design_and_manufacturing": { "acc": true }, "kmmlu_management": { "acc": true }, "kmmlu_maritime_engineering": { "acc": true }, "kmmlu_marketing": { "acc": true }, "kmmlu_materials_engineering": { "acc": true }, "kmmlu_math": { "acc": true }, "kmmlu_mechanical_engineering": { "acc": true }, "kmmlu_nondestructive_testing": { "acc": true }, "kmmlu_other": { "acc": true }, "kmmlu_patent": { "acc": true }, "kmmlu_political_science_and_sociology": { "acc": true }, "kmmlu_psychology": { "acc": true }, "kmmlu_public_safety": { "acc": true }, "kmmlu_railway_and_automotive_engineering": { "acc": true }, "kmmlu_real_estate": { "acc": true }, "kmmlu_refrigerating_machinery": { "acc": true }, "kmmlu_social_welfare": { "acc": true }, "kmmlu_stem": { "acc": true }, "kmmlu_taxation": { "acc": true }, "kmmlu_telecommunications_and_wireless_technology": { "acc": true }, "kobest_boolq": { "acc": true, "f1": true }, "kobest_copa": { "acc": true, "f1": true }, "kobest_hellaswag": { "acc": true, "acc_norm": true, "f1": true }, "mmlu": { "acc": true }, "mmlu_abstract_algebra": { "acc": true }, "mmlu_anatomy": { "acc": true }, "mmlu_astronomy": { "acc": true }, "mmlu_business_ethics": { "acc": true }, "mmlu_clinical_knowledge": { "acc": true }, "mmlu_college_biology": { "acc": true }, "mmlu_college_chemistry": { "acc": true }, "mmlu_college_computer_science": { "acc": true }, "mmlu_college_mathematics": { "acc": true }, "mmlu_college_medicine": { "acc": true }, "mmlu_college_physics": { "acc": true }, "mmlu_computer_security": { "acc": true }, "mmlu_conceptual_physics": { "acc": true }, "mmlu_econometrics": { "acc": true }, "mmlu_electrical_engineering": { "acc": true }, "mmlu_elementary_mathematics": { "acc": true }, "mmlu_formal_logic": { "acc": true }, "mmlu_global_facts": { "acc": true }, "mmlu_high_school_biology": { "acc": true }, "mmlu_high_school_chemistry": { "acc": true }, "mmlu_high_school_computer_science": { "acc": true }, "mmlu_high_school_european_history": { "acc": true }, "mmlu_high_school_geography": { "acc": true }, "mmlu_high_school_government_and_politics": { "acc": true }, "mmlu_high_school_macroeconomics": { "acc": true }, "mmlu_high_school_mathematics": { "acc": true }, "mmlu_high_school_microeconomics": { "acc": true }, "mmlu_high_school_physics": { "acc": true }, "mmlu_high_school_psychology": { "acc": true }, "mmlu_high_school_statistics": { "acc": true }, "mmlu_high_school_us_history": { "acc": true }, "mmlu_high_school_world_history": { "acc": true }, "mmlu_human_aging": { "acc": true }, "mmlu_human_sexuality": { "acc": true }, "mmlu_humanities": { "acc": true }, "mmlu_international_law": { "acc": true }, "mmlu_jurisprudence": { "acc": true }, "mmlu_logical_fallacies": { "acc": true }, "mmlu_machine_learning": { "acc": true }, "mmlu_management": { "acc": true }, "mmlu_marketing": { "acc": true }, "mmlu_medical_genetics": { "acc": true }, "mmlu_miscellaneous": { "acc": true }, "mmlu_moral_disputes": { "acc": true }, "mmlu_moral_scenarios": { "acc": true }, "mmlu_nutrition": { "acc": true }, "mmlu_other": { "acc": true }, "mmlu_philosophy": { "acc": true }, "mmlu_prehistory": { "acc": true }, "mmlu_professional_accounting": { "acc": true }, "mmlu_professional_law": { "acc": true }, "mmlu_professional_medicine": { "acc": true }, "mmlu_professional_psychology": { "acc": true }, "mmlu_public_relations": { "acc": true }, "mmlu_security_studies": { "acc": true }, "mmlu_social_sciences": { "acc": true }, "mmlu_sociology": { "acc": true }, "mmlu_stem": { "acc": true }, "mmlu_us_foreign_policy": { "acc": true }, "mmlu_virology": { "acc": true }, "mmlu_world_religions": { "acc": true }, "winogrande": { "acc": true } }, "n-samples": { "kobest_hellaswag": { "original": 500, "effective": 400 }, "kobest_copa": { "original": 1000, "effective": 400 }, "kobest_boolq": { "original": 1404, "effective": 400 }, "kmmlu_biology": { "original": 1000, "effective": 400 }, "kmmlu_chemical_engineering": { "original": 1000, "effective": 400 }, "kmmlu_chemistry": { "original": 600, "effective": 400 }, "kmmlu_civil_engineering": { "original": 1000, "effective": 400 }, "kmmlu_computer_science": { "original": 1000, "effective": 400 }, "kmmlu_ecology": { "original": 1000, "effective": 400 }, "kmmlu_electrical_engineering": { "original": 1000, "effective": 400 }, "kmmlu_information_technology": { "original": 1000, "effective": 400 }, "kmmlu_materials_engineering": { "original": 1000, "effective": 400 }, "kmmlu_math": { "original": 300, "effective": 300 }, "kmmlu_mechanical_engineering": { "original": 1000, "effective": 400 }, "kmmlu_agricultural_sciences": { "original": 1000, "effective": 400 }, "kmmlu_construction": { "original": 1000, "effective": 400 }, "kmmlu_fashion": { "original": 1000, "effective": 400 }, "kmmlu_food_processing": { "original": 1000, "effective": 400 }, "kmmlu_health": { "original": 100, "effective": 100 }, "kmmlu_interior_architecture_and_design": { "original": 1000, "effective": 400 }, "kmmlu_marketing": { "original": 1000, "effective": 400 }, "kmmlu_patent": { "original": 100, "effective": 100 }, "kmmlu_public_safety": { "original": 1000, "effective": 400 }, "kmmlu_real_estate": { "original": 200, "effective": 200 }, "kmmlu_refrigerating_machinery": { "original": 1000, "effective": 400 }, "kmmlu_aviation_engineering_and_maintenance": { "original": 1000, "effective": 400 }, "kmmlu_electronics_engineering": { "original": 1000, "effective": 400 }, "kmmlu_energy_management": { "original": 1000, "effective": 400 }, "kmmlu_environmental_science": { "original": 1000, "effective": 400 }, "kmmlu_gas_technology_and_engineering": { "original": 1000, "effective": 400 }, "kmmlu_geomatics": { "original": 1000, "effective": 400 }, "kmmlu_industrial_engineer": { "original": 1000, "effective": 400 }, "kmmlu_machine_design_and_manufacturing": { "original": 1000, "effective": 400 }, "kmmlu_maritime_engineering": { "original": 600, "effective": 400 }, "kmmlu_nondestructive_testing": { "original": 1000, "effective": 400 }, "kmmlu_railway_and_automotive_engineering": { "original": 1000, "effective": 400 }, "kmmlu_telecommunications_and_wireless_technology": { "original": 1000, "effective": 400 }, "kmmlu_accounting": { "original": 100, "effective": 100 }, "kmmlu_criminal_law": { "original": 200, "effective": 200 }, "kmmlu_economics": { "original": 130, "effective": 130 }, "kmmlu_education": { "original": 100, "effective": 100 }, "kmmlu_korean_history": { "original": 100, "effective": 100 }, "kmmlu_law": { "original": 1000, "effective": 400 }, "kmmlu_management": { "original": 1000, "effective": 400 }, "kmmlu_political_science_and_sociology": { "original": 300, "effective": 300 }, "kmmlu_psychology": { "original": 1000, "effective": 400 }, "kmmlu_social_welfare": { "original": 1000, "effective": 400 }, "kmmlu_taxation": { "original": 200, "effective": 200 }, "winogrande": { "original": 1267, "effective": 400 }, "arc_challenge": { "original": 1172, "effective": 400 }, "arc_easy": { "original": 2376, "effective": 400 }, "hellaswag": { "original": 10042, "effective": 400 }, "mmlu_abstract_algebra": { "original": 100, "effective": 100 }, "mmlu_anatomy": { "original": 135, "effective": 135 }, "mmlu_astronomy": { "original": 152, "effective": 152 }, "mmlu_college_biology": { "original": 144, "effective": 144 }, "mmlu_college_chemistry": { "original": 100, "effective": 100 }, "mmlu_college_computer_science": { "original": 100, "effective": 100 }, "mmlu_college_mathematics": { "original": 100, "effective": 100 }, "mmlu_college_physics": { "original": 102, "effective": 102 }, "mmlu_computer_security": { "original": 100, "effective": 100 }, "mmlu_conceptual_physics": { "original": 235, "effective": 235 }, "mmlu_electrical_engineering": { "original": 145, "effective": 145 }, "mmlu_elementary_mathematics": { "original": 378, "effective": 378 }, "mmlu_high_school_biology": { "original": 310, "effective": 310 }, "mmlu_high_school_chemistry": { "original": 203, "effective": 203 }, "mmlu_high_school_computer_science": { "original": 100, "effective": 100 }, "mmlu_high_school_mathematics": { "original": 270, "effective": 270 }, "mmlu_high_school_physics": { "original": 151, "effective": 151 }, "mmlu_high_school_statistics": { "original": 216, "effective": 216 }, "mmlu_machine_learning": { "original": 112, "effective": 112 }, "mmlu_business_ethics": { "original": 100, "effective": 100 }, "mmlu_clinical_knowledge": { "original": 265, "effective": 265 }, "mmlu_college_medicine": { "original": 173, "effective": 173 }, "mmlu_global_facts": { "original": 100, "effective": 100 }, "mmlu_human_aging": { "original": 223, "effective": 223 }, "mmlu_management": { "original": 103, "effective": 103 }, "mmlu_marketing": { "original": 234, "effective": 234 }, "mmlu_medical_genetics": { "original": 100, "effective": 100 }, "mmlu_miscellaneous": { "original": 783, "effective": 400 }, "mmlu_nutrition": { "original": 306, "effective": 306 }, "mmlu_professional_accounting": { "original": 282, "effective": 282 }, "mmlu_professional_medicine": { "original": 272, "effective": 272 }, "mmlu_virology": { "original": 166, "effective": 166 }, "mmlu_econometrics": { "original": 114, "effective": 114 }, "mmlu_high_school_geography": { "original": 198, "effective": 198 }, "mmlu_high_school_government_and_politics": { "original": 193, "effective": 193 }, "mmlu_high_school_macroeconomics": { "original": 390, "effective": 390 }, "mmlu_high_school_microeconomics": { "original": 238, "effective": 238 }, "mmlu_high_school_psychology": { "original": 545, "effective": 400 }, "mmlu_human_sexuality": { "original": 131, "effective": 131 }, "mmlu_professional_psychology": { "original": 612, "effective": 400 }, "mmlu_public_relations": { "original": 110, "effective": 110 }, "mmlu_security_studies": { "original": 245, "effective": 245 }, "mmlu_sociology": { "original": 201, "effective": 201 }, "mmlu_us_foreign_policy": { "original": 100, "effective": 100 }, "mmlu_formal_logic": { "original": 126, "effective": 126 }, "mmlu_high_school_european_history": { "original": 165, "effective": 165 }, "mmlu_high_school_us_history": { "original": 204, "effective": 204 }, "mmlu_high_school_world_history": { "original": 237, "effective": 237 }, "mmlu_international_law": { "original": 121, "effective": 121 }, "mmlu_jurisprudence": { "original": 108, "effective": 108 }, "mmlu_logical_fallacies": { "original": 163, "effective": 163 }, "mmlu_moral_disputes": { "original": 346, "effective": 346 }, "mmlu_moral_scenarios": { "original": 895, "effective": 400 }, "mmlu_philosophy": { "original": 311, "effective": 311 }, "mmlu_prehistory": { "original": 324, "effective": 324 }, "mmlu_professional_law": { "original": 1534, "effective": 400 }, "mmlu_world_religions": { "original": 171, "effective": 171 } }, "config": { "model": "hf", "model_args": { "pretrained": "unsloth/Qwen3-4B-Base", "trust_remote_code": true }, "model_num_parameters": 4022468096, "model_dtype": "torch.bfloat16", "model_revision": "main", "model_sha": "0573b584bc6b32adc84bb9c91bf9b71bea71fc40", "batch_size": "12", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": 400.0, "bootstrap_iters": 100000, "gen_kwargs": {}, "random_seed": 0, "numpy_seed": 1234, "torch_seed": 1234, "fewshot_seed": 1234 }, "git_hash": "0ce43af", "date": 1775962695.520946, "pretty_env_info": "PyTorch version: 2.9.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 4.1.0\nLibc version: glibc-2.35\n\nPython version: 3.11.14 | packaged by conda-forge | (main, Oct 13 2025, 14:09:32) [GCC 14.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition\nNvidia driver version: 590.48.01\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.8.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 192\nOn-line CPU(s) list: 0-191\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7642 48-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2300.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4600.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sev sev_es ibpb_exit_to_user\nVirtualization: AMD-V\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47,96-143\nNUMA node1 CPU(s): 48-95,144-191\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; Safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nVulnerability Vmscape: Mitigation; IBPB before exit to userspace\n\nVersions of relevant libraries:\n[pip3] executorch==1.0.1\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.17.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] optree==0.17.0\n[pip3] pytorch_tokenizers==1.0.1\n[pip3] torch==2.9.0+cu128\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torch-stoi==0.2.3\n[pip3] torchao==0.14.0\n[pip3] torchaudio==2.9.0+cu128\n[pip3] torchcodec==0.9.1\n[pip3] torchelastic==0.2.2\n[pip3] torchvision==0.24.0+cu128\n[pip3] triton==3.5.0\n[pip3] triton_kernels==1.0.0\n[conda] No relevant packages", "transformers_version": "5.5.3", "lm_eval_version": "0.4.11", "upper_git_hash": null, "tokenizer_pad_token": [ "<|vision_pad|>", "151654" ], "tokenizer_eos_token": [ "<|endoftext|>", "151643" ], "tokenizer_bos_token": [ null, "None" ], "eot_token_id": 151643, "max_length": 32768, "task_hashes": {}, "model_source": "hf", "model_name": "unsloth/Qwen3-4B-Base", "model_name_sanitized": "unsloth__Qwen3-4B-Base", "system_instruction": null, "system_instruction_sha": null, "fewshot_as_multiturn": null, "chat_template": null, "chat_template_sha": null, "total_evaluation_time_seconds": "573.7631184216589" }