commit 67c081bc2ec6615ed6214a3e1d0e2e8bba2ba585 Author: ModelHub XC Date: Fri May 1 18:43:25 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: alwaysgood/QWEN3-4B-CPT Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4769e9f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1477/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f53990 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: unsloth/Qwen3-4B-Base +library_name: transformers +model_name: checkpoints +tags: +- generated_from_trainer +- sft +- unsloth +- trl +licence: license +--- + +# Model Card for checkpoints + +This model is a fine-tuned version of [unsloth/Qwen3-4B-Base](https://huggingface.co/unsloth/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/hiloong/mono-cpt/runs/sxp4zkdr) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.24.0 +- Transformers: 5.5.3 +- Pytorch: 2.9.0+cu128 +- Datasets: 4.3.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..409268f --- /dev/null +++ b/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 1.0, + "eval_loss": 1.7002116441726685, + "eval_runtime": 173.1669, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 0.693, + "total_flos": 2.103177196962902e+18, + "train_loss": 1.7256613558986822, + "train_runtime": 29239.084, + "train_samples_per_second": 1.616, + "train_steps_per_second": 0.051 +} \ No newline at end of file diff --git a/checkpoint-1477/config.json b/checkpoint-1477/config.json new file mode 100644 index 0000000..cbfe8ec --- /dev/null +++ b/checkpoint-1477/config.json @@ -0,0 +1,74 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_name": "unsloth/Qwen3-4B-Base", + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151669, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "unsloth_fixed": true, + "unsloth_version": "2026.4.4", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-1477/generation_config.json b/checkpoint-1477/generation_config.json new file mode 100644 index 0000000..43f602b --- /dev/null +++ b/checkpoint-1477/generation_config.json @@ -0,0 +1,9 @@ +{ + "eos_token_id": [ + 151643 + ], + "max_length": 32768, + "max_new_tokens": 2048, + "pad_token_id": 151669, + "transformers_version": "5.5.3" +} diff --git a/checkpoint-1477/model.safetensors b/checkpoint-1477/model.safetensors new file mode 100644 index 0000000..3e5ceb1 --- /dev/null +++ b/checkpoint-1477/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3a4a2abfa757af075d0fa804b2093ad46c4d9bc4a227a070907a885eea69e97 +size 8044982080 diff --git a/checkpoint-1477/optimizer.pt b/checkpoint-1477/optimizer.pt new file mode 100644 index 0000000..0469685 --- /dev/null +++ b/checkpoint-1477/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa38e6bbd0738ba63086bd0369ac5ef53d96f94c1d3bf63cd8286ec8d324f12e +size 14534393422 diff --git a/checkpoint-1477/rng_state.pth b/checkpoint-1477/rng_state.pth new file mode 100644 index 0000000..435e005 --- /dev/null +++ b/checkpoint-1477/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399 +size 14645 diff --git a/checkpoint-1477/scheduler.pt b/checkpoint-1477/scheduler.pt new file mode 100644 index 0000000..5f84341 --- /dev/null +++ b/checkpoint-1477/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b1610c26267a8ba984a88363b2c05d3a4b232638c92e564e21a007ac9f4fc1 +size 1465 diff --git a/checkpoint-1477/tokenizer.json b/checkpoint-1477/tokenizer.json new file mode 100644 index 0000000..73037fe --- /dev/null +++ b/checkpoint-1477/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45c4ffda6666cf6d75d0b1f961f25964e2a52a62e78aaecb2f458e9ba9824112 +size 11422840 diff --git a/checkpoint-1477/tokenizer_config.json b/checkpoint-1477/tokenizer_config.json new file mode 100644 index 0000000..d450540 --- /dev/null +++ b/checkpoint-1477/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-1477/trainer_state.json b/checkpoint-1477/trainer_state.json new file mode 100644 index 0000000..a7c293f --- /dev/null +++ b/checkpoint-1477/trainer_state.json @@ -0,0 +1,1087 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1477, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006771626883358727, + "grad_norm": 1.5234375, + "learning_rate": 6.081081081081082e-07, + "loss": 1.8358331680297852, + "step": 10 + }, + { + "epoch": 0.013543253766717453, + "grad_norm": 1.5078125, + "learning_rate": 1.2837837837837838e-06, + "loss": 1.840726089477539, + "step": 20 + }, + { + "epoch": 0.02031488065007618, + "grad_norm": 1.0859375, + "learning_rate": 1.9594594594594595e-06, + "loss": 1.8267410278320313, + "step": 30 + }, + { + "epoch": 0.027086507533434907, + "grad_norm": 1.1640625, + "learning_rate": 2.6351351351351353e-06, + "loss": 1.8383310317993165, + "step": 40 + }, + { + "epoch": 0.03385813441679363, + "grad_norm": 1.0859375, + "learning_rate": 3.310810810810811e-06, + "loss": 1.8384885787963867, + "step": 50 + }, + { + "epoch": 0.04062976130015236, + "grad_norm": 1.03125, + "learning_rate": 3.986486486486487e-06, + "loss": 1.8087802886962892, + "step": 60 + }, + { + "epoch": 0.04740138818351109, + "grad_norm": 1.015625, + "learning_rate": 4.6621621621621625e-06, + "loss": 1.8259227752685547, + "step": 70 + }, + { + "epoch": 0.05417301506686981, + "grad_norm": 1.046875, + "learning_rate": 5.337837837837838e-06, + "loss": 1.8241001129150392, + "step": 80 + }, + { + "epoch": 0.06094464195022854, + "grad_norm": 0.96484375, + "learning_rate": 6.013513513513514e-06, + "loss": 1.82220516204834, + "step": 90 + }, + { + "epoch": 0.06771626883358726, + "grad_norm": 0.953125, + "learning_rate": 6.689189189189191e-06, + "loss": 1.7921783447265625, + "step": 100 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 0.9296875, + "learning_rate": 7.3648648648648655e-06, + "loss": 1.797548484802246, + "step": 110 + }, + { + "epoch": 0.08125952260030472, + "grad_norm": 0.89453125, + "learning_rate": 8.040540540540541e-06, + "loss": 1.7889528274536133, + "step": 120 + }, + { + "epoch": 0.08803114948366345, + "grad_norm": 0.90234375, + "learning_rate": 8.716216216216217e-06, + "loss": 1.7663179397583009, + "step": 130 + }, + { + "epoch": 0.09480277636702218, + "grad_norm": 0.89453125, + "learning_rate": 9.391891891891893e-06, + "loss": 1.7635225296020507, + "step": 140 + }, + { + "epoch": 0.1015744032503809, + "grad_norm": 0.91015625, + "learning_rate": 9.999986030219255e-06, + "loss": 1.7774492263793946, + "step": 150 + }, + { + "epoch": 0.10834603013373963, + "grad_norm": 0.91796875, + "learning_rate": 9.998309750982693e-06, + "loss": 1.7622718811035156, + "step": 160 + }, + { + "epoch": 0.11511765701709836, + "grad_norm": 0.890625, + "learning_rate": 9.993840588849743e-06, + "loss": 1.7750001907348634, + "step": 170 + }, + { + "epoch": 0.12188928390045708, + "grad_norm": 0.890625, + "learning_rate": 9.986581041033881e-06, + "loss": 1.767216110229492, + "step": 180 + }, + { + "epoch": 0.1286609107838158, + "grad_norm": 0.921875, + "learning_rate": 9.976535163919757e-06, + "loss": 1.7609657287597655, + "step": 190 + }, + { + "epoch": 0.13543253766717453, + "grad_norm": 0.87109375, + "learning_rate": 9.96370857079661e-06, + "loss": 1.7535722732543946, + "step": 200 + }, + { + "epoch": 0.14220416455053325, + "grad_norm": 0.86328125, + "learning_rate": 9.948108428721782e-06, + "loss": 1.7395360946655274, + "step": 210 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 0.88671875, + "learning_rate": 9.92974345451598e-06, + "loss": 1.7465991973876953, + "step": 220 + }, + { + "epoch": 0.15574741831725072, + "grad_norm": 0.87890625, + "learning_rate": 9.908623909892651e-06, + "loss": 1.7506902694702149, + "step": 230 + }, + { + "epoch": 0.16251904520060945, + "grad_norm": 0.8984375, + "learning_rate": 9.884761595724068e-06, + "loss": 1.7368896484375, + "step": 240 + }, + { + "epoch": 0.16929067208396817, + "grad_norm": 0.8671875, + "learning_rate": 9.858169845447417e-06, + "loss": 1.7515613555908203, + "step": 250 + }, + { + "epoch": 0.1760622989673269, + "grad_norm": 0.85546875, + "learning_rate": 9.828863517614533e-06, + "loss": 1.7509956359863281, + "step": 260 + }, + { + "epoch": 0.1828339258506856, + "grad_norm": 0.9140625, + "learning_rate": 9.796858987589462e-06, + "loss": 1.753628921508789, + "step": 270 + }, + { + "epoch": 0.18960555273404436, + "grad_norm": 0.85546875, + "learning_rate": 9.762174138398456e-06, + "loss": 1.7379936218261718, + "step": 280 + }, + { + "epoch": 0.19637717961740309, + "grad_norm": 0.88671875, + "learning_rate": 9.724828350737574e-06, + "loss": 1.7442964553833007, + "step": 290 + }, + { + "epoch": 0.2031488065007618, + "grad_norm": 0.87109375, + "learning_rate": 9.684842492143399e-06, + "loss": 1.7366142272949219, + "step": 300 + }, + { + "epoch": 0.20992043338412053, + "grad_norm": 0.84765625, + "learning_rate": 9.642238905333e-06, + "loss": 1.7396051406860351, + "step": 310 + }, + { + "epoch": 0.21669206026747925, + "grad_norm": 0.87109375, + "learning_rate": 9.597041395719573e-06, + "loss": 1.732611083984375, + "step": 320 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 0.8828125, + "learning_rate": 9.549275218110818e-06, + "loss": 1.7453182220458985, + "step": 330 + }, + { + "epoch": 0.23023531403419673, + "grad_norm": 0.875, + "learning_rate": 9.498967062597403e-06, + "loss": 1.7297761917114258, + "step": 340 + }, + { + "epoch": 0.23700694091755545, + "grad_norm": 0.875, + "learning_rate": 9.446145039639486e-06, + "loss": 1.728118324279785, + "step": 350 + }, + { + "epoch": 0.24377856780091417, + "grad_norm": 0.890625, + "learning_rate": 9.390838664359539e-06, + "loss": 1.7387624740600587, + "step": 360 + }, + { + "epoch": 0.2505501946842729, + "grad_norm": 0.85546875, + "learning_rate": 9.333078840050331e-06, + "loss": 1.7364713668823242, + "step": 370 + }, + { + "epoch": 0.2573218215676316, + "grad_norm": 0.8828125, + "learning_rate": 9.27289784090723e-06, + "loss": 1.7236080169677734, + "step": 380 + }, + { + "epoch": 0.26409344845099036, + "grad_norm": 0.890625, + "learning_rate": 9.210329293994495e-06, + "loss": 1.7224924087524414, + "step": 390 + }, + { + "epoch": 0.27086507533434906, + "grad_norm": 0.8671875, + "learning_rate": 9.145408160455642e-06, + "loss": 1.7099193572998046, + "step": 400 + }, + { + "epoch": 0.2776367022177078, + "grad_norm": 0.8515625, + "learning_rate": 9.078170715978353e-06, + "loss": 1.737176513671875, + "step": 410 + }, + { + "epoch": 0.2844083291010665, + "grad_norm": 0.9140625, + "learning_rate": 9.008654530524883e-06, + "loss": 1.73763427734375, + "step": 420 + }, + { + "epoch": 0.29117995598442525, + "grad_norm": 0.85546875, + "learning_rate": 8.936898447339257e-06, + "loss": 1.7290821075439453, + "step": 430 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 0.8984375, + "learning_rate": 8.86294256124301e-06, + "loss": 1.7403568267822265, + "step": 440 + }, + { + "epoch": 0.3047232097511427, + "grad_norm": 0.859375, + "learning_rate": 8.786828196231584e-06, + "loss": 1.7217792510986327, + "step": 450 + }, + { + "epoch": 0.31149483663450145, + "grad_norm": 0.87109375, + "learning_rate": 8.708597882383908e-06, + "loss": 1.7103708267211915, + "step": 460 + }, + { + "epoch": 0.31826646351786014, + "grad_norm": 0.91796875, + "learning_rate": 8.62829533209805e-06, + "loss": 1.7208784103393555, + "step": 470 + }, + { + "epoch": 0.3250380904012189, + "grad_norm": 0.859375, + "learning_rate": 8.545965415666254e-06, + "loss": 1.7223230361938477, + "step": 480 + }, + { + "epoch": 0.33180971728457764, + "grad_norm": 0.8671875, + "learning_rate": 8.46165413620295e-06, + "loss": 1.719701385498047, + "step": 490 + }, + { + "epoch": 0.33858134416793634, + "grad_norm": 0.85546875, + "learning_rate": 8.375408603939827e-06, + "loss": 1.721092987060547, + "step": 500 + }, + { + "epoch": 0.33858134416793634, + "eval_loss": 1.7143864631652832, + "eval_runtime": 177.179, + "eval_samples_per_second": 5.401, + "eval_steps_per_second": 0.677, + "step": 500 + }, + { + "epoch": 0.3453529710512951, + "grad_norm": 0.859375, + "learning_rate": 8.287277009902237e-06, + "loss": 1.7325265884399415, + "step": 510 + }, + { + "epoch": 0.3521245979346538, + "grad_norm": 0.83984375, + "learning_rate": 8.197308598981731e-06, + "loss": 1.7298921585083007, + "step": 520 + }, + { + "epoch": 0.35889622481801253, + "grad_norm": 0.8828125, + "learning_rate": 8.105553642419708e-06, + "loss": 1.6982412338256836, + "step": 530 + }, + { + "epoch": 0.3656678517013712, + "grad_norm": 0.91015625, + "learning_rate": 8.012063409717578e-06, + "loss": 1.7173789978027343, + "step": 540 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 0.875, + "learning_rate": 7.916890139989147e-06, + "loss": 1.724541473388672, + "step": 550 + }, + { + "epoch": 0.3792111054680887, + "grad_norm": 0.859375, + "learning_rate": 7.820087012771184e-06, + "loss": 1.701674461364746, + "step": 560 + }, + { + "epoch": 0.3859827323514474, + "grad_norm": 0.85546875, + "learning_rate": 7.721708118308556e-06, + "loss": 1.7177881240844726, + "step": 570 + }, + { + "epoch": 0.39275435923480617, + "grad_norm": 0.87890625, + "learning_rate": 7.621808427330447e-06, + "loss": 1.6985021591186524, + "step": 580 + }, + { + "epoch": 0.39952598611816487, + "grad_norm": 0.87109375, + "learning_rate": 7.5204437603346224e-06, + "loss": 1.709127426147461, + "step": 590 + }, + { + "epoch": 0.4062976130015236, + "grad_norm": 0.88671875, + "learning_rate": 7.417670756396863e-06, + "loss": 1.7201419830322267, + "step": 600 + }, + { + "epoch": 0.41306923988488237, + "grad_norm": 0.8984375, + "learning_rate": 7.313546841522998e-06, + "loss": 1.7153247833251952, + "step": 610 + }, + { + "epoch": 0.41984086676824106, + "grad_norm": 0.875, + "learning_rate": 7.2081301965612435e-06, + "loss": 1.707881546020508, + "step": 620 + }, + { + "epoch": 0.4266124936515998, + "grad_norm": 0.87109375, + "learning_rate": 7.10147972469275e-06, + "loss": 1.7271339416503906, + "step": 630 + }, + { + "epoch": 0.4333841205349585, + "grad_norm": 1.3515625, + "learning_rate": 6.993655018518541e-06, + "loss": 1.7222976684570312, + "step": 640 + }, + { + "epoch": 0.44015574741831726, + "grad_norm": 0.85546875, + "learning_rate": 6.884716326761218e-06, + "loss": 1.7006675720214843, + "step": 650 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 0.87109375, + "learning_rate": 6.774724520600069e-06, + "loss": 1.6978439331054687, + "step": 660 + }, + { + "epoch": 0.4536990011850347, + "grad_norm": 0.87890625, + "learning_rate": 6.663741059658337e-06, + "loss": 1.7124168395996093, + "step": 670 + }, + { + "epoch": 0.46047062806839345, + "grad_norm": 0.87890625, + "learning_rate": 6.551827957661722e-06, + "loss": 1.7023361206054688, + "step": 680 + }, + { + "epoch": 0.46724225495175215, + "grad_norm": 0.86328125, + "learning_rate": 6.439047747787242e-06, + "loss": 1.700748825073242, + "step": 690 + }, + { + "epoch": 0.4740138818351109, + "grad_norm": 0.85546875, + "learning_rate": 6.325463447721852e-06, + "loss": 1.6977190017700194, + "step": 700 + }, + { + "epoch": 0.4807855087184696, + "grad_norm": 0.8984375, + "learning_rate": 6.211138524450347e-06, + "loss": 1.7250362396240235, + "step": 710 + }, + { + "epoch": 0.48755713560182834, + "grad_norm": 0.90234375, + "learning_rate": 6.096136858792193e-06, + "loss": 1.7249008178710938, + "step": 720 + }, + { + "epoch": 0.4943287624851871, + "grad_norm": 0.8671875, + "learning_rate": 5.980522709707132e-06, + "loss": 1.7153186798095703, + "step": 730 + }, + { + "epoch": 0.5011003893685458, + "grad_norm": 0.8828125, + "learning_rate": 5.864360678389497e-06, + "loss": 1.6841873168945312, + "step": 740 + }, + { + "epoch": 0.5078720162519045, + "grad_norm": 0.8515625, + "learning_rate": 5.747715672171295e-06, + "loss": 1.7151117324829102, + "step": 750 + }, + { + "epoch": 0.5146436431352632, + "grad_norm": 0.95703125, + "learning_rate": 5.630652868254229e-06, + "loss": 1.704267692565918, + "step": 760 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 0.88671875, + "learning_rate": 5.51323767729093e-06, + "loss": 1.7240329742431642, + "step": 770 + }, + { + "epoch": 0.5281868969019807, + "grad_norm": 0.87890625, + "learning_rate": 5.395535706835744e-06, + "loss": 1.7058921813964845, + "step": 780 + }, + { + "epoch": 0.5349585237853395, + "grad_norm": 0.8828125, + "learning_rate": 5.27761272468549e-06, + "loss": 1.6999113082885742, + "step": 790 + }, + { + "epoch": 0.5417301506686981, + "grad_norm": 0.9140625, + "learning_rate": 5.159534622130695e-06, + "loss": 1.7173538208007812, + "step": 800 + }, + { + "epoch": 0.5485017775520569, + "grad_norm": 0.85546875, + "learning_rate": 5.04136737713781e-06, + "loss": 1.706464958190918, + "step": 810 + }, + { + "epoch": 0.5552734044354156, + "grad_norm": 0.84765625, + "learning_rate": 4.923177017483002e-06, + "loss": 1.7123580932617188, + "step": 820 + }, + { + "epoch": 0.5620450313187744, + "grad_norm": 0.84765625, + "learning_rate": 4.805029583858115e-06, + "loss": 1.7076505661010741, + "step": 830 + }, + { + "epoch": 0.568816658202133, + "grad_norm": 0.87109375, + "learning_rate": 4.686991092969408e-06, + "loss": 1.7007432937622071, + "step": 840 + }, + { + "epoch": 0.5755882850854918, + "grad_norm": 0.83984375, + "learning_rate": 4.569127500649701e-06, + "loss": 1.7156892776489259, + "step": 850 + }, + { + "epoch": 0.5823599119688505, + "grad_norm": 0.85546875, + "learning_rate": 4.4515046650045316e-06, + "loss": 1.6989547729492187, + "step": 860 + }, + { + "epoch": 0.5891315388522093, + "grad_norm": 0.859375, + "learning_rate": 4.334188309612923e-06, + "loss": 1.701683235168457, + "step": 870 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 0.875, + "learning_rate": 4.217243986803315e-06, + "loss": 1.7004409790039063, + "step": 880 + }, + { + "epoch": 0.6026747926189266, + "grad_norm": 0.88671875, + "learning_rate": 4.100737041025188e-06, + "loss": 1.727794075012207, + "step": 890 + }, + { + "epoch": 0.6094464195022854, + "grad_norm": 0.89453125, + "learning_rate": 3.984732572336837e-06, + "loss": 1.6976716995239258, + "step": 900 + }, + { + "epoch": 0.6162180463856441, + "grad_norm": 0.89453125, + "learning_rate": 3.869295400029714e-06, + "loss": 1.6927717208862305, + "step": 910 + }, + { + "epoch": 0.6229896732690029, + "grad_norm": 0.84375, + "learning_rate": 3.754490026409637e-06, + "loss": 1.6997186660766601, + "step": 920 + }, + { + "epoch": 0.6297613001523616, + "grad_norm": 0.93359375, + "learning_rate": 3.6403806007551373e-06, + "loss": 1.7196897506713866, + "step": 930 + }, + { + "epoch": 0.6365329270357203, + "grad_norm": 0.83203125, + "learning_rate": 3.527030883473055e-06, + "loss": 1.7054462432861328, + "step": 940 + }, + { + "epoch": 0.643304553919079, + "grad_norm": 0.890625, + "learning_rate": 3.414504210471421e-06, + "loss": 1.7200759887695312, + "step": 950 + }, + { + "epoch": 0.6500761808024378, + "grad_norm": 0.890625, + "learning_rate": 3.302863457769544e-06, + "loss": 1.6951274871826172, + "step": 960 + }, + { + "epoch": 0.6568478076857965, + "grad_norm": 0.90625, + "learning_rate": 3.192171006365061e-06, + "loss": 1.7151849746704102, + "step": 970 + }, + { + "epoch": 0.6636194345691553, + "grad_norm": 0.8984375, + "learning_rate": 3.0824887073775877e-06, + "loss": 1.713322067260742, + "step": 980 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.83984375, + "learning_rate": 2.973877847488451e-06, + "loss": 1.7172536849975586, + "step": 990 + }, + { + "epoch": 0.6771626883358727, + "grad_norm": 0.859375, + "learning_rate": 2.8663991146958064e-06, + "loss": 1.7149576187133788, + "step": 1000 + }, + { + "epoch": 0.6771626883358727, + "eval_loss": 1.7007688283920288, + "eval_runtime": 165.432, + "eval_samples_per_second": 5.785, + "eval_steps_per_second": 0.725, + "step": 1000 + }, + { + "epoch": 0.6839343152192314, + "grad_norm": 0.90625, + "learning_rate": 2.7601125644042777e-06, + "loss": 1.714142417907715, + "step": 1010 + }, + { + "epoch": 0.6907059421025902, + "grad_norm": 0.859375, + "learning_rate": 2.6550775858680793e-06, + "loss": 1.7104360580444335, + "step": 1020 + }, + { + "epoch": 0.6974775689859489, + "grad_norm": 0.90234375, + "learning_rate": 2.551352869006338e-06, + "loss": 1.7032684326171874, + "step": 1030 + }, + { + "epoch": 0.7042491958693076, + "grad_norm": 0.86328125, + "learning_rate": 2.4489963716092096e-06, + "loss": 1.701323890686035, + "step": 1040 + }, + { + "epoch": 0.7110208227526663, + "grad_norm": 0.890625, + "learning_rate": 2.348065286953048e-06, + "loss": 1.7169862747192384, + "step": 1050 + }, + { + "epoch": 0.7177924496360251, + "grad_norm": 0.87890625, + "learning_rate": 2.2486160118427958e-06, + "loss": 1.701096534729004, + "step": 1060 + }, + { + "epoch": 0.7245640765193838, + "grad_norm": 0.88671875, + "learning_rate": 2.1507041150993813e-06, + "loss": 1.700172233581543, + "step": 1070 + }, + { + "epoch": 0.7313357034027425, + "grad_norm": 0.859375, + "learning_rate": 2.054384306509794e-06, + "loss": 1.7045093536376954, + "step": 1080 + }, + { + "epoch": 0.7381073302861012, + "grad_norm": 0.859375, + "learning_rate": 1.9597104062571337e-06, + "loss": 1.7091920852661133, + "step": 1090 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 0.86328125, + "learning_rate": 1.8667353148477547e-06, + "loss": 1.7001871109008788, + "step": 1100 + }, + { + "epoch": 0.7516505840528187, + "grad_norm": 0.85546875, + "learning_rate": 1.7755109835522938e-06, + "loss": 1.7016315460205078, + "step": 1110 + }, + { + "epoch": 0.7584222109361775, + "grad_norm": 0.87890625, + "learning_rate": 1.6860883853770848e-06, + "loss": 1.7196449279785155, + "step": 1120 + }, + { + "epoch": 0.7651938378195361, + "grad_norm": 0.89453125, + "learning_rate": 1.5985174865822146e-06, + "loss": 1.701955223083496, + "step": 1130 + }, + { + "epoch": 0.7719654647028948, + "grad_norm": 0.85546875, + "learning_rate": 1.5128472187620886e-06, + "loss": 1.703407096862793, + "step": 1140 + }, + { + "epoch": 0.7787370915862536, + "grad_norm": 0.875, + "learning_rate": 1.4291254515041592e-06, + "loss": 1.7057323455810547, + "step": 1150 + }, + { + "epoch": 0.7855087184696123, + "grad_norm": 0.8828125, + "learning_rate": 1.3473989656410413e-06, + "loss": 1.6963571548461913, + "step": 1160 + }, + { + "epoch": 0.7922803453529711, + "grad_norm": 0.8671875, + "learning_rate": 1.2677134271110082e-06, + "loss": 1.7136796951293944, + "step": 1170 + }, + { + "epoch": 0.7990519722363297, + "grad_norm": 0.89453125, + "learning_rate": 1.1901133614414352e-06, + "loss": 1.7095062255859375, + "step": 1180 + }, + { + "epoch": 0.8058235991196885, + "grad_norm": 0.875, + "learning_rate": 1.114642128869473e-06, + "loss": 1.7052017211914063, + "step": 1190 + }, + { + "epoch": 0.8125952260030472, + "grad_norm": 0.8984375, + "learning_rate": 1.0413419001138525e-06, + "loss": 1.7166055679321288, + "step": 1200 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 0.87890625, + "learning_rate": 9.702536328113305e-07, + "loss": 1.7042055130004883, + "step": 1210 + }, + { + "epoch": 0.8261384797697647, + "grad_norm": 0.8671875, + "learning_rate": 9.014170486309875e-07, + "loss": 1.6885286331176759, + "step": 1220 + }, + { + "epoch": 0.8329101066531234, + "grad_norm": 0.84375, + "learning_rate": 8.348706110791238e-07, + "loss": 1.7065910339355468, + "step": 1230 + }, + { + "epoch": 0.8396817335364821, + "grad_norm": 0.87109375, + "learning_rate": 7.706515040071854e-07, + "loss": 1.6999498367309571, + "step": 1240 + }, + { + "epoch": 0.8464533604198409, + "grad_norm": 0.8828125, + "learning_rate": 7.08795610834706e-07, + "loss": 1.7021600723266601, + "step": 1250 + }, + { + "epoch": 0.8532249873031996, + "grad_norm": 0.87890625, + "learning_rate": 6.493374944988984e-07, + "loss": 1.722920799255371, + "step": 1260 + }, + { + "epoch": 0.8599966141865584, + "grad_norm": 0.8671875, + "learning_rate": 5.923103781420708e-07, + "loss": 1.7148597717285157, + "step": 1270 + }, + { + "epoch": 0.866768241069917, + "grad_norm": 0.890625, + "learning_rate": 5.377461265476868e-07, + "loss": 1.7151250839233398, + "step": 1280 + }, + { + "epoch": 0.8735398679532758, + "grad_norm": 0.8671875, + "learning_rate": 4.856752283354277e-07, + "loss": 1.7023918151855468, + "step": 1290 + }, + { + "epoch": 0.8803114948366345, + "grad_norm": 0.8671875, + "learning_rate": 4.3612677892519496e-07, + "loss": 1.7045417785644532, + "step": 1300 + }, + { + "epoch": 0.8870831217199933, + "grad_norm": 0.86328125, + "learning_rate": 3.891284642796045e-07, + "loss": 1.7008039474487304, + "step": 1310 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 0.8671875, + "learning_rate": 3.447065454340198e-07, + "loss": 1.7126380920410156, + "step": 1320 + }, + { + "epoch": 0.9006263754867107, + "grad_norm": 0.88671875, + "learning_rate": 3.028858438227966e-07, + "loss": 1.7127569198608399, + "step": 1330 + }, + { + "epoch": 0.9073980023700694, + "grad_norm": 0.86328125, + "learning_rate": 2.636897274099187e-07, + "loss": 1.7151193618774414, + "step": 1340 + }, + { + "epoch": 0.9141696292534282, + "grad_norm": 0.8515625, + "learning_rate": 2.2714009763178945e-07, + "loss": 1.704157829284668, + "step": 1350 + }, + { + "epoch": 0.9209412561367869, + "grad_norm": 0.87890625, + "learning_rate": 1.932573771594648e-07, + "loss": 1.7036989212036133, + "step": 1360 + }, + { + "epoch": 0.9277128830201455, + "grad_norm": 0.8671875, + "learning_rate": 1.6206049848716765e-07, + "loss": 1.7044996261596679, + "step": 1370 + }, + { + "epoch": 0.9344845099035043, + "grad_norm": 1.109375, + "learning_rate": 1.3356689335346728e-07, + "loss": 1.7029462814331056, + "step": 1380 + }, + { + "epoch": 0.941256136786863, + "grad_norm": 0.91015625, + "learning_rate": 1.0779248300102352e-07, + "loss": 1.7133670806884767, + "step": 1390 + }, + { + "epoch": 0.9480277636702218, + "grad_norm": 0.859375, + "learning_rate": 8.475166928034684e-08, + "loss": 1.6992549896240234, + "step": 1400 + }, + { + "epoch": 0.9547993905535805, + "grad_norm": 0.85546875, + "learning_rate": 6.445732660254056e-08, + "loss": 1.7066579818725587, + "step": 1410 + }, + { + "epoch": 0.9615710174369392, + "grad_norm": 0.9140625, + "learning_rate": 4.692079474552691e-08, + "loss": 1.6963106155395509, + "step": 1420 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 0.8515625, + "learning_rate": 3.2151872517767194e-08, + "loss": 1.7118385314941407, + "step": 1430 + }, + { + "epoch": 0.9751142712036567, + "grad_norm": 0.84375, + "learning_rate": 2.0158812283030403e-08, + "loss": 1.6870197296142577, + "step": 1440 + }, + { + "epoch": 0.9818858980870154, + "grad_norm": 0.87109375, + "learning_rate": 1.094831534925289e-08, + "loss": 1.7051671981811523, + "step": 1450 + }, + { + "epoch": 0.9886575249703742, + "grad_norm": 0.86328125, + "learning_rate": 4.5255282240802554e-09, + "loss": 1.7082006454467773, + "step": 1460 + }, + { + "epoch": 0.9954291518537328, + "grad_norm": 0.8828125, + "learning_rate": 8.940397391787869e-10, + "loss": 1.707107162475586, + "step": 1470 + }, + { + "epoch": 1.0, + "eval_loss": 1.7002202272415161, + "eval_runtime": 169.1979, + "eval_samples_per_second": 5.656, + "eval_steps_per_second": 0.709, + "step": 1477 + } + ], + "logging_steps": 10, + "max_steps": 1477, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.103177196962902e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1477/training_args.bin b/checkpoint-1477/training_args.bin new file mode 100644 index 0000000..70b9526 --- /dev/null +++ b/checkpoint-1477/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:021e20fabb8f12442e13effbcc63f0a47b25ed87f82c678b87ee5792f87ef9bc +size 5777 diff --git a/config.json b/config.json new file mode 100644 index 0000000..cbfe8ec --- /dev/null +++ b/config.json @@ -0,0 +1,74 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_name": "unsloth/Qwen3-4B-Base", + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151669, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "unsloth_fixed": true, + "unsloth_version": "2026.4.4", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/eval/eval_results_final.json b/eval/eval_results_final.json new file mode 100644 index 0000000..6243158 --- /dev/null +++ b/eval/eval_results_final.json @@ -0,0 +1,15657 @@ +{ + "model_path": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "ppl": null, + "base_ppl": null, + "benchmarks": { + "cpt": { + "results": { + "arc_challenge": { + "alias": "arc_challenge", + "acc,none": 0.4825, + "acc_stderr,none": 0.025015972341295333, + "acc_norm,none": 0.5325, + "acc_norm_stderr,none": 0.024978374105060028 + }, + "arc_easy": { + "alias": "arc_easy", + "acc,none": 0.78, + "acc_stderr,none": 0.020738254217024313, + "acc_norm,none": 0.795, + "acc_norm_stderr,none": 0.020210359883399975 + }, + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4975, + "acc_stderr,none": 0.025030995822773405, + "acc_norm,none": 0.63, + "acc_norm_stderr,none": 0.024170447375168467 + }, + "kmmlu": { + "acc,none": 0.4692806221646144, + "acc_stderr,none": 0.0039182515413587, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45375, + "acc_stderr,none": 0.007111885914543827, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_aviation_engineering_and_maintenance": { + "alias": " - kmmlu_aviation_engineering_and_maintenance", + "acc,none": 0.46, + "acc_stderr,none": 0.024951079956135092 + }, + "kmmlu_electronics_engineering": { + "alias": " - kmmlu_electronics_engineering", + "acc,none": 0.6275, + "acc_stderr,none": 0.0242038000082031 + }, + "kmmlu_energy_management": { + "alias": " - kmmlu_energy_management", + "acc,none": 0.395, + "acc_stderr,none": 0.0244731452227279 + }, + "kmmlu_environmental_science": { + "alias": " - kmmlu_environmental_science", + "acc,none": 0.37, + "acc_stderr,none": 0.024170447375168453 + }, + "kmmlu_gas_technology_and_engineering": { + "alias": " - kmmlu_gas_technology_and_engineering", + "acc,none": 0.405, + "acc_stderr,none": 0.024575340657273674 + }, + "kmmlu_geomatics": { + "alias": " - kmmlu_geomatics", + "acc,none": 0.425, + "acc_stderr,none": 0.024748104405776187 + }, + "kmmlu_industrial_engineer": { + "alias": " - kmmlu_industrial_engineer", + "acc,none": 0.4275, + "acc_stderr,none": 0.024766769210836766 + }, + "kmmlu_machine_design_and_manufacturing": { + "alias": " - kmmlu_machine_design_and_manufacturing", + "acc,none": 0.4975, + "acc_stderr,none": 0.025030995822773395 + }, + "kmmlu_maritime_engineering": { + "alias": " - kmmlu_maritime_engineering", + "acc,none": 0.4075, + "acc_stderr,none": 0.02459923129797198 + }, + "kmmlu_nondestructive_testing": { + "alias": " - kmmlu_nondestructive_testing", + "acc,none": 0.475, + "acc_stderr,none": 0.024999999999999994 + }, + "kmmlu_railway_and_automotive_engineering": { + "alias": " - kmmlu_railway_and_automotive_engineering", + "acc,none": 0.3825, + "acc_stderr,none": 0.024330316186072946 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "alias": " - kmmlu_telecommunications_and_wireless_technology", + "acc,none": 0.5725, + "acc_stderr,none": 0.02476676921083677 + }, + "kmmlu_humss": { + "acc,none": 0.4776556776556777, + "acc_stderr,none": 0.00943997794327789, + "alias": " - kmmlu_humss" + }, + "kmmlu_accounting": { + "alias": " - kmmlu_accounting", + "acc,none": 0.5, + "acc_stderr,none": 0.050251890762960605 + }, + "kmmlu_criminal_law": { + "alias": " - kmmlu_criminal_law", + "acc,none": 0.39, + "acc_stderr,none": 0.03457567623250012 + }, + "kmmlu_economics": { + "alias": " - kmmlu_economics", + "acc,none": 0.5461538461538461, + "acc_stderr,none": 0.04383459241436368 + }, + "kmmlu_education": { + "alias": " - kmmlu_education", + "acc,none": 0.64, + "acc_stderr,none": 0.048241815132442176 + }, + "kmmlu_korean_history": { + "alias": " - kmmlu_korean_history", + "acc,none": 0.3, + "acc_stderr,none": 0.046056618647183814 + }, + "kmmlu_law": { + "alias": " - kmmlu_law", + "acc,none": 0.375, + "acc_stderr,none": 0.02423646044779629 + }, + "kmmlu_management": { + "alias": " - kmmlu_management", + "acc,none": 0.5225, + "acc_stderr,none": 0.02500595167250431 + }, + "kmmlu_political_science_and_sociology": { + "alias": " - kmmlu_political_science_and_sociology", + "acc,none": 0.55, + "acc_stderr,none": 0.02877080459987894 + }, + "kmmlu_psychology": { + "alias": " - kmmlu_psychology", + "acc,none": 0.45, + "acc_stderr,none": 0.024905837706844923 + }, + "kmmlu_social_welfare": { + "alias": " - kmmlu_social_welfare", + "acc,none": 0.57, + "acc_stderr,none": 0.02478478796128207 + }, + "kmmlu_taxation": { + "alias": " - kmmlu_taxation", + "acc,none": 0.395, + "acc_stderr,none": 0.03465370682892271 + }, + "kmmlu_other": { + "acc,none": 0.4697222222222222, + "acc_stderr,none": 0.008043980393376315, + "alias": " - kmmlu_other" + }, + "kmmlu_agricultural_sciences": { + "alias": " - kmmlu_agricultural_sciences", + "acc,none": 0.3625, + "acc_stderr,none": 0.024066207238097735 + }, + "kmmlu_construction": { + "alias": " - kmmlu_construction", + "acc,none": 0.4, + "acc_stderr,none": 0.024525573579398552 + }, + "kmmlu_fashion": { + "alias": " - kmmlu_fashion", + "acc,none": 0.45, + "acc_stderr,none": 0.024905837706844923 + }, + "kmmlu_food_processing": { + "alias": " - kmmlu_food_processing", + "acc,none": 0.3675, + "acc_stderr,none": 0.024136399679191744 + }, + "kmmlu_health": { + "alias": " - kmmlu_health", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "kmmlu_interior_architecture_and_design": { + "alias": " - kmmlu_interior_architecture_and_design", + "acc,none": 0.6175, + "acc_stderr,none": 0.024330316186072936 + }, + "kmmlu_marketing": { + "alias": " - kmmlu_marketing", + "acc,none": 0.765, + "acc_stderr,none": 0.021226490755055 + }, + "kmmlu_patent": { + "alias": " - kmmlu_patent", + "acc,none": 0.42, + "acc_stderr,none": 0.049604496374885836 + }, + "kmmlu_public_safety": { + "alias": " - kmmlu_public_safety", + "acc,none": 0.38, + "acc_stderr,none": 0.024299715851758236 + }, + "kmmlu_real_estate": { + "alias": " - kmmlu_real_estate", + "acc,none": 0.45, + "acc_stderr,none": 0.03526639466921485 + }, + "kmmlu_refrigerating_machinery": { + "alias": " - kmmlu_refrigerating_machinery", + "acc,none": 0.41, + "acc_stderr,none": 0.02462246259333947 + }, + "kmmlu_stem": { + "acc,none": 0.48093023255813955, + "acc_stderr,none": 0.007306868046626305, + "alias": " - kmmlu_stem" + }, + "kmmlu_biology": { + "alias": " - kmmlu_biology", + "acc,none": 0.3125, + "acc_stderr,none": 0.023204644228784484 + }, + "kmmlu_chemical_engineering": { + "alias": " - kmmlu_chemical_engineering", + "acc,none": 0.4875, + "acc_stderr,none": 0.025023485209500245 + }, + "kmmlu_chemistry": { + "alias": " - kmmlu_chemistry", + "acc,none": 0.5175, + "acc_stderr,none": 0.025015972341295323 + }, + "kmmlu_civil_engineering": { + "alias": " - kmmlu_civil_engineering", + "acc,none": 0.3925, + "acc_stderr,none": 0.024445927747963322 + }, + "kmmlu_computer_science": { + "alias": " - kmmlu_computer_science", + "acc,none": 0.74, + "acc_stderr,none": 0.021959178349484305 + }, + "kmmlu_ecology": { + "alias": " - kmmlu_ecology", + "acc,none": 0.505, + "acc_stderr,none": 0.02503005711936146 + }, + "kmmlu_electrical_engineering": { + "alias": " - kmmlu_electrical_engineering", + "acc,none": 0.3425, + "acc_stderr,none": 0.02375700661717548 + }, + "kmmlu_information_technology": { + "alias": " - kmmlu_information_technology", + "acc,none": 0.7525, + "acc_stderr,none": 0.021605006729678956 + }, + "kmmlu_materials_engineering": { + "alias": " - kmmlu_materials_engineering", + "acc,none": 0.475, + "acc_stderr,none": 0.025 + }, + "kmmlu_math": { + "alias": " - kmmlu_math", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.027262027336984393 + }, + "kmmlu_mechanical_engineering": { + "alias": " - kmmlu_mechanical_engineering", + "acc,none": 0.395, + "acc_stderr,none": 0.0244731452227279 + }, + "kobest_boolq": { + "alias": "kobest_boolq", + "acc,none": 0.755, + "acc_stderr,none": 0.02153129097913247, + "f1,none": 0.7379609080456697, + "f1_stderr,none": "N/A" + }, + "kobest_copa": { + "alias": "kobest_copa", + "acc,none": 0.6525, + "acc_stderr,none": 0.023838625698390636, + "f1,none": 0.6523935455233165, + "f1_stderr,none": "N/A" + }, + "kobest_hellaswag": { + "alias": "kobest_hellaswag", + "acc,none": 0.4325, + "acc_stderr,none": 0.024802162065186355, + "f1,none": 0.4264529493583016, + "f1_stderr,none": "N/A", + "acc_norm,none": 0.565, + "acc_norm_stderr,none": 0.024818892876375884 + }, + "mmlu": { + "acc,none": 0.7352865587252634, + "acc_stderr,none": 0.003887849176172822, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6862808842652796, + "acc_stderr,none": 0.0077616777391173045, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5873015873015873, + "acc_stderr,none": 0.04403438954768177 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7818181818181819, + "acc_stderr,none": 0.03225078108306289 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8186274509803921, + "acc_stderr,none": 0.02704462171947408 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8481012658227848, + "acc_stderr,none": 0.023363878096632453 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8264462809917356, + "acc_stderr,none": 0.0345727283691767 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8148148148148148, + "acc_stderr,none": 0.03755265865037183 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.8466257668711656, + "acc_stderr,none": 0.02831160144143859 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7543352601156069, + "acc_stderr,none": 0.023176298203992005 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.3225, + "acc_stderr,none": 0.023400926978618716 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7331189710610932, + "acc_stderr,none": 0.025122637608816636 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.7870370370370371, + "acc_stderr,none": 0.02277971908873339 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5075, + "acc_stderr,none": 0.02502849253543831 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8070175438596491, + "acc_stderr,none": 0.030267457554898458 + }, + "mmlu_other": { + "acc,none": 0.7415565345080763, + "acc_stderr,none": 0.008104267812218218, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909282 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.769811320754717, + "acc_stderr,none": 0.025907897122408173 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7456647398843931, + "acc_stderr,none": 0.0332055644308557 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.44, + "acc_stderr,none": 0.0498887651569859 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7399103139013453, + "acc_stderr,none": 0.029442495585857473 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8640776699029126, + "acc_stderr,none": 0.0339329572976101 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8931623931623932, + "acc_stderr,none": 0.020237149008990932 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036846 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8225, + "acc_stderr,none": 0.019128489820344343 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7777777777777778, + "acc_stderr,none": 0.02380518652488816 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.574468085106383, + "acc_stderr,none": 0.029494827600144366 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7757352941176471, + "acc_stderr,none": 0.02533684856333236 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5060240963855421, + "acc_stderr,none": 0.038922121953330446 + }, + "mmlu_social_sciences": { + "acc,none": 0.8158088235294118, + "acc_stderr,none": 0.007306038192044323, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336937 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8585858585858586, + "acc_stderr,none": 0.02482590979334335 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8704663212435233, + "acc_stderr,none": 0.024233532297758716 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.8076923076923077, + "acc_stderr,none": 0.019982347208637296 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.8991596638655462, + "acc_stderr,none": 0.019559663430480802 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.905, + "acc_stderr,none": 0.014679107277903242 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7786259541984732, + "acc_stderr,none": 0.03641297081313729 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.74, + "acc_stderr,none": 0.02195917834948431 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6727272727272727, + "acc_stderr,none": 0.0449429086625209 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7428571428571429, + "acc_stderr,none": 0.027979823538744546 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8557213930348259, + "acc_stderr,none": 0.02484575321230605 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.89, + "acc_stderr,none": 0.03144660377352203 + }, + "mmlu_stem": { + "acc,none": 0.7082143989850935, + "acc_stderr,none": 0.007816574368205405, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.7111111111111111, + "acc_stderr,none": 0.0391545063041425 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8486842105263158, + "acc_stderr,none": 0.029162631596843975 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8263888888888888, + "acc_stderr,none": 0.03167473383795717 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.52, + "acc_stderr,none": 0.050211673156867795 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.68, + "acc_stderr,none": 0.04688261722621504 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.53, + "acc_stderr,none": 0.05016135580465919 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.5784313725490197, + "acc_stderr,none": 0.049135952012745045 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.83, + "acc_stderr,none": 0.03775251680686371 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.8, + "acc_stderr,none": 0.026148818018424506 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.7586206896551724, + "acc_stderr,none": 0.03565998174135302 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.6746031746031746, + "acc_stderr,none": 0.024130158299762613 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.9, + "acc_stderr,none": 0.017066403719657258 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.729064039408867, + "acc_stderr,none": 0.03127090713297698 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.85, + "acc_stderr,none": 0.0358870281282637 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.5296296296296297, + "acc_stderr,none": 0.030431963547936584 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.6754966887417219, + "acc_stderr,none": 0.03822746937658752 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.7037037037037037, + "acc_stderr,none": 0.031141447823536044 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5892857142857143, + "acc_stderr,none": 0.04669510663875191 + }, + "winogrande": { + "alias": "winogrande", + "acc,none": 0.7225, + "acc_stderr,none": 0.022416302137144652 + } + }, + "groups": { + "kmmlu": { + "acc,none": 0.4692806221646144, + "acc_stderr,none": 0.0039182515413587, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45375, + "acc_stderr,none": 0.007111885914543827, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_humss": { + "acc,none": 0.4776556776556777, + "acc_stderr,none": 0.00943997794327789, + "alias": " - kmmlu_humss" + }, + "kmmlu_other": { + "acc,none": 0.4697222222222222, + "acc_stderr,none": 0.008043980393376315, + "alias": " - kmmlu_other" + }, + "kmmlu_stem": { + "acc,none": 0.48093023255813955, + "acc_stderr,none": 0.007306868046626305, + "alias": " - kmmlu_stem" + }, + "mmlu": { + "acc,none": 0.7352865587252634, + "acc_stderr,none": 0.003887849176172822, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6862808842652796, + "acc_stderr,none": 0.0077616777391173045, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7415565345080763, + "acc_stderr,none": 0.008104267812218218, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8158088235294118, + "acc_stderr,none": 0.007306038192044323, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.7082143989850935, + "acc_stderr,none": 0.007816574368205405, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_law", + "mmlu_world_religions" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_psychology", + "mmlu_human_sexuality", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy" + ], + "mmlu_other": [ + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_medicine", + "mmlu_global_facts", + "mmlu_human_aging", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_nutrition", + "mmlu_professional_accounting", + "mmlu_professional_medicine", + "mmlu_virology" + ], + "mmlu_stem": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_mathematics", + "mmlu_high_school_physics", + "mmlu_high_school_statistics", + "mmlu_machine_learning" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "hellaswag": [], + "arc_easy": [], + "arc_challenge": [], + "winogrande": [], + "kmmlu_humss": [ + "kmmlu_accounting", + "kmmlu_criminal_law", + "kmmlu_economics", + "kmmlu_education", + "kmmlu_korean_history", + "kmmlu_law", + "kmmlu_management", + "kmmlu_political_science_and_sociology", + "kmmlu_psychology", + "kmmlu_social_welfare", + "kmmlu_taxation" + ], + "kmmlu_applied_science": [ + "kmmlu_aviation_engineering_and_maintenance", + "kmmlu_electronics_engineering", + "kmmlu_energy_management", + "kmmlu_environmental_science", + "kmmlu_gas_technology_and_engineering", + "kmmlu_geomatics", + "kmmlu_industrial_engineer", + "kmmlu_machine_design_and_manufacturing", + "kmmlu_maritime_engineering", + "kmmlu_nondestructive_testing", + "kmmlu_railway_and_automotive_engineering", + "kmmlu_telecommunications_and_wireless_technology" + ], + "kmmlu_other": [ + "kmmlu_agricultural_sciences", + "kmmlu_construction", + "kmmlu_fashion", + "kmmlu_food_processing", + "kmmlu_health", + "kmmlu_interior_architecture_and_design", + "kmmlu_marketing", + "kmmlu_patent", + "kmmlu_public_safety", + "kmmlu_real_estate", + "kmmlu_refrigerating_machinery" + ], + "kmmlu_stem": [ + "kmmlu_biology", + "kmmlu_chemical_engineering", + "kmmlu_chemistry", + "kmmlu_civil_engineering", + "kmmlu_computer_science", + "kmmlu_ecology", + "kmmlu_electrical_engineering", + "kmmlu_information_technology", + "kmmlu_materials_engineering", + "kmmlu_math", + "kmmlu_mechanical_engineering" + ], + "kmmlu": [ + "kmmlu_stem", + "kmmlu_other", + "kmmlu_applied_science", + "kmmlu_humss" + ], + "kobest_boolq": [], + "kobest_copa": [], + "kobest_hellaswag": [] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "arc_easy": { + "task": "arc_easy", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Easy", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "Rowan/hellaswag", + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_accounting": { + "task": "kmmlu_accounting", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_agricultural_sciences": { + "task": "kmmlu_agricultural_sciences", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Agricultural-Sciences", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_aviation_engineering_and_maintenance": { + "task": "kmmlu_aviation_engineering_and_maintenance", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Aviation-Engineering-and-Maintenance", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_biology": { + "task": "kmmlu_biology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_chemical_engineering": { + "task": "kmmlu_chemical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_chemistry": { + "task": "kmmlu_chemistry", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_civil_engineering": { + "task": "kmmlu_civil_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Civil-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_computer_science": { + "task": "kmmlu_computer_science", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Computer-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_construction": { + "task": "kmmlu_construction", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Construction", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_criminal_law": { + "task": "kmmlu_criminal_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Criminal-Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_ecology": { + "task": "kmmlu_ecology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Ecology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_economics": { + "task": "kmmlu_economics", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Economics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_education": { + "task": "kmmlu_education", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_electrical_engineering": { + "task": "kmmlu_electrical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electrical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_electronics_engineering": { + "task": "kmmlu_electronics_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electronics-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_energy_management": { + "task": "kmmlu_energy_management", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Energy-Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_environmental_science": { + "task": "kmmlu_environmental_science", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Environmental-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_fashion": { + "task": "kmmlu_fashion", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Fashion", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_food_processing": { + "task": "kmmlu_food_processing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Food-Processing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_gas_technology_and_engineering": { + "task": "kmmlu_gas_technology_and_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Gas-Technology-and-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_geomatics": { + "task": "kmmlu_geomatics", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Geomatics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_health": { + "task": "kmmlu_health", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Health", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_industrial_engineer": { + "task": "kmmlu_industrial_engineer", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Industrial-Engineer", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_information_technology": { + "task": "kmmlu_information_technology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Information-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_interior_architecture_and_design": { + "task": "kmmlu_interior_architecture_and_design", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Interior-Architecture-and-Design", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_korean_history": { + "task": "kmmlu_korean_history", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Korean-History", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_law": { + "task": "kmmlu_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_machine_design_and_manufacturing": { + "task": "kmmlu_machine_design_and_manufacturing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Machine-Design-and-Manufacturing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_management": { + "task": "kmmlu_management", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_maritime_engineering": { + "task": "kmmlu_maritime_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Maritime-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_marketing": { + "task": "kmmlu_marketing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_materials_engineering": { + "task": "kmmlu_materials_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Materials-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_math": { + "task": "kmmlu_math", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Math", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_mechanical_engineering": { + "task": "kmmlu_mechanical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Mechanical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_nondestructive_testing": { + "task": "kmmlu_nondestructive_testing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Nondestructive-Testing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_patent": { + "task": "kmmlu_patent", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Patent", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_political_science_and_sociology": { + "task": "kmmlu_political_science_and_sociology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Political-Science-and-Sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_psychology": { + "task": "kmmlu_psychology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_public_safety": { + "task": "kmmlu_public_safety", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Public-Safety", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_railway_and_automotive_engineering": { + "task": "kmmlu_railway_and_automotive_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Railway-and-Automotive-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_real_estate": { + "task": "kmmlu_real_estate", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Real-Estate", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_refrigerating_machinery": { + "task": "kmmlu_refrigerating_machinery", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Refrigerating-Machinery", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_social_welfare": { + "task": "kmmlu_social_welfare", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Social-Welfare", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_taxation": { + "task": "kmmlu_taxation", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Taxation", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_telecommunications_and_wireless_technology": { + "task": "kmmlu_telecommunications_and_wireless_technology", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Telecommunications-and-Wireless-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_boolq": { + "task": "kobest_boolq", + "dataset_path": "skt/kobest_v1", + "dataset_name": "boolq", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": [ + "아니오", + "예" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_choice": [ + "아니오", + "예" + ], + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_copa": { + "task": "kobest_copa", + "dataset_path": "skt/kobest_v1", + "dataset_name": "copa", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "def copa_doc_to_text(doc: dict) -> str:\n connector = {\"원인\": \" 왜냐하면\", \"결과\": \" 그래서\"}[doc[\"question\"].strip()]\n return f\"\"\"{doc[\"premise\"]} {connector}\"\"\"\n", + "doc_to_target": "def copa_doc_to_target(doc: dict) -> str:\n correct_choice = doc[\"alternative_1\"] if doc[\"label\"] == 0 else doc[\"alternative_2\"]\n return f\"\"\"{correct_choice}\"\"\"\n", + "unsafe_code": false, + "doc_to_choice": "def copa_doc_to_choice(doc: dict) -> list:\n return [f\"\"\"{doc[\"alternative_1\"]}\"\"\", f\"\"\"{doc[\"alternative_2\"]}\"\"\"]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_hellaswag": { + "task": "kobest_hellaswag", + "dataset_path": "skt/kobest_v1", + "dataset_name": "hellaswag", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [\n dataset[\"ending_1\"],\n dataset[\"ending_2\"],\n dataset[\"ending_3\"],\n dataset[\"ending_4\"],\n ],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "allenai/winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + } + }, + "versions": { + "arc_challenge": 1.0, + "arc_easy": 1.0, + "hellaswag": 1.0, + "kmmlu": 2.0, + "kmmlu_accounting": 2.0, + "kmmlu_agricultural_sciences": 2.0, + "kmmlu_applied_science": 2.0, + "kmmlu_aviation_engineering_and_maintenance": 2.0, + "kmmlu_biology": 2.0, + "kmmlu_chemical_engineering": 2.0, + "kmmlu_chemistry": 2.0, + "kmmlu_civil_engineering": 2.0, + "kmmlu_computer_science": 2.0, + "kmmlu_construction": 2.0, + "kmmlu_criminal_law": 2.0, + "kmmlu_ecology": 2.0, + "kmmlu_economics": 2.0, + "kmmlu_education": 2.0, + "kmmlu_electrical_engineering": 2.0, + "kmmlu_electronics_engineering": 2.0, + "kmmlu_energy_management": 2.0, + "kmmlu_environmental_science": 2.0, + "kmmlu_fashion": 2.0, + "kmmlu_food_processing": 2.0, + "kmmlu_gas_technology_and_engineering": 2.0, + "kmmlu_geomatics": 2.0, + "kmmlu_health": 2.0, + "kmmlu_humss": 2.0, + "kmmlu_industrial_engineer": 2.0, + "kmmlu_information_technology": 2.0, + "kmmlu_interior_architecture_and_design": 2.0, + "kmmlu_korean_history": 2.0, + "kmmlu_law": 2.0, + "kmmlu_machine_design_and_manufacturing": 2.0, + "kmmlu_management": 2.0, + "kmmlu_maritime_engineering": 2.0, + "kmmlu_marketing": 2.0, + "kmmlu_materials_engineering": 2.0, + "kmmlu_math": 2.0, + "kmmlu_mechanical_engineering": 2.0, + "kmmlu_nondestructive_testing": 2.0, + "kmmlu_other": 2.0, + "kmmlu_patent": 2.0, + "kmmlu_political_science_and_sociology": 2.0, + "kmmlu_psychology": 2.0, + "kmmlu_public_safety": 2.0, + "kmmlu_railway_and_automotive_engineering": 2.0, + "kmmlu_real_estate": 2.0, + "kmmlu_refrigerating_machinery": 2.0, + "kmmlu_social_welfare": 2.0, + "kmmlu_stem": 2.0, + "kmmlu_taxation": 2.0, + "kmmlu_telecommunications_and_wireless_technology": 2.0, + "kobest_boolq": 1.0, + "kobest_copa": 1.0, + "kobest_hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 0, + "arc_easy": 0, + "hellaswag": 0, + "kmmlu_accounting": 0, + "kmmlu_agricultural_sciences": 0, + "kmmlu_aviation_engineering_and_maintenance": 0, + "kmmlu_biology": 0, + "kmmlu_chemical_engineering": 0, + "kmmlu_chemistry": 0, + "kmmlu_civil_engineering": 0, + "kmmlu_computer_science": 0, + "kmmlu_construction": 0, + "kmmlu_criminal_law": 0, + "kmmlu_ecology": 0, + "kmmlu_economics": 0, + "kmmlu_education": 0, + "kmmlu_electrical_engineering": 0, + "kmmlu_electronics_engineering": 0, + "kmmlu_energy_management": 0, + "kmmlu_environmental_science": 0, + "kmmlu_fashion": 0, + "kmmlu_food_processing": 0, + "kmmlu_gas_technology_and_engineering": 0, + "kmmlu_geomatics": 0, + "kmmlu_health": 0, + "kmmlu_industrial_engineer": 0, + "kmmlu_information_technology": 0, + "kmmlu_interior_architecture_and_design": 0, + "kmmlu_korean_history": 0, + "kmmlu_law": 0, + "kmmlu_machine_design_and_manufacturing": 0, + "kmmlu_management": 0, + "kmmlu_maritime_engineering": 0, + "kmmlu_marketing": 0, + "kmmlu_materials_engineering": 0, + "kmmlu_math": 0, + "kmmlu_mechanical_engineering": 0, + "kmmlu_nondestructive_testing": 0, + "kmmlu_patent": 0, + "kmmlu_political_science_and_sociology": 0, + "kmmlu_psychology": 0, + "kmmlu_public_safety": 0, + "kmmlu_railway_and_automotive_engineering": 0, + "kmmlu_real_estate": 0, + "kmmlu_refrigerating_machinery": 0, + "kmmlu_social_welfare": 0, + "kmmlu_taxation": 0, + "kmmlu_telecommunications_and_wireless_technology": 0, + "kobest_boolq": 0, + "kobest_copa": 0, + "kobest_hellaswag": 0, + "mmlu_abstract_algebra": 0, + "mmlu_anatomy": 0, + "mmlu_astronomy": 0, + "mmlu_business_ethics": 0, + "mmlu_clinical_knowledge": 0, + "mmlu_college_biology": 0, + "mmlu_college_chemistry": 0, + "mmlu_college_computer_science": 0, + "mmlu_college_mathematics": 0, + "mmlu_college_medicine": 0, + "mmlu_college_physics": 0, + "mmlu_computer_security": 0, + "mmlu_conceptual_physics": 0, + "mmlu_econometrics": 0, + "mmlu_electrical_engineering": 0, + "mmlu_elementary_mathematics": 0, + "mmlu_formal_logic": 0, + "mmlu_global_facts": 0, + "mmlu_high_school_biology": 0, + "mmlu_high_school_chemistry": 0, + "mmlu_high_school_computer_science": 0, + "mmlu_high_school_european_history": 0, + "mmlu_high_school_geography": 0, + "mmlu_high_school_government_and_politics": 0, + "mmlu_high_school_macroeconomics": 0, + "mmlu_high_school_mathematics": 0, + "mmlu_high_school_microeconomics": 0, + "mmlu_high_school_physics": 0, + "mmlu_high_school_psychology": 0, + "mmlu_high_school_statistics": 0, + "mmlu_high_school_us_history": 0, + "mmlu_high_school_world_history": 0, + "mmlu_human_aging": 0, + "mmlu_human_sexuality": 0, + "mmlu_international_law": 0, + "mmlu_jurisprudence": 0, + "mmlu_logical_fallacies": 0, + "mmlu_machine_learning": 0, + "mmlu_management": 0, + "mmlu_marketing": 0, + "mmlu_medical_genetics": 0, + "mmlu_miscellaneous": 0, + "mmlu_moral_disputes": 0, + "mmlu_moral_scenarios": 0, + "mmlu_nutrition": 0, + "mmlu_philosophy": 0, + "mmlu_prehistory": 0, + "mmlu_professional_accounting": 0, + "mmlu_professional_law": 0, + "mmlu_professional_medicine": 0, + "mmlu_professional_psychology": 0, + "mmlu_public_relations": 0, + "mmlu_security_studies": 0, + "mmlu_sociology": 0, + "mmlu_us_foreign_policy": 0, + "mmlu_virology": 0, + "mmlu_world_religions": 0, + "winogrande": 0 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "arc_easy": { + "acc": true, + "acc_norm": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "kmmlu": { + "acc": true + }, + "kmmlu_accounting": { + "acc": true + }, + "kmmlu_agricultural_sciences": { + "acc": true + }, + "kmmlu_applied_science": { + "acc": true + }, + "kmmlu_aviation_engineering_and_maintenance": { + "acc": true + }, + "kmmlu_biology": { + "acc": true + }, + "kmmlu_chemical_engineering": { + "acc": true + }, + "kmmlu_chemistry": { + "acc": true + }, + "kmmlu_civil_engineering": { + "acc": true + }, + "kmmlu_computer_science": { + "acc": true + }, + "kmmlu_construction": { + "acc": true + }, + "kmmlu_criminal_law": { + "acc": true + }, + "kmmlu_ecology": { + "acc": true + }, + "kmmlu_economics": { + "acc": true + }, + "kmmlu_education": { + "acc": true + }, + "kmmlu_electrical_engineering": { + "acc": true + }, + "kmmlu_electronics_engineering": { + "acc": true + }, + "kmmlu_energy_management": { + "acc": true + }, + "kmmlu_environmental_science": { + "acc": true + }, + "kmmlu_fashion": { + "acc": true + }, + "kmmlu_food_processing": { + "acc": true + }, + "kmmlu_gas_technology_and_engineering": { + "acc": true + }, + "kmmlu_geomatics": { + "acc": true + }, + "kmmlu_health": { + "acc": true + }, + "kmmlu_humss": { + "acc": true + }, + "kmmlu_industrial_engineer": { + "acc": true + }, + "kmmlu_information_technology": { + "acc": true + }, + "kmmlu_interior_architecture_and_design": { + "acc": true + }, + "kmmlu_korean_history": { + "acc": true + }, + "kmmlu_law": { + "acc": true + }, + "kmmlu_machine_design_and_manufacturing": { + "acc": true + }, + "kmmlu_management": { + "acc": true + }, + "kmmlu_maritime_engineering": { + "acc": true + }, + "kmmlu_marketing": { + "acc": true + }, + "kmmlu_materials_engineering": { + "acc": true + }, + "kmmlu_math": { + "acc": true + }, + "kmmlu_mechanical_engineering": { + "acc": true + }, + "kmmlu_nondestructive_testing": { + "acc": true + }, + "kmmlu_other": { + "acc": true + }, + "kmmlu_patent": { + "acc": true + }, + "kmmlu_political_science_and_sociology": { + "acc": true + }, + "kmmlu_psychology": { + "acc": true + }, + "kmmlu_public_safety": { + "acc": true + }, + "kmmlu_railway_and_automotive_engineering": { + "acc": true + }, + "kmmlu_real_estate": { + "acc": true + }, + "kmmlu_refrigerating_machinery": { + "acc": true + }, + "kmmlu_social_welfare": { + "acc": true + }, + "kmmlu_stem": { + "acc": true + }, + "kmmlu_taxation": { + "acc": true + }, + "kmmlu_telecommunications_and_wireless_technology": { + "acc": true + }, + "kobest_boolq": { + "acc": true, + "f1": true + }, + "kobest_copa": { + "acc": true, + "f1": true + }, + "kobest_hellaswag": { + "acc": true, + "acc_norm": true, + "f1": true + }, + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "kobest_hellaswag": { + "original": 500, + "effective": 400 + }, + "kobest_copa": { + "original": 1000, + "effective": 400 + }, + "kobest_boolq": { + "original": 1404, + "effective": 400 + }, + "kmmlu_biology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemistry": { + "original": 600, + "effective": 400 + }, + "kmmlu_civil_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_computer_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_ecology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electrical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_information_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_materials_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_math": { + "original": 300, + "effective": 300 + }, + "kmmlu_mechanical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_agricultural_sciences": { + "original": 1000, + "effective": 400 + }, + "kmmlu_construction": { + "original": 1000, + "effective": 400 + }, + "kmmlu_fashion": { + "original": 1000, + "effective": 400 + }, + "kmmlu_food_processing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_health": { + "original": 100, + "effective": 100 + }, + "kmmlu_interior_architecture_and_design": { + "original": 1000, + "effective": 400 + }, + "kmmlu_marketing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_patent": { + "original": 100, + "effective": 100 + }, + "kmmlu_public_safety": { + "original": 1000, + "effective": 400 + }, + "kmmlu_real_estate": { + "original": 200, + "effective": 200 + }, + "kmmlu_refrigerating_machinery": { + "original": 1000, + "effective": 400 + }, + "kmmlu_aviation_engineering_and_maintenance": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electronics_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_energy_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_environmental_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_gas_technology_and_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_geomatics": { + "original": 1000, + "effective": 400 + }, + "kmmlu_industrial_engineer": { + "original": 1000, + "effective": 400 + }, + "kmmlu_machine_design_and_manufacturing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_maritime_engineering": { + "original": 600, + "effective": 400 + }, + "kmmlu_nondestructive_testing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_railway_and_automotive_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_accounting": { + "original": 100, + "effective": 100 + }, + "kmmlu_criminal_law": { + "original": 200, + "effective": 200 + }, + "kmmlu_economics": { + "original": 130, + "effective": 130 + }, + "kmmlu_education": { + "original": 100, + "effective": 100 + }, + "kmmlu_korean_history": { + "original": 100, + "effective": 100 + }, + "kmmlu_law": { + "original": 1000, + "effective": 400 + }, + "kmmlu_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_political_science_and_sociology": { + "original": 300, + "effective": 300 + }, + "kmmlu_psychology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_social_welfare": { + "original": 1000, + "effective": 400 + }, + "kmmlu_taxation": { + "original": 200, + "effective": 200 + }, + "winogrande": { + "original": 1267, + "effective": 400 + }, + "arc_challenge": { + "original": 1172, + "effective": 400 + }, + "arc_easy": { + "original": 2376, + "effective": 400 + }, + "hellaswag": { + "original": 10042, + "effective": 400 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 400 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 400 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 400 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 400 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 400 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + } + }, + "config": { + "model": "hf", + "model_args": { + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + }, + "model_num_parameters": 4022468096, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "12", + "batch_sizes": [], + "device": "cuda:0", + "use_cache": null, + "limit": 400.0, + "bootstrap_iters": 100000, + "gen_kwargs": {}, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "0ce43af", + "date": 1775962096.959724, + "pretty_env_info": "PyTorch version: 2.9.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 4.1.0\nLibc version: glibc-2.35\n\nPython version: 3.11.14 | packaged by conda-forge | (main, Oct 13 2025, 14:09:32) [GCC 14.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition\nNvidia driver version: 590.48.01\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.8.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 192\nOn-line CPU(s) list: 0-191\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7642 48-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2300.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4600.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sev sev_es ibpb_exit_to_user\nVirtualization: AMD-V\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47,96-143\nNUMA node1 CPU(s): 48-95,144-191\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; Safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nVulnerability Vmscape: Mitigation; IBPB before exit to userspace\n\nVersions of relevant libraries:\n[pip3] executorch==1.0.1\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.17.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] optree==0.17.0\n[pip3] pytorch_tokenizers==1.0.1\n[pip3] torch==2.9.0+cu128\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torch-stoi==0.2.3\n[pip3] torchao==0.14.0\n[pip3] torchaudio==2.9.0+cu128\n[pip3] torchcodec==0.9.1\n[pip3] torchelastic==0.2.2\n[pip3] torchvision==0.24.0+cu128\n[pip3] triton==3.5.0\n[pip3] triton_kernels==1.0.0\n[conda] No relevant packages", + "transformers_version": "5.5.3", + "lm_eval_version": "0.4.11", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|PAD_TOKEN|>", + "151669" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "model_name_sanitized": "__home__unsloth__scp_stage1_cpt__artifacts__cpt_full_96gb_qwen3_4b__checkpoints", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": null, + "chat_template": null, + "chat_template_sha": null, + "total_evaluation_time_seconds": "580.1511918641627" + }, + "base": { + "results": { + "arc_challenge": { + "alias": "arc_challenge", + "acc,none": 0.4525, + "acc_stderr,none": 0.024918098926991643, + "acc_norm,none": 0.4975, + "acc_norm_stderr,none": 0.0250309958227734 + }, + "arc_easy": { + "alias": "arc_easy", + "acc,none": 0.7625, + "acc_stderr,none": 0.02130420258115865, + "acc_norm,none": 0.755, + "acc_norm_stderr,none": 0.02153129097913246 + }, + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4925, + "acc_stderr,none": 0.025028492535438325, + "acc_norm,none": 0.6225, + "acc_norm_stderr,none": 0.024268431488608636 + }, + "kmmlu": { + "acc,none": 0.47556707712248864, + "acc_stderr,none": 0.003918983222456166, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45875, + "acc_stderr,none": 0.007101063857525891, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_aviation_engineering_and_maintenance": { + "alias": " - kmmlu_aviation_engineering_and_maintenance", + "acc,none": 0.4475, + "acc_stderr,none": 0.024892941194307603 + }, + "kmmlu_electronics_engineering": { + "alias": " - kmmlu_electronics_engineering", + "acc,none": 0.65, + "acc_stderr,none": 0.023878346647046 + }, + "kmmlu_energy_management": { + "alias": " - kmmlu_energy_management", + "acc,none": 0.4, + "acc_stderr,none": 0.02452557357939856 + }, + "kmmlu_environmental_science": { + "alias": " - kmmlu_environmental_science", + "acc,none": 0.3875, + "acc_stderr,none": 0.02438947500927543 + }, + "kmmlu_gas_technology_and_engineering": { + "alias": " - kmmlu_gas_technology_and_engineering", + "acc,none": 0.3775, + "acc_stderr,none": 0.02426843148860864 + }, + "kmmlu_geomatics": { + "alias": " - kmmlu_geomatics", + "acc,none": 0.4325, + "acc_stderr,none": 0.024802162065186362 + }, + "kmmlu_industrial_engineer": { + "alias": " - kmmlu_industrial_engineer", + "acc,none": 0.4275, + "acc_stderr,none": 0.024766769210836766 + }, + "kmmlu_machine_design_and_manufacturing": { + "alias": " - kmmlu_machine_design_and_manufacturing", + "acc,none": 0.52, + "acc_stderr,none": 0.025011275652681887 + }, + "kmmlu_maritime_engineering": { + "alias": " - kmmlu_maritime_engineering", + "acc,none": 0.405, + "acc_stderr,none": 0.024575340657273674 + }, + "kmmlu_nondestructive_testing": { + "alias": " - kmmlu_nondestructive_testing", + "acc,none": 0.4825, + "acc_stderr,none": 0.025015972341295333 + }, + "kmmlu_railway_and_automotive_engineering": { + "alias": " - kmmlu_railway_and_automotive_engineering", + "acc,none": 0.3875, + "acc_stderr,none": 0.02438947500927542 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "alias": " - kmmlu_telecommunications_and_wireless_technology", + "acc,none": 0.5875, + "acc_stderr,none": 0.024645036407943802 + }, + "kmmlu_humss": { + "acc,none": 0.4805860805860806, + "acc_stderr,none": 0.009419825503999339, + "alias": " - kmmlu_humss" + }, + "kmmlu_accounting": { + "alias": " - kmmlu_accounting", + "acc,none": 0.49, + "acc_stderr,none": 0.05024183937956912 + }, + "kmmlu_criminal_law": { + "alias": " - kmmlu_criminal_law", + "acc,none": 0.39, + "acc_stderr,none": 0.03457567623250011 + }, + "kmmlu_economics": { + "alias": " - kmmlu_economics", + "acc,none": 0.5615384615384615, + "acc_stderr,none": 0.04368784779071991 + }, + "kmmlu_education": { + "alias": " - kmmlu_education", + "acc,none": 0.65, + "acc_stderr,none": 0.047937248544110196 + }, + "kmmlu_korean_history": { + "alias": " - kmmlu_korean_history", + "acc,none": 0.24, + "acc_stderr,none": 0.04292346959909284 + }, + "kmmlu_law": { + "alias": " - kmmlu_law", + "acc,none": 0.3875, + "acc_stderr,none": 0.024389475009275435 + }, + "kmmlu_management": { + "alias": " - kmmlu_management", + "acc,none": 0.53, + "acc_stderr,none": 0.02498621173652297 + }, + "kmmlu_political_science_and_sociology": { + "alias": " - kmmlu_political_science_and_sociology", + "acc,none": 0.5466666666666666, + "acc_stderr,none": 0.028789526978043094 + }, + "kmmlu_psychology": { + "alias": " - kmmlu_psychology", + "acc,none": 0.4275, + "acc_stderr,none": 0.02476676921083677 + }, + "kmmlu_social_welfare": { + "alias": " - kmmlu_social_welfare", + "acc,none": 0.585, + "acc_stderr,none": 0.02466695454685353 + }, + "kmmlu_taxation": { + "alias": " - kmmlu_taxation", + "acc,none": 0.435, + "acc_stderr,none": 0.03514328173714407 + }, + "kmmlu_other": { + "acc,none": 0.4772222222222222, + "acc_stderr,none": 0.008073884461069719, + "alias": " - kmmlu_other" + }, + "kmmlu_agricultural_sciences": { + "alias": " - kmmlu_agricultural_sciences", + "acc,none": 0.3625, + "acc_stderr,none": 0.024066207238097725 + }, + "kmmlu_construction": { + "alias": " - kmmlu_construction", + "acc,none": 0.3925, + "acc_stderr,none": 0.024445927747963316 + }, + "kmmlu_fashion": { + "alias": " - kmmlu_fashion", + "acc,none": 0.4575, + "acc_stderr,none": 0.024940719189394073 + }, + "kmmlu_food_processing": { + "alias": " - kmmlu_food_processing", + "acc,none": 0.39, + "acc_stderr,none": 0.024418038445046374 + }, + "kmmlu_health": { + "alias": " - kmmlu_health", + "acc,none": 0.63, + "acc_stderr,none": 0.048523658709391 + }, + "kmmlu_interior_architecture_and_design": { + "alias": " - kmmlu_interior_architecture_and_design", + "acc,none": 0.6025, + "acc_stderr,none": 0.024499693108404712 + }, + "kmmlu_marketing": { + "alias": " - kmmlu_marketing", + "acc,none": 0.76, + "acc_stderr,none": 0.021380899352993952 + }, + "kmmlu_patent": { + "alias": " - kmmlu_patent", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620332 + }, + "kmmlu_public_safety": { + "alias": " - kmmlu_public_safety", + "acc,none": 0.4025, + "acc_stderr,none": 0.024550788746396206 + }, + "kmmlu_real_estate": { + "alias": " - kmmlu_real_estate", + "acc,none": 0.485, + "acc_stderr,none": 0.03542810683297719 + }, + "kmmlu_refrigerating_machinery": { + "alias": " - kmmlu_refrigerating_machinery", + "acc,none": 0.4125, + "acc_stderr,none": 0.024645036407943802 + }, + "kmmlu_stem": { + "acc,none": 0.4897674418604651, + "acc_stderr,none": 0.007312394370135803, + "alias": " - kmmlu_stem" + }, + "kmmlu_biology": { + "alias": " - kmmlu_biology", + "acc,none": 0.3225, + "acc_stderr,none": 0.023400926978618723 + }, + "kmmlu_chemical_engineering": { + "alias": " - kmmlu_chemical_engineering", + "acc,none": 0.4875, + "acc_stderr,none": 0.025023485209500245 + }, + "kmmlu_chemistry": { + "alias": " - kmmlu_chemistry", + "acc,none": 0.5175, + "acc_stderr,none": 0.02501597234129533 + }, + "kmmlu_civil_engineering": { + "alias": " - kmmlu_civil_engineering", + "acc,none": 0.3825, + "acc_stderr,none": 0.024330316186072946 + }, + "kmmlu_computer_science": { + "alias": " - kmmlu_computer_science", + "acc,none": 0.75, + "acc_stderr,none": 0.021677749238103 + }, + "kmmlu_ecology": { + "alias": " - kmmlu_ecology", + "acc,none": 0.5425, + "acc_stderr,none": 0.024940719189394077 + }, + "kmmlu_electrical_engineering": { + "alias": " - kmmlu_electrical_engineering", + "acc,none": 0.355, + "acc_stderr,none": 0.023955629410456463 + }, + "kmmlu_information_technology": { + "alias": " - kmmlu_information_technology", + "acc,none": 0.75, + "acc_stderr,none": 0.021677749238103 + }, + "kmmlu_materials_engineering": { + "alias": " - kmmlu_materials_engineering", + "acc,none": 0.495, + "acc_stderr,none": 0.025030057119361453 + }, + "kmmlu_math": { + "alias": " - kmmlu_math", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.027262027336984396 + }, + "kmmlu_mechanical_engineering": { + "alias": " - kmmlu_mechanical_engineering", + "acc,none": 0.4125, + "acc_stderr,none": 0.024645036407943802 + }, + "kobest_boolq": { + "alias": "kobest_boolq", + "acc,none": 0.6675, + "acc_stderr,none": 0.023584952830141535, + "f1,none": 0.6247575383530242, + "f1_stderr,none": "N/A" + }, + "kobest_copa": { + "alias": "kobest_copa", + "acc,none": 0.6475, + "acc_stderr,none": 0.023917346710791564, + "f1,none": 0.6473920138042275, + "f1_stderr,none": "N/A" + }, + "kobest_hellaswag": { + "alias": "kobest_hellaswag", + "acc,none": 0.44, + "acc_stderr,none": 0.02485042976789583, + "f1,none": 0.4328647077786627, + "f1_stderr,none": "N/A", + "acc_norm,none": 0.5825, + "acc_norm_stderr,none": 0.024688218756390913 + }, + "mmlu": { + "acc,none": 0.7404266255461321, + "acc_stderr,none": 0.003869340083262106, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6931079323797139, + "acc_stderr,none": 0.0077779673157217745, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5793650793650794, + "acc_stderr,none": 0.04415438226743745 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7818181818181819, + "acc_stderr,none": 0.03225078108306289 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8284313725490197, + "acc_stderr,none": 0.02646056956124065 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8438818565400844, + "acc_stderr,none": 0.023627159460318684 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8016528925619835, + "acc_stderr,none": 0.03640118271990946 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7962962962962963, + "acc_stderr,none": 0.03893542518824847 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.8404907975460123, + "acc_stderr,none": 0.02876748172598387 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7543352601156069, + "acc_stderr,none": 0.023176298203992 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.3475, + "acc_stderr,none": 0.023838625698390636 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7588424437299035, + "acc_stderr,none": 0.02429659403476343 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.7870370370370371, + "acc_stderr,none": 0.02277971908873339 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.53, + "acc_stderr,none": 0.02498621173652297 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8070175438596491, + "acc_stderr,none": 0.030267457554898458 + }, + "mmlu_other": { + "acc,none": 0.7437591776798825, + "acc_stderr,none": 0.008056333552095894, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.75, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7773584905660378, + "acc_stderr,none": 0.0256042334708991 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7341040462427746, + "acc_stderr,none": 0.03368762932259431 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.43, + "acc_stderr,none": 0.04975698519562429 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7488789237668162, + "acc_stderr,none": 0.02910522083322461 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8932038834951457, + "acc_stderr,none": 0.030581088928331356 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9145299145299145, + "acc_stderr,none": 0.018315891685625862 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036846 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.82, + "acc_stderr,none": 0.01923342954415769 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7745098039215687, + "acc_stderr,none": 0.023929155517351277 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5709219858156028, + "acc_stderr,none": 0.02952591430255856 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7757352941176471, + "acc_stderr,none": 0.025336848563332365 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5120481927710844, + "acc_stderr,none": 0.03891364495835817 + }, + "mmlu_social_sciences": { + "acc,none": 0.8202205882352941, + "acc_stderr,none": 0.007248431086566561, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336937 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8737373737373737, + "acc_stderr,none": 0.02366435940288024 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8756476683937824, + "acc_stderr,none": 0.023814477086593556 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.8076923076923077, + "acc_stderr,none": 0.019982347208637292 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.8991596638655462, + "acc_stderr,none": 0.019559663430480802 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.9025, + "acc_stderr,none": 0.0148504449187799 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7862595419847328, + "acc_stderr,none": 0.035954616117746904 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7475, + "acc_stderr,none": 0.0217495282695941 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6818181818181818, + "acc_stderr,none": 0.04461272175910509 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7673469387755102, + "acc_stderr,none": 0.02704925791589618 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.845771144278607, + "acc_stderr,none": 0.02553843336857833 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.89, + "acc_stderr,none": 0.03144660377352203 + }, + "mmlu_stem": { + "acc,none": 0.7148747224865207, + "acc_stderr,none": 0.007751851248299227, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.47, + "acc_stderr,none": 0.050161355804659205 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6888888888888889, + "acc_stderr,none": 0.03999262876617723 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8421052631578947, + "acc_stderr,none": 0.02967416752010141 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8402777777777778, + "acc_stderr,none": 0.030635578972093267 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.56, + "acc_stderr,none": 0.049888765156985884 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.66, + "acc_stderr,none": 0.04760952285695237 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.52, + "acc_stderr,none": 0.050211673156867795 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.5686274509803921, + "acc_stderr,none": 0.04928099597287534 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.83, + "acc_stderr,none": 0.0377525168068637 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.7957446808510639, + "acc_stderr,none": 0.026355158413349428 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.7517241379310344, + "acc_stderr,none": 0.036001056927277716 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.7116402116402116, + "acc_stderr,none": 0.023330654054535903 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.9161290322580645, + "acc_stderr,none": 0.015769027496775653 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.7192118226600985, + "acc_stderr,none": 0.03161856335358611 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.87, + "acc_stderr,none": 0.03379976689896309 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.5222222222222223, + "acc_stderr,none": 0.030455413985678408 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.6754966887417219, + "acc_stderr,none": 0.038227469376587525 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.030546745264953185 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.6160714285714286, + "acc_stderr,none": 0.04616143075028546 + }, + "winogrande": { + "alias": "winogrande", + "acc,none": 0.7375, + "acc_stderr,none": 0.022027196108925243 + } + }, + "groups": { + "kmmlu": { + "acc,none": 0.47556707712248864, + "acc_stderr,none": 0.003918983222456166, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45875, + "acc_stderr,none": 0.007101063857525891, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_humss": { + "acc,none": 0.4805860805860806, + "acc_stderr,none": 0.009419825503999339, + "alias": " - kmmlu_humss" + }, + "kmmlu_other": { + "acc,none": 0.4772222222222222, + "acc_stderr,none": 0.008073884461069719, + "alias": " - kmmlu_other" + }, + "kmmlu_stem": { + "acc,none": 0.4897674418604651, + "acc_stderr,none": 0.007312394370135803, + "alias": " - kmmlu_stem" + }, + "mmlu": { + "acc,none": 0.7404266255461321, + "acc_stderr,none": 0.003869340083262106, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6931079323797139, + "acc_stderr,none": 0.0077779673157217745, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7437591776798825, + "acc_stderr,none": 0.008056333552095894, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8202205882352941, + "acc_stderr,none": 0.007248431086566561, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.7148747224865207, + "acc_stderr,none": 0.007751851248299227, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_law", + "mmlu_world_religions" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_psychology", + "mmlu_human_sexuality", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy" + ], + "mmlu_other": [ + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_medicine", + "mmlu_global_facts", + "mmlu_human_aging", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_nutrition", + "mmlu_professional_accounting", + "mmlu_professional_medicine", + "mmlu_virology" + ], + "mmlu_stem": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_mathematics", + "mmlu_high_school_physics", + "mmlu_high_school_statistics", + "mmlu_machine_learning" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "hellaswag": [], + "arc_easy": [], + "arc_challenge": [], + "winogrande": [], + "kmmlu_humss": [ + "kmmlu_accounting", + "kmmlu_criminal_law", + "kmmlu_economics", + "kmmlu_education", + "kmmlu_korean_history", + "kmmlu_law", + "kmmlu_management", + "kmmlu_political_science_and_sociology", + "kmmlu_psychology", + "kmmlu_social_welfare", + "kmmlu_taxation" + ], + "kmmlu_applied_science": [ + "kmmlu_aviation_engineering_and_maintenance", + "kmmlu_electronics_engineering", + "kmmlu_energy_management", + "kmmlu_environmental_science", + "kmmlu_gas_technology_and_engineering", + "kmmlu_geomatics", + "kmmlu_industrial_engineer", + "kmmlu_machine_design_and_manufacturing", + "kmmlu_maritime_engineering", + "kmmlu_nondestructive_testing", + "kmmlu_railway_and_automotive_engineering", + "kmmlu_telecommunications_and_wireless_technology" + ], + "kmmlu_other": [ + "kmmlu_agricultural_sciences", + "kmmlu_construction", + "kmmlu_fashion", + "kmmlu_food_processing", + "kmmlu_health", + "kmmlu_interior_architecture_and_design", + "kmmlu_marketing", + "kmmlu_patent", + "kmmlu_public_safety", + "kmmlu_real_estate", + "kmmlu_refrigerating_machinery" + ], + "kmmlu_stem": [ + "kmmlu_biology", + "kmmlu_chemical_engineering", + "kmmlu_chemistry", + "kmmlu_civil_engineering", + "kmmlu_computer_science", + "kmmlu_ecology", + "kmmlu_electrical_engineering", + "kmmlu_information_technology", + "kmmlu_materials_engineering", + "kmmlu_math", + "kmmlu_mechanical_engineering" + ], + "kmmlu": [ + "kmmlu_stem", + "kmmlu_other", + "kmmlu_applied_science", + "kmmlu_humss" + ], + "kobest_boolq": [], + "kobest_copa": [], + "kobest_hellaswag": [] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "arc_easy": { + "task": "arc_easy", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Easy", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "Rowan/hellaswag", + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_accounting": { + "task": "kmmlu_accounting", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_agricultural_sciences": { + "task": "kmmlu_agricultural_sciences", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Agricultural-Sciences", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_aviation_engineering_and_maintenance": { + "task": "kmmlu_aviation_engineering_and_maintenance", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Aviation-Engineering-and-Maintenance", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_biology": { + "task": "kmmlu_biology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_chemical_engineering": { + "task": "kmmlu_chemical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_chemistry": { + "task": "kmmlu_chemistry", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_civil_engineering": { + "task": "kmmlu_civil_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Civil-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_computer_science": { + "task": "kmmlu_computer_science", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Computer-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_construction": { + "task": "kmmlu_construction", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Construction", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_criminal_law": { + "task": "kmmlu_criminal_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Criminal-Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_ecology": { + "task": "kmmlu_ecology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Ecology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_economics": { + "task": "kmmlu_economics", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Economics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_education": { + "task": "kmmlu_education", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_electrical_engineering": { + "task": "kmmlu_electrical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electrical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_electronics_engineering": { + "task": "kmmlu_electronics_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electronics-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_energy_management": { + "task": "kmmlu_energy_management", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Energy-Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_environmental_science": { + "task": "kmmlu_environmental_science", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Environmental-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_fashion": { + "task": "kmmlu_fashion", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Fashion", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_food_processing": { + "task": "kmmlu_food_processing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Food-Processing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_gas_technology_and_engineering": { + "task": "kmmlu_gas_technology_and_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Gas-Technology-and-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_geomatics": { + "task": "kmmlu_geomatics", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Geomatics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_health": { + "task": "kmmlu_health", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Health", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_industrial_engineer": { + "task": "kmmlu_industrial_engineer", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Industrial-Engineer", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_information_technology": { + "task": "kmmlu_information_technology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Information-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_interior_architecture_and_design": { + "task": "kmmlu_interior_architecture_and_design", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Interior-Architecture-and-Design", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_korean_history": { + "task": "kmmlu_korean_history", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Korean-History", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_law": { + "task": "kmmlu_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_machine_design_and_manufacturing": { + "task": "kmmlu_machine_design_and_manufacturing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Machine-Design-and-Manufacturing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_management": { + "task": "kmmlu_management", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_maritime_engineering": { + "task": "kmmlu_maritime_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Maritime-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_marketing": { + "task": "kmmlu_marketing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_materials_engineering": { + "task": "kmmlu_materials_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Materials-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_math": { + "task": "kmmlu_math", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Math", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_mechanical_engineering": { + "task": "kmmlu_mechanical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Mechanical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_nondestructive_testing": { + "task": "kmmlu_nondestructive_testing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Nondestructive-Testing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_patent": { + "task": "kmmlu_patent", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Patent", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_political_science_and_sociology": { + "task": "kmmlu_political_science_and_sociology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Political-Science-and-Sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_psychology": { + "task": "kmmlu_psychology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_public_safety": { + "task": "kmmlu_public_safety", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Public-Safety", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_railway_and_automotive_engineering": { + "task": "kmmlu_railway_and_automotive_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Railway-and-Automotive-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_real_estate": { + "task": "kmmlu_real_estate", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Real-Estate", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_refrigerating_machinery": { + "task": "kmmlu_refrigerating_machinery", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Refrigerating-Machinery", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_social_welfare": { + "task": "kmmlu_social_welfare", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Social-Welfare", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_taxation": { + "task": "kmmlu_taxation", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Taxation", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_telecommunications_and_wireless_technology": { + "task": "kmmlu_telecommunications_and_wireless_technology", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Telecommunications-and-Wireless-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_boolq": { + "task": "kobest_boolq", + "dataset_path": "skt/kobest_v1", + "dataset_name": "boolq", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": [ + "아니오", + "예" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_choice": [ + "아니오", + "예" + ], + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_copa": { + "task": "kobest_copa", + "dataset_path": "skt/kobest_v1", + "dataset_name": "copa", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "def copa_doc_to_text(doc: dict) -> str:\n connector = {\"원인\": \" 왜냐하면\", \"결과\": \" 그래서\"}[doc[\"question\"].strip()]\n return f\"\"\"{doc[\"premise\"]} {connector}\"\"\"\n", + "doc_to_target": "def copa_doc_to_target(doc: dict) -> str:\n correct_choice = doc[\"alternative_1\"] if doc[\"label\"] == 0 else doc[\"alternative_2\"]\n return f\"\"\"{correct_choice}\"\"\"\n", + "unsafe_code": false, + "doc_to_choice": "def copa_doc_to_choice(doc: dict) -> list:\n return [f\"\"\"{doc[\"alternative_1\"]}\"\"\", f\"\"\"{doc[\"alternative_2\"]}\"\"\"]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_hellaswag": { + "task": "kobest_hellaswag", + "dataset_path": "skt/kobest_v1", + "dataset_name": "hellaswag", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [\n dataset[\"ending_1\"],\n dataset[\"ending_2\"],\n dataset[\"ending_3\"],\n dataset[\"ending_4\"],\n ],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "allenai/winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + } + }, + "versions": { + "arc_challenge": 1.0, + "arc_easy": 1.0, + "hellaswag": 1.0, + "kmmlu": 2.0, + "kmmlu_accounting": 2.0, + "kmmlu_agricultural_sciences": 2.0, + "kmmlu_applied_science": 2.0, + "kmmlu_aviation_engineering_and_maintenance": 2.0, + "kmmlu_biology": 2.0, + "kmmlu_chemical_engineering": 2.0, + "kmmlu_chemistry": 2.0, + "kmmlu_civil_engineering": 2.0, + "kmmlu_computer_science": 2.0, + "kmmlu_construction": 2.0, + "kmmlu_criminal_law": 2.0, + "kmmlu_ecology": 2.0, + "kmmlu_economics": 2.0, + "kmmlu_education": 2.0, + "kmmlu_electrical_engineering": 2.0, + "kmmlu_electronics_engineering": 2.0, + "kmmlu_energy_management": 2.0, + "kmmlu_environmental_science": 2.0, + "kmmlu_fashion": 2.0, + "kmmlu_food_processing": 2.0, + "kmmlu_gas_technology_and_engineering": 2.0, + "kmmlu_geomatics": 2.0, + "kmmlu_health": 2.0, + "kmmlu_humss": 2.0, + "kmmlu_industrial_engineer": 2.0, + "kmmlu_information_technology": 2.0, + "kmmlu_interior_architecture_and_design": 2.0, + "kmmlu_korean_history": 2.0, + "kmmlu_law": 2.0, + "kmmlu_machine_design_and_manufacturing": 2.0, + "kmmlu_management": 2.0, + "kmmlu_maritime_engineering": 2.0, + "kmmlu_marketing": 2.0, + "kmmlu_materials_engineering": 2.0, + "kmmlu_math": 2.0, + "kmmlu_mechanical_engineering": 2.0, + "kmmlu_nondestructive_testing": 2.0, + "kmmlu_other": 2.0, + "kmmlu_patent": 2.0, + "kmmlu_political_science_and_sociology": 2.0, + "kmmlu_psychology": 2.0, + "kmmlu_public_safety": 2.0, + "kmmlu_railway_and_automotive_engineering": 2.0, + "kmmlu_real_estate": 2.0, + "kmmlu_refrigerating_machinery": 2.0, + "kmmlu_social_welfare": 2.0, + "kmmlu_stem": 2.0, + "kmmlu_taxation": 2.0, + "kmmlu_telecommunications_and_wireless_technology": 2.0, + "kobest_boolq": 1.0, + "kobest_copa": 1.0, + "kobest_hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 0, + "arc_easy": 0, + "hellaswag": 0, + "kmmlu_accounting": 0, + "kmmlu_agricultural_sciences": 0, + "kmmlu_aviation_engineering_and_maintenance": 0, + "kmmlu_biology": 0, + "kmmlu_chemical_engineering": 0, + "kmmlu_chemistry": 0, + "kmmlu_civil_engineering": 0, + "kmmlu_computer_science": 0, + "kmmlu_construction": 0, + "kmmlu_criminal_law": 0, + "kmmlu_ecology": 0, + "kmmlu_economics": 0, + "kmmlu_education": 0, + "kmmlu_electrical_engineering": 0, + "kmmlu_electronics_engineering": 0, + "kmmlu_energy_management": 0, + "kmmlu_environmental_science": 0, + "kmmlu_fashion": 0, + "kmmlu_food_processing": 0, + "kmmlu_gas_technology_and_engineering": 0, + "kmmlu_geomatics": 0, + "kmmlu_health": 0, + "kmmlu_industrial_engineer": 0, + "kmmlu_information_technology": 0, + "kmmlu_interior_architecture_and_design": 0, + "kmmlu_korean_history": 0, + "kmmlu_law": 0, + "kmmlu_machine_design_and_manufacturing": 0, + "kmmlu_management": 0, + "kmmlu_maritime_engineering": 0, + "kmmlu_marketing": 0, + "kmmlu_materials_engineering": 0, + "kmmlu_math": 0, + "kmmlu_mechanical_engineering": 0, + "kmmlu_nondestructive_testing": 0, + "kmmlu_patent": 0, + "kmmlu_political_science_and_sociology": 0, + "kmmlu_psychology": 0, + "kmmlu_public_safety": 0, + "kmmlu_railway_and_automotive_engineering": 0, + "kmmlu_real_estate": 0, + "kmmlu_refrigerating_machinery": 0, + "kmmlu_social_welfare": 0, + "kmmlu_taxation": 0, + "kmmlu_telecommunications_and_wireless_technology": 0, + "kobest_boolq": 0, + "kobest_copa": 0, + "kobest_hellaswag": 0, + "mmlu_abstract_algebra": 0, + "mmlu_anatomy": 0, + "mmlu_astronomy": 0, + "mmlu_business_ethics": 0, + "mmlu_clinical_knowledge": 0, + "mmlu_college_biology": 0, + "mmlu_college_chemistry": 0, + "mmlu_college_computer_science": 0, + "mmlu_college_mathematics": 0, + "mmlu_college_medicine": 0, + "mmlu_college_physics": 0, + "mmlu_computer_security": 0, + "mmlu_conceptual_physics": 0, + "mmlu_econometrics": 0, + "mmlu_electrical_engineering": 0, + "mmlu_elementary_mathematics": 0, + "mmlu_formal_logic": 0, + "mmlu_global_facts": 0, + "mmlu_high_school_biology": 0, + "mmlu_high_school_chemistry": 0, + "mmlu_high_school_computer_science": 0, + "mmlu_high_school_european_history": 0, + "mmlu_high_school_geography": 0, + "mmlu_high_school_government_and_politics": 0, + "mmlu_high_school_macroeconomics": 0, + "mmlu_high_school_mathematics": 0, + "mmlu_high_school_microeconomics": 0, + "mmlu_high_school_physics": 0, + "mmlu_high_school_psychology": 0, + "mmlu_high_school_statistics": 0, + "mmlu_high_school_us_history": 0, + "mmlu_high_school_world_history": 0, + "mmlu_human_aging": 0, + "mmlu_human_sexuality": 0, + "mmlu_international_law": 0, + "mmlu_jurisprudence": 0, + "mmlu_logical_fallacies": 0, + "mmlu_machine_learning": 0, + "mmlu_management": 0, + "mmlu_marketing": 0, + "mmlu_medical_genetics": 0, + "mmlu_miscellaneous": 0, + "mmlu_moral_disputes": 0, + "mmlu_moral_scenarios": 0, + "mmlu_nutrition": 0, + "mmlu_philosophy": 0, + "mmlu_prehistory": 0, + "mmlu_professional_accounting": 0, + "mmlu_professional_law": 0, + "mmlu_professional_medicine": 0, + "mmlu_professional_psychology": 0, + "mmlu_public_relations": 0, + "mmlu_security_studies": 0, + "mmlu_sociology": 0, + "mmlu_us_foreign_policy": 0, + "mmlu_virology": 0, + "mmlu_world_religions": 0, + "winogrande": 0 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "arc_easy": { + "acc": true, + "acc_norm": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "kmmlu": { + "acc": true + }, + "kmmlu_accounting": { + "acc": true + }, + "kmmlu_agricultural_sciences": { + "acc": true + }, + "kmmlu_applied_science": { + "acc": true + }, + "kmmlu_aviation_engineering_and_maintenance": { + "acc": true + }, + "kmmlu_biology": { + "acc": true + }, + "kmmlu_chemical_engineering": { + "acc": true + }, + "kmmlu_chemistry": { + "acc": true + }, + "kmmlu_civil_engineering": { + "acc": true + }, + "kmmlu_computer_science": { + "acc": true + }, + "kmmlu_construction": { + "acc": true + }, + "kmmlu_criminal_law": { + "acc": true + }, + "kmmlu_ecology": { + "acc": true + }, + "kmmlu_economics": { + "acc": true + }, + "kmmlu_education": { + "acc": true + }, + "kmmlu_electrical_engineering": { + "acc": true + }, + "kmmlu_electronics_engineering": { + "acc": true + }, + "kmmlu_energy_management": { + "acc": true + }, + "kmmlu_environmental_science": { + "acc": true + }, + "kmmlu_fashion": { + "acc": true + }, + "kmmlu_food_processing": { + "acc": true + }, + "kmmlu_gas_technology_and_engineering": { + "acc": true + }, + "kmmlu_geomatics": { + "acc": true + }, + "kmmlu_health": { + "acc": true + }, + "kmmlu_humss": { + "acc": true + }, + "kmmlu_industrial_engineer": { + "acc": true + }, + "kmmlu_information_technology": { + "acc": true + }, + "kmmlu_interior_architecture_and_design": { + "acc": true + }, + "kmmlu_korean_history": { + "acc": true + }, + "kmmlu_law": { + "acc": true + }, + "kmmlu_machine_design_and_manufacturing": { + "acc": true + }, + "kmmlu_management": { + "acc": true + }, + "kmmlu_maritime_engineering": { + "acc": true + }, + "kmmlu_marketing": { + "acc": true + }, + "kmmlu_materials_engineering": { + "acc": true + }, + "kmmlu_math": { + "acc": true + }, + "kmmlu_mechanical_engineering": { + "acc": true + }, + "kmmlu_nondestructive_testing": { + "acc": true + }, + "kmmlu_other": { + "acc": true + }, + "kmmlu_patent": { + "acc": true + }, + "kmmlu_political_science_and_sociology": { + "acc": true + }, + "kmmlu_psychology": { + "acc": true + }, + "kmmlu_public_safety": { + "acc": true + }, + "kmmlu_railway_and_automotive_engineering": { + "acc": true + }, + "kmmlu_real_estate": { + "acc": true + }, + "kmmlu_refrigerating_machinery": { + "acc": true + }, + "kmmlu_social_welfare": { + "acc": true + }, + "kmmlu_stem": { + "acc": true + }, + "kmmlu_taxation": { + "acc": true + }, + "kmmlu_telecommunications_and_wireless_technology": { + "acc": true + }, + "kobest_boolq": { + "acc": true, + "f1": true + }, + "kobest_copa": { + "acc": true, + "f1": true + }, + "kobest_hellaswag": { + "acc": true, + "acc_norm": true, + "f1": true + }, + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "kobest_hellaswag": { + "original": 500, + "effective": 400 + }, + "kobest_copa": { + "original": 1000, + "effective": 400 + }, + "kobest_boolq": { + "original": 1404, + "effective": 400 + }, + "kmmlu_biology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemistry": { + "original": 600, + "effective": 400 + }, + "kmmlu_civil_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_computer_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_ecology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electrical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_information_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_materials_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_math": { + "original": 300, + "effective": 300 + }, + "kmmlu_mechanical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_agricultural_sciences": { + "original": 1000, + "effective": 400 + }, + "kmmlu_construction": { + "original": 1000, + "effective": 400 + }, + "kmmlu_fashion": { + "original": 1000, + "effective": 400 + }, + "kmmlu_food_processing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_health": { + "original": 100, + "effective": 100 + }, + "kmmlu_interior_architecture_and_design": { + "original": 1000, + "effective": 400 + }, + "kmmlu_marketing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_patent": { + "original": 100, + "effective": 100 + }, + "kmmlu_public_safety": { + "original": 1000, + "effective": 400 + }, + "kmmlu_real_estate": { + "original": 200, + "effective": 200 + }, + "kmmlu_refrigerating_machinery": { + "original": 1000, + "effective": 400 + }, + "kmmlu_aviation_engineering_and_maintenance": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electronics_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_energy_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_environmental_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_gas_technology_and_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_geomatics": { + "original": 1000, + "effective": 400 + }, + "kmmlu_industrial_engineer": { + "original": 1000, + "effective": 400 + }, + "kmmlu_machine_design_and_manufacturing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_maritime_engineering": { + "original": 600, + "effective": 400 + }, + "kmmlu_nondestructive_testing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_railway_and_automotive_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_accounting": { + "original": 100, + "effective": 100 + }, + "kmmlu_criminal_law": { + "original": 200, + "effective": 200 + }, + "kmmlu_economics": { + "original": 130, + "effective": 130 + }, + "kmmlu_education": { + "original": 100, + "effective": 100 + }, + "kmmlu_korean_history": { + "original": 100, + "effective": 100 + }, + "kmmlu_law": { + "original": 1000, + "effective": 400 + }, + "kmmlu_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_political_science_and_sociology": { + "original": 300, + "effective": 300 + }, + "kmmlu_psychology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_social_welfare": { + "original": 1000, + "effective": 400 + }, + "kmmlu_taxation": { + "original": 200, + "effective": 200 + }, + "winogrande": { + "original": 1267, + "effective": 400 + }, + "arc_challenge": { + "original": 1172, + "effective": 400 + }, + "arc_easy": { + "original": 2376, + "effective": 400 + }, + "hellaswag": { + "original": 10042, + "effective": 400 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 400 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 400 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 400 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 400 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 400 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + } + }, + "config": { + "model": "hf", + "model_args": { + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + }, + "model_num_parameters": 4022468096, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "0573b584bc6b32adc84bb9c91bf9b71bea71fc40", + "batch_size": "12", + "batch_sizes": [], + "device": "cuda:0", + "use_cache": null, + "limit": 400.0, + "bootstrap_iters": 100000, + "gen_kwargs": {}, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "0ce43af", + "date": 1775962695.520946, + "pretty_env_info": "PyTorch version: 2.9.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 4.1.0\nLibc version: glibc-2.35\n\nPython version: 3.11.14 | packaged by conda-forge | (main, Oct 13 2025, 14:09:32) [GCC 14.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition\nNvidia driver version: 590.48.01\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.8.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 192\nOn-line CPU(s) list: 0-191\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7642 48-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2300.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4600.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sev sev_es ibpb_exit_to_user\nVirtualization: AMD-V\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47,96-143\nNUMA node1 CPU(s): 48-95,144-191\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; Safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nVulnerability Vmscape: Mitigation; IBPB before exit to userspace\n\nVersions of relevant libraries:\n[pip3] executorch==1.0.1\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.17.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] optree==0.17.0\n[pip3] pytorch_tokenizers==1.0.1\n[pip3] torch==2.9.0+cu128\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torch-stoi==0.2.3\n[pip3] torchao==0.14.0\n[pip3] torchaudio==2.9.0+cu128\n[pip3] torchcodec==0.9.1\n[pip3] torchelastic==0.2.2\n[pip3] torchvision==0.24.0+cu128\n[pip3] triton==3.5.0\n[pip3] triton_kernels==1.0.0\n[conda] No relevant packages", + "transformers_version": "5.5.3", + "lm_eval_version": "0.4.11", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|vision_pad|>", + "151654" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "unsloth/Qwen3-4B-Base", + "model_name_sanitized": "unsloth__Qwen3-4B-Base", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": null, + "chat_template": null, + "chat_template_sha": null, + "total_evaluation_time_seconds": "573.7631184216589" + } + } +} \ No newline at end of file diff --git a/eval/lm_eval/checkpoints/base/stdout.txt b/eval/lm_eval/checkpoints/base/stdout.txt new file mode 100644 index 0000000..2190b6e --- /dev/null +++ b/eval/lm_eval/checkpoints/base/stdout.txt @@ -0,0 +1,2765 @@ +2026-04-12:02:58:12 WARNING [config.evaluate_config:281] --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT. +2026-04-12:02:58:15 INFO [_cli.run:376] Selected Tasks: ['mmlu', 'hellaswag', 'arc_easy', 'arc_challenge', 'winogrande', 'kmmlu', 'kobest_boolq', 'kobest_copa', 'kobest_hellaswag'] +🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning. +Unsloth: Your Flash Attention 2 installation seems to be broken. Using Xformers instead. No performance changes will be seen. +🦥 Unsloth Zoo will now patch everything to make training faster! + +Loading weights: 0%| | 0/398 [00:00 datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_accounting": { + "task": "kmmlu_accounting", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_agricultural_sciences": { + "task": "kmmlu_agricultural_sciences", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Agricultural-Sciences", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_aviation_engineering_and_maintenance": { + "task": "kmmlu_aviation_engineering_and_maintenance", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Aviation-Engineering-and-Maintenance", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_biology": { + "task": "kmmlu_biology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_chemical_engineering": { + "task": "kmmlu_chemical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_chemistry": { + "task": "kmmlu_chemistry", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_civil_engineering": { + "task": "kmmlu_civil_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Civil-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_computer_science": { + "task": "kmmlu_computer_science", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Computer-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_construction": { + "task": "kmmlu_construction", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Construction", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_criminal_law": { + "task": "kmmlu_criminal_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Criminal-Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_ecology": { + "task": "kmmlu_ecology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Ecology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_economics": { + "task": "kmmlu_economics", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Economics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_education": { + "task": "kmmlu_education", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_electrical_engineering": { + "task": "kmmlu_electrical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electrical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_electronics_engineering": { + "task": "kmmlu_electronics_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electronics-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_energy_management": { + "task": "kmmlu_energy_management", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Energy-Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_environmental_science": { + "task": "kmmlu_environmental_science", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Environmental-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_fashion": { + "task": "kmmlu_fashion", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Fashion", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_food_processing": { + "task": "kmmlu_food_processing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Food-Processing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_gas_technology_and_engineering": { + "task": "kmmlu_gas_technology_and_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Gas-Technology-and-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_geomatics": { + "task": "kmmlu_geomatics", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Geomatics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_health": { + "task": "kmmlu_health", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Health", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_industrial_engineer": { + "task": "kmmlu_industrial_engineer", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Industrial-Engineer", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_information_technology": { + "task": "kmmlu_information_technology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Information-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_interior_architecture_and_design": { + "task": "kmmlu_interior_architecture_and_design", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Interior-Architecture-and-Design", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_korean_history": { + "task": "kmmlu_korean_history", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Korean-History", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_law": { + "task": "kmmlu_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_machine_design_and_manufacturing": { + "task": "kmmlu_machine_design_and_manufacturing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Machine-Design-and-Manufacturing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_management": { + "task": "kmmlu_management", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_maritime_engineering": { + "task": "kmmlu_maritime_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Maritime-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_marketing": { + "task": "kmmlu_marketing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_materials_engineering": { + "task": "kmmlu_materials_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Materials-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_math": { + "task": "kmmlu_math", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Math", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_mechanical_engineering": { + "task": "kmmlu_mechanical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Mechanical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_nondestructive_testing": { + "task": "kmmlu_nondestructive_testing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Nondestructive-Testing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_patent": { + "task": "kmmlu_patent", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Patent", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_political_science_and_sociology": { + "task": "kmmlu_political_science_and_sociology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Political-Science-and-Sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_psychology": { + "task": "kmmlu_psychology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_public_safety": { + "task": "kmmlu_public_safety", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Public-Safety", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_railway_and_automotive_engineering": { + "task": "kmmlu_railway_and_automotive_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Railway-and-Automotive-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_real_estate": { + "task": "kmmlu_real_estate", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Real-Estate", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_refrigerating_machinery": { + "task": "kmmlu_refrigerating_machinery", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Refrigerating-Machinery", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_social_welfare": { + "task": "kmmlu_social_welfare", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Social-Welfare", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_taxation": { + "task": "kmmlu_taxation", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Taxation", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kmmlu_telecommunications_and_wireless_technology": { + "task": "kmmlu_telecommunications_and_wireless_technology", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Telecommunications-and-Wireless-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_boolq": { + "task": "kobest_boolq", + "dataset_path": "skt/kobest_v1", + "dataset_name": "boolq", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": [ + "아니오", + "예" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_choice": [ + "아니오", + "예" + ], + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_copa": { + "task": "kobest_copa", + "dataset_path": "skt/kobest_v1", + "dataset_name": "copa", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "def copa_doc_to_text(doc: dict) -> str:\n connector = {\"원인\": \" 왜냐하면\", \"결과\": \" 그래서\"}[doc[\"question\"].strip()]\n return f\"\"\"{doc[\"premise\"]} {connector}\"\"\"\n", + "doc_to_target": "def copa_doc_to_target(doc: dict) -> str:\n correct_choice = doc[\"alternative_1\"] if doc[\"label\"] == 0 else doc[\"alternative_2\"]\n return f\"\"\"{correct_choice}\"\"\"\n", + "unsafe_code": false, + "doc_to_choice": "def copa_doc_to_choice(doc: dict) -> list:\n return [f\"\"\"{doc[\"alternative_1\"]}\"\"\", f\"\"\"{doc[\"alternative_2\"]}\"\"\"]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "kobest_hellaswag": { + "task": "kobest_hellaswag", + "dataset_path": "skt/kobest_v1", + "dataset_name": "hellaswag", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [\n dataset[\"ending_1\"],\n dataset[\"ending_2\"],\n dataset[\"ending_3\"],\n dataset[\"ending_4\"],\n ],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "allenai/winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + } + } + }, + "versions": { + "arc_challenge": 1.0, + "arc_easy": 1.0, + "hellaswag": 1.0, + "kmmlu": 2.0, + "kmmlu_accounting": 2.0, + "kmmlu_agricultural_sciences": 2.0, + "kmmlu_applied_science": 2.0, + "kmmlu_aviation_engineering_and_maintenance": 2.0, + "kmmlu_biology": 2.0, + "kmmlu_chemical_engineering": 2.0, + "kmmlu_chemistry": 2.0, + "kmmlu_civil_engineering": 2.0, + "kmmlu_computer_science": 2.0, + "kmmlu_construction": 2.0, + "kmmlu_criminal_law": 2.0, + "kmmlu_ecology": 2.0, + "kmmlu_economics": 2.0, + "kmmlu_education": 2.0, + "kmmlu_electrical_engineering": 2.0, + "kmmlu_electronics_engineering": 2.0, + "kmmlu_energy_management": 2.0, + "kmmlu_environmental_science": 2.0, + "kmmlu_fashion": 2.0, + "kmmlu_food_processing": 2.0, + "kmmlu_gas_technology_and_engineering": 2.0, + "kmmlu_geomatics": 2.0, + "kmmlu_health": 2.0, + "kmmlu_humss": 2.0, + "kmmlu_industrial_engineer": 2.0, + "kmmlu_information_technology": 2.0, + "kmmlu_interior_architecture_and_design": 2.0, + "kmmlu_korean_history": 2.0, + "kmmlu_law": 2.0, + "kmmlu_machine_design_and_manufacturing": 2.0, + "kmmlu_management": 2.0, + "kmmlu_maritime_engineering": 2.0, + "kmmlu_marketing": 2.0, + "kmmlu_materials_engineering": 2.0, + "kmmlu_math": 2.0, + "kmmlu_mechanical_engineering": 2.0, + "kmmlu_nondestructive_testing": 2.0, + "kmmlu_other": 2.0, + "kmmlu_patent": 2.0, + "kmmlu_political_science_and_sociology": 2.0, + "kmmlu_psychology": 2.0, + "kmmlu_public_safety": 2.0, + "kmmlu_railway_and_automotive_engineering": 2.0, + "kmmlu_real_estate": 2.0, + "kmmlu_refrigerating_machinery": 2.0, + "kmmlu_social_welfare": 2.0, + "kmmlu_stem": 2.0, + "kmmlu_taxation": 2.0, + "kmmlu_telecommunications_and_wireless_technology": 2.0, + "kobest_boolq": 1.0, + "kobest_copa": 1.0, + "kobest_hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 0, + "arc_easy": 0, + "hellaswag": 0, + "kmmlu_accounting": 0, + "kmmlu_agricultural_sciences": 0, + "kmmlu_aviation_engineering_and_maintenance": 0, + "kmmlu_biology": 0, + "kmmlu_chemical_engineering": 0, + "kmmlu_chemistry": 0, + "kmmlu_civil_engineering": 0, + "kmmlu_computer_science": 0, + "kmmlu_construction": 0, + "kmmlu_criminal_law": 0, + "kmmlu_ecology": 0, + "kmmlu_economics": 0, + "kmmlu_education": 0, + "kmmlu_electrical_engineering": 0, + "kmmlu_electronics_engineering": 0, + "kmmlu_energy_management": 0, + "kmmlu_environmental_science": 0, + "kmmlu_fashion": 0, + "kmmlu_food_processing": 0, + "kmmlu_gas_technology_and_engineering": 0, + "kmmlu_geomatics": 0, + "kmmlu_health": 0, + "kmmlu_industrial_engineer": 0, + "kmmlu_information_technology": 0, + "kmmlu_interior_architecture_and_design": 0, + "kmmlu_korean_history": 0, + "kmmlu_law": 0, + "kmmlu_machine_design_and_manufacturing": 0, + "kmmlu_management": 0, + "kmmlu_maritime_engineering": 0, + "kmmlu_marketing": 0, + "kmmlu_materials_engineering": 0, + "kmmlu_math": 0, + "kmmlu_mechanical_engineering": 0, + "kmmlu_nondestructive_testing": 0, + "kmmlu_patent": 0, + "kmmlu_political_science_and_sociology": 0, + "kmmlu_psychology": 0, + "kmmlu_public_safety": 0, + "kmmlu_railway_and_automotive_engineering": 0, + "kmmlu_real_estate": 0, + "kmmlu_refrigerating_machinery": 0, + "kmmlu_social_welfare": 0, + "kmmlu_taxation": 0, + "kmmlu_telecommunications_and_wireless_technology": 0, + "kobest_boolq": 0, + "kobest_copa": 0, + "kobest_hellaswag": 0, + "mmlu_abstract_algebra": 0, + "mmlu_anatomy": 0, + "mmlu_astronomy": 0, + "mmlu_business_ethics": 0, + "mmlu_clinical_knowledge": 0, + "mmlu_college_biology": 0, + "mmlu_college_chemistry": 0, + "mmlu_college_computer_science": 0, + "mmlu_college_mathematics": 0, + "mmlu_college_medicine": 0, + "mmlu_college_physics": 0, + "mmlu_computer_security": 0, + "mmlu_conceptual_physics": 0, + "mmlu_econometrics": 0, + "mmlu_electrical_engineering": 0, + "mmlu_elementary_mathematics": 0, + "mmlu_formal_logic": 0, + "mmlu_global_facts": 0, + "mmlu_high_school_biology": 0, + "mmlu_high_school_chemistry": 0, + "mmlu_high_school_computer_science": 0, + "mmlu_high_school_european_history": 0, + "mmlu_high_school_geography": 0, + "mmlu_high_school_government_and_politics": 0, + "mmlu_high_school_macroeconomics": 0, + "mmlu_high_school_mathematics": 0, + "mmlu_high_school_microeconomics": 0, + "mmlu_high_school_physics": 0, + "mmlu_high_school_psychology": 0, + "mmlu_high_school_statistics": 0, + "mmlu_high_school_us_history": 0, + "mmlu_high_school_world_history": 0, + "mmlu_human_aging": 0, + "mmlu_human_sexuality": 0, + "mmlu_international_law": 0, + "mmlu_jurisprudence": 0, + "mmlu_logical_fallacies": 0, + "mmlu_machine_learning": 0, + "mmlu_management": 0, + "mmlu_marketing": 0, + "mmlu_medical_genetics": 0, + "mmlu_miscellaneous": 0, + "mmlu_moral_disputes": 0, + "mmlu_moral_scenarios": 0, + "mmlu_nutrition": 0, + "mmlu_philosophy": 0, + "mmlu_prehistory": 0, + "mmlu_professional_accounting": 0, + "mmlu_professional_law": 0, + "mmlu_professional_medicine": 0, + "mmlu_professional_psychology": 0, + "mmlu_public_relations": 0, + "mmlu_security_studies": 0, + "mmlu_sociology": 0, + "mmlu_us_foreign_policy": 0, + "mmlu_virology": 0, + "mmlu_world_religions": 0, + "winogrande": 0 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "arc_easy": { + "acc": true, + "acc_norm": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "kmmlu": { + "acc": true + }, + "kmmlu_accounting": { + "acc": true + }, + "kmmlu_agricultural_sciences": { + "acc": true + }, + "kmmlu_applied_science": { + "acc": true + }, + "kmmlu_aviation_engineering_and_maintenance": { + "acc": true + }, + "kmmlu_biology": { + "acc": true + }, + "kmmlu_chemical_engineering": { + "acc": true + }, + "kmmlu_chemistry": { + "acc": true + }, + "kmmlu_civil_engineering": { + "acc": true + }, + "kmmlu_computer_science": { + "acc": true + }, + "kmmlu_construction": { + "acc": true + }, + "kmmlu_criminal_law": { + "acc": true + }, + "kmmlu_ecology": { + "acc": true + }, + "kmmlu_economics": { + "acc": true + }, + "kmmlu_education": { + "acc": true + }, + "kmmlu_electrical_engineering": { + "acc": true + }, + "kmmlu_electronics_engineering": { + "acc": true + }, + "kmmlu_energy_management": { + "acc": true + }, + "kmmlu_environmental_science": { + "acc": true + }, + "kmmlu_fashion": { + "acc": true + }, + "kmmlu_food_processing": { + "acc": true + }, + "kmmlu_gas_technology_and_engineering": { + "acc": true + }, + "kmmlu_geomatics": { + "acc": true + }, + "kmmlu_health": { + "acc": true + }, + "kmmlu_humss": { + "acc": true + }, + "kmmlu_industrial_engineer": { + "acc": true + }, + "kmmlu_information_technology": { + "acc": true + }, + "kmmlu_interior_architecture_and_design": { + "acc": true + }, + "kmmlu_korean_history": { + "acc": true + }, + "kmmlu_law": { + "acc": true + }, + "kmmlu_machine_design_and_manufacturing": { + "acc": true + }, + "kmmlu_management": { + "acc": true + }, + "kmmlu_maritime_engineering": { + "acc": true + }, + "kmmlu_marketing": { + "acc": true + }, + "kmmlu_materials_engineering": { + "acc": true + }, + "kmmlu_math": { + "acc": true + }, + "kmmlu_mechanical_engineering": { + "acc": true + }, + "kmmlu_nondestructive_testing": { + "acc": true + }, + "kmmlu_other": { + "acc": true + }, + "kmmlu_patent": { + "acc": true + }, + "kmmlu_political_science_and_sociology": { + "acc": true + }, + "kmmlu_psychology": { + "acc": true + }, + "kmmlu_public_safety": { + "acc": true + }, + "kmmlu_railway_and_automotive_engineering": { + "acc": true + }, + "kmmlu_real_estate": { + "acc": true + }, + "kmmlu_refrigerating_machinery": { + "acc": true + }, + "kmmlu_social_welfare": { + "acc": true + }, + "kmmlu_stem": { + "acc": true + }, + "kmmlu_taxation": { + "acc": true + }, + "kmmlu_telecommunications_and_wireless_technology": { + "acc": true + }, + "kobest_boolq": { + "acc": true, + "f1": true + }, + "kobest_copa": { + "acc": true, + "f1": true + }, + "kobest_hellaswag": { + "acc": true, + "acc_norm": true, + "f1": true + }, + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "kobest_hellaswag": { + "original": 500, + "effective": 400 + }, + "kobest_copa": { + "original": 1000, + "effective": 400 + }, + "kobest_boolq": { + "original": 1404, + "effective": 400 + }, + "kmmlu_biology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemistry": { + "original": 600, + "effective": 400 + }, + "kmmlu_civil_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_computer_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_ecology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electrical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_information_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_materials_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_math": { + "original": 300, + "effective": 300 + }, + "kmmlu_mechanical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_agricultural_sciences": { + "original": 1000, + "effective": 400 + }, + "kmmlu_construction": { + "original": 1000, + "effective": 400 + }, + "kmmlu_fashion": { + "original": 1000, + "effective": 400 + }, + "kmmlu_food_processing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_health": { + "original": 100, + "effective": 100 + }, + "kmmlu_interior_architecture_and_design": { + "original": 1000, + "effective": 400 + }, + "kmmlu_marketing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_patent": { + "original": 100, + "effective": 100 + }, + "kmmlu_public_safety": { + "original": 1000, + "effective": 400 + }, + "kmmlu_real_estate": { + "original": 200, + "effective": 200 + }, + "kmmlu_refrigerating_machinery": { + "original": 1000, + "effective": 400 + }, + "kmmlu_aviation_engineering_and_maintenance": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electronics_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_energy_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_environmental_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_gas_technology_and_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_geomatics": { + "original": 1000, + "effective": 400 + }, + "kmmlu_industrial_engineer": { + "original": 1000, + "effective": 400 + }, + "kmmlu_machine_design_and_manufacturing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_maritime_engineering": { + "original": 600, + "effective": 400 + }, + "kmmlu_nondestructive_testing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_railway_and_automotive_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_accounting": { + "original": 100, + "effective": 100 + }, + "kmmlu_criminal_law": { + "original": 200, + "effective": 200 + }, + "kmmlu_economics": { + "original": 130, + "effective": 130 + }, + "kmmlu_education": { + "original": 100, + "effective": 100 + }, + "kmmlu_korean_history": { + "original": 100, + "effective": 100 + }, + "kmmlu_law": { + "original": 1000, + "effective": 400 + }, + "kmmlu_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_political_science_and_sociology": { + "original": 300, + "effective": 300 + }, + "kmmlu_psychology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_social_welfare": { + "original": 1000, + "effective": 400 + }, + "kmmlu_taxation": { + "original": 200, + "effective": 200 + }, + "winogrande": { + "original": 1267, + "effective": 400 + }, + "arc_challenge": { + "original": 1172, + "effective": 400 + }, + "arc_easy": { + "original": 2376, + "effective": 400 + }, + "hellaswag": { + "original": 10042, + "effective": 400 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 400 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 400 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 400 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 400 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 400 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + } + }, + "config": { + "model": "hf", + "model_args": { + "pretrained": "unsloth/Qwen3-4B-Base", + "trust_remote_code": true + }, + "model_num_parameters": 4022468096, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "0573b584bc6b32adc84bb9c91bf9b71bea71fc40", + "batch_size": "12", + "batch_sizes": [], + "device": "cuda:0", + "use_cache": null, + "limit": 400.0, + "bootstrap_iters": 100000, + "gen_kwargs": {}, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "0ce43af", + "date": 1775962695.520946, + "pretty_env_info": "PyTorch version: 2.9.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 4.1.0\nLibc version: glibc-2.35\n\nPython version: 3.11.14 | packaged by conda-forge | (main, Oct 13 2025, 14:09:32) [GCC 14.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition\nNvidia driver version: 590.48.01\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.8.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 192\nOn-line CPU(s) list: 0-191\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7642 48-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2300.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4600.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sev sev_es ibpb_exit_to_user\nVirtualization: AMD-V\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47,96-143\nNUMA node1 CPU(s): 48-95,144-191\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; Safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nVulnerability Vmscape: Mitigation; IBPB before exit to userspace\n\nVersions of relevant libraries:\n[pip3] executorch==1.0.1\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.17.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] optree==0.17.0\n[pip3] pytorch_tokenizers==1.0.1\n[pip3] torch==2.9.0+cu128\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torch-stoi==0.2.3\n[pip3] torchao==0.14.0\n[pip3] torchaudio==2.9.0+cu128\n[pip3] torchcodec==0.9.1\n[pip3] torchelastic==0.2.2\n[pip3] torchvision==0.24.0+cu128\n[pip3] triton==3.5.0\n[pip3] triton_kernels==1.0.0\n[conda] No relevant packages", + "transformers_version": "5.5.3", + "lm_eval_version": "0.4.11", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|vision_pad|>", + "151654" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "unsloth/Qwen3-4B-Base", + "model_name_sanitized": "unsloth__Qwen3-4B-Base", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": null, + "chat_template": null, + "chat_template_sha": null, + "total_evaluation_time_seconds": "573.7631184216589" +} \ No newline at end of file diff --git a/eval/lm_eval/checkpoints/cpt/__home__unsloth__scp_stage1_cpt__artifacts__cpt_full_96gb_qwen3_4b__checkpoints/results_2026-04-12T02-57-53.684526.json b/eval/lm_eval/checkpoints/cpt/__home__unsloth__scp_stage1_cpt__artifacts__cpt_full_96gb_qwen3_4b__checkpoints/results_2026-04-12T02-57-53.684526.json new file mode 100644 index 0000000..2b94719 --- /dev/null +++ b/eval/lm_eval/checkpoints/cpt/__home__unsloth__scp_stage1_cpt__artifacts__cpt_full_96gb_qwen3_4b__checkpoints/results_2026-04-12T02-57-53.684526.json @@ -0,0 +1,7825 @@ +{ + "results": { + "arc_challenge": { + "alias": "arc_challenge", + "acc,none": 0.4825, + "acc_stderr,none": 0.025015972341295333, + "acc_norm,none": 0.5325, + "acc_norm_stderr,none": 0.024978374105060028 + }, + "arc_easy": { + "alias": "arc_easy", + "acc,none": 0.78, + "acc_stderr,none": 0.020738254217024313, + "acc_norm,none": 0.795, + "acc_norm_stderr,none": 0.020210359883399975 + }, + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4975, + "acc_stderr,none": 0.025030995822773405, + "acc_norm,none": 0.63, + "acc_norm_stderr,none": 0.024170447375168467 + }, + "kmmlu": { + "acc,none": 0.4692806221646144, + "acc_stderr,none": 0.0039182515413587, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45375, + "acc_stderr,none": 0.007111885914543827, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_aviation_engineering_and_maintenance": { + "alias": " - kmmlu_aviation_engineering_and_maintenance", + "acc,none": 0.46, + "acc_stderr,none": 0.024951079956135092 + }, + "kmmlu_electronics_engineering": { + "alias": " - kmmlu_electronics_engineering", + "acc,none": 0.6275, + "acc_stderr,none": 0.0242038000082031 + }, + "kmmlu_energy_management": { + "alias": " - kmmlu_energy_management", + "acc,none": 0.395, + "acc_stderr,none": 0.0244731452227279 + }, + "kmmlu_environmental_science": { + "alias": " - kmmlu_environmental_science", + "acc,none": 0.37, + "acc_stderr,none": 0.024170447375168453 + }, + "kmmlu_gas_technology_and_engineering": { + "alias": " - kmmlu_gas_technology_and_engineering", + "acc,none": 0.405, + "acc_stderr,none": 0.024575340657273674 + }, + "kmmlu_geomatics": { + "alias": " - kmmlu_geomatics", + "acc,none": 0.425, + "acc_stderr,none": 0.024748104405776187 + }, + "kmmlu_industrial_engineer": { + "alias": " - kmmlu_industrial_engineer", + "acc,none": 0.4275, + "acc_stderr,none": 0.024766769210836766 + }, + "kmmlu_machine_design_and_manufacturing": { + "alias": " - kmmlu_machine_design_and_manufacturing", + "acc,none": 0.4975, + "acc_stderr,none": 0.025030995822773395 + }, + "kmmlu_maritime_engineering": { + "alias": " - kmmlu_maritime_engineering", + "acc,none": 0.4075, + "acc_stderr,none": 0.02459923129797198 + }, + "kmmlu_nondestructive_testing": { + "alias": " - kmmlu_nondestructive_testing", + "acc,none": 0.475, + "acc_stderr,none": 0.024999999999999994 + }, + "kmmlu_railway_and_automotive_engineering": { + "alias": " - kmmlu_railway_and_automotive_engineering", + "acc,none": 0.3825, + "acc_stderr,none": 0.024330316186072946 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "alias": " - kmmlu_telecommunications_and_wireless_technology", + "acc,none": 0.5725, + "acc_stderr,none": 0.02476676921083677 + }, + "kmmlu_humss": { + "acc,none": 0.4776556776556777, + "acc_stderr,none": 0.00943997794327789, + "alias": " - kmmlu_humss" + }, + "kmmlu_accounting": { + "alias": " - kmmlu_accounting", + "acc,none": 0.5, + "acc_stderr,none": 0.050251890762960605 + }, + "kmmlu_criminal_law": { + "alias": " - kmmlu_criminal_law", + "acc,none": 0.39, + "acc_stderr,none": 0.03457567623250012 + }, + "kmmlu_economics": { + "alias": " - kmmlu_economics", + "acc,none": 0.5461538461538461, + "acc_stderr,none": 0.04383459241436368 + }, + "kmmlu_education": { + "alias": " - kmmlu_education", + "acc,none": 0.64, + "acc_stderr,none": 0.048241815132442176 + }, + "kmmlu_korean_history": { + "alias": " - kmmlu_korean_history", + "acc,none": 0.3, + "acc_stderr,none": 0.046056618647183814 + }, + "kmmlu_law": { + "alias": " - kmmlu_law", + "acc,none": 0.375, + "acc_stderr,none": 0.02423646044779629 + }, + "kmmlu_management": { + "alias": " - kmmlu_management", + "acc,none": 0.5225, + "acc_stderr,none": 0.02500595167250431 + }, + "kmmlu_political_science_and_sociology": { + "alias": " - kmmlu_political_science_and_sociology", + "acc,none": 0.55, + "acc_stderr,none": 0.02877080459987894 + }, + "kmmlu_psychology": { + "alias": " - kmmlu_psychology", + "acc,none": 0.45, + "acc_stderr,none": 0.024905837706844923 + }, + "kmmlu_social_welfare": { + "alias": " - kmmlu_social_welfare", + "acc,none": 0.57, + "acc_stderr,none": 0.02478478796128207 + }, + "kmmlu_taxation": { + "alias": " - kmmlu_taxation", + "acc,none": 0.395, + "acc_stderr,none": 0.03465370682892271 + }, + "kmmlu_other": { + "acc,none": 0.4697222222222222, + "acc_stderr,none": 0.008043980393376315, + "alias": " - kmmlu_other" + }, + "kmmlu_agricultural_sciences": { + "alias": " - kmmlu_agricultural_sciences", + "acc,none": 0.3625, + "acc_stderr,none": 0.024066207238097735 + }, + "kmmlu_construction": { + "alias": " - kmmlu_construction", + "acc,none": 0.4, + "acc_stderr,none": 0.024525573579398552 + }, + "kmmlu_fashion": { + "alias": " - kmmlu_fashion", + "acc,none": 0.45, + "acc_stderr,none": 0.024905837706844923 + }, + "kmmlu_food_processing": { + "alias": " - kmmlu_food_processing", + "acc,none": 0.3675, + "acc_stderr,none": 0.024136399679191744 + }, + "kmmlu_health": { + "alias": " - kmmlu_health", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "kmmlu_interior_architecture_and_design": { + "alias": " - kmmlu_interior_architecture_and_design", + "acc,none": 0.6175, + "acc_stderr,none": 0.024330316186072936 + }, + "kmmlu_marketing": { + "alias": " - kmmlu_marketing", + "acc,none": 0.765, + "acc_stderr,none": 0.021226490755055 + }, + "kmmlu_patent": { + "alias": " - kmmlu_patent", + "acc,none": 0.42, + "acc_stderr,none": 0.049604496374885836 + }, + "kmmlu_public_safety": { + "alias": " - kmmlu_public_safety", + "acc,none": 0.38, + "acc_stderr,none": 0.024299715851758236 + }, + "kmmlu_real_estate": { + "alias": " - kmmlu_real_estate", + "acc,none": 0.45, + "acc_stderr,none": 0.03526639466921485 + }, + "kmmlu_refrigerating_machinery": { + "alias": " - kmmlu_refrigerating_machinery", + "acc,none": 0.41, + "acc_stderr,none": 0.02462246259333947 + }, + "kmmlu_stem": { + "acc,none": 0.48093023255813955, + "acc_stderr,none": 0.007306868046626305, + "alias": " - kmmlu_stem" + }, + "kmmlu_biology": { + "alias": " - kmmlu_biology", + "acc,none": 0.3125, + "acc_stderr,none": 0.023204644228784484 + }, + "kmmlu_chemical_engineering": { + "alias": " - kmmlu_chemical_engineering", + "acc,none": 0.4875, + "acc_stderr,none": 0.025023485209500245 + }, + "kmmlu_chemistry": { + "alias": " - kmmlu_chemistry", + "acc,none": 0.5175, + "acc_stderr,none": 0.025015972341295323 + }, + "kmmlu_civil_engineering": { + "alias": " - kmmlu_civil_engineering", + "acc,none": 0.3925, + "acc_stderr,none": 0.024445927747963322 + }, + "kmmlu_computer_science": { + "alias": " - kmmlu_computer_science", + "acc,none": 0.74, + "acc_stderr,none": 0.021959178349484305 + }, + "kmmlu_ecology": { + "alias": " - kmmlu_ecology", + "acc,none": 0.505, + "acc_stderr,none": 0.02503005711936146 + }, + "kmmlu_electrical_engineering": { + "alias": " - kmmlu_electrical_engineering", + "acc,none": 0.3425, + "acc_stderr,none": 0.02375700661717548 + }, + "kmmlu_information_technology": { + "alias": " - kmmlu_information_technology", + "acc,none": 0.7525, + "acc_stderr,none": 0.021605006729678956 + }, + "kmmlu_materials_engineering": { + "alias": " - kmmlu_materials_engineering", + "acc,none": 0.475, + "acc_stderr,none": 0.025 + }, + "kmmlu_math": { + "alias": " - kmmlu_math", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.027262027336984393 + }, + "kmmlu_mechanical_engineering": { + "alias": " - kmmlu_mechanical_engineering", + "acc,none": 0.395, + "acc_stderr,none": 0.0244731452227279 + }, + "kobest_boolq": { + "alias": "kobest_boolq", + "acc,none": 0.755, + "acc_stderr,none": 0.02153129097913247, + "f1,none": 0.7379609080456697, + "f1_stderr,none": "N/A" + }, + "kobest_copa": { + "alias": "kobest_copa", + "acc,none": 0.6525, + "acc_stderr,none": 0.023838625698390636, + "f1,none": 0.6523935455233165, + "f1_stderr,none": "N/A" + }, + "kobest_hellaswag": { + "alias": "kobest_hellaswag", + "acc,none": 0.4325, + "acc_stderr,none": 0.024802162065186355, + "f1,none": 0.4264529493583016, + "f1_stderr,none": "N/A", + "acc_norm,none": 0.565, + "acc_norm_stderr,none": 0.024818892876375884 + }, + "mmlu": { + "acc,none": 0.7352865587252634, + "acc_stderr,none": 0.003887849176172822, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6862808842652796, + "acc_stderr,none": 0.0077616777391173045, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5873015873015873, + "acc_stderr,none": 0.04403438954768177 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7818181818181819, + "acc_stderr,none": 0.03225078108306289 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8186274509803921, + "acc_stderr,none": 0.02704462171947408 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8481012658227848, + "acc_stderr,none": 0.023363878096632453 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8264462809917356, + "acc_stderr,none": 0.0345727283691767 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8148148148148148, + "acc_stderr,none": 0.03755265865037183 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.8466257668711656, + "acc_stderr,none": 0.02831160144143859 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7543352601156069, + "acc_stderr,none": 0.023176298203992005 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.3225, + "acc_stderr,none": 0.023400926978618716 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7331189710610932, + "acc_stderr,none": 0.025122637608816636 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.7870370370370371, + "acc_stderr,none": 0.02277971908873339 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5075, + "acc_stderr,none": 0.02502849253543831 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8070175438596491, + "acc_stderr,none": 0.030267457554898458 + }, + "mmlu_other": { + "acc,none": 0.7415565345080763, + "acc_stderr,none": 0.008104267812218218, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909282 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.769811320754717, + "acc_stderr,none": 0.025907897122408173 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7456647398843931, + "acc_stderr,none": 0.0332055644308557 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.44, + "acc_stderr,none": 0.0498887651569859 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7399103139013453, + "acc_stderr,none": 0.029442495585857473 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8640776699029126, + "acc_stderr,none": 0.0339329572976101 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8931623931623932, + "acc_stderr,none": 0.020237149008990932 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036846 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8225, + "acc_stderr,none": 0.019128489820344343 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7777777777777778, + "acc_stderr,none": 0.02380518652488816 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.574468085106383, + "acc_stderr,none": 0.029494827600144366 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7757352941176471, + "acc_stderr,none": 0.02533684856333236 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5060240963855421, + "acc_stderr,none": 0.038922121953330446 + }, + "mmlu_social_sciences": { + "acc,none": 0.8158088235294118, + "acc_stderr,none": 0.007306038192044323, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336937 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8585858585858586, + "acc_stderr,none": 0.02482590979334335 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8704663212435233, + "acc_stderr,none": 0.024233532297758716 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.8076923076923077, + "acc_stderr,none": 0.019982347208637296 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.8991596638655462, + "acc_stderr,none": 0.019559663430480802 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.905, + "acc_stderr,none": 0.014679107277903242 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7786259541984732, + "acc_stderr,none": 0.03641297081313729 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.74, + "acc_stderr,none": 0.02195917834948431 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6727272727272727, + "acc_stderr,none": 0.0449429086625209 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7428571428571429, + "acc_stderr,none": 0.027979823538744546 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8557213930348259, + "acc_stderr,none": 0.02484575321230605 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.89, + "acc_stderr,none": 0.03144660377352203 + }, + "mmlu_stem": { + "acc,none": 0.7082143989850935, + "acc_stderr,none": 0.007816574368205405, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.7111111111111111, + "acc_stderr,none": 0.0391545063041425 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8486842105263158, + "acc_stderr,none": 0.029162631596843975 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8263888888888888, + "acc_stderr,none": 0.03167473383795717 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.52, + "acc_stderr,none": 0.050211673156867795 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.68, + "acc_stderr,none": 0.04688261722621504 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.53, + "acc_stderr,none": 0.05016135580465919 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.5784313725490197, + "acc_stderr,none": 0.049135952012745045 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.83, + "acc_stderr,none": 0.03775251680686371 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.8, + "acc_stderr,none": 0.026148818018424506 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.7586206896551724, + "acc_stderr,none": 0.03565998174135302 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.6746031746031746, + "acc_stderr,none": 0.024130158299762613 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.9, + "acc_stderr,none": 0.017066403719657258 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.729064039408867, + "acc_stderr,none": 0.03127090713297698 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.85, + "acc_stderr,none": 0.0358870281282637 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.5296296296296297, + "acc_stderr,none": 0.030431963547936584 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.6754966887417219, + "acc_stderr,none": 0.03822746937658752 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.7037037037037037, + "acc_stderr,none": 0.031141447823536044 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5892857142857143, + "acc_stderr,none": 0.04669510663875191 + }, + "winogrande": { + "alias": "winogrande", + "acc,none": 0.7225, + "acc_stderr,none": 0.022416302137144652 + } + }, + "groups": { + "kmmlu": { + "acc,none": 0.4692806221646144, + "acc_stderr,none": 0.0039182515413587, + "alias": "kmmlu" + }, + "kmmlu_applied_science": { + "acc,none": 0.45375, + "acc_stderr,none": 0.007111885914543827, + "alias": " - kmmlu_applied_science" + }, + "kmmlu_humss": { + "acc,none": 0.4776556776556777, + "acc_stderr,none": 0.00943997794327789, + "alias": " - kmmlu_humss" + }, + "kmmlu_other": { + "acc,none": 0.4697222222222222, + "acc_stderr,none": 0.008043980393376315, + "alias": " - kmmlu_other" + }, + "kmmlu_stem": { + "acc,none": 0.48093023255813955, + "acc_stderr,none": 0.007306868046626305, + "alias": " - kmmlu_stem" + }, + "mmlu": { + "acc,none": 0.7352865587252634, + "acc_stderr,none": 0.003887849176172822, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6862808842652796, + "acc_stderr,none": 0.0077616777391173045, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7415565345080763, + "acc_stderr,none": 0.008104267812218218, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8158088235294118, + "acc_stderr,none": 0.007306038192044323, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.7082143989850935, + "acc_stderr,none": 0.007816574368205405, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_law", + "mmlu_world_religions" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_psychology", + "mmlu_human_sexuality", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy" + ], + "mmlu_other": [ + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_medicine", + "mmlu_global_facts", + "mmlu_human_aging", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_nutrition", + "mmlu_professional_accounting", + "mmlu_professional_medicine", + "mmlu_virology" + ], + "mmlu_stem": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_mathematics", + "mmlu_high_school_physics", + "mmlu_high_school_statistics", + "mmlu_machine_learning" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "hellaswag": [], + "arc_easy": [], + "arc_challenge": [], + "winogrande": [], + "kmmlu_humss": [ + "kmmlu_accounting", + "kmmlu_criminal_law", + "kmmlu_economics", + "kmmlu_education", + "kmmlu_korean_history", + "kmmlu_law", + "kmmlu_management", + "kmmlu_political_science_and_sociology", + "kmmlu_psychology", + "kmmlu_social_welfare", + "kmmlu_taxation" + ], + "kmmlu_applied_science": [ + "kmmlu_aviation_engineering_and_maintenance", + "kmmlu_electronics_engineering", + "kmmlu_energy_management", + "kmmlu_environmental_science", + "kmmlu_gas_technology_and_engineering", + "kmmlu_geomatics", + "kmmlu_industrial_engineer", + "kmmlu_machine_design_and_manufacturing", + "kmmlu_maritime_engineering", + "kmmlu_nondestructive_testing", + "kmmlu_railway_and_automotive_engineering", + "kmmlu_telecommunications_and_wireless_technology" + ], + "kmmlu_other": [ + "kmmlu_agricultural_sciences", + "kmmlu_construction", + "kmmlu_fashion", + "kmmlu_food_processing", + "kmmlu_health", + "kmmlu_interior_architecture_and_design", + "kmmlu_marketing", + "kmmlu_patent", + "kmmlu_public_safety", + "kmmlu_real_estate", + "kmmlu_refrigerating_machinery" + ], + "kmmlu_stem": [ + "kmmlu_biology", + "kmmlu_chemical_engineering", + "kmmlu_chemistry", + "kmmlu_civil_engineering", + "kmmlu_computer_science", + "kmmlu_ecology", + "kmmlu_electrical_engineering", + "kmmlu_information_technology", + "kmmlu_materials_engineering", + "kmmlu_math", + "kmmlu_mechanical_engineering" + ], + "kmmlu": [ + "kmmlu_stem", + "kmmlu_other", + "kmmlu_applied_science", + "kmmlu_humss" + ], + "kobest_boolq": [], + "kobest_copa": [], + "kobest_hellaswag": [] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "arc_easy": { + "task": "arc_easy", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Easy", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_choice": "{{choices.text}}", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "Rowan/hellaswag", + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_accounting": { + "task": "kmmlu_accounting", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_agricultural_sciences": { + "task": "kmmlu_agricultural_sciences", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Agricultural-Sciences", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_aviation_engineering_and_maintenance": { + "task": "kmmlu_aviation_engineering_and_maintenance", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Aviation-Engineering-and-Maintenance", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_biology": { + "task": "kmmlu_biology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_chemical_engineering": { + "task": "kmmlu_chemical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_chemistry": { + "task": "kmmlu_chemistry", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_civil_engineering": { + "task": "kmmlu_civil_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Civil-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_computer_science": { + "task": "kmmlu_computer_science", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Computer-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_construction": { + "task": "kmmlu_construction", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Construction", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_criminal_law": { + "task": "kmmlu_criminal_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Criminal-Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_ecology": { + "task": "kmmlu_ecology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Ecology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_economics": { + "task": "kmmlu_economics", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Economics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_education": { + "task": "kmmlu_education", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_electrical_engineering": { + "task": "kmmlu_electrical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electrical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_electronics_engineering": { + "task": "kmmlu_electronics_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Electronics-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_energy_management": { + "task": "kmmlu_energy_management", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Energy-Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_environmental_science": { + "task": "kmmlu_environmental_science", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Environmental-Science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_fashion": { + "task": "kmmlu_fashion", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Fashion", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_food_processing": { + "task": "kmmlu_food_processing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Food-Processing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_gas_technology_and_engineering": { + "task": "kmmlu_gas_technology_and_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Gas-Technology-and-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_geomatics": { + "task": "kmmlu_geomatics", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Geomatics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_health": { + "task": "kmmlu_health", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Health", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_industrial_engineer": { + "task": "kmmlu_industrial_engineer", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Industrial-Engineer", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_information_technology": { + "task": "kmmlu_information_technology", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Information-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_interior_architecture_and_design": { + "task": "kmmlu_interior_architecture_and_design", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Interior-Architecture-and-Design", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_korean_history": { + "task": "kmmlu_korean_history", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Korean-History", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_law": { + "task": "kmmlu_law", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_machine_design_and_manufacturing": { + "task": "kmmlu_machine_design_and_manufacturing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Machine-Design-and-Manufacturing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_management": { + "task": "kmmlu_management", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_maritime_engineering": { + "task": "kmmlu_maritime_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Maritime-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_marketing": { + "task": "kmmlu_marketing", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_materials_engineering": { + "task": "kmmlu_materials_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Materials-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_math": { + "task": "kmmlu_math", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Math", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_mechanical_engineering": { + "task": "kmmlu_mechanical_engineering", + "tag": "kmmlu_stem_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Mechanical-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_nondestructive_testing": { + "task": "kmmlu_nondestructive_testing", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Nondestructive-Testing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_patent": { + "task": "kmmlu_patent", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Patent", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_political_science_and_sociology": { + "task": "kmmlu_political_science_and_sociology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Political-Science-and-Sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_psychology": { + "task": "kmmlu_psychology", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_public_safety": { + "task": "kmmlu_public_safety", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Public-Safety", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_railway_and_automotive_engineering": { + "task": "kmmlu_railway_and_automotive_engineering", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Railway-and-Automotive-Engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_real_estate": { + "task": "kmmlu_real_estate", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Real-Estate", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_refrigerating_machinery": { + "task": "kmmlu_refrigerating_machinery", + "tag": "kmmlu_other_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Refrigerating-Machinery", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_social_welfare": { + "task": "kmmlu_social_welfare", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Social-Welfare", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_taxation": { + "task": "kmmlu_taxation", + "tag": "kmmlu_humss_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Taxation", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kmmlu_telecommunications_and_wireless_technology": { + "task": "kmmlu_telecommunications_and_wireless_technology", + "tag": "kmmlu_applied_science_tasks", + "dataset_path": "HAERAE-HUB/KMMLU", + "dataset_name": "Telecommunications-and-Wireless-Technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_target": "{{answer-1}}", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "{{answer-1}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 2.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_boolq": { + "task": "kobest_boolq", + "dataset_path": "skt/kobest_v1", + "dataset_name": "boolq", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": [ + "아니오", + "예" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{paragraph}} 질문: {{question}} 답변: ", + "doc_to_choice": [ + "아니오", + "예" + ], + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_copa": { + "task": "kobest_copa", + "dataset_path": "skt/kobest_v1", + "dataset_name": "copa", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "def copa_doc_to_text(doc: dict) -> str:\n connector = {\"원인\": \" 왜냐하면\", \"결과\": \" 그래서\"}[doc[\"question\"].strip()]\n return f\"\"\"{doc[\"premise\"]} {connector}\"\"\"\n", + "doc_to_target": "def copa_doc_to_target(doc: dict) -> str:\n correct_choice = doc[\"alternative_1\"] if doc[\"label\"] == 0 else doc[\"alternative_2\"]\n return f\"\"\"{correct_choice}\"\"\"\n", + "unsafe_code": false, + "doc_to_choice": "def copa_doc_to_choice(doc: dict) -> list:\n return [f\"\"\"{doc[\"alternative_1\"]}\"\"\", f\"\"\"{doc[\"alternative_2\"]}\"\"\"]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "kobest_hellaswag": { + "task": "kobest_hellaswag", + "dataset_path": "skt/kobest_v1", + "dataset_name": "hellaswag", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [\n dataset[\"ending_1\"],\n dataset[\"ending_2\"],\n dataset[\"ending_3\"],\n dataset[\"ending_4\"],\n ],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": "", + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{query}}", + "doc_to_choice": "choices", + "doc_to_target": "{{label}}", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "f1", + "aggregation": "def macro_f1_score(items):\n from sklearn.metrics import f1_score\n\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average=\"macro\")\n return fscore\n", + "average": "macro", + "hf_evaluate": true, + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n", + "split": "dev", + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "doc_to_target": "answer", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "allenai/winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "default", + "split": null, + "process_docs": null, + "fewshot_indices": null, + "samples": null, + "doc_to_text": "", + "doc_to_choice": "", + "doc_to_target": "", + "gen_prefix": null, + "fewshot_delimiter": "\n\n", + "target_delimiter": " " + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + } + } + }, + "versions": { + "arc_challenge": 1.0, + "arc_easy": 1.0, + "hellaswag": 1.0, + "kmmlu": 2.0, + "kmmlu_accounting": 2.0, + "kmmlu_agricultural_sciences": 2.0, + "kmmlu_applied_science": 2.0, + "kmmlu_aviation_engineering_and_maintenance": 2.0, + "kmmlu_biology": 2.0, + "kmmlu_chemical_engineering": 2.0, + "kmmlu_chemistry": 2.0, + "kmmlu_civil_engineering": 2.0, + "kmmlu_computer_science": 2.0, + "kmmlu_construction": 2.0, + "kmmlu_criminal_law": 2.0, + "kmmlu_ecology": 2.0, + "kmmlu_economics": 2.0, + "kmmlu_education": 2.0, + "kmmlu_electrical_engineering": 2.0, + "kmmlu_electronics_engineering": 2.0, + "kmmlu_energy_management": 2.0, + "kmmlu_environmental_science": 2.0, + "kmmlu_fashion": 2.0, + "kmmlu_food_processing": 2.0, + "kmmlu_gas_technology_and_engineering": 2.0, + "kmmlu_geomatics": 2.0, + "kmmlu_health": 2.0, + "kmmlu_humss": 2.0, + "kmmlu_industrial_engineer": 2.0, + "kmmlu_information_technology": 2.0, + "kmmlu_interior_architecture_and_design": 2.0, + "kmmlu_korean_history": 2.0, + "kmmlu_law": 2.0, + "kmmlu_machine_design_and_manufacturing": 2.0, + "kmmlu_management": 2.0, + "kmmlu_maritime_engineering": 2.0, + "kmmlu_marketing": 2.0, + "kmmlu_materials_engineering": 2.0, + "kmmlu_math": 2.0, + "kmmlu_mechanical_engineering": 2.0, + "kmmlu_nondestructive_testing": 2.0, + "kmmlu_other": 2.0, + "kmmlu_patent": 2.0, + "kmmlu_political_science_and_sociology": 2.0, + "kmmlu_psychology": 2.0, + "kmmlu_public_safety": 2.0, + "kmmlu_railway_and_automotive_engineering": 2.0, + "kmmlu_real_estate": 2.0, + "kmmlu_refrigerating_machinery": 2.0, + "kmmlu_social_welfare": 2.0, + "kmmlu_stem": 2.0, + "kmmlu_taxation": 2.0, + "kmmlu_telecommunications_and_wireless_technology": 2.0, + "kobest_boolq": 1.0, + "kobest_copa": 1.0, + "kobest_hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 0, + "arc_easy": 0, + "hellaswag": 0, + "kmmlu_accounting": 0, + "kmmlu_agricultural_sciences": 0, + "kmmlu_aviation_engineering_and_maintenance": 0, + "kmmlu_biology": 0, + "kmmlu_chemical_engineering": 0, + "kmmlu_chemistry": 0, + "kmmlu_civil_engineering": 0, + "kmmlu_computer_science": 0, + "kmmlu_construction": 0, + "kmmlu_criminal_law": 0, + "kmmlu_ecology": 0, + "kmmlu_economics": 0, + "kmmlu_education": 0, + "kmmlu_electrical_engineering": 0, + "kmmlu_electronics_engineering": 0, + "kmmlu_energy_management": 0, + "kmmlu_environmental_science": 0, + "kmmlu_fashion": 0, + "kmmlu_food_processing": 0, + "kmmlu_gas_technology_and_engineering": 0, + "kmmlu_geomatics": 0, + "kmmlu_health": 0, + "kmmlu_industrial_engineer": 0, + "kmmlu_information_technology": 0, + "kmmlu_interior_architecture_and_design": 0, + "kmmlu_korean_history": 0, + "kmmlu_law": 0, + "kmmlu_machine_design_and_manufacturing": 0, + "kmmlu_management": 0, + "kmmlu_maritime_engineering": 0, + "kmmlu_marketing": 0, + "kmmlu_materials_engineering": 0, + "kmmlu_math": 0, + "kmmlu_mechanical_engineering": 0, + "kmmlu_nondestructive_testing": 0, + "kmmlu_patent": 0, + "kmmlu_political_science_and_sociology": 0, + "kmmlu_psychology": 0, + "kmmlu_public_safety": 0, + "kmmlu_railway_and_automotive_engineering": 0, + "kmmlu_real_estate": 0, + "kmmlu_refrigerating_machinery": 0, + "kmmlu_social_welfare": 0, + "kmmlu_taxation": 0, + "kmmlu_telecommunications_and_wireless_technology": 0, + "kobest_boolq": 0, + "kobest_copa": 0, + "kobest_hellaswag": 0, + "mmlu_abstract_algebra": 0, + "mmlu_anatomy": 0, + "mmlu_astronomy": 0, + "mmlu_business_ethics": 0, + "mmlu_clinical_knowledge": 0, + "mmlu_college_biology": 0, + "mmlu_college_chemistry": 0, + "mmlu_college_computer_science": 0, + "mmlu_college_mathematics": 0, + "mmlu_college_medicine": 0, + "mmlu_college_physics": 0, + "mmlu_computer_security": 0, + "mmlu_conceptual_physics": 0, + "mmlu_econometrics": 0, + "mmlu_electrical_engineering": 0, + "mmlu_elementary_mathematics": 0, + "mmlu_formal_logic": 0, + "mmlu_global_facts": 0, + "mmlu_high_school_biology": 0, + "mmlu_high_school_chemistry": 0, + "mmlu_high_school_computer_science": 0, + "mmlu_high_school_european_history": 0, + "mmlu_high_school_geography": 0, + "mmlu_high_school_government_and_politics": 0, + "mmlu_high_school_macroeconomics": 0, + "mmlu_high_school_mathematics": 0, + "mmlu_high_school_microeconomics": 0, + "mmlu_high_school_physics": 0, + "mmlu_high_school_psychology": 0, + "mmlu_high_school_statistics": 0, + "mmlu_high_school_us_history": 0, + "mmlu_high_school_world_history": 0, + "mmlu_human_aging": 0, + "mmlu_human_sexuality": 0, + "mmlu_international_law": 0, + "mmlu_jurisprudence": 0, + "mmlu_logical_fallacies": 0, + "mmlu_machine_learning": 0, + "mmlu_management": 0, + "mmlu_marketing": 0, + "mmlu_medical_genetics": 0, + "mmlu_miscellaneous": 0, + "mmlu_moral_disputes": 0, + "mmlu_moral_scenarios": 0, + "mmlu_nutrition": 0, + "mmlu_philosophy": 0, + "mmlu_prehistory": 0, + "mmlu_professional_accounting": 0, + "mmlu_professional_law": 0, + "mmlu_professional_medicine": 0, + "mmlu_professional_psychology": 0, + "mmlu_public_relations": 0, + "mmlu_security_studies": 0, + "mmlu_sociology": 0, + "mmlu_us_foreign_policy": 0, + "mmlu_virology": 0, + "mmlu_world_religions": 0, + "winogrande": 0 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "arc_easy": { + "acc": true, + "acc_norm": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "kmmlu": { + "acc": true + }, + "kmmlu_accounting": { + "acc": true + }, + "kmmlu_agricultural_sciences": { + "acc": true + }, + "kmmlu_applied_science": { + "acc": true + }, + "kmmlu_aviation_engineering_and_maintenance": { + "acc": true + }, + "kmmlu_biology": { + "acc": true + }, + "kmmlu_chemical_engineering": { + "acc": true + }, + "kmmlu_chemistry": { + "acc": true + }, + "kmmlu_civil_engineering": { + "acc": true + }, + "kmmlu_computer_science": { + "acc": true + }, + "kmmlu_construction": { + "acc": true + }, + "kmmlu_criminal_law": { + "acc": true + }, + "kmmlu_ecology": { + "acc": true + }, + "kmmlu_economics": { + "acc": true + }, + "kmmlu_education": { + "acc": true + }, + "kmmlu_electrical_engineering": { + "acc": true + }, + "kmmlu_electronics_engineering": { + "acc": true + }, + "kmmlu_energy_management": { + "acc": true + }, + "kmmlu_environmental_science": { + "acc": true + }, + "kmmlu_fashion": { + "acc": true + }, + "kmmlu_food_processing": { + "acc": true + }, + "kmmlu_gas_technology_and_engineering": { + "acc": true + }, + "kmmlu_geomatics": { + "acc": true + }, + "kmmlu_health": { + "acc": true + }, + "kmmlu_humss": { + "acc": true + }, + "kmmlu_industrial_engineer": { + "acc": true + }, + "kmmlu_information_technology": { + "acc": true + }, + "kmmlu_interior_architecture_and_design": { + "acc": true + }, + "kmmlu_korean_history": { + "acc": true + }, + "kmmlu_law": { + "acc": true + }, + "kmmlu_machine_design_and_manufacturing": { + "acc": true + }, + "kmmlu_management": { + "acc": true + }, + "kmmlu_maritime_engineering": { + "acc": true + }, + "kmmlu_marketing": { + "acc": true + }, + "kmmlu_materials_engineering": { + "acc": true + }, + "kmmlu_math": { + "acc": true + }, + "kmmlu_mechanical_engineering": { + "acc": true + }, + "kmmlu_nondestructive_testing": { + "acc": true + }, + "kmmlu_other": { + "acc": true + }, + "kmmlu_patent": { + "acc": true + }, + "kmmlu_political_science_and_sociology": { + "acc": true + }, + "kmmlu_psychology": { + "acc": true + }, + "kmmlu_public_safety": { + "acc": true + }, + "kmmlu_railway_and_automotive_engineering": { + "acc": true + }, + "kmmlu_real_estate": { + "acc": true + }, + "kmmlu_refrigerating_machinery": { + "acc": true + }, + "kmmlu_social_welfare": { + "acc": true + }, + "kmmlu_stem": { + "acc": true + }, + "kmmlu_taxation": { + "acc": true + }, + "kmmlu_telecommunications_and_wireless_technology": { + "acc": true + }, + "kobest_boolq": { + "acc": true, + "f1": true + }, + "kobest_copa": { + "acc": true, + "f1": true + }, + "kobest_hellaswag": { + "acc": true, + "acc_norm": true, + "f1": true + }, + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "kobest_hellaswag": { + "original": 500, + "effective": 400 + }, + "kobest_copa": { + "original": 1000, + "effective": 400 + }, + "kobest_boolq": { + "original": 1404, + "effective": 400 + }, + "kmmlu_biology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_chemistry": { + "original": 600, + "effective": 400 + }, + "kmmlu_civil_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_computer_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_ecology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electrical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_information_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_materials_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_math": { + "original": 300, + "effective": 300 + }, + "kmmlu_mechanical_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_agricultural_sciences": { + "original": 1000, + "effective": 400 + }, + "kmmlu_construction": { + "original": 1000, + "effective": 400 + }, + "kmmlu_fashion": { + "original": 1000, + "effective": 400 + }, + "kmmlu_food_processing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_health": { + "original": 100, + "effective": 100 + }, + "kmmlu_interior_architecture_and_design": { + "original": 1000, + "effective": 400 + }, + "kmmlu_marketing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_patent": { + "original": 100, + "effective": 100 + }, + "kmmlu_public_safety": { + "original": 1000, + "effective": 400 + }, + "kmmlu_real_estate": { + "original": 200, + "effective": 200 + }, + "kmmlu_refrigerating_machinery": { + "original": 1000, + "effective": 400 + }, + "kmmlu_aviation_engineering_and_maintenance": { + "original": 1000, + "effective": 400 + }, + "kmmlu_electronics_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_energy_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_environmental_science": { + "original": 1000, + "effective": 400 + }, + "kmmlu_gas_technology_and_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_geomatics": { + "original": 1000, + "effective": 400 + }, + "kmmlu_industrial_engineer": { + "original": 1000, + "effective": 400 + }, + "kmmlu_machine_design_and_manufacturing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_maritime_engineering": { + "original": 600, + "effective": 400 + }, + "kmmlu_nondestructive_testing": { + "original": 1000, + "effective": 400 + }, + "kmmlu_railway_and_automotive_engineering": { + "original": 1000, + "effective": 400 + }, + "kmmlu_telecommunications_and_wireless_technology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_accounting": { + "original": 100, + "effective": 100 + }, + "kmmlu_criminal_law": { + "original": 200, + "effective": 200 + }, + "kmmlu_economics": { + "original": 130, + "effective": 130 + }, + "kmmlu_education": { + "original": 100, + "effective": 100 + }, + "kmmlu_korean_history": { + "original": 100, + "effective": 100 + }, + "kmmlu_law": { + "original": 1000, + "effective": 400 + }, + "kmmlu_management": { + "original": 1000, + "effective": 400 + }, + "kmmlu_political_science_and_sociology": { + "original": 300, + "effective": 300 + }, + "kmmlu_psychology": { + "original": 1000, + "effective": 400 + }, + "kmmlu_social_welfare": { + "original": 1000, + "effective": 400 + }, + "kmmlu_taxation": { + "original": 200, + "effective": 200 + }, + "winogrande": { + "original": 1267, + "effective": 400 + }, + "arc_challenge": { + "original": 1172, + "effective": 400 + }, + "arc_easy": { + "original": 2376, + "effective": 400 + }, + "hellaswag": { + "original": 10042, + "effective": 400 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 400 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 400 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 400 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 400 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 400 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + } + }, + "config": { + "model": "hf", + "model_args": { + "pretrained": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "trust_remote_code": true + }, + "model_num_parameters": 4022468096, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "12", + "batch_sizes": [], + "device": "cuda:0", + "use_cache": null, + "limit": 400.0, + "bootstrap_iters": 100000, + "gen_kwargs": {}, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "0ce43af", + "date": 1775962096.959724, + "pretty_env_info": "PyTorch version: 2.9.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 4.1.0\nLibc version: glibc-2.35\n\nPython version: 3.11.14 | packaged by conda-forge | (main, Oct 13 2025, 14:09:32) [GCC 14.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition\nNvidia driver version: 590.48.01\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.8.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.8.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 192\nOn-line CPU(s) list: 0-191\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7642 48-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2300.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4600.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sev sev_es ibpb_exit_to_user\nVirtualization: AMD-V\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47,96-143\nNUMA node1 CPU(s): 48-95,144-191\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; Safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nVulnerability Vmscape: Mitigation; IBPB before exit to userspace\n\nVersions of relevant libraries:\n[pip3] executorch==1.0.1\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.17.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] optree==0.17.0\n[pip3] pytorch_tokenizers==1.0.1\n[pip3] torch==2.9.0+cu128\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torch-stoi==0.2.3\n[pip3] torchao==0.14.0\n[pip3] torchaudio==2.9.0+cu128\n[pip3] torchcodec==0.9.1\n[pip3] torchelastic==0.2.2\n[pip3] torchvision==0.24.0+cu128\n[pip3] triton==3.5.0\n[pip3] triton_kernels==1.0.0\n[conda] No relevant packages", + "transformers_version": "5.5.3", + "lm_eval_version": "0.4.11", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|PAD_TOKEN|>", + "151669" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/home/unsloth/scp_stage1_cpt/artifacts/cpt_full_96gb_qwen3_4b/checkpoints", + "model_name_sanitized": "__home__unsloth__scp_stage1_cpt__artifacts__cpt_full_96gb_qwen3_4b__checkpoints", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": null, + "chat_template": null, + "chat_template_sha": null, + "total_evaluation_time_seconds": "580.1511918641627" +} \ No newline at end of file diff --git a/eval/lm_eval/checkpoints/cpt/stdout.txt b/eval/lm_eval/checkpoints/cpt/stdout.txt new file mode 100644 index 0000000..b6d158b --- /dev/null +++ b/eval/lm_eval/checkpoints/cpt/stdout.txt @@ -0,0 +1,2748 @@ +2026-04-12:02:48:13 WARNING [config.evaluate_config:281] --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT. +2026-04-12:02:48:16 INFO [_cli.run:376] Selected Tasks: ['mmlu', 'hellaswag', 'arc_easy', 'arc_challenge', 'winogrande', 'kmmlu', 'kobest_boolq', 'kobest_copa', 'kobest_hellaswag'] +🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning. +Unsloth: Your Flash Attention 2 installation seems to be broken. Using Xformers instead. No performance changes will be seen. +🦥 Unsloth Zoo will now patch everything to make training faster! + +Loading weights: 0%| | 0/398 [00:00", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1f1e6cf --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..d037da7 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 2.103177196962902e+18, + "train_loss": 1.7256613558986822, + "train_runtime": 29239.084, + "train_samples_per_second": 1.616, + "train_steps_per_second": 0.051 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..87bdcbc --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1096 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1477, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006771626883358727, + "grad_norm": 1.5234375, + "learning_rate": 6.081081081081082e-07, + "loss": 1.8358331680297852, + "step": 10 + }, + { + "epoch": 0.013543253766717453, + "grad_norm": 1.5078125, + "learning_rate": 1.2837837837837838e-06, + "loss": 1.840726089477539, + "step": 20 + }, + { + "epoch": 0.02031488065007618, + "grad_norm": 1.0859375, + "learning_rate": 1.9594594594594595e-06, + "loss": 1.8267410278320313, + "step": 30 + }, + { + "epoch": 0.027086507533434907, + "grad_norm": 1.1640625, + "learning_rate": 2.6351351351351353e-06, + "loss": 1.8383310317993165, + "step": 40 + }, + { + "epoch": 0.03385813441679363, + "grad_norm": 1.0859375, + "learning_rate": 3.310810810810811e-06, + "loss": 1.8384885787963867, + "step": 50 + }, + { + "epoch": 0.04062976130015236, + "grad_norm": 1.03125, + "learning_rate": 3.986486486486487e-06, + "loss": 1.8087802886962892, + "step": 60 + }, + { + "epoch": 0.04740138818351109, + "grad_norm": 1.015625, + "learning_rate": 4.6621621621621625e-06, + "loss": 1.8259227752685547, + "step": 70 + }, + { + "epoch": 0.05417301506686981, + "grad_norm": 1.046875, + "learning_rate": 5.337837837837838e-06, + "loss": 1.8241001129150392, + "step": 80 + }, + { + "epoch": 0.06094464195022854, + "grad_norm": 0.96484375, + "learning_rate": 6.013513513513514e-06, + "loss": 1.82220516204834, + "step": 90 + }, + { + "epoch": 0.06771626883358726, + "grad_norm": 0.953125, + "learning_rate": 6.689189189189191e-06, + "loss": 1.7921783447265625, + "step": 100 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 0.9296875, + "learning_rate": 7.3648648648648655e-06, + "loss": 1.797548484802246, + "step": 110 + }, + { + "epoch": 0.08125952260030472, + "grad_norm": 0.89453125, + "learning_rate": 8.040540540540541e-06, + "loss": 1.7889528274536133, + "step": 120 + }, + { + "epoch": 0.08803114948366345, + "grad_norm": 0.90234375, + "learning_rate": 8.716216216216217e-06, + "loss": 1.7663179397583009, + "step": 130 + }, + { + "epoch": 0.09480277636702218, + "grad_norm": 0.89453125, + "learning_rate": 9.391891891891893e-06, + "loss": 1.7635225296020507, + "step": 140 + }, + { + "epoch": 0.1015744032503809, + "grad_norm": 0.91015625, + "learning_rate": 9.999986030219255e-06, + "loss": 1.7774492263793946, + "step": 150 + }, + { + "epoch": 0.10834603013373963, + "grad_norm": 0.91796875, + "learning_rate": 9.998309750982693e-06, + "loss": 1.7622718811035156, + "step": 160 + }, + { + "epoch": 0.11511765701709836, + "grad_norm": 0.890625, + "learning_rate": 9.993840588849743e-06, + "loss": 1.7750001907348634, + "step": 170 + }, + { + "epoch": 0.12188928390045708, + "grad_norm": 0.890625, + "learning_rate": 9.986581041033881e-06, + "loss": 1.767216110229492, + "step": 180 + }, + { + "epoch": 0.1286609107838158, + "grad_norm": 0.921875, + "learning_rate": 9.976535163919757e-06, + "loss": 1.7609657287597655, + "step": 190 + }, + { + "epoch": 0.13543253766717453, + "grad_norm": 0.87109375, + "learning_rate": 9.96370857079661e-06, + "loss": 1.7535722732543946, + "step": 200 + }, + { + "epoch": 0.14220416455053325, + "grad_norm": 0.86328125, + "learning_rate": 9.948108428721782e-06, + "loss": 1.7395360946655274, + "step": 210 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 0.88671875, + "learning_rate": 9.92974345451598e-06, + "loss": 1.7465991973876953, + "step": 220 + }, + { + "epoch": 0.15574741831725072, + "grad_norm": 0.87890625, + "learning_rate": 9.908623909892651e-06, + "loss": 1.7506902694702149, + "step": 230 + }, + { + "epoch": 0.16251904520060945, + "grad_norm": 0.8984375, + "learning_rate": 9.884761595724068e-06, + "loss": 1.7368896484375, + "step": 240 + }, + { + "epoch": 0.16929067208396817, + "grad_norm": 0.8671875, + "learning_rate": 9.858169845447417e-06, + "loss": 1.7515613555908203, + "step": 250 + }, + { + "epoch": 0.1760622989673269, + "grad_norm": 0.85546875, + "learning_rate": 9.828863517614533e-06, + "loss": 1.7509956359863281, + "step": 260 + }, + { + "epoch": 0.1828339258506856, + "grad_norm": 0.9140625, + "learning_rate": 9.796858987589462e-06, + "loss": 1.753628921508789, + "step": 270 + }, + { + "epoch": 0.18960555273404436, + "grad_norm": 0.85546875, + "learning_rate": 9.762174138398456e-06, + "loss": 1.7379936218261718, + "step": 280 + }, + { + "epoch": 0.19637717961740309, + "grad_norm": 0.88671875, + "learning_rate": 9.724828350737574e-06, + "loss": 1.7442964553833007, + "step": 290 + }, + { + "epoch": 0.2031488065007618, + "grad_norm": 0.87109375, + "learning_rate": 9.684842492143399e-06, + "loss": 1.7366142272949219, + "step": 300 + }, + { + "epoch": 0.20992043338412053, + "grad_norm": 0.84765625, + "learning_rate": 9.642238905333e-06, + "loss": 1.7396051406860351, + "step": 310 + }, + { + "epoch": 0.21669206026747925, + "grad_norm": 0.87109375, + "learning_rate": 9.597041395719573e-06, + "loss": 1.732611083984375, + "step": 320 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 0.8828125, + "learning_rate": 9.549275218110818e-06, + "loss": 1.7453182220458985, + "step": 330 + }, + { + "epoch": 0.23023531403419673, + "grad_norm": 0.875, + "learning_rate": 9.498967062597403e-06, + "loss": 1.7297761917114258, + "step": 340 + }, + { + "epoch": 0.23700694091755545, + "grad_norm": 0.875, + "learning_rate": 9.446145039639486e-06, + "loss": 1.728118324279785, + "step": 350 + }, + { + "epoch": 0.24377856780091417, + "grad_norm": 0.890625, + "learning_rate": 9.390838664359539e-06, + "loss": 1.7387624740600587, + "step": 360 + }, + { + "epoch": 0.2505501946842729, + "grad_norm": 0.85546875, + "learning_rate": 9.333078840050331e-06, + "loss": 1.7364713668823242, + "step": 370 + }, + { + "epoch": 0.2573218215676316, + "grad_norm": 0.8828125, + "learning_rate": 9.27289784090723e-06, + "loss": 1.7236080169677734, + "step": 380 + }, + { + "epoch": 0.26409344845099036, + "grad_norm": 0.890625, + "learning_rate": 9.210329293994495e-06, + "loss": 1.7224924087524414, + "step": 390 + }, + { + "epoch": 0.27086507533434906, + "grad_norm": 0.8671875, + "learning_rate": 9.145408160455642e-06, + "loss": 1.7099193572998046, + "step": 400 + }, + { + "epoch": 0.2776367022177078, + "grad_norm": 0.8515625, + "learning_rate": 9.078170715978353e-06, + "loss": 1.737176513671875, + "step": 410 + }, + { + "epoch": 0.2844083291010665, + "grad_norm": 0.9140625, + "learning_rate": 9.008654530524883e-06, + "loss": 1.73763427734375, + "step": 420 + }, + { + "epoch": 0.29117995598442525, + "grad_norm": 0.85546875, + "learning_rate": 8.936898447339257e-06, + "loss": 1.7290821075439453, + "step": 430 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 0.8984375, + "learning_rate": 8.86294256124301e-06, + "loss": 1.7403568267822265, + "step": 440 + }, + { + "epoch": 0.3047232097511427, + "grad_norm": 0.859375, + "learning_rate": 8.786828196231584e-06, + "loss": 1.7217792510986327, + "step": 450 + }, + { + "epoch": 0.31149483663450145, + "grad_norm": 0.87109375, + "learning_rate": 8.708597882383908e-06, + "loss": 1.7103708267211915, + "step": 460 + }, + { + "epoch": 0.31826646351786014, + "grad_norm": 0.91796875, + "learning_rate": 8.62829533209805e-06, + "loss": 1.7208784103393555, + "step": 470 + }, + { + "epoch": 0.3250380904012189, + "grad_norm": 0.859375, + "learning_rate": 8.545965415666254e-06, + "loss": 1.7223230361938477, + "step": 480 + }, + { + "epoch": 0.33180971728457764, + "grad_norm": 0.8671875, + "learning_rate": 8.46165413620295e-06, + "loss": 1.719701385498047, + "step": 490 + }, + { + "epoch": 0.33858134416793634, + "grad_norm": 0.85546875, + "learning_rate": 8.375408603939827e-06, + "loss": 1.721092987060547, + "step": 500 + }, + { + "epoch": 0.33858134416793634, + "eval_loss": 1.7143864631652832, + "eval_runtime": 177.179, + "eval_samples_per_second": 5.401, + "eval_steps_per_second": 0.677, + "step": 500 + }, + { + "epoch": 0.3453529710512951, + "grad_norm": 0.859375, + "learning_rate": 8.287277009902237e-06, + "loss": 1.7325265884399415, + "step": 510 + }, + { + "epoch": 0.3521245979346538, + "grad_norm": 0.83984375, + "learning_rate": 8.197308598981731e-06, + "loss": 1.7298921585083007, + "step": 520 + }, + { + "epoch": 0.35889622481801253, + "grad_norm": 0.8828125, + "learning_rate": 8.105553642419708e-06, + "loss": 1.6982412338256836, + "step": 530 + }, + { + "epoch": 0.3656678517013712, + "grad_norm": 0.91015625, + "learning_rate": 8.012063409717578e-06, + "loss": 1.7173789978027343, + "step": 540 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 0.875, + "learning_rate": 7.916890139989147e-06, + "loss": 1.724541473388672, + "step": 550 + }, + { + "epoch": 0.3792111054680887, + "grad_norm": 0.859375, + "learning_rate": 7.820087012771184e-06, + "loss": 1.701674461364746, + "step": 560 + }, + { + "epoch": 0.3859827323514474, + "grad_norm": 0.85546875, + "learning_rate": 7.721708118308556e-06, + "loss": 1.7177881240844726, + "step": 570 + }, + { + "epoch": 0.39275435923480617, + "grad_norm": 0.87890625, + "learning_rate": 7.621808427330447e-06, + "loss": 1.6985021591186524, + "step": 580 + }, + { + "epoch": 0.39952598611816487, + "grad_norm": 0.87109375, + "learning_rate": 7.5204437603346224e-06, + "loss": 1.709127426147461, + "step": 590 + }, + { + "epoch": 0.4062976130015236, + "grad_norm": 0.88671875, + "learning_rate": 7.417670756396863e-06, + "loss": 1.7201419830322267, + "step": 600 + }, + { + "epoch": 0.41306923988488237, + "grad_norm": 0.8984375, + "learning_rate": 7.313546841522998e-06, + "loss": 1.7153247833251952, + "step": 610 + }, + { + "epoch": 0.41984086676824106, + "grad_norm": 0.875, + "learning_rate": 7.2081301965612435e-06, + "loss": 1.707881546020508, + "step": 620 + }, + { + "epoch": 0.4266124936515998, + "grad_norm": 0.87109375, + "learning_rate": 7.10147972469275e-06, + "loss": 1.7271339416503906, + "step": 630 + }, + { + "epoch": 0.4333841205349585, + "grad_norm": 1.3515625, + "learning_rate": 6.993655018518541e-06, + "loss": 1.7222976684570312, + "step": 640 + }, + { + "epoch": 0.44015574741831726, + "grad_norm": 0.85546875, + "learning_rate": 6.884716326761218e-06, + "loss": 1.7006675720214843, + "step": 650 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 0.87109375, + "learning_rate": 6.774724520600069e-06, + "loss": 1.6978439331054687, + "step": 660 + }, + { + "epoch": 0.4536990011850347, + "grad_norm": 0.87890625, + "learning_rate": 6.663741059658337e-06, + "loss": 1.7124168395996093, + "step": 670 + }, + { + "epoch": 0.46047062806839345, + "grad_norm": 0.87890625, + "learning_rate": 6.551827957661722e-06, + "loss": 1.7023361206054688, + "step": 680 + }, + { + "epoch": 0.46724225495175215, + "grad_norm": 0.86328125, + "learning_rate": 6.439047747787242e-06, + "loss": 1.700748825073242, + "step": 690 + }, + { + "epoch": 0.4740138818351109, + "grad_norm": 0.85546875, + "learning_rate": 6.325463447721852e-06, + "loss": 1.6977190017700194, + "step": 700 + }, + { + "epoch": 0.4807855087184696, + "grad_norm": 0.8984375, + "learning_rate": 6.211138524450347e-06, + "loss": 1.7250362396240235, + "step": 710 + }, + { + "epoch": 0.48755713560182834, + "grad_norm": 0.90234375, + "learning_rate": 6.096136858792193e-06, + "loss": 1.7249008178710938, + "step": 720 + }, + { + "epoch": 0.4943287624851871, + "grad_norm": 0.8671875, + "learning_rate": 5.980522709707132e-06, + "loss": 1.7153186798095703, + "step": 730 + }, + { + "epoch": 0.5011003893685458, + "grad_norm": 0.8828125, + "learning_rate": 5.864360678389497e-06, + "loss": 1.6841873168945312, + "step": 740 + }, + { + "epoch": 0.5078720162519045, + "grad_norm": 0.8515625, + "learning_rate": 5.747715672171295e-06, + "loss": 1.7151117324829102, + "step": 750 + }, + { + "epoch": 0.5146436431352632, + "grad_norm": 0.95703125, + "learning_rate": 5.630652868254229e-06, + "loss": 1.704267692565918, + "step": 760 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 0.88671875, + "learning_rate": 5.51323767729093e-06, + "loss": 1.7240329742431642, + "step": 770 + }, + { + "epoch": 0.5281868969019807, + "grad_norm": 0.87890625, + "learning_rate": 5.395535706835744e-06, + "loss": 1.7058921813964845, + "step": 780 + }, + { + "epoch": 0.5349585237853395, + "grad_norm": 0.8828125, + "learning_rate": 5.27761272468549e-06, + "loss": 1.6999113082885742, + "step": 790 + }, + { + "epoch": 0.5417301506686981, + "grad_norm": 0.9140625, + "learning_rate": 5.159534622130695e-06, + "loss": 1.7173538208007812, + "step": 800 + }, + { + "epoch": 0.5485017775520569, + "grad_norm": 0.85546875, + "learning_rate": 5.04136737713781e-06, + "loss": 1.706464958190918, + "step": 810 + }, + { + "epoch": 0.5552734044354156, + "grad_norm": 0.84765625, + "learning_rate": 4.923177017483002e-06, + "loss": 1.7123580932617188, + "step": 820 + }, + { + "epoch": 0.5620450313187744, + "grad_norm": 0.84765625, + "learning_rate": 4.805029583858115e-06, + "loss": 1.7076505661010741, + "step": 830 + }, + { + "epoch": 0.568816658202133, + "grad_norm": 0.87109375, + "learning_rate": 4.686991092969408e-06, + "loss": 1.7007432937622071, + "step": 840 + }, + { + "epoch": 0.5755882850854918, + "grad_norm": 0.83984375, + "learning_rate": 4.569127500649701e-06, + "loss": 1.7156892776489259, + "step": 850 + }, + { + "epoch": 0.5823599119688505, + "grad_norm": 0.85546875, + "learning_rate": 4.4515046650045316e-06, + "loss": 1.6989547729492187, + "step": 860 + }, + { + "epoch": 0.5891315388522093, + "grad_norm": 0.859375, + "learning_rate": 4.334188309612923e-06, + "loss": 1.701683235168457, + "step": 870 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 0.875, + "learning_rate": 4.217243986803315e-06, + "loss": 1.7004409790039063, + "step": 880 + }, + { + "epoch": 0.6026747926189266, + "grad_norm": 0.88671875, + "learning_rate": 4.100737041025188e-06, + "loss": 1.727794075012207, + "step": 890 + }, + { + "epoch": 0.6094464195022854, + "grad_norm": 0.89453125, + "learning_rate": 3.984732572336837e-06, + "loss": 1.6976716995239258, + "step": 900 + }, + { + "epoch": 0.6162180463856441, + "grad_norm": 0.89453125, + "learning_rate": 3.869295400029714e-06, + "loss": 1.6927717208862305, + "step": 910 + }, + { + "epoch": 0.6229896732690029, + "grad_norm": 0.84375, + "learning_rate": 3.754490026409637e-06, + "loss": 1.6997186660766601, + "step": 920 + }, + { + "epoch": 0.6297613001523616, + "grad_norm": 0.93359375, + "learning_rate": 3.6403806007551373e-06, + "loss": 1.7196897506713866, + "step": 930 + }, + { + "epoch": 0.6365329270357203, + "grad_norm": 0.83203125, + "learning_rate": 3.527030883473055e-06, + "loss": 1.7054462432861328, + "step": 940 + }, + { + "epoch": 0.643304553919079, + "grad_norm": 0.890625, + "learning_rate": 3.414504210471421e-06, + "loss": 1.7200759887695312, + "step": 950 + }, + { + "epoch": 0.6500761808024378, + "grad_norm": 0.890625, + "learning_rate": 3.302863457769544e-06, + "loss": 1.6951274871826172, + "step": 960 + }, + { + "epoch": 0.6568478076857965, + "grad_norm": 0.90625, + "learning_rate": 3.192171006365061e-06, + "loss": 1.7151849746704102, + "step": 970 + }, + { + "epoch": 0.6636194345691553, + "grad_norm": 0.8984375, + "learning_rate": 3.0824887073775877e-06, + "loss": 1.713322067260742, + "step": 980 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.83984375, + "learning_rate": 2.973877847488451e-06, + "loss": 1.7172536849975586, + "step": 990 + }, + { + "epoch": 0.6771626883358727, + "grad_norm": 0.859375, + "learning_rate": 2.8663991146958064e-06, + "loss": 1.7149576187133788, + "step": 1000 + }, + { + "epoch": 0.6771626883358727, + "eval_loss": 1.7007688283920288, + "eval_runtime": 165.432, + "eval_samples_per_second": 5.785, + "eval_steps_per_second": 0.725, + "step": 1000 + }, + { + "epoch": 0.6839343152192314, + "grad_norm": 0.90625, + "learning_rate": 2.7601125644042777e-06, + "loss": 1.714142417907715, + "step": 1010 + }, + { + "epoch": 0.6907059421025902, + "grad_norm": 0.859375, + "learning_rate": 2.6550775858680793e-06, + "loss": 1.7104360580444335, + "step": 1020 + }, + { + "epoch": 0.6974775689859489, + "grad_norm": 0.90234375, + "learning_rate": 2.551352869006338e-06, + "loss": 1.7032684326171874, + "step": 1030 + }, + { + "epoch": 0.7042491958693076, + "grad_norm": 0.86328125, + "learning_rate": 2.4489963716092096e-06, + "loss": 1.701323890686035, + "step": 1040 + }, + { + "epoch": 0.7110208227526663, + "grad_norm": 0.890625, + "learning_rate": 2.348065286953048e-06, + "loss": 1.7169862747192384, + "step": 1050 + }, + { + "epoch": 0.7177924496360251, + "grad_norm": 0.87890625, + "learning_rate": 2.2486160118427958e-06, + "loss": 1.701096534729004, + "step": 1060 + }, + { + "epoch": 0.7245640765193838, + "grad_norm": 0.88671875, + "learning_rate": 2.1507041150993813e-06, + "loss": 1.700172233581543, + "step": 1070 + }, + { + "epoch": 0.7313357034027425, + "grad_norm": 0.859375, + "learning_rate": 2.054384306509794e-06, + "loss": 1.7045093536376954, + "step": 1080 + }, + { + "epoch": 0.7381073302861012, + "grad_norm": 0.859375, + "learning_rate": 1.9597104062571337e-06, + "loss": 1.7091920852661133, + "step": 1090 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 0.86328125, + "learning_rate": 1.8667353148477547e-06, + "loss": 1.7001871109008788, + "step": 1100 + }, + { + "epoch": 0.7516505840528187, + "grad_norm": 0.85546875, + "learning_rate": 1.7755109835522938e-06, + "loss": 1.7016315460205078, + "step": 1110 + }, + { + "epoch": 0.7584222109361775, + "grad_norm": 0.87890625, + "learning_rate": 1.6860883853770848e-06, + "loss": 1.7196449279785155, + "step": 1120 + }, + { + "epoch": 0.7651938378195361, + "grad_norm": 0.89453125, + "learning_rate": 1.5985174865822146e-06, + "loss": 1.701955223083496, + "step": 1130 + }, + { + "epoch": 0.7719654647028948, + "grad_norm": 0.85546875, + "learning_rate": 1.5128472187620886e-06, + "loss": 1.703407096862793, + "step": 1140 + }, + { + "epoch": 0.7787370915862536, + "grad_norm": 0.875, + "learning_rate": 1.4291254515041592e-06, + "loss": 1.7057323455810547, + "step": 1150 + }, + { + "epoch": 0.7855087184696123, + "grad_norm": 0.8828125, + "learning_rate": 1.3473989656410413e-06, + "loss": 1.6963571548461913, + "step": 1160 + }, + { + "epoch": 0.7922803453529711, + "grad_norm": 0.8671875, + "learning_rate": 1.2677134271110082e-06, + "loss": 1.7136796951293944, + "step": 1170 + }, + { + "epoch": 0.7990519722363297, + "grad_norm": 0.89453125, + "learning_rate": 1.1901133614414352e-06, + "loss": 1.7095062255859375, + "step": 1180 + }, + { + "epoch": 0.8058235991196885, + "grad_norm": 0.875, + "learning_rate": 1.114642128869473e-06, + "loss": 1.7052017211914063, + "step": 1190 + }, + { + "epoch": 0.8125952260030472, + "grad_norm": 0.8984375, + "learning_rate": 1.0413419001138525e-06, + "loss": 1.7166055679321288, + "step": 1200 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 0.87890625, + "learning_rate": 9.702536328113305e-07, + "loss": 1.7042055130004883, + "step": 1210 + }, + { + "epoch": 0.8261384797697647, + "grad_norm": 0.8671875, + "learning_rate": 9.014170486309875e-07, + "loss": 1.6885286331176759, + "step": 1220 + }, + { + "epoch": 0.8329101066531234, + "grad_norm": 0.84375, + "learning_rate": 8.348706110791238e-07, + "loss": 1.7065910339355468, + "step": 1230 + }, + { + "epoch": 0.8396817335364821, + "grad_norm": 0.87109375, + "learning_rate": 7.706515040071854e-07, + "loss": 1.6999498367309571, + "step": 1240 + }, + { + "epoch": 0.8464533604198409, + "grad_norm": 0.8828125, + "learning_rate": 7.08795610834706e-07, + "loss": 1.7021600723266601, + "step": 1250 + }, + { + "epoch": 0.8532249873031996, + "grad_norm": 0.87890625, + "learning_rate": 6.493374944988984e-07, + "loss": 1.722920799255371, + "step": 1260 + }, + { + "epoch": 0.8599966141865584, + "grad_norm": 0.8671875, + "learning_rate": 5.923103781420708e-07, + "loss": 1.7148597717285157, + "step": 1270 + }, + { + "epoch": 0.866768241069917, + "grad_norm": 0.890625, + "learning_rate": 5.377461265476868e-07, + "loss": 1.7151250839233398, + "step": 1280 + }, + { + "epoch": 0.8735398679532758, + "grad_norm": 0.8671875, + "learning_rate": 4.856752283354277e-07, + "loss": 1.7023918151855468, + "step": 1290 + }, + { + "epoch": 0.8803114948366345, + "grad_norm": 0.8671875, + "learning_rate": 4.3612677892519496e-07, + "loss": 1.7045417785644532, + "step": 1300 + }, + { + "epoch": 0.8870831217199933, + "grad_norm": 0.86328125, + "learning_rate": 3.891284642796045e-07, + "loss": 1.7008039474487304, + "step": 1310 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 0.8671875, + "learning_rate": 3.447065454340198e-07, + "loss": 1.7126380920410156, + "step": 1320 + }, + { + "epoch": 0.9006263754867107, + "grad_norm": 0.88671875, + "learning_rate": 3.028858438227966e-07, + "loss": 1.7127569198608399, + "step": 1330 + }, + { + "epoch": 0.9073980023700694, + "grad_norm": 0.86328125, + "learning_rate": 2.636897274099187e-07, + "loss": 1.7151193618774414, + "step": 1340 + }, + { + "epoch": 0.9141696292534282, + "grad_norm": 0.8515625, + "learning_rate": 2.2714009763178945e-07, + "loss": 1.704157829284668, + "step": 1350 + }, + { + "epoch": 0.9209412561367869, + "grad_norm": 0.87890625, + "learning_rate": 1.932573771594648e-07, + "loss": 1.7036989212036133, + "step": 1360 + }, + { + "epoch": 0.9277128830201455, + "grad_norm": 0.8671875, + "learning_rate": 1.6206049848716765e-07, + "loss": 1.7044996261596679, + "step": 1370 + }, + { + "epoch": 0.9344845099035043, + "grad_norm": 1.109375, + "learning_rate": 1.3356689335346728e-07, + "loss": 1.7029462814331056, + "step": 1380 + }, + { + "epoch": 0.941256136786863, + "grad_norm": 0.91015625, + "learning_rate": 1.0779248300102352e-07, + "loss": 1.7133670806884767, + "step": 1390 + }, + { + "epoch": 0.9480277636702218, + "grad_norm": 0.859375, + "learning_rate": 8.475166928034684e-08, + "loss": 1.6992549896240234, + "step": 1400 + }, + { + "epoch": 0.9547993905535805, + "grad_norm": 0.85546875, + "learning_rate": 6.445732660254056e-08, + "loss": 1.7066579818725587, + "step": 1410 + }, + { + "epoch": 0.9615710174369392, + "grad_norm": 0.9140625, + "learning_rate": 4.692079474552691e-08, + "loss": 1.6963106155395509, + "step": 1420 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 0.8515625, + "learning_rate": 3.2151872517767194e-08, + "loss": 1.7118385314941407, + "step": 1430 + }, + { + "epoch": 0.9751142712036567, + "grad_norm": 0.84375, + "learning_rate": 2.0158812283030403e-08, + "loss": 1.6870197296142577, + "step": 1440 + }, + { + "epoch": 0.9818858980870154, + "grad_norm": 0.87109375, + "learning_rate": 1.094831534925289e-08, + "loss": 1.7051671981811523, + "step": 1450 + }, + { + "epoch": 0.9886575249703742, + "grad_norm": 0.86328125, + "learning_rate": 4.5255282240802554e-09, + "loss": 1.7082006454467773, + "step": 1460 + }, + { + "epoch": 0.9954291518537328, + "grad_norm": 0.8828125, + "learning_rate": 8.940397391787869e-10, + "loss": 1.707107162475586, + "step": 1470 + }, + { + "epoch": 1.0, + "eval_loss": 1.7002202272415161, + "eval_runtime": 169.1979, + "eval_samples_per_second": 5.656, + "eval_steps_per_second": 0.709, + "step": 1477 + }, + { + "epoch": 1.0, + "step": 1477, + "total_flos": 2.103177196962902e+18, + "train_loss": 1.7256613558986822, + "train_runtime": 29239.084, + "train_samples_per_second": 1.616, + "train_steps_per_second": 0.051 + } + ], + "logging_steps": 10, + "max_steps": 1477, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.103177196962902e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..70b9526 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:021e20fabb8f12442e13effbcc63f0a47b25ed87f82c678b87ee5792f87ef9bc +size 5777