Files
qwen2.5-7B-rlvr_g8_b512/trainer_state.json

583 lines
20 KiB
JSON
Raw Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 15,
"global_step": 78,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03046875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 992.8,
"completions/mean_length": 148.38466796875,
"completions/mean_terminated_length": 120.88012237548828,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.064,
"grad_norm": 0.016000226140022278,
"learning_rate": 1e-06,
"loss": 0.008,
"num_tokens": 13353659.0,
"reward": 0.43779296875,
"reward_std": 0.30512999892234804,
"rewards/accuracy_reward": 0.18955078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.68603515625,
"rewards/mean_confidence_reward": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01142578125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 888.2,
"completions/mean_length": 113.01220703125,
"completions/mean_terminated_length": 102.50659942626953,
"completions/min_length": 4.4,
"completions/min_terminated_length": 4.4,
"epoch": 0.128,
"grad_norm": 0.005569960456341505,
"learning_rate": 1e-06,
"loss": 0.0183,
"num_tokens": 26368200.0,
"reward": 0.5927734375,
"reward_std": 0.18429518938064576,
"rewards/accuracy_reward": 0.26328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.922265625,
"rewards/mean_confidence_reward": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004296875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 834.0,
"completions/mean_length": 84.0111328125,
"completions/mean_terminated_length": 79.96037445068359,
"completions/min_length": 9.2,
"completions/min_terminated_length": 9.2,
"epoch": 0.192,
"grad_norm": 0.002098287222906947,
"learning_rate": 1e-06,
"loss": 0.0082,
"num_tokens": 39090618.0,
"reward": 0.672998046875,
"reward_std": 0.1340289294719696,
"rewards/accuracy_reward": 0.36337890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9826171875,
"rewards/mean_confidence_reward": 0.0,
"step": 15
},
{
"epoch": 0.192,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 247.5,
"eval_completions/max_terminated_length": 247.5,
"eval_completions/mean_length": 74.62883949279785,
"eval_completions/mean_terminated_length": 74.62883949279785,
"eval_completions/min_length": 22.5,
"eval_completions/min_terminated_length": 22.5,
"eval_loss": 0.0,
"eval_num_tokens": 39090618.0,
"eval_reward": 0.65625,
"eval_reward_std": 0.21594678610563278,
"eval_rewards/accuracy_reward": 0.314453125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.998046875,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 24.5821,
"eval_samples_per_second": 20.34,
"eval_steps_per_second": 0.163,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013671875,
"completions/max_length": 885.6,
"completions/max_terminated_length": 481.4,
"completions/mean_length": 75.769140625,
"completions/mean_terminated_length": 74.47151794433594,
"completions/min_length": 12.8,
"completions/min_terminated_length": 12.8,
"epoch": 0.256,
"grad_norm": 0.0015658332267776132,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 51845790.0,
"reward": 0.708984375,
"reward_std": 0.10655935555696487,
"rewards/accuracy_reward": 0.42177734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99619140625,
"rewards/mean_confidence_reward": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 709.8,
"completions/max_terminated_length": 506.6,
"completions/mean_length": 73.475,
"completions/mean_terminated_length": 73.0110580444336,
"completions/min_length": 19.2,
"completions/min_terminated_length": 19.2,
"epoch": 0.32,
"grad_norm": 0.0015233514131978154,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 64440126.0,
"reward": 0.705615234375,
"reward_std": 0.10088382810354232,
"rewards/accuracy_reward": 0.4138671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99736328125,
"rewards/mean_confidence_reward": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 952.0,
"completions/max_terminated_length": 627.2,
"completions/mean_length": 76.364453125,
"completions/mean_terminated_length": 75.99405670166016,
"completions/min_length": 18.6,
"completions/min_terminated_length": 18.6,
"epoch": 0.384,
"grad_norm": 0.002182575175538659,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 77054626.0,
"reward": 0.708837890625,
"reward_std": 0.0894511729478836,
"rewards/accuracy_reward": 0.4197265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99794921875,
"rewards/mean_confidence_reward": 0.0,
"step": 30
},
{
"epoch": 0.384,
"eval_completions/clipped_ratio": 0.002155172413793094,
"eval_completions/max_length": 414.5,
"eval_completions/max_terminated_length": 204.75,
"eval_completions/mean_length": 79.66776657104492,
"eval_completions/mean_terminated_length": 77.63948631286621,
"eval_completions/min_length": 31.25,
"eval_completions/min_terminated_length": 31.25,
"eval_loss": 0.0,
"eval_num_tokens": 77054626.0,
"eval_reward": 0.6640625,
"eval_reward_std": 0.22485817223787308,
"eval_rewards/accuracy_reward": 0.33203125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.99609375,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 33.0856,
"eval_samples_per_second": 15.112,
"eval_steps_per_second": 0.121,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 932.6,
"completions/max_terminated_length": 520.6,
"completions/mean_length": 80.10986328125,
"completions/mean_terminated_length": 79.64854736328125,
"completions/min_length": 18.6,
"completions/min_terminated_length": 18.6,
"epoch": 0.448,
"grad_norm": 0.0010561308590695262,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 89702463.0,
"reward": 0.72060546875,
"reward_std": 0.08864349871873856,
"rewards/accuracy_reward": 0.4431640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 800.6,
"completions/max_terminated_length": 461.4,
"completions/mean_length": 86.64755859375,
"completions/mean_terminated_length": 86.28123168945312,
"completions/min_length": 24.6,
"completions/min_terminated_length": 24.6,
"epoch": 0.512,
"grad_norm": 0.0010048149852082133,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 102614142.0,
"reward": 0.721337890625,
"reward_std": 0.08699959516525269,
"rewards/accuracy_reward": 0.44345703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 923.6,
"completions/max_terminated_length": 530.4,
"completions/mean_length": 89.90283203125,
"completions/mean_terminated_length": 88.98892669677734,
"completions/min_length": 18.8,
"completions/min_terminated_length": 18.8,
"epoch": 0.576,
"grad_norm": 0.0009294641204178333,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 115441355.0,
"reward": 0.732666015625,
"reward_std": 0.08389391750097275,
"rewards/accuracy_reward": 0.4677734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.0,
"step": 45
},
{
"epoch": 0.576,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 269.5,
"eval_completions/max_terminated_length": 269.5,
"eval_completions/mean_length": 92.64102935791016,
"eval_completions/mean_terminated_length": 92.64102935791016,
"eval_completions/min_length": 35.5,
"eval_completions/min_terminated_length": 35.5,
"eval_loss": 0.0,
"eval_num_tokens": 115441355.0,
"eval_reward": 0.6865234375,
"eval_reward_std": 0.23618583008646965,
"eval_rewards/accuracy_reward": 0.373046875,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 26.3838,
"eval_samples_per_second": 18.951,
"eval_steps_per_second": 0.152,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00087890625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 568.2,
"completions/mean_length": 94.26240234375,
"completions/mean_terminated_length": 93.44497528076172,
"completions/min_length": 24.4,
"completions/min_terminated_length": 24.4,
"epoch": 0.64,
"grad_norm": 0.0007046961691230536,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 128476370.0,
"reward": 0.735302734375,
"reward_std": 0.0788412094116211,
"rewards/accuracy_reward": 0.47177734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998828125,
"rewards/mean_confidence_reward": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 514.8,
"completions/mean_length": 95.73056640625,
"completions/mean_terminated_length": 94.82296295166016,
"completions/min_length": 28.2,
"completions/min_terminated_length": 28.2,
"epoch": 0.704,
"grad_norm": 0.0007479747291654348,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 141210483.0,
"reward": 0.728857421875,
"reward_std": 0.07646729648113251,
"rewards/accuracy_reward": 0.45927734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9984375,
"rewards/mean_confidence_reward": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00068359375,
"completions/max_length": 896.2,
"completions/max_terminated_length": 469.8,
"completions/mean_length": 95.0162109375,
"completions/mean_terminated_length": 94.38102111816406,
"completions/min_length": 24.4,
"completions/min_terminated_length": 24.4,
"epoch": 0.768,
"grad_norm": 0.0006520415190607309,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 154067105.0,
"reward": 0.741455078125,
"reward_std": 0.07960962057113648,
"rewards/accuracy_reward": 0.48369140625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 60
},
{
"epoch": 0.768,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 263.75,
"eval_completions/max_terminated_length": 263.75,
"eval_completions/mean_length": 96.76313209533691,
"eval_completions/mean_terminated_length": 96.76313209533691,
"eval_completions/min_length": 38.25,
"eval_completions/min_terminated_length": 38.25,
"eval_loss": 0.0,
"eval_num_tokens": 154067105.0,
"eval_reward": 0.69140625,
"eval_reward_std": 0.23931611329317093,
"eval_rewards/accuracy_reward": 0.384765625,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.998046875,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 25.814,
"eval_samples_per_second": 19.369,
"eval_steps_per_second": 0.155,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 774.0,
"completions/max_terminated_length": 411.2,
"completions/mean_length": 94.5296875,
"completions/mean_terminated_length": 94.16671752929688,
"completions/min_length": 32.4,
"completions/min_terminated_length": 32.4,
"epoch": 0.832,
"grad_norm": 0.0007356010028161108,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 166964521.0,
"reward": 0.751513671875,
"reward_std": 0.07600467056035995,
"rewards/accuracy_reward": 0.50341796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013671875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 443.2,
"completions/mean_length": 97.67490234375,
"completions/mean_terminated_length": 96.40738067626953,
"completions/min_length": 30.8,
"completions/min_terminated_length": 30.8,
"epoch": 0.896,
"grad_norm": 0.000633634568657726,
"learning_rate": 1e-06,
"loss": 0.0024,
"num_tokens": 179885808.0,
"reward": 0.743115234375,
"reward_std": 0.07322432547807693,
"rewards/accuracy_reward": 0.48759765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9986328125,
"rewards/mean_confidence_reward": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 403.2,
"completions/mean_length": 98.701171875,
"completions/mean_terminated_length": 97.97780151367188,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.96,
"grad_norm": 0.0007835639989934862,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 192751292.0,
"reward": 0.739453125,
"reward_std": 0.07245174199342727,
"rewards/accuracy_reward": 0.4796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 75
},
{
"epoch": 0.96,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 253.25,
"eval_completions/max_terminated_length": 253.25,
"eval_completions/mean_length": 96.76939582824707,
"eval_completions/mean_terminated_length": 96.76939582824707,
"eval_completions/min_length": 46.0,
"eval_completions/min_terminated_length": 46.0,
"eval_loss": 0.0,
"eval_num_tokens": 192751292.0,
"eval_reward": 0.7021484375,
"eval_reward_std": 0.23983138427138329,
"eval_rewards/accuracy_reward": 0.404296875,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 25.6674,
"eval_samples_per_second": 19.48,
"eval_steps_per_second": 0.156,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 576.6666666666666,
"completions/max_terminated_length": 576.6666666666666,
"completions/mean_length": 97.17679595947266,
"completions/mean_terminated_length": 97.17679595947266,
"completions/min_length": 32.333333333333336,
"completions/min_terminated_length": 32.333333333333336,
"epoch": 0.9984,
"num_tokens": 200463724.0,
"reward": 0.7422688802083334,
"reward_std": 0.07581798732280731,
"rewards/accuracy_reward": 0.4845377604166667,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 78,
"total_flos": 0.0,
"train_loss": 0.0031132084311535344,
"train_runtime": 28707.0758,
"train_samples_per_second": 0.697,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 78,
"num_input_tokens_seen": 200463724,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}