{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 212, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 305.40234375, "epoch": 0.009512485136741973, "grad_norm": 9.479197648905142, "kl": 1.3113021850585938e-05, "learning_rate": 0.0, "loss": 0.0, "reward": 2.353515625, "reward_std": 0.5666993586346507, "rewards/accuracy_reward": 0.46484375, "rewards/format_reward": 0.9375, "rewards/influence_reward": 0.3828125, "rewards/len_reward": 0.568359375, "step": 1 }, { "completion_length": 303.59375, "epoch": 0.04756242568370987, "grad_norm": 5.559375419637804, "kl": 0.00265657901763916, "learning_rate": 1.818181818181818e-07, "loss": 0.0001, "reward": 2.5146484375, "reward_std": 0.530997786205262, "rewards/accuracy_reward": 0.54736328125, "rewards/format_reward": 0.9443359375, "rewards/influence_reward": 0.42724609375, "rewards/len_reward": 0.595703125, "step": 5 }, { "completion_length": 301.755859375, "epoch": 0.09512485136741974, "grad_norm": 7.509497472122595, "kl": 0.010486793518066407, "learning_rate": 4.090909090909091e-07, "loss": 0.0004, "reward": 2.534765625, "reward_std": 0.5271333329379558, "rewards/accuracy_reward": 0.542578125, "rewards/format_reward": 0.95234375, "rewards/influence_reward": 0.441015625, "rewards/len_reward": 0.598828125, "step": 10 }, { "completion_length": 308.83046875, "epoch": 0.1426872770511296, "grad_norm": 9.5559930417524, "kl": 0.3596527099609375, "learning_rate": 6.363636363636363e-07, "loss": 0.0144, "reward": 2.4265625, "reward_std": 0.4884017549455166, "rewards/accuracy_reward": 0.512109375, "rewards/format_reward": 0.95625, "rewards/influence_reward": 0.419140625, "rewards/len_reward": 0.5390625, "step": 15 }, { "completion_length": 309.508984375, "epoch": 0.1902497027348395, "grad_norm": 6.575100009019049, "kl": 1.8339599609375, "learning_rate": 8.636363636363636e-07, "loss": 0.0734, "reward": 2.483984375, "reward_std": 0.4793000495061278, "rewards/accuracy_reward": 0.534765625, "rewards/format_reward": 0.971875, "rewards/influence_reward": 0.444921875, "rewards/len_reward": 0.532421875, "step": 20 }, { "completion_length": 302.12109375, "epoch": 0.23781212841854935, "grad_norm": 3.8082898220241357, "kl": 1.6882080078125, "learning_rate": 9.99726628670463e-07, "loss": 0.0675, "reward": 2.571484375, "reward_std": 0.46187512911856177, "rewards/accuracy_reward": 0.56015625, "rewards/format_reward": 0.97578125, "rewards/influence_reward": 0.478125, "rewards/len_reward": 0.557421875, "step": 25 }, { "completion_length": 285.939453125, "epoch": 0.2853745541022592, "grad_norm": 5.417820371147177, "kl": 1.641796875, "learning_rate": 9.966546331768192e-07, "loss": 0.0657, "reward": 2.583984375, "reward_std": 0.48238000813871623, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.985546875, "rewards/influence_reward": 0.465625, "rewards/len_reward": 0.5703125, "step": 30 }, { "completion_length": 295.471875, "epoch": 0.3329369797859691, "grad_norm": 3.3934355489448493, "kl": 1.362841796875, "learning_rate": 9.901899829374047e-07, "loss": 0.0545, "reward": 2.55234375, "reward_std": 0.48458676002919676, "rewards/accuracy_reward": 0.516796875, "rewards/format_reward": 0.9796875, "rewards/influence_reward": 0.429296875, "rewards/len_reward": 0.6265625, "step": 35 }, { "completion_length": 295.90390625, "epoch": 0.380499405469679, "grad_norm": 4.1900410232582335, "kl": 1.5516845703125, "learning_rate": 9.803768380684242e-07, "loss": 0.0621, "reward": 2.50546875, "reward_std": 0.46826295778155325, "rewards/accuracy_reward": 0.50703125, "rewards/format_reward": 0.97578125, "rewards/influence_reward": 0.428125, "rewards/len_reward": 0.59453125, "step": 40 }, { "completion_length": 292.028515625, "epoch": 0.4280618311533888, "grad_norm": 7.946193406732366, "kl": 1.88642578125, "learning_rate": 9.672822322997304e-07, "loss": 0.0754, "reward": 2.47421875, "reward_std": 0.45962891932576894, "rewards/accuracy_reward": 0.49296875, "rewards/format_reward": 0.982421875, "rewards/influence_reward": 0.411328125, "rewards/len_reward": 0.5875, "step": 45 }, { "completion_length": 292.873046875, "epoch": 0.4756242568370987, "grad_norm": 7.492938961448867, "kl": 1.96015625, "learning_rate": 9.509956150664795e-07, "loss": 0.0784, "reward": 2.596484375, "reward_std": 0.460917086713016, "rewards/accuracy_reward": 0.52578125, "rewards/format_reward": 0.982421875, "rewards/influence_reward": 0.4453125, "rewards/len_reward": 0.64296875, "step": 50 }, { "completion_length": 286.85859375, "epoch": 0.5231866825208086, "grad_norm": 3.5519428103835224, "kl": 1.998779296875, "learning_rate": 9.316282404787869e-07, "loss": 0.0799, "reward": 2.530078125, "reward_std": 0.43998180609196424, "rewards/accuracy_reward": 0.506640625, "rewards/format_reward": 0.973828125, "rewards/influence_reward": 0.43046875, "rewards/len_reward": 0.619140625, "step": 55 }, { "completion_length": 284.312890625, "epoch": 0.5707491082045184, "grad_norm": 3.7968869284274014, "kl": 2.133349609375, "learning_rate": 9.093124073433462e-07, "loss": 0.0854, "reward": 2.521875, "reward_std": 0.5203634534031153, "rewards/accuracy_reward": 0.506640625, "rewards/format_reward": 0.979296875, "rewards/influence_reward": 0.419140625, "rewards/len_reward": 0.616796875, "step": 60 }, { "completion_length": 275.38203125, "epoch": 0.6183115338882283, "grad_norm": 4.3717453671021165, "kl": 2.27578125, "learning_rate": 8.842005554284295e-07, "loss": 0.091, "reward": 2.609765625, "reward_std": 0.4997987896203995, "rewards/accuracy_reward": 0.539453125, "rewards/format_reward": 0.983203125, "rewards/influence_reward": 0.4578125, "rewards/len_reward": 0.629296875, "step": 65 }, { "completion_length": 271.71171875, "epoch": 0.6658739595719382, "grad_norm": 5.274292147592828, "kl": 2.325927734375, "learning_rate": 8.564642241456986e-07, "loss": 0.093, "reward": 2.628515625, "reward_std": 0.47792479060590265, "rewards/accuracy_reward": 0.543359375, "rewards/format_reward": 0.9765625, "rewards/influence_reward": 0.4640625, "rewards/len_reward": 0.64453125, "step": 70 }, { "completion_length": 272.53828125, "epoch": 0.713436385255648, "grad_norm": 4.422045761319486, "kl": 2.280224609375, "learning_rate": 8.262928807620843e-07, "loss": 0.0912, "reward": 2.548046875, "reward_std": 0.504490308649838, "rewards/accuracy_reward": 0.511328125, "rewards/format_reward": 0.98046875, "rewards/influence_reward": 0.433203125, "rewards/len_reward": 0.623046875, "step": 75 }, { "completion_length": 274.9234375, "epoch": 0.760998810939358, "grad_norm": 3.788180136247375, "kl": 2.3419921875, "learning_rate": 7.938926261462365e-07, "loss": 0.0937, "reward": 2.491796875, "reward_std": 0.47606104165315627, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.980859375, "rewards/influence_reward": 0.40546875, "rewards/len_reward": 0.62109375, "step": 80 }, { "completion_length": 284.03671875, "epoch": 0.8085612366230678, "grad_norm": 2.828523061717255, "kl": 2.372216796875, "learning_rate": 7.594847868906076e-07, "loss": 0.0949, "reward": 2.5171875, "reward_std": 0.4772877097129822, "rewards/accuracy_reward": 0.4734375, "rewards/format_reward": 0.98046875, "rewards/influence_reward": 0.40390625, "rewards/len_reward": 0.659375, "step": 85 }, { "completion_length": 280.916796875, "epoch": 0.8561236623067776, "grad_norm": 607.3521812300567, "kl": 2.50087890625, "learning_rate": 7.233044034264033e-07, "loss": 0.1001, "reward": 2.506640625, "reward_std": 0.46796532850712536, "rewards/accuracy_reward": 0.495703125, "rewards/format_reward": 0.9734375, "rewards/influence_reward": 0.42109375, "rewards/len_reward": 0.61640625, "step": 90 }, { "completion_length": 289.612890625, "epoch": 0.9036860879904876, "grad_norm": 2.4261586696137574, "kl": 2.483837890625, "learning_rate": 6.855986244591103e-07, "loss": 0.0994, "reward": 2.565625, "reward_std": 0.4893400952219963, "rewards/accuracy_reward": 0.525, "rewards/format_reward": 0.977734375, "rewards/influence_reward": 0.4375, "rewards/len_reward": 0.625390625, "step": 95 }, { "completion_length": 286.48984375, "epoch": 0.9512485136741974, "grad_norm": 4.005939428029324, "kl": 2.36318359375, "learning_rate": 6.466250186922324e-07, "loss": 0.0945, "reward": 2.48828125, "reward_std": 0.45276672914624216, "rewards/accuracy_reward": 0.488671875, "rewards/format_reward": 0.978515625, "rewards/influence_reward": 0.403515625, "rewards/len_reward": 0.617578125, "step": 100 }, { "completion_length": 288.877734375, "epoch": 0.9988109393579072, "grad_norm": 3.7990751409015475, "kl": 2.427783203125, "learning_rate": 6.066498153718734e-07, "loss": 0.0971, "reward": 2.457421875, "reward_std": 0.5042316474020481, "rewards/accuracy_reward": 0.47734375, "rewards/format_reward": 0.9734375, "rewards/influence_reward": 0.380859375, "rewards/len_reward": 0.62578125, "step": 105 }, { "completion_length": 274.04876893939394, "epoch": 1.0380499405469679, "grad_norm": 22.164672338850448, "kl": 2.5658735795454546, "learning_rate": 5.659460856710345e-07, "loss": 0.1004, "reward": 2.5634469696969697, "reward_std": 0.5097437008763804, "rewards/accuracy_reward": 0.5350378787878788, "rewards/format_reward": 0.9772727272727273, "rewards/influence_reward": 0.4308712121212121, "rewards/len_reward": 0.6202651515151515, "step": 110 }, { "completion_length": 275.455078125, "epoch": 1.0856123662306778, "grad_norm": 3.2333169625918923, "kl": 2.7962890625, "learning_rate": 5.247918773366111e-07, "loss": 0.1119, "reward": 2.575390625, "reward_std": 0.4654921619221568, "rewards/accuracy_reward": 0.5125, "rewards/format_reward": 0.984375, "rewards/influence_reward": 0.426953125, "rewards/len_reward": 0.6515625, "step": 115 }, { "completion_length": 289.22265625, "epoch": 1.1331747919143877, "grad_norm": 6.3823521689302085, "kl": 2.52705078125, "learning_rate": 4.834683153413459e-07, "loss": 0.1011, "reward": 2.573046875, "reward_std": 0.4989847050979733, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.971875, "rewards/influence_reward": 0.41875, "rewards/len_reward": 0.666796875, "step": 120 }, { "completion_length": 269.36796875, "epoch": 1.1807372175980975, "grad_norm": 9.314483672694436, "kl": 2.718115234375, "learning_rate": 4.4225768151520694e-07, "loss": 0.1087, "reward": 2.595703125, "reward_std": 0.4847894934937358, "rewards/accuracy_reward": 0.530078125, "rewards/format_reward": 0.975, "rewards/influence_reward": 0.440625, "rewards/len_reward": 0.65, "step": 125 }, { "completion_length": 272.870703125, "epoch": 1.2282996432818074, "grad_norm": 9.964810062893706, "kl": 2.692333984375, "learning_rate": 4.0144148627425986e-07, "loss": 0.1077, "reward": 2.548828125, "reward_std": 0.4715102421119809, "rewards/accuracy_reward": 0.50546875, "rewards/format_reward": 0.979296875, "rewards/influence_reward": 0.421875, "rewards/len_reward": 0.6421875, "step": 130 }, { "completion_length": 271.344921875, "epoch": 1.2758620689655173, "grad_norm": 4.882054754352994, "kl": 2.486865234375, "learning_rate": 3.612985456190778e-07, "loss": 0.0995, "reward": 2.56328125, "reward_std": 0.47647856548428535, "rewards/accuracy_reward": 0.51328125, "rewards/format_reward": 0.978515625, "rewards/influence_reward": 0.422265625, "rewards/len_reward": 0.64921875, "step": 135 }, { "completion_length": 274.757421875, "epoch": 1.323424494649227, "grad_norm": 4.091037506852225, "kl": 2.571435546875, "learning_rate": 3.221030765387417e-07, "loss": 0.1029, "reward": 2.5890625, "reward_std": 0.47138190008699893, "rewards/accuracy_reward": 0.512109375, "rewards/format_reward": 0.978515625, "rewards/influence_reward": 0.434765625, "rewards/len_reward": 0.663671875, "step": 140 }, { "completion_length": 273.21875, "epoch": 1.370986920332937, "grad_norm": 3.923917161371445, "kl": 2.58369140625, "learning_rate": 2.841228238307536e-07, "loss": 0.1033, "reward": 2.52421875, "reward_std": 0.48247870467603204, "rewards/accuracy_reward": 0.496875, "rewards/format_reward": 0.973046875, "rewards/influence_reward": 0.407421875, "rewards/len_reward": 0.646875, "step": 145 }, { "completion_length": 270.95703125, "epoch": 1.418549346016647, "grad_norm": 3.288471428585341, "kl": 2.6228515625, "learning_rate": 2.476172311325783e-07, "loss": 0.1049, "reward": 2.5703125, "reward_std": 0.530765401944518, "rewards/accuracy_reward": 0.52265625, "rewards/format_reward": 0.975390625, "rewards/influence_reward": 0.416796875, "rewards/len_reward": 0.65546875, "step": 150 }, { "completion_length": 276.080859375, "epoch": 1.4661117717003567, "grad_norm": 5.406976549541725, "kl": 3.496875, "learning_rate": 2.128356686585282e-07, "loss": 0.1399, "reward": 2.603515625, "reward_std": 0.5087036734446884, "rewards/accuracy_reward": 0.539453125, "rewards/format_reward": 0.976171875, "rewards/influence_reward": 0.444140625, "rewards/len_reward": 0.64375, "step": 155 }, { "completion_length": 283.583203125, "epoch": 1.5136741973840666, "grad_norm": 2.4719135321159986, "kl": 2.696875, "learning_rate": 1.8001572974834168e-07, "loss": 0.1079, "reward": 2.537109375, "reward_std": 0.4965396413579583, "rewards/accuracy_reward": 0.495703125, "rewards/format_reward": 0.97890625, "rewards/influence_reward": 0.4078125, "rewards/len_reward": 0.6546875, "step": 160 }, { "completion_length": 281.8515625, "epoch": 1.5612366230677766, "grad_norm": 6.845144411654133, "kl": 2.631396484375, "learning_rate": 1.493816078637557e-07, "loss": 0.1052, "reward": 2.54296875, "reward_std": 0.48724669627845285, "rewards/accuracy_reward": 0.501171875, "rewards/format_reward": 0.97421875, "rewards/influence_reward": 0.41015625, "rewards/len_reward": 0.657421875, "step": 165 }, { "completion_length": 276.32578125, "epoch": 1.6087990487514863, "grad_norm": 2.820753870764231, "kl": 2.634814453125, "learning_rate": 1.2114256511983274e-07, "loss": 0.1054, "reward": 2.5515625, "reward_std": 0.492490841075778, "rewards/accuracy_reward": 0.501171875, "rewards/format_reward": 0.978125, "rewards/influence_reward": 0.408203125, "rewards/len_reward": 0.6640625, "step": 170 }, { "completion_length": 282.89296875, "epoch": 1.6563614744351962, "grad_norm": 3.065824553875067, "kl": 2.65234375, "learning_rate": 9.549150281252632e-08, "loss": 0.1061, "reward": 2.530078125, "reward_std": 0.4955192942172289, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.973046875, "rewards/influence_reward": 0.4046875, "rewards/len_reward": 0.65234375, "step": 175 }, { "completion_length": 274.28359375, "epoch": 1.7039239001189062, "grad_norm": 3.281579185573704, "kl": 2.66640625, "learning_rate": 7.260364370723043e-08, "loss": 0.1066, "reward": 2.540625, "reward_std": 0.45585995763540266, "rewards/accuracy_reward": 0.504296875, "rewards/format_reward": 0.98203125, "rewards/influence_reward": 0.420703125, "rewards/len_reward": 0.63359375, "step": 180 }, { "completion_length": 282.63828125, "epoch": 1.7514863258026159, "grad_norm": 7.835917126431856, "kl": 2.7076171875, "learning_rate": 5.263533508961826e-08, "loss": 0.1083, "reward": 2.524609375, "reward_std": 0.49459295999258757, "rewards/accuracy_reward": 0.486328125, "rewards/format_reward": 0.97578125, "rewards/influence_reward": 0.405859375, "rewards/len_reward": 0.656640625, "step": 185 }, { "completion_length": 279.878515625, "epoch": 1.7990487514863258, "grad_norm": 3.407510264782874, "kl": 2.6724609375, "learning_rate": 3.572298075514652e-08, "loss": 0.1069, "reward": 2.572265625, "reward_std": 0.48079199306666853, "rewards/accuracy_reward": 0.5140625, "rewards/format_reward": 0.980859375, "rewards/influence_reward": 0.436328125, "rewards/len_reward": 0.641015625, "step": 190 }, { "completion_length": 280.059375, "epoch": 1.8466111771700358, "grad_norm": 20.679733097388006, "kl": 2.6583984375, "learning_rate": 2.1982109232821176e-08, "loss": 0.1063, "reward": 2.49453125, "reward_std": 0.4927243089303374, "rewards/accuracy_reward": 0.498828125, "rewards/format_reward": 0.97734375, "rewards/influence_reward": 0.403125, "rewards/len_reward": 0.615234375, "step": 195 }, { "completion_length": 278.7625, "epoch": 1.8941736028537455, "grad_norm": 2.48795349080197, "kl": 2.6826171875, "learning_rate": 1.1506584608200364e-08, "loss": 0.1073, "reward": 2.60546875, "reward_std": 0.4754039028659463, "rewards/accuracy_reward": 0.536328125, "rewards/format_reward": 0.981640625, "rewards/influence_reward": 0.446875, "rewards/len_reward": 0.640625, "step": 200 }, { "completion_length": 275.9921875, "epoch": 1.9417360285374554, "grad_norm": 2.346124453455794, "kl": 2.770361328125, "learning_rate": 4.367965336512403e-09, "loss": 0.1108, "reward": 2.6203125, "reward_std": 0.5024769959971309, "rewards/accuracy_reward": 0.5328125, "rewards/format_reward": 0.9796875, "rewards/influence_reward": 0.45, "rewards/len_reward": 0.6578125, "step": 205 }, { "completion_length": 280.690234375, "epoch": 1.9892984542211654, "grad_norm": 2.0191058844209615, "kl": 2.69677734375, "learning_rate": 6.150154258476314e-10, "loss": 0.1079, "reward": 2.544140625, "reward_std": 0.5300922216847539, "rewards/accuracy_reward": 0.4984375, "rewards/format_reward": 0.980078125, "rewards/influence_reward": 0.40703125, "rewards/len_reward": 0.65859375, "step": 210 }, { "completion_length": 299.63368055555554, "epoch": 2.0, "kl": 2.6888020833333335, "reward": 2.5711805555555554, "reward_std": 0.5023611783981323, "rewards/accuracy_reward": 0.5399305555555556, "rewards/format_reward": 0.9652777777777778, "rewards/influence_reward": 0.4131944444444444, "rewards/len_reward": 0.6527777777777778, "step": 212, "total_flos": 0.0, "train_loss": 0.0899008925090421, "train_runtime": 180643.6048, "train_samples_per_second": 0.149, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 212, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }