{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0273972602739727, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2687.75, "epoch": 0.003424657534246575, "grad_norm": 0.18900136649608612, "kl": 0.0, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "reward": 0.3042634315788746, "reward_std": 0.4194334000349045, "rewards/cosine_scaled_reward": -0.14712543413043022, "rewards/format_reward": 0.4513889104127884, "step": 1 }, { "completion_length": 2708.71533203125, "epoch": 0.00684931506849315, "grad_norm": 0.18016651272773743, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.3438135087490082, "reward_std": 0.4610961228609085, "rewards/cosine_scaled_reward": -0.1353531926870346, "rewards/format_reward": 0.4791666716337204, "step": 2 }, { "completion_length": 2606.2569580078125, "epoch": 0.010273972602739725, "grad_norm": 0.20038573443889618, "kl": 9.512901306152344e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.4459744915366173, "reward_std": 0.4223913550376892, "rewards/cosine_scaled_reward": -0.033192168921232224, "rewards/format_reward": 0.4791666567325592, "step": 3 }, { "completion_length": 2475.6805419921875, "epoch": 0.0136986301369863, "grad_norm": 0.288876473903656, "kl": 0.00010967254638671875, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.5504279434680939, "reward_std": 0.4054145812988281, "rewards/cosine_scaled_reward": -0.07457206398248672, "rewards/format_reward": 0.625, "step": 4 }, { "completion_length": 2784.541748046875, "epoch": 0.017123287671232876, "grad_norm": 0.14747899770736694, "kl": 0.00010347366333007812, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.3733392059803009, "reward_std": 0.4893380403518677, "rewards/cosine_scaled_reward": -0.057216365821659565, "rewards/format_reward": 0.4305555522441864, "step": 5 }, { "completion_length": 3103.0902099609375, "epoch": 0.02054794520547945, "grad_norm": 0.1839321404695511, "kl": 0.00011086463928222656, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.02957332320511341, "reward_std": 0.3297805115580559, "rewards/cosine_scaled_reward": -0.22042667865753174, "rewards/format_reward": 0.2499999925494194, "step": 6 }, { "completion_length": 2373.9375, "epoch": 0.023972602739726026, "grad_norm": 0.22951197624206543, "kl": 0.00012159347534179688, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "reward": 0.4939586818218231, "reward_std": 0.3855544626712799, "rewards/cosine_scaled_reward": -0.04770803824067116, "rewards/format_reward": 0.5416666716337204, "step": 7 }, { "completion_length": 2863.65283203125, "epoch": 0.0273972602739726, "grad_norm": 0.1952303647994995, "kl": 0.00010585784912109375, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.17456580698490143, "reward_std": 0.4745761901140213, "rewards/cosine_scaled_reward": -0.15182308107614517, "rewards/format_reward": 0.3263888880610466, "step": 8 }, { "completion_length": 3127.5833740234375, "epoch": 0.030821917808219176, "grad_norm": 0.12419066578149796, "kl": 0.00010228157043457031, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.10362935438752174, "reward_std": 0.3372735381126404, "rewards/cosine_scaled_reward": -0.15331509709358215, "rewards/format_reward": 0.2569444552063942, "step": 9 }, { "completion_length": 3021.9722900390625, "epoch": 0.03424657534246575, "grad_norm": 0.1988951563835144, "kl": 0.00011277198791503906, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.13170428201556206, "reward_std": 0.41356146335601807, "rewards/cosine_scaled_reward": -0.16690683364868164, "rewards/format_reward": 0.298611119389534, "step": 10 }, { "completion_length": 2931.1319580078125, "epoch": 0.03767123287671233, "grad_norm": 0.18511006236076355, "kl": 0.0001304149627685547, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "reward": 0.1742808436974883, "reward_std": 0.32126323878765106, "rewards/cosine_scaled_reward": -0.17294137924909592, "rewards/format_reward": 0.3472222238779068, "step": 11 }, { "completion_length": 2932.5555419921875, "epoch": 0.0410958904109589, "grad_norm": 0.16578702628612518, "kl": 0.00010228157043457031, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.19121046364307404, "reward_std": 0.3839987516403198, "rewards/cosine_scaled_reward": -0.16295619308948517, "rewards/format_reward": 0.3541666567325592, "step": 12 }, { "completion_length": 2798.6180419921875, "epoch": 0.04452054794520548, "grad_norm": 0.15695109963417053, "kl": 9.846687316894531e-05, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "reward": 0.40019115805625916, "reward_std": 0.49739648401737213, "rewards/cosine_scaled_reward": -0.05814217543229461, "rewards/format_reward": 0.4583333432674408, "step": 13 }, { "completion_length": 2636.548583984375, "epoch": 0.04794520547945205, "grad_norm": 0.19918033480644226, "kl": 0.00012230873107910156, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.5720034390687943, "reward_std": 0.46172526478767395, "rewards/cosine_scaled_reward": 0.009503423236310482, "rewards/format_reward": 0.5625, "step": 14 }, { "completion_length": 2823.763916015625, "epoch": 0.05136986301369863, "grad_norm": 0.13800376653671265, "kl": 0.00010466575622558594, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.46923990547657013, "reward_std": 0.5656772404909134, "rewards/cosine_scaled_reward": -0.037704543210566044, "rewards/format_reward": 0.5069444477558136, "step": 15 }, { "completion_length": 2824.2777099609375, "epoch": 0.0547945205479452, "grad_norm": 0.19206716120243073, "kl": 0.00011682510375976562, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "reward": 0.23321796208620071, "reward_std": 0.41186313331127167, "rewards/cosine_scaled_reward": -0.10011539235711098, "rewards/format_reward": 0.3333333432674408, "step": 16 }, { "completion_length": 2954.1112060546875, "epoch": 0.05821917808219178, "grad_norm": 0.19295988976955414, "kl": 0.00010442733764648438, "learning_rate": 5.666666666666666e-07, "loss": 0.0, "reward": 0.28434962406754494, "reward_std": 0.5011897683143616, "rewards/cosine_scaled_reward": -0.07676149532198906, "rewards/format_reward": 0.361111119389534, "step": 17 }, { "completion_length": 2896.78466796875, "epoch": 0.06164383561643835, "grad_norm": 0.16591955721378326, "kl": 0.00012493133544921875, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.1477920152246952, "reward_std": 0.3473651483654976, "rewards/cosine_scaled_reward": -0.1855413243174553, "rewards/format_reward": 0.3333333432674408, "step": 18 }, { "completion_length": 2883.125, "epoch": 0.06506849315068493, "grad_norm": 0.24647116661071777, "kl": 0.0001239776611328125, "learning_rate": 6.333333333333332e-07, "loss": 0.0, "reward": 0.23157373815774918, "reward_std": 0.3507264107465744, "rewards/cosine_scaled_reward": -0.10870405659079552, "rewards/format_reward": 0.3402777761220932, "step": 19 }, { "completion_length": 2961.7362060546875, "epoch": 0.0684931506849315, "grad_norm": 0.18728116154670715, "kl": 0.000133514404296875, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.17506830394268036, "reward_std": 0.30466167628765106, "rewards/cosine_scaled_reward": -0.13743170350790024, "rewards/format_reward": 0.3125, "step": 20 }, { "completion_length": 2186.2501220703125, "epoch": 0.07191780821917808, "grad_norm": 0.21891450881958008, "kl": 0.00015282630920410156, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.8545757234096527, "reward_std": 0.4611281454563141, "rewards/cosine_scaled_reward": 0.1392979435622692, "rewards/format_reward": 0.715277761220932, "step": 21 }, { "completion_length": 2804.2291259765625, "epoch": 0.07534246575342465, "grad_norm": 0.17504653334617615, "kl": 0.000141143798828125, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.26354077458381653, "reward_std": 0.30501972138881683, "rewards/cosine_scaled_reward": -0.16007035970687866, "rewards/format_reward": 0.423611119389534, "step": 22 }, { "completion_length": 2849.8958740234375, "epoch": 0.07876712328767123, "grad_norm": 0.16187402606010437, "kl": 0.00014853477478027344, "learning_rate": 7.666666666666667e-07, "loss": 0.0, "reward": 0.2565463110804558, "reward_std": 0.4471036493778229, "rewards/cosine_scaled_reward": -0.11150925606489182, "rewards/format_reward": 0.3680555522441864, "step": 23 }, { "completion_length": 2847.166748046875, "epoch": 0.0821917808219178, "grad_norm": 0.1545354276895523, "kl": 0.0001323223114013672, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.4566986709833145, "reward_std": 0.5350392460823059, "rewards/cosine_scaled_reward": -0.015523582696914673, "rewards/format_reward": 0.4722222238779068, "step": 24 }, { "completion_length": 2807.2708740234375, "epoch": 0.08561643835616438, "grad_norm": 0.18177877366542816, "kl": 0.00020647048950195312, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": 0.21164313331246376, "reward_std": 0.3955174386501312, "rewards/cosine_scaled_reward": -0.15641241893172264, "rewards/format_reward": 0.3680555522441864, "step": 25 }, { "completion_length": 2927.194580078125, "epoch": 0.08904109589041095, "grad_norm": 0.17393368482589722, "kl": 0.00020933151245117188, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "reward": 0.370952308177948, "reward_std": 0.4646989554166794, "rewards/cosine_scaled_reward": -0.017936568707227707, "rewards/format_reward": 0.388888880610466, "step": 26 }, { "completion_length": 2989.7291259765625, "epoch": 0.09246575342465753, "grad_norm": 0.19562838971614838, "kl": 0.00015878677368164062, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.10655032098293304, "reward_std": 0.4004325717687607, "rewards/cosine_scaled_reward": -0.20594968646764755, "rewards/format_reward": 0.3125, "step": 27 }, { "completion_length": 3012.5555419921875, "epoch": 0.0958904109589041, "grad_norm": 0.15751628577709198, "kl": 0.00021696090698242188, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "reward": 0.3497147411108017, "reward_std": 0.6247645616531372, "rewards/cosine_scaled_reward": -0.0738963820040226, "rewards/format_reward": 0.423611119389534, "step": 28 }, { "completion_length": 3110.6181640625, "epoch": 0.09931506849315068, "grad_norm": 0.13668763637542725, "kl": 0.00023889541625976562, "learning_rate": 9.666666666666666e-07, "loss": 0.0, "reward": 0.10539204813539982, "reward_std": 0.30441631376743317, "rewards/cosine_scaled_reward": -0.17933017387986183, "rewards/format_reward": 0.2847222238779068, "step": 29 }, { "completion_length": 2887.2083740234375, "epoch": 0.10273972602739725, "grad_norm": 0.17280973494052887, "kl": 0.00022649765014648438, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.4436980187892914, "reward_std": 0.5404154658317566, "rewards/cosine_scaled_reward": 0.006197985261678696, "rewards/format_reward": 0.4375, "step": 30 }, { "completion_length": 2932.757080078125, "epoch": 0.10616438356164383, "grad_norm": 0.15570040047168732, "kl": 0.00044155120849609375, "learning_rate": 9.99969538601693e-07, "loss": 0.0, "reward": 0.16736368834972382, "reward_std": 0.4045102745294571, "rewards/cosine_scaled_reward": -0.15208075568079948, "rewards/format_reward": 0.3194444477558136, "step": 31 }, { "completion_length": 3094.4444580078125, "epoch": 0.1095890410958904, "grad_norm": 0.14696453511714935, "kl": 0.0003809928894042969, "learning_rate": 9.998781585307575e-07, "loss": 0.0, "reward": 0.1443577939644456, "reward_std": 0.5129190236330032, "rewards/cosine_scaled_reward": -0.16119776666164398, "rewards/format_reward": 0.3055555522441864, "step": 32 }, { "completion_length": 2546.0556640625, "epoch": 0.11301369863013698, "grad_norm": 0.1807711124420166, "kl": 0.00067138671875, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.5040995180606842, "reward_std": 0.43710532784461975, "rewards/cosine_scaled_reward": 0.011043965816497803, "rewards/format_reward": 0.4930555671453476, "step": 33 }, { "completion_length": 2832.5902099609375, "epoch": 0.11643835616438356, "grad_norm": 0.16727592051029205, "kl": 0.000640869140625, "learning_rate": 9.99512700102336e-07, "loss": 0.0, "reward": 0.24528168514370918, "reward_std": 0.4029017984867096, "rewards/cosine_scaled_reward": -0.15749610401690006, "rewards/format_reward": 0.4027777910232544, "step": 34 }, { "completion_length": 2865.923583984375, "epoch": 0.11986301369863013, "grad_norm": 0.1456187218427658, "kl": 0.0006847381591796875, "learning_rate": 9.992386712220707e-07, "loss": 0.0, "reward": 0.27862629294395447, "reward_std": 0.37985116243362427, "rewards/cosine_scaled_reward": -0.11026258394122124, "rewards/format_reward": 0.3888888955116272, "step": 35 }, { "completion_length": 2459.1527709960938, "epoch": 0.1232876712328767, "grad_norm": 0.16684532165527344, "kl": 0.0010623931884765625, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.5571352988481522, "reward_std": 0.4538237154483795, "rewards/cosine_scaled_reward": 0.015468628145754337, "rewards/format_reward": 0.5416666865348816, "step": 36 }, { "completion_length": 2674.763916015625, "epoch": 0.1267123287671233, "grad_norm": 0.15782181918621063, "kl": 0.001285552978515625, "learning_rate": 9.985081996200277e-07, "loss": 0.0001, "reward": 0.3798440098762512, "reward_std": 0.48301415145397186, "rewards/cosine_scaled_reward": -0.08543377462774515, "rewards/format_reward": 0.4652777910232544, "step": 37 }, { "completion_length": 2692.986083984375, "epoch": 0.13013698630136986, "grad_norm": 0.16332699358463287, "kl": 0.0014743804931640625, "learning_rate": 9.98051855792412e-07, "loss": 0.0001, "reward": 0.38910074532032013, "reward_std": 0.3475951850414276, "rewards/cosine_scaled_reward": -0.03451040526852012, "rewards/format_reward": 0.423611119389534, "step": 38 }, { "completion_length": 3102.4930419921875, "epoch": 0.13356164383561644, "grad_norm": 0.1224745586514473, "kl": 0.00080108642578125, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.22906357236206532, "reward_std": 0.5051266252994537, "rewards/cosine_scaled_reward": -0.10426976904273033, "rewards/format_reward": 0.3333333358168602, "step": 39 }, { "completion_length": 2613.0347900390625, "epoch": 0.136986301369863, "grad_norm": 0.2610801160335541, "kl": 0.00220489501953125, "learning_rate": 9.969572609838744e-07, "loss": 0.0001, "reward": 0.39406873285770416, "reward_std": 0.5201945602893829, "rewards/cosine_scaled_reward": -0.13370903208851814, "rewards/format_reward": 0.5277777910232544, "step": 40 }, { "completion_length": 2810.84033203125, "epoch": 0.1404109589041096, "grad_norm": 0.1520536243915558, "kl": 0.00159454345703125, "learning_rate": 9.963191581935677e-07, "loss": 0.0001, "reward": 0.26414267159998417, "reward_std": 0.39119474589824677, "rewards/cosine_scaled_reward": -0.12474621459841728, "rewards/format_reward": 0.3888888955116272, "step": 41 }, { "completion_length": 2832.3958740234375, "epoch": 0.14383561643835616, "grad_norm": 0.16354550421237946, "kl": 0.001583099365234375, "learning_rate": 9.956206309337066e-07, "loss": 0.0001, "reward": 0.1603957675397396, "reward_std": 0.2545732408761978, "rewards/cosine_scaled_reward": -0.19377091526985168, "rewards/format_reward": 0.3541666716337204, "step": 42 }, { "completion_length": 2594.4166259765625, "epoch": 0.14726027397260275, "grad_norm": 0.1575089693069458, "kl": 0.002162933349609375, "learning_rate": 9.948617737737001e-07, "loss": 0.0001, "reward": 0.4924927353858948, "reward_std": 0.4764625281095505, "rewards/cosine_scaled_reward": -0.05611838772892952, "rewards/format_reward": 0.5486111044883728, "step": 43 }, { "completion_length": 3142.638916015625, "epoch": 0.1506849315068493, "grad_norm": 0.13056397438049316, "kl": 0.0015392303466796875, "learning_rate": 9.940426894506606e-07, "loss": 0.0001, "reward": 0.18407823704183102, "reward_std": 0.40585757791996, "rewards/cosine_scaled_reward": -0.0867550881812349, "rewards/format_reward": 0.2708333432674408, "step": 44 }, { "completion_length": 2841.7501220703125, "epoch": 0.1541095890410959, "grad_norm": 0.1490578055381775, "kl": 0.0020904541015625, "learning_rate": 9.931634888554935e-07, "loss": 0.0001, "reward": 0.24261729046702385, "reward_std": 0.4036310315132141, "rewards/cosine_scaled_reward": -0.11849382892251015, "rewards/format_reward": 0.3611111119389534, "step": 45 }, { "completion_length": 2715.6944580078125, "epoch": 0.15753424657534246, "grad_norm": 0.18108582496643066, "kl": 0.00254058837890625, "learning_rate": 9.922242910178859e-07, "loss": 0.0001, "reward": 0.40730662643909454, "reward_std": 0.5260264724493027, "rewards/cosine_scaled_reward": -0.05797116830945015, "rewards/format_reward": 0.4652777761220932, "step": 46 }, { "completion_length": 2995.9862060546875, "epoch": 0.16095890410958905, "grad_norm": 0.23772801458835602, "kl": 0.0029754638671875, "learning_rate": 9.912252230901906e-07, "loss": 0.0001, "reward": 0.28154853731393814, "reward_std": 0.44985754787921906, "rewards/cosine_scaled_reward": -0.0726181073114276, "rewards/format_reward": 0.3541666641831398, "step": 47 }, { "completion_length": 2810.7222900390625, "epoch": 0.1643835616438356, "grad_norm": 0.1889384686946869, "kl": 0.00370025634765625, "learning_rate": 9.901664203302124e-07, "loss": 0.0001, "reward": 0.3217255771160126, "reward_std": 0.5001529008150101, "rewards/cosine_scaled_reward": -0.08105220319703221, "rewards/format_reward": 0.4027777761220932, "step": 48 }, { "completion_length": 2650.09033203125, "epoch": 0.1678082191780822, "grad_norm": 0.13828006386756897, "kl": 0.00315093994140625, "learning_rate": 9.890480260828965e-07, "loss": 0.0001, "reward": 0.37053926289081573, "reward_std": 0.40743280947208405, "rewards/cosine_scaled_reward": -0.12251630332320929, "rewards/format_reward": 0.4930555522441864, "step": 49 }, { "completion_length": 2785.84716796875, "epoch": 0.17123287671232876, "grad_norm": 0.16287721693515778, "kl": 0.00333404541015625, "learning_rate": 9.878701917609207e-07, "loss": 0.0001, "reward": 0.3723617196083069, "reward_std": 0.4880683571100235, "rewards/cosine_scaled_reward": -0.09291603974997997, "rewards/format_reward": 0.4652777910232544, "step": 50 }, { "completion_length": 2878.1181640625, "epoch": 0.17465753424657535, "grad_norm": 0.13556380569934845, "kl": 0.0032501220703125, "learning_rate": 9.866330768241983e-07, "loss": 0.0001, "reward": 0.2918383926153183, "reward_std": 0.3286859840154648, "rewards/cosine_scaled_reward": -0.1109393835067749, "rewards/format_reward": 0.4027777910232544, "step": 51 }, { "completion_length": 2817.638916015625, "epoch": 0.1780821917808219, "grad_norm": 0.16504673659801483, "kl": 0.0037384033203125, "learning_rate": 9.853368487582886e-07, "loss": 0.0001, "reward": 0.23075967282056808, "reward_std": 0.3352076858282089, "rewards/cosine_scaled_reward": -0.08868478238582611, "rewards/format_reward": 0.3194444477558136, "step": 52 }, { "completion_length": 2916.1112060546875, "epoch": 0.1815068493150685, "grad_norm": 0.13884864747524261, "kl": 0.00319671630859375, "learning_rate": 9.839816830517225e-07, "loss": 0.0001, "reward": 0.28766703605651855, "reward_std": 0.39385148882865906, "rewards/cosine_scaled_reward": -0.09427741169929504, "rewards/format_reward": 0.3819444477558136, "step": 53 }, { "completion_length": 2991.6041259765625, "epoch": 0.18493150684931506, "grad_norm": 0.1436394602060318, "kl": 0.0031585693359375, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": 0.35537297278642654, "reward_std": 0.39448249340057373, "rewards/cosine_scaled_reward": -0.04740479774773121, "rewards/format_reward": 0.4027777910232544, "step": 54 }, { "completion_length": 2785.173583984375, "epoch": 0.18835616438356165, "grad_norm": 0.18955738842487335, "kl": 0.0048980712890625, "learning_rate": 9.8109528054197e-07, "loss": 0.0002, "reward": 0.2277716025710106, "reward_std": 0.4378567188978195, "rewards/cosine_scaled_reward": -0.1263950616121292, "rewards/format_reward": 0.3541666716337204, "step": 55 }, { "completion_length": 2579.388916015625, "epoch": 0.1917808219178082, "grad_norm": 0.1459513008594513, "kl": 0.003570556640625, "learning_rate": 9.795644345114794e-07, "loss": 0.0001, "reward": 0.35765571892261505, "reward_std": 0.39826689660549164, "rewards/cosine_scaled_reward": -0.11456651613116264, "rewards/format_reward": 0.4722222238779068, "step": 56 }, { "completion_length": 2586.9305419921875, "epoch": 0.1952054794520548, "grad_norm": 0.20002588629722595, "kl": 0.00665283203125, "learning_rate": 9.779754323328192e-07, "loss": 0.0003, "reward": 0.3011641800403595, "reward_std": 0.397590771317482, "rewards/cosine_scaled_reward": -0.1641135960817337, "rewards/format_reward": 0.4652777761220932, "step": 57 }, { "completion_length": 2786.3333740234375, "epoch": 0.19863013698630136, "grad_norm": 0.15498584508895874, "kl": 0.00461578369140625, "learning_rate": 9.76328489131448e-07, "loss": 0.0002, "reward": 0.3455113209784031, "reward_std": 0.38071802258491516, "rewards/cosine_scaled_reward": -0.043377578258514404, "rewards/format_reward": 0.3888889029622078, "step": 58 }, { "completion_length": 2466.763916015625, "epoch": 0.20205479452054795, "grad_norm": 0.14243252575397491, "kl": 0.0069580078125, "learning_rate": 9.746238278771125e-07, "loss": 0.0003, "reward": 0.5345294326543808, "reward_std": 0.5026020854711533, "rewards/cosine_scaled_reward": -0.021026152186095715, "rewards/format_reward": 0.5555555522441864, "step": 59 }, { "completion_length": 2819.2083740234375, "epoch": 0.2054794520547945, "grad_norm": 0.13467197120189667, "kl": 0.005340576171875, "learning_rate": 9.728616793536587e-07, "loss": 0.0002, "reward": 0.32512621581554413, "reward_std": 0.3411554992198944, "rewards/cosine_scaled_reward": -0.07070711255073547, "rewards/format_reward": 0.3958333432674408, "step": 60 }, { "completion_length": 2741.6944580078125, "epoch": 0.2089041095890411, "grad_norm": 0.1468200534582138, "kl": 0.0046539306640625, "learning_rate": 9.71042282127789e-07, "loss": 0.0002, "reward": 0.4192769527435303, "reward_std": 0.4629937559366226, "rewards/cosine_scaled_reward": -0.10850081220269203, "rewards/format_reward": 0.5277777910232544, "step": 61 }, { "completion_length": 2945.27783203125, "epoch": 0.21232876712328766, "grad_norm": 0.14704585075378418, "kl": 0.00518798828125, "learning_rate": 9.69165882516764e-07, "loss": 0.0002, "reward": 0.35519421100616455, "reward_std": 0.49601832032203674, "rewards/cosine_scaled_reward": -0.019805820658802986, "rewards/format_reward": 0.375, "step": 62 }, { "completion_length": 2946.166748046875, "epoch": 0.21575342465753425, "grad_norm": 0.14856047928333282, "kl": 0.004547119140625, "learning_rate": 9.672327345550543e-07, "loss": 0.0002, "reward": 0.14313821494579315, "reward_std": 0.30480653047561646, "rewards/cosine_scaled_reward": -0.16241733357310295, "rewards/format_reward": 0.305555559694767, "step": 63 }, { "completion_length": 2631.2708740234375, "epoch": 0.2191780821917808, "grad_norm": 0.1544739007949829, "kl": 0.0066070556640625, "learning_rate": 9.65243099959949e-07, "loss": 0.0003, "reward": 0.3399986997246742, "reward_std": 0.41406454145908356, "rewards/cosine_scaled_reward": -0.0905568664893508, "rewards/format_reward": 0.4305555522441864, "step": 64 }, { "completion_length": 2305.416748046875, "epoch": 0.2226027397260274, "grad_norm": 0.18752287328243256, "kl": 0.011260986328125, "learning_rate": 9.631972480961233e-07, "loss": 0.0005, "reward": 0.43726037442684174, "reward_std": 0.25909677147865295, "rewards/cosine_scaled_reward": -0.06273962743580341, "rewards/format_reward": 0.5000000149011612, "step": 65 }, { "completion_length": 2983.9444580078125, "epoch": 0.22602739726027396, "grad_norm": 0.11286786943674088, "kl": 0.00555419921875, "learning_rate": 9.610954559391704e-07, "loss": 0.0002, "reward": 0.2669789642095566, "reward_std": 0.39550328254699707, "rewards/cosine_scaled_reward": -0.12190994247794151, "rewards/format_reward": 0.3888888955116272, "step": 66 }, { "completion_length": 2817.4652099609375, "epoch": 0.22945205479452055, "grad_norm": 0.12778045237064362, "kl": 0.006805419921875, "learning_rate": 9.589380080381038e-07, "loss": 0.0003, "reward": 0.37828030437231064, "reward_std": 0.372231587767601, "rewards/cosine_scaled_reward": -0.05921970750205219, "rewards/format_reward": 0.4375000149011612, "step": 67 }, { "completion_length": 2652.541748046875, "epoch": 0.2328767123287671, "grad_norm": 0.21021802723407745, "kl": 0.009674072265625, "learning_rate": 9.567251964768342e-07, "loss": 0.0004, "reward": 0.36411239206790924, "reward_std": 0.4231058210134506, "rewards/cosine_scaled_reward": -0.04560980945825577, "rewards/format_reward": 0.4097222238779068, "step": 68 }, { "completion_length": 2930.09716796875, "epoch": 0.2363013698630137, "grad_norm": 0.13323834538459778, "kl": 0.0063323974609375, "learning_rate": 9.54457320834625e-07, "loss": 0.0003, "reward": 0.2518671154975891, "reward_std": 0.44373343884944916, "rewards/cosine_scaled_reward": -0.10924399271607399, "rewards/format_reward": 0.361111119389534, "step": 69 }, { "completion_length": 2892.1041259765625, "epoch": 0.23972602739726026, "grad_norm": 0.1291056126356125, "kl": 0.0074005126953125, "learning_rate": 9.521346881455354e-07, "loss": 0.0003, "reward": 0.32329247891902924, "reward_std": 0.2955201715230942, "rewards/cosine_scaled_reward": -0.04476307414006442, "rewards/format_reward": 0.3680555522441864, "step": 70 }, { "completion_length": 3238.2847900390625, "epoch": 0.24315068493150685, "grad_norm": 0.12446684390306473, "kl": 0.0060577392578125, "learning_rate": 9.497576128568518e-07, "loss": 0.0002, "reward": 0.127724077552557, "reward_std": 0.38546572625637054, "rewards/cosine_scaled_reward": -0.129220362752676, "rewards/format_reward": 0.256944440305233, "step": 71 }, { "completion_length": 2898.52783203125, "epoch": 0.2465753424657534, "grad_norm": 0.12787607312202454, "kl": 0.007537841796875, "learning_rate": 9.473264167865171e-07, "loss": 0.0003, "reward": 0.21395007893443108, "reward_std": 0.3605284094810486, "rewards/cosine_scaled_reward": -0.14716103300452232, "rewards/format_reward": 0.361111119389534, "step": 72 }, { "completion_length": 2566.1666259765625, "epoch": 0.25, "grad_norm": 0.16552697122097015, "kl": 0.010162353515625, "learning_rate": 9.448414290795618e-07, "loss": 0.0004, "reward": 0.5014020800590515, "reward_std": 0.3501627743244171, "rewards/cosine_scaled_reward": 0.015290968120098114, "rewards/format_reward": 0.486111119389534, "step": 73 }, { "completion_length": 2927.8125, "epoch": 0.2534246575342466, "grad_norm": 0.1325031816959381, "kl": 0.007843017578125, "learning_rate": 9.42302986163543e-07, "loss": 0.0003, "reward": 0.20612417813390493, "reward_std": 0.38437697291374207, "rewards/cosine_scaled_reward": -0.1758202537894249, "rewards/format_reward": 0.3819444477558136, "step": 74 }, { "completion_length": 2712.4583740234375, "epoch": 0.2568493150684932, "grad_norm": 0.1481809765100479, "kl": 0.0088043212890625, "learning_rate": 9.397114317029974e-07, "loss": 0.0004, "reward": 0.46940816938877106, "reward_std": 0.4842826575040817, "rewards/cosine_scaled_reward": -0.002814057283103466, "rewards/format_reward": 0.472222238779068, "step": 75 }, { "completion_length": 2792.3055419921875, "epoch": 0.2602739726027397, "grad_norm": 0.22191229462623596, "kl": 0.0104827880859375, "learning_rate": 9.370671165529144e-07, "loss": 0.0004, "reward": 0.3087327107787132, "reward_std": 0.3974086493253708, "rewards/cosine_scaled_reward": -0.07321174256503582, "rewards/format_reward": 0.3819444477558136, "step": 76 }, { "completion_length": 2848.0625, "epoch": 0.2636986301369863, "grad_norm": 0.14868301153182983, "kl": 0.009918212890625, "learning_rate": 9.343703987112365e-07, "loss": 0.0004, "reward": 0.361818864941597, "reward_std": 0.47596603631973267, "rewards/cosine_scaled_reward": -0.09651443734765053, "rewards/format_reward": 0.4583333432674408, "step": 77 }, { "completion_length": 3004.96533203125, "epoch": 0.2671232876712329, "grad_norm": 0.1331622451543808, "kl": 0.007659912109375, "learning_rate": 9.316216432703916e-07, "loss": 0.0003, "reward": 0.4262048453092575, "reward_std": 0.38969628512859344, "rewards/cosine_scaled_reward": 0.030371490865945816, "rewards/format_reward": 0.3958333283662796, "step": 78 }, { "completion_length": 2514.673583984375, "epoch": 0.2705479452054795, "grad_norm": 0.15792632102966309, "kl": 0.010833740234375, "learning_rate": 9.288212223678658e-07, "loss": 0.0004, "reward": 0.517068013548851, "reward_std": 0.4114367365837097, "rewards/cosine_scaled_reward": -0.003765310626477003, "rewards/format_reward": 0.5208333283662796, "step": 79 }, { "completion_length": 2929.9652099609375, "epoch": 0.273972602739726, "grad_norm": 0.16112865507602692, "kl": 0.009185791015625, "learning_rate": 9.259695151358214e-07, "loss": 0.0004, "reward": 0.22491584718227386, "reward_std": 0.3951188027858734, "rewards/cosine_scaled_reward": -0.11536196433007717, "rewards/format_reward": 0.3402777910232544, "step": 80 }, { "completion_length": 2740.84716796875, "epoch": 0.2773972602739726, "grad_norm": 0.16842201352119446, "kl": 0.012237548828125, "learning_rate": 9.230669076497687e-07, "loss": 0.0005, "reward": 0.4160406142473221, "reward_std": 0.47858820855617523, "rewards/cosine_scaled_reward": -0.007570529356598854, "rewards/format_reward": 0.423611119389534, "step": 81 }, { "completion_length": 2845.5069580078125, "epoch": 0.2808219178082192, "grad_norm": 0.20275162160396576, "kl": 0.009857177734375, "learning_rate": 9.20113792876298e-07, "loss": 0.0004, "reward": 0.3008575513958931, "reward_std": 0.46689480543136597, "rewards/cosine_scaled_reward": -0.088031355291605, "rewards/format_reward": 0.3888888955116272, "step": 82 }, { "completion_length": 2746.25, "epoch": 0.2842465753424658, "grad_norm": 0.186171293258667, "kl": 0.013397216796875, "learning_rate": 9.171105706198774e-07, "loss": 0.0005, "reward": 0.2251388169825077, "reward_std": 0.49258220195770264, "rewards/cosine_scaled_reward": -0.1568056344985962, "rewards/format_reward": 0.3819444477558136, "step": 83 }, { "completion_length": 2873.1805419921875, "epoch": 0.2876712328767123, "grad_norm": 0.1654582917690277, "kl": 0.00982666015625, "learning_rate": 9.140576474687263e-07, "loss": 0.0004, "reward": 0.33728019893169403, "reward_std": 0.4015253335237503, "rewards/cosine_scaled_reward": -0.05855315364897251, "rewards/format_reward": 0.3958333432674408, "step": 84 }, { "completion_length": 2642.9376220703125, "epoch": 0.2910958904109589, "grad_norm": 0.1457146555185318, "kl": 0.01275634765625, "learning_rate": 9.109554367397697e-07, "loss": 0.0005, "reward": 0.5559843331575394, "reward_std": 0.39349929988384247, "rewards/cosine_scaled_reward": 0.0004287753254175186, "rewards/format_reward": 0.5555555522441864, "step": 85 }, { "completion_length": 2919.9027099609375, "epoch": 0.2945205479452055, "grad_norm": 0.12657979130744934, "kl": 0.0092926025390625, "learning_rate": 9.078043584226815e-07, "loss": 0.0004, "reward": 0.4835028350353241, "reward_std": 0.4722355157136917, "rewards/cosine_scaled_reward": -0.009552719071507454, "rewards/format_reward": 0.4930555671453476, "step": 86 }, { "completion_length": 2724.104248046875, "epoch": 0.2979452054794521, "grad_norm": 0.16625893115997314, "kl": 0.0166015625, "learning_rate": 9.046048391230247e-07, "loss": 0.0007, "reward": 0.23857301846146584, "reward_std": 0.30345526337623596, "rewards/cosine_scaled_reward": -0.09476033598184586, "rewards/format_reward": 0.3333333283662796, "step": 87 }, { "completion_length": 3015.486083984375, "epoch": 0.3013698630136986, "grad_norm": 0.15486519038677216, "kl": 0.01129150390625, "learning_rate": 9.013573120044966e-07, "loss": 0.0005, "reward": 0.197230139747262, "reward_std": 0.4105876684188843, "rewards/cosine_scaled_reward": -0.15693652629852295, "rewards/format_reward": 0.3541666716337204, "step": 88 }, { "completion_length": 3045.7708740234375, "epoch": 0.3047945205479452, "grad_norm": 0.11665515601634979, "kl": 0.00848388671875, "learning_rate": 8.980622167302837e-07, "loss": 0.0003, "reward": 0.2519669234752655, "reward_std": 0.37049752473831177, "rewards/cosine_scaled_reward": -0.07442197389900684, "rewards/format_reward": 0.3263888955116272, "step": 89 }, { "completion_length": 2415.263916015625, "epoch": 0.3082191780821918, "grad_norm": 0.14136534929275513, "kl": 0.01422119140625, "learning_rate": 8.9471999940354e-07, "loss": 0.0006, "reward": 0.4783553332090378, "reward_std": 0.37421566247940063, "rewards/cosine_scaled_reward": -0.05636689253151417, "rewards/format_reward": 0.5347222238779068, "step": 90 }, { "completion_length": 3079.4166259765625, "epoch": 0.3116438356164384, "grad_norm": 0.1431264877319336, "kl": 0.011749267578125, "learning_rate": 8.91331112506991e-07, "loss": 0.0005, "reward": 0.1510103940963745, "reward_std": 0.4773586541414261, "rewards/cosine_scaled_reward": -0.1684340313076973, "rewards/format_reward": 0.3194444477558136, "step": 91 }, { "completion_length": 2879.6458740234375, "epoch": 0.3150684931506849, "grad_norm": 0.16839326918125153, "kl": 0.013824462890625, "learning_rate": 8.878960148416747e-07, "loss": 0.0006, "reward": 0.36638200283050537, "reward_std": 0.4755648225545883, "rewards/cosine_scaled_reward": -0.07111799996346235, "rewards/format_reward": 0.4375, "step": 92 }, { "completion_length": 2882.326416015625, "epoch": 0.3184931506849315, "grad_norm": 0.14426778256893158, "kl": 0.01336669921875, "learning_rate": 8.844151714648274e-07, "loss": 0.0005, "reward": 0.41798925399780273, "reward_std": 0.3813701719045639, "rewards/cosine_scaled_reward": 0.015211460180580616, "rewards/format_reward": 0.4027777910232544, "step": 93 }, { "completion_length": 2881.97216796875, "epoch": 0.3219178082191781, "grad_norm": 0.13846907019615173, "kl": 0.012542724609375, "learning_rate": 8.808890536269229e-07, "loss": 0.0005, "reward": 0.3407672867178917, "reward_std": 0.5038859099149704, "rewards/cosine_scaled_reward": -0.05506602302193642, "rewards/format_reward": 0.3958333432674408, "step": 94 }, { "completion_length": 3092.4375, "epoch": 0.3253424657534247, "grad_norm": 0.1459702104330063, "kl": 0.011962890625, "learning_rate": 8.773181387078719e-07, "loss": 0.0005, "reward": 0.0870569609105587, "reward_std": 0.36285020411014557, "rewards/cosine_scaled_reward": -0.1837763711810112, "rewards/format_reward": 0.2708333283662796, "step": 95 }, { "completion_length": 2850.486083984375, "epoch": 0.3287671232876712, "grad_norm": 0.16794843971729279, "kl": 0.016632080078125, "learning_rate": 8.737029101523929e-07, "loss": 0.0007, "reward": 0.2914813682436943, "reward_std": 0.40039965510368347, "rewards/cosine_scaled_reward": -0.12518527917563915, "rewards/format_reward": 0.4166666716337204, "step": 96 }, { "completion_length": 2382.861083984375, "epoch": 0.3321917808219178, "grad_norm": 0.17770880460739136, "kl": 0.01910400390625, "learning_rate": 8.700438574045617e-07, "loss": 0.0008, "reward": 0.5590852797031403, "reward_std": 0.3997005224227905, "rewards/cosine_scaled_reward": -0.010359160602092743, "rewards/format_reward": 0.5694444477558136, "step": 97 }, { "completion_length": 2623.6181640625, "epoch": 0.3356164383561644, "grad_norm": 0.14203575253486633, "kl": 0.0145263671875, "learning_rate": 8.663414758415478e-07, "loss": 0.0006, "reward": 0.4303872585296631, "reward_std": 0.41416965425014496, "rewards/cosine_scaled_reward": -0.055723853409290314, "rewards/format_reward": 0.486111119389534, "step": 98 }, { "completion_length": 2726.8056640625, "epoch": 0.339041095890411, "grad_norm": 0.14974938333034515, "kl": 0.014617919921875, "learning_rate": 8.625962667065487e-07, "loss": 0.0006, "reward": 0.5150108188390732, "reward_std": 0.42872530221939087, "rewards/cosine_scaled_reward": 0.02889970690011978, "rewards/format_reward": 0.486111119389534, "step": 99 }, { "completion_length": 2724.8472900390625, "epoch": 0.3424657534246575, "grad_norm": 0.15371288359165192, "kl": 0.015380859375, "learning_rate": 8.588087370409302e-07, "loss": 0.0006, "reward": 0.31959572434425354, "reward_std": 0.4421093314886093, "rewards/cosine_scaled_reward": -0.06234869919717312, "rewards/format_reward": 0.3819444477558136, "step": 100 }, { "completion_length": 2685.0069580078125, "epoch": 0.3458904109589041, "grad_norm": 0.15011949837207794, "kl": 0.017578125, "learning_rate": 8.549793996155795e-07, "loss": 0.0007, "reward": 0.28529858589172363, "reward_std": 0.3051328808069229, "rewards/cosine_scaled_reward": -0.14525696635246277, "rewards/format_reward": 0.4305555522441864, "step": 101 }, { "completion_length": 2744.3056640625, "epoch": 0.3493150684931507, "grad_norm": 0.16087760031223297, "kl": 0.015289306640625, "learning_rate": 8.511087728614862e-07, "loss": 0.0006, "reward": 0.4498697370290756, "reward_std": 0.459157794713974, "rewards/cosine_scaled_reward": 0.01931417128071189, "rewards/format_reward": 0.4305555522441864, "step": 102 }, { "completion_length": 2870.5208740234375, "epoch": 0.3527397260273973, "grad_norm": 0.1468544751405716, "kl": 0.01470947265625, "learning_rate": 8.471973807995534e-07, "loss": 0.0006, "reward": 0.3780040740966797, "reward_std": 0.5862710475921631, "rewards/cosine_scaled_reward": -0.08727369178086519, "rewards/format_reward": 0.4652777761220932, "step": 103 }, { "completion_length": 2846.5069580078125, "epoch": 0.3561643835616438, "grad_norm": 0.1548270583152771, "kl": 0.018310546875, "learning_rate": 8.432457529696548e-07, "loss": 0.0007, "reward": 0.35155677795410156, "reward_std": 0.4553772062063217, "rewards/cosine_scaled_reward": -0.0720543134957552, "rewards/format_reward": 0.423611119389534, "step": 104 }, { "completion_length": 2724.076416015625, "epoch": 0.3595890410958904, "grad_norm": 0.1372641921043396, "kl": 0.01708984375, "learning_rate": 8.392544243589427e-07, "loss": 0.0007, "reward": 0.23305706679821014, "reward_std": 0.36483894288539886, "rewards/cosine_scaled_reward": -0.16277626901865005, "rewards/format_reward": 0.3958333283662796, "step": 105 }, { "completion_length": 2911.5069580078125, "epoch": 0.363013698630137, "grad_norm": 0.173682302236557, "kl": 0.0185546875, "learning_rate": 8.352239353294194e-07, "loss": 0.0007, "reward": 0.3065890637226403, "reward_std": 0.30855099856853485, "rewards/cosine_scaled_reward": -0.03368869423866272, "rewards/format_reward": 0.3402777761220932, "step": 106 }, { "completion_length": 3053.5555419921875, "epoch": 0.3664383561643836, "grad_norm": 0.1345294862985611, "kl": 0.01544189453125, "learning_rate": 8.31154831544782e-07, "loss": 0.0006, "reward": 0.28517407923936844, "reward_std": 0.4504627585411072, "rewards/cosine_scaled_reward": -0.11065925285220146, "rewards/format_reward": 0.3958333283662796, "step": 107 }, { "completion_length": 3053.4583740234375, "epoch": 0.3698630136986301, "grad_norm": 0.12478786706924438, "kl": 0.015869140625, "learning_rate": 8.270476638965461e-07, "loss": 0.0006, "reward": 0.18328540516085923, "reward_std": 0.31476570665836334, "rewards/cosine_scaled_reward": -0.14310350455343723, "rewards/format_reward": 0.3263888880610466, "step": 108 }, { "completion_length": 2813.4444580078125, "epoch": 0.3732876712328767, "grad_norm": 0.13302823901176453, "kl": 0.013427734375, "learning_rate": 8.229029884294662e-07, "loss": 0.0005, "reward": 0.4454272836446762, "reward_std": 0.4187168627977371, "rewards/cosine_scaled_reward": 0.0009828601032495499, "rewards/format_reward": 0.4444444477558136, "step": 109 }, { "completion_length": 3167.1875, "epoch": 0.3767123287671233, "grad_norm": 0.13403479754924774, "kl": 0.013946533203125, "learning_rate": 8.187213662662538e-07, "loss": 0.0006, "reward": 0.12785961106419563, "reward_std": 0.4516500234603882, "rewards/cosine_scaled_reward": -0.19158484041690826, "rewards/format_reward": 0.3194444477558136, "step": 110 }, { "completion_length": 3124.513916015625, "epoch": 0.3801369863013699, "grad_norm": 0.13928255438804626, "kl": 0.0164794921875, "learning_rate": 8.145033635316128e-07, "loss": 0.0007, "reward": 0.1088075079023838, "reward_std": 0.40094128996133804, "rewards/cosine_scaled_reward": -0.1828591525554657, "rewards/format_reward": 0.291666679084301, "step": 111 }, { "completion_length": 3192.8194580078125, "epoch": 0.3835616438356164, "grad_norm": 0.12524190545082092, "kl": 0.015838623046875, "learning_rate": 8.102495512755938e-07, "loss": 0.0006, "reward": 0.16609879583120346, "reward_std": 0.4136479049921036, "rewards/cosine_scaled_reward": -0.18806789070367813, "rewards/format_reward": 0.3541666716337204, "step": 112 }, { "completion_length": 3000.625, "epoch": 0.386986301369863, "grad_norm": 0.15665502846240997, "kl": 0.01593017578125, "learning_rate": 8.059605053962833e-07, "loss": 0.0006, "reward": 0.34966103732585907, "reward_std": 0.5462008714675903, "rewards/cosine_scaled_reward": -0.025338975712656975, "rewards/format_reward": 0.375, "step": 113 }, { "completion_length": 3111.944580078125, "epoch": 0.3904109589041096, "grad_norm": 0.12206412851810455, "kl": 0.0172119140625, "learning_rate": 8.01636806561836e-07, "loss": 0.0007, "reward": 0.3253851607441902, "reward_std": 0.46508027613162994, "rewards/cosine_scaled_reward": -0.021837057545781136, "rewards/format_reward": 0.3472222238779068, "step": 114 }, { "completion_length": 2817.8333740234375, "epoch": 0.3938356164383562, "grad_norm": 0.15154601633548737, "kl": 0.01849365234375, "learning_rate": 7.972790401318627e-07, "loss": 0.0007, "reward": 0.19523771665990353, "reward_std": 0.2818225920200348, "rewards/cosine_scaled_reward": -0.17281781509518623, "rewards/format_reward": 0.368055559694767, "step": 115 }, { "completion_length": 3147.47216796875, "epoch": 0.3972602739726027, "grad_norm": 0.11532028764486313, "kl": 0.013916015625, "learning_rate": 7.928877960781808e-07, "loss": 0.0006, "reward": 0.21391746401786804, "reward_std": 0.4905927777290344, "rewards/cosine_scaled_reward": -0.0777492057532072, "rewards/format_reward": 0.2916666716337204, "step": 116 }, { "completion_length": 2670.5625, "epoch": 0.4006849315068493, "grad_norm": 0.16773977875709534, "kl": 0.0164794921875, "learning_rate": 7.884636689049422e-07, "loss": 0.0007, "reward": 0.4450993984937668, "reward_std": 0.4119822680950165, "rewards/cosine_scaled_reward": -0.08962283562868834, "rewards/format_reward": 0.5347222238779068, "step": 117 }, { "completion_length": 3084.826416015625, "epoch": 0.4041095890410959, "grad_norm": 0.13086926937103271, "kl": 0.0169677734375, "learning_rate": 7.840072575681468e-07, "loss": 0.0007, "reward": 0.32114290446043015, "reward_std": 0.3112673908472061, "rewards/cosine_scaled_reward": -0.033023773692548275, "rewards/format_reward": 0.3541666716337204, "step": 118 }, { "completion_length": 3206.5625, "epoch": 0.4075342465753425, "grad_norm": 0.1529916226863861, "kl": 0.02081298828125, "learning_rate": 7.795191653945538e-07, "loss": 0.0008, "reward": 0.08010175824165344, "reward_std": 0.2550045773386955, "rewards/cosine_scaled_reward": -0.14906491339206696, "rewards/format_reward": 0.2291666641831398, "step": 119 }, { "completion_length": 2953.4097900390625, "epoch": 0.410958904109589, "grad_norm": 0.1457367241382599, "kl": 0.01806640625, "learning_rate": 7.75e-07, "loss": 0.0007, "reward": 0.4063476175069809, "reward_std": 0.4592422544956207, "rewards/cosine_scaled_reward": -0.024207940325140953, "rewards/format_reward": 0.4305555671453476, "step": 120 }, { "completion_length": 2701.7569580078125, "epoch": 0.4143835616438356, "grad_norm": 0.16445693373680115, "kl": 0.0205078125, "learning_rate": 7.704503732071391e-07, "loss": 0.0008, "reward": 0.5053235739469528, "reward_std": 0.5655853599309921, "rewards/cosine_scaled_reward": -0.015509757213294506, "rewards/format_reward": 0.5208333432674408, "step": 121 }, { "completion_length": 3087.791748046875, "epoch": 0.4178082191780822, "grad_norm": 0.11365609616041183, "kl": 0.015625, "learning_rate": 7.658709009626109e-07, "loss": 0.0006, "reward": 0.6354653835296631, "reward_std": 0.5989937484264374, "rewards/cosine_scaled_reward": 0.16324318200349808, "rewards/format_reward": 0.472222238779068, "step": 122 }, { "completion_length": 2501.3958740234375, "epoch": 0.4212328767123288, "grad_norm": 0.14529232680797577, "kl": 0.0194091796875, "learning_rate": 7.612622032536507e-07, "loss": 0.0008, "reward": 0.4810824617743492, "reward_std": 0.4910588413476944, "rewards/cosine_scaled_reward": -0.06058423314243555, "rewards/format_reward": 0.5416666567325592, "step": 123 }, { "completion_length": 3121.854248046875, "epoch": 0.4246575342465753, "grad_norm": 0.11533054709434509, "kl": 0.0169677734375, "learning_rate": 7.566249040241553e-07, "loss": 0.0007, "reward": 0.26003750413656235, "reward_std": 0.44601309299468994, "rewards/cosine_scaled_reward": -0.0732958409935236, "rewards/format_reward": 0.3333333283662796, "step": 124 }, { "completion_length": 2981.65283203125, "epoch": 0.4280821917808219, "grad_norm": 0.14013110101222992, "kl": 0.0194091796875, "learning_rate": 7.51959631090208e-07, "loss": 0.0008, "reward": 0.17028066888451576, "reward_std": 0.35746677219867706, "rewards/cosine_scaled_reward": -0.16305266320705414, "rewards/format_reward": 0.3333333358168602, "step": 125 }, { "completion_length": 3021.8055419921875, "epoch": 0.4315068493150685, "grad_norm": 0.12227875739336014, "kl": 0.01800537109375, "learning_rate": 7.472670160550848e-07, "loss": 0.0007, "reward": 0.35344888269901276, "reward_std": 0.3790488839149475, "rewards/cosine_scaled_reward": -0.0007177963852882385, "rewards/format_reward": 0.3541666716337204, "step": 126 }, { "completion_length": 3308.8055419921875, "epoch": 0.4349315068493151, "grad_norm": 0.13628166913986206, "kl": 0.01708984375, "learning_rate": 7.425476942237444e-07, "loss": 0.0007, "reward": 0.2540893331170082, "reward_std": 0.47945114970207214, "rewards/cosine_scaled_reward": -0.07229956053197384, "rewards/format_reward": 0.326388880610466, "step": 127 }, { "completion_length": 2813.6666259765625, "epoch": 0.4383561643835616, "grad_norm": 0.15076382458209991, "kl": 0.019775390625, "learning_rate": 7.37802304516818e-07, "loss": 0.0008, "reward": 0.47455114126205444, "reward_std": 0.4427139610052109, "rewards/cosine_scaled_reward": -0.03933773934841156, "rewards/format_reward": 0.5138888955116272, "step": 128 }, { "completion_length": 2989.52783203125, "epoch": 0.4417808219178082, "grad_norm": 0.14209668338298798, "kl": 0.01824951171875, "learning_rate": 7.330314893841101e-07, "loss": 0.0007, "reward": 0.3041275106370449, "reward_std": 0.4487529695034027, "rewards/cosine_scaled_reward": -0.09865027293562889, "rewards/format_reward": 0.4027777910232544, "step": 129 }, { "completion_length": 2724.0556640625, "epoch": 0.4452054794520548, "grad_norm": 0.13135673105716705, "kl": 0.02044677734375, "learning_rate": 7.282358947176205e-07, "loss": 0.0008, "reward": 0.4611601382493973, "reward_std": 0.44737473130226135, "rewards/cosine_scaled_reward": -0.059673219453543425, "rewards/format_reward": 0.520833358168602, "step": 130 }, { "completion_length": 2713.166748046875, "epoch": 0.4486301369863014, "grad_norm": 0.14880216121673584, "kl": 0.0208740234375, "learning_rate": 7.234161697641017e-07, "loss": 0.0008, "reward": 0.5555524080991745, "reward_std": 0.3877111077308655, "rewards/cosine_scaled_reward": 0.05555241275578737, "rewards/format_reward": 0.5000000149011612, "step": 131 }, { "completion_length": 3061.0625, "epoch": 0.4520547945205479, "grad_norm": 0.12646687030792236, "kl": 0.02032470703125, "learning_rate": 7.185729670371604e-07, "loss": 0.0008, "reward": 0.15745187550783157, "reward_std": 0.23714770376682281, "rewards/cosine_scaled_reward": -0.15504813194274902, "rewards/format_reward": 0.3125, "step": 132 }, { "completion_length": 3232.3472900390625, "epoch": 0.4554794520547945, "grad_norm": 0.10323884338140488, "kl": 0.0181884765625, "learning_rate": 7.137069422289181e-07, "loss": 0.0007, "reward": 0.2314557433128357, "reward_std": 0.39915989339351654, "rewards/cosine_scaled_reward": -0.08104425063356757, "rewards/format_reward": 0.3125000149011612, "step": 133 }, { "completion_length": 3193.8612060546875, "epoch": 0.4589041095890411, "grad_norm": 0.1457975059747696, "kl": 0.02178955078125, "learning_rate": 7.08818754121241e-07, "loss": 0.0009, "reward": 0.03552616201341152, "reward_std": 0.36521604657173157, "rewards/cosine_scaled_reward": -0.2630849555134773, "rewards/format_reward": 0.298611119389534, "step": 134 }, { "completion_length": 3036.7291259765625, "epoch": 0.4623287671232877, "grad_norm": 0.12449899315834045, "kl": 0.01971435546875, "learning_rate": 7.039090644965509e-07, "loss": 0.0008, "reward": 0.6097220778465271, "reward_std": 0.6045728027820587, "rewards/cosine_scaled_reward": 0.10972205176949501, "rewards/format_reward": 0.5000000149011612, "step": 135 }, { "completion_length": 2624.75, "epoch": 0.4657534246575342, "grad_norm": 0.15769356489181519, "kl": 0.02471923828125, "learning_rate": 6.989785380482312e-07, "loss": 0.001, "reward": 0.4681246876716614, "reward_std": 0.3646779954433441, "rewards/cosine_scaled_reward": -0.052708632312715054, "rewards/format_reward": 0.520833358168602, "step": 136 }, { "completion_length": 2946.8333740234375, "epoch": 0.4691780821917808, "grad_norm": 0.13552969694137573, "kl": 0.02203369140625, "learning_rate": 6.940278422906372e-07, "loss": 0.0009, "reward": 0.4625835418701172, "reward_std": 0.4559210389852524, "rewards/cosine_scaled_reward": 0.004250235855579376, "rewards/format_reward": 0.4583333283662796, "step": 137 }, { "completion_length": 3224.576416015625, "epoch": 0.4726027397260274, "grad_norm": 0.1137109324336052, "kl": 0.01885986328125, "learning_rate": 6.890576474687263e-07, "loss": 0.0008, "reward": 0.27407026663422585, "reward_std": 0.42793411016464233, "rewards/cosine_scaled_reward": -0.03148530051112175, "rewards/format_reward": 0.305555559694767, "step": 138 }, { "completion_length": 3156.84033203125, "epoch": 0.476027397260274, "grad_norm": 0.13348160684108734, "kl": 0.01849365234375, "learning_rate": 6.840686264673168e-07, "loss": 0.0007, "reward": 0.2564939334988594, "reward_std": 0.4258010536432266, "rewards/cosine_scaled_reward": -0.06295053288340569, "rewards/format_reward": 0.3194444477558136, "step": 139 }, { "completion_length": 3052.3125, "epoch": 0.4794520547945205, "grad_norm": 0.1428801566362381, "kl": 0.021240234375, "learning_rate": 6.790614547199906e-07, "loss": 0.0009, "reward": 0.3512444347143173, "reward_std": 0.404025673866272, "rewards/cosine_scaled_reward": -0.016811135224997997, "rewards/format_reward": 0.3680555671453476, "step": 140 }, { "completion_length": 2792.486083984375, "epoch": 0.4828767123287671, "grad_norm": 0.13267238438129425, "kl": 0.02069091796875, "learning_rate": 6.740368101176495e-07, "loss": 0.0008, "reward": 0.6453568935394287, "reward_std": 0.5250414609909058, "rewards/cosine_scaled_reward": 0.11063468037173152, "rewards/format_reward": 0.5347222238779068, "step": 141 }, { "completion_length": 2989.0972900390625, "epoch": 0.4863013698630137, "grad_norm": 0.12677060067653656, "kl": 0.021484375, "learning_rate": 6.68995372916741e-07, "loss": 0.0009, "reward": 0.4786522537469864, "reward_std": 0.5605998337268829, "rewards/cosine_scaled_reward": 0.013374458998441696, "rewards/format_reward": 0.4652777761220932, "step": 142 }, { "completion_length": 3084.6180419921875, "epoch": 0.4897260273972603, "grad_norm": 0.12558940052986145, "kl": 0.0198974609375, "learning_rate": 6.639378256471608e-07, "loss": 0.0008, "reward": 0.3289758712053299, "reward_std": 0.45005667209625244, "rewards/cosine_scaled_reward": -0.018246358260512352, "rewards/format_reward": 0.3472222238779068, "step": 143 }, { "completion_length": 3120.8541259765625, "epoch": 0.4931506849315068, "grad_norm": 0.13771818578243256, "kl": 0.02197265625, "learning_rate": 6.588648530198504e-07, "loss": 0.0009, "reward": 0.1804770578892203, "reward_std": 0.40222403407096863, "rewards/cosine_scaled_reward": -0.11813406273722649, "rewards/format_reward": 0.2986111119389534, "step": 144 }, { "completion_length": 3007.013916015625, "epoch": 0.4965753424657534, "grad_norm": 0.19792306423187256, "kl": 0.031005859375, "learning_rate": 6.537771418340981e-07, "loss": 0.0012, "reward": 0.19204921275377274, "reward_std": 0.30794692039489746, "rewards/cosine_scaled_reward": -0.09961747378110886, "rewards/format_reward": 0.2916666567325592, "step": 145 }, { "completion_length": 3020.9306640625, "epoch": 0.5, "grad_norm": 0.13727493584156036, "kl": 0.0220947265625, "learning_rate": 6.486753808845564e-07, "loss": 0.0009, "reward": 0.16832835972309113, "reward_std": 0.5160266309976578, "rewards/cosine_scaled_reward": -0.15806053578853607, "rewards/format_reward": 0.3263888955116272, "step": 146 }, { "completion_length": 2718.4097900390625, "epoch": 0.5034246575342466, "grad_norm": 0.11808720976114273, "kl": 0.021484375, "learning_rate": 6.435602608679916e-07, "loss": 0.0009, "reward": 0.5802240371704102, "reward_std": 0.44783809781074524, "rewards/cosine_scaled_reward": 0.017724037170410156, "rewards/format_reward": 0.5625000149011612, "step": 147 }, { "completion_length": 2859.6458740234375, "epoch": 0.5068493150684932, "grad_norm": 0.21733467280864716, "kl": 0.03009033203125, "learning_rate": 6.384324742897735e-07, "loss": 0.0012, "reward": 0.4155062139034271, "reward_std": 0.4282771795988083, "rewards/cosine_scaled_reward": 0.005784010514616966, "rewards/format_reward": 0.4097222238779068, "step": 148 }, { "completion_length": 2950.701416015625, "epoch": 0.5102739726027398, "grad_norm": 0.10564743727445602, "kl": 0.02130126953125, "learning_rate": 6.332927153701215e-07, "loss": 0.0009, "reward": 0.6034330129623413, "reward_std": 0.47583654522895813, "rewards/cosine_scaled_reward": 0.1103774681687355, "rewards/format_reward": 0.4930555671453476, "step": 149 }, { "completion_length": 3038.013916015625, "epoch": 0.5136986301369864, "grad_norm": 0.12356794625520706, "kl": 0.02337646484375, "learning_rate": 6.281416799501187e-07, "loss": 0.0009, "reward": 0.577631801366806, "reward_std": 0.47365154325962067, "rewards/cosine_scaled_reward": 0.13318736106157303, "rewards/format_reward": 0.4444444477558136, "step": 150 }, { "completion_length": 3258.90966796875, "epoch": 0.5171232876712328, "grad_norm": 0.13418567180633545, "kl": 0.0211181640625, "learning_rate": 6.229800653975054e-07, "loss": 0.0008, "reward": 0.3129318729043007, "reward_std": 0.5764759629964828, "rewards/cosine_scaled_reward": -0.0551237054169178, "rewards/format_reward": 0.3680555671453476, "step": 151 }, { "completion_length": 2671.361083984375, "epoch": 0.5205479452054794, "grad_norm": 0.1966993510723114, "kl": 0.0272216796875, "learning_rate": 6.178085705122674e-07, "loss": 0.0011, "reward": 0.5614952445030212, "reward_std": 0.43348051607608795, "rewards/cosine_scaled_reward": 0.040661935694515705, "rewards/format_reward": 0.5208333432674408, "step": 152 }, { "completion_length": 3149.1666259765625, "epoch": 0.523972602739726, "grad_norm": 0.12545958161354065, "kl": 0.0216064453125, "learning_rate": 6.126278954320294e-07, "loss": 0.0009, "reward": 0.15634628385305405, "reward_std": 0.38860486447811127, "rewards/cosine_scaled_reward": -0.17698704451322556, "rewards/format_reward": 0.3333333432674408, "step": 153 }, { "completion_length": 3038.9930419921875, "epoch": 0.5273972602739726, "grad_norm": 0.15269039571285248, "kl": 0.0208740234375, "learning_rate": 6.074387415372676e-07, "loss": 0.0008, "reward": 0.38375401496887207, "reward_std": 0.48632751405239105, "rewards/cosine_scaled_reward": -0.019023781642317772, "rewards/format_reward": 0.4027777761220932, "step": 154 }, { "completion_length": 3070.8680419921875, "epoch": 0.5308219178082192, "grad_norm": 0.11029084771871567, "kl": 0.02392578125, "learning_rate": 6.022418113563535e-07, "loss": 0.001, "reward": 0.26838141679763794, "reward_std": 0.3721802681684494, "rewards/cosine_scaled_reward": -0.07189634721726179, "rewards/format_reward": 0.3402777910232544, "step": 155 }, { "completion_length": 3024.270751953125, "epoch": 0.5342465753424658, "grad_norm": 0.1171901524066925, "kl": 0.02239990234375, "learning_rate": 5.97037808470444e-07, "loss": 0.0009, "reward": 0.36915363371372223, "reward_std": 0.40745308995246887, "rewards/cosine_scaled_reward": 0.021931427530944347, "rewards/format_reward": 0.3472222238779068, "step": 156 }, { "completion_length": 2957.673583984375, "epoch": 0.5376712328767124, "grad_norm": 0.14073632657527924, "kl": 0.0245361328125, "learning_rate": 5.918274374182266e-07, "loss": 0.001, "reward": 0.18879153579473495, "reward_std": 0.31644490361213684, "rewards/cosine_scaled_reward": -0.14454180747270584, "rewards/format_reward": 0.3333333432674408, "step": 157 }, { "completion_length": 2745.4306640625, "epoch": 0.541095890410959, "grad_norm": 0.15367640554904938, "kl": 0.02471923828125, "learning_rate": 5.866114036005362e-07, "loss": 0.001, "reward": 0.647605836391449, "reward_std": 0.45094963908195496, "rewards/cosine_scaled_reward": 0.12677249684929848, "rewards/format_reward": 0.5208333432674408, "step": 158 }, { "completion_length": 2848.479248046875, "epoch": 0.5445205479452054, "grad_norm": 0.14879484474658966, "kl": 0.026123046875, "learning_rate": 5.813904131848564e-07, "loss": 0.001, "reward": 0.31949036195874214, "reward_std": 0.43667902052402496, "rewards/cosine_scaled_reward": -0.10412076953798532, "rewards/format_reward": 0.423611119389534, "step": 159 }, { "completion_length": 2885.5416259765625, "epoch": 0.547945205479452, "grad_norm": 0.12330173701047897, "kl": 0.02471923828125, "learning_rate": 5.761651730097142e-07, "loss": 0.001, "reward": 0.47639837861061096, "reward_std": 0.5159202069044113, "rewards/cosine_scaled_reward": -0.00276830792427063, "rewards/format_reward": 0.4791666716337204, "step": 160 }, { "completion_length": 3260.416748046875, "epoch": 0.5513698630136986, "grad_norm": 0.1258607804775238, "kl": 0.02301025390625, "learning_rate": 5.709363904889861e-07, "loss": 0.0009, "reward": 0.3024430572986603, "reward_std": 0.6272812485694885, "rewards/cosine_scaled_reward": -0.01700139231979847, "rewards/format_reward": 0.3194444477558136, "step": 161 }, { "completion_length": 3207.2083740234375, "epoch": 0.5547945205479452, "grad_norm": 0.12467379122972488, "kl": 0.02398681640625, "learning_rate": 5.657047735161255e-07, "loss": 0.001, "reward": 0.20557049103081226, "reward_std": 0.3906491547822952, "rewards/cosine_scaled_reward": -0.07220727764070034, "rewards/format_reward": 0.2777777761220932, "step": 162 }, { "completion_length": 2876.319580078125, "epoch": 0.5582191780821918, "grad_norm": 0.13523170351982117, "kl": 0.02081298828125, "learning_rate": 5.604710303683253e-07, "loss": 0.0008, "reward": 0.5364240109920502, "reward_std": 0.48902808129787445, "rewards/cosine_scaled_reward": 0.06420179456472397, "rewards/format_reward": 0.4722222238779068, "step": 163 }, { "completion_length": 2883.8958740234375, "epoch": 0.5616438356164384, "grad_norm": 0.17181335389614105, "kl": 0.028564453125, "learning_rate": 5.552358696106288e-07, "loss": 0.0011, "reward": 0.27160534262657166, "reward_std": 0.360342800617218, "rewards/cosine_scaled_reward": -0.10339467972517014, "rewards/format_reward": 0.375, "step": 164 }, { "completion_length": 3007.28466796875, "epoch": 0.565068493150685, "grad_norm": 0.12402219325304031, "kl": 0.02569580078125, "learning_rate": 5.5e-07, "loss": 0.001, "reward": 0.4214746206998825, "reward_std": 0.4252837300300598, "rewards/cosine_scaled_reward": 0.004807952791452408, "rewards/format_reward": 0.4166666716337204, "step": 165 }, { "completion_length": 2990.2291259765625, "epoch": 0.5684931506849316, "grad_norm": 0.1333877593278885, "kl": 0.02728271484375, "learning_rate": 5.447641303893714e-07, "loss": 0.0011, "reward": 0.3566634953022003, "reward_std": 0.49885208904743195, "rewards/cosine_scaled_reward": -0.05305874161422253, "rewards/format_reward": 0.4097222238779068, "step": 166 }, { "completion_length": 3205.513916015625, "epoch": 0.571917808219178, "grad_norm": 0.14048750698566437, "kl": 0.02264404296875, "learning_rate": 5.395289696316747e-07, "loss": 0.0009, "reward": 0.29908060282468796, "reward_std": 0.5275179445743561, "rewards/cosine_scaled_reward": -0.07591940555721521, "rewards/format_reward": 0.3750000149011612, "step": 167 }, { "completion_length": 2930.826416015625, "epoch": 0.5753424657534246, "grad_norm": 0.13344134390354156, "kl": 0.0302734375, "learning_rate": 5.342952264838747e-07, "loss": 0.0012, "reward": 0.41841088235378265, "reward_std": 0.40377357602119446, "rewards/cosine_scaled_reward": -0.05381134897470474, "rewards/format_reward": 0.4722222238779068, "step": 168 }, { "completion_length": 2789.326416015625, "epoch": 0.5787671232876712, "grad_norm": 0.1511034071445465, "kl": 0.02587890625, "learning_rate": 5.29063609511014e-07, "loss": 0.001, "reward": 0.45637938380241394, "reward_std": 0.4706819951534271, "rewards/cosine_scaled_reward": -0.00889836996793747, "rewards/format_reward": 0.4652777761220932, "step": 169 }, { "completion_length": 3059.9930419921875, "epoch": 0.5821917808219178, "grad_norm": 0.14896270632743835, "kl": 0.02703857421875, "learning_rate": 5.238348269902859e-07, "loss": 0.0011, "reward": 0.18200979381799698, "reward_std": 0.4613404721021652, "rewards/cosine_scaled_reward": -0.19299022108316422, "rewards/format_reward": 0.375, "step": 170 }, { "completion_length": 3044.4930419921875, "epoch": 0.5856164383561644, "grad_norm": 0.12685967981815338, "kl": 0.02545166015625, "learning_rate": 5.186095868151436e-07, "loss": 0.001, "reward": 0.24055355973541737, "reward_std": 0.4408658444881439, "rewards/cosine_scaled_reward": -0.14139089360833168, "rewards/format_reward": 0.3819444477558136, "step": 171 }, { "completion_length": 2786.451416015625, "epoch": 0.589041095890411, "grad_norm": 0.14915940165519714, "kl": 0.02362060546875, "learning_rate": 5.133885963994639e-07, "loss": 0.0009, "reward": 0.23196261376142502, "reward_std": 0.30038829147815704, "rewards/cosine_scaled_reward": -0.14303740486502647, "rewards/format_reward": 0.375, "step": 172 }, { "completion_length": 3067.9306640625, "epoch": 0.5924657534246576, "grad_norm": 0.10505218058824539, "kl": 0.0228271484375, "learning_rate": 5.081725625817735e-07, "loss": 0.0009, "reward": 0.5079791992902756, "reward_std": 0.5481714606285095, "rewards/cosine_scaled_reward": 0.05659032240509987, "rewards/format_reward": 0.451388880610466, "step": 173 }, { "completion_length": 2666.3541259765625, "epoch": 0.5958904109589042, "grad_norm": 0.135061576962471, "kl": 0.02471923828125, "learning_rate": 5.02962191529556e-07, "loss": 0.001, "reward": 0.4175218790769577, "reward_std": 0.37634842097759247, "rewards/cosine_scaled_reward": -0.1033114567399025, "rewards/format_reward": 0.5208333283662796, "step": 174 }, { "completion_length": 3136.0833740234375, "epoch": 0.5993150684931506, "grad_norm": 0.11110376566648483, "kl": 0.026123046875, "learning_rate": 4.977581886436462e-07, "loss": 0.001, "reward": 0.4329180419445038, "reward_std": 0.5168599039316177, "rewards/cosine_scaled_reward": -0.025415293872356415, "rewards/format_reward": 0.4583333283662796, "step": 175 }, { "completion_length": 3090.9583740234375, "epoch": 0.6027397260273972, "grad_norm": 0.12515892088413239, "kl": 0.02777099609375, "learning_rate": 4.925612584627324e-07, "loss": 0.0011, "reward": 0.3180784285068512, "reward_std": 0.4640498459339142, "rewards/cosine_scaled_reward": -0.0985882543027401, "rewards/format_reward": 0.4166666716337204, "step": 176 }, { "completion_length": 3099.416748046875, "epoch": 0.6061643835616438, "grad_norm": 0.10670111328363419, "kl": 0.027587890625, "learning_rate": 4.873721045679706e-07, "loss": 0.0011, "reward": 0.37505099177360535, "reward_std": 0.31386855244636536, "rewards/cosine_scaled_reward": -0.013837885111570358, "rewards/format_reward": 0.3888889029622078, "step": 177 }, { "completion_length": 2943.9306640625, "epoch": 0.6095890410958904, "grad_norm": 0.13467499613761902, "kl": 0.0272216796875, "learning_rate": 4.821914294877326e-07, "loss": 0.0011, "reward": 0.2806200385093689, "reward_std": 0.48631517589092255, "rewards/cosine_scaled_reward": -0.1152133010327816, "rewards/format_reward": 0.3958333283662796, "step": 178 }, { "completion_length": 3079.7222900390625, "epoch": 0.613013698630137, "grad_norm": 0.1212821900844574, "kl": 0.02349853515625, "learning_rate": 4.770199346024947e-07, "loss": 0.0009, "reward": 0.34058963507413864, "reward_std": 0.5203238129615784, "rewards/cosine_scaled_reward": -0.03441034443676472, "rewards/format_reward": 0.375, "step": 179 }, { "completion_length": 2798.0555419921875, "epoch": 0.6164383561643836, "grad_norm": 0.14347128570079803, "kl": 0.02813720703125, "learning_rate": 4.7185832004988133e-07, "loss": 0.0011, "reward": 0.33986398577690125, "reward_std": 0.4621936082839966, "rewards/cosine_scaled_reward": -0.12541379779577255, "rewards/format_reward": 0.4652777761220932, "step": 180 }, { "completion_length": 3015.9791259765625, "epoch": 0.6198630136986302, "grad_norm": 0.10863691568374634, "kl": 0.0252685546875, "learning_rate": 4.667072846298785e-07, "loss": 0.001, "reward": 0.32850363850593567, "reward_std": 0.4628835916519165, "rewards/cosine_scaled_reward": -0.10205190535634756, "rewards/format_reward": 0.4305555671453476, "step": 181 }, { "completion_length": 3229.4375, "epoch": 0.6232876712328768, "grad_norm": 0.13130250573158264, "kl": 0.02691650390625, "learning_rate": 4.6156752571022637e-07, "loss": 0.0011, "reward": 0.1762874647974968, "reward_std": 0.45904412865638733, "rewards/cosine_scaled_reward": -0.1084347516298294, "rewards/format_reward": 0.2847222238779068, "step": 182 }, { "completion_length": 3043.9722900390625, "epoch": 0.6267123287671232, "grad_norm": 0.13968031108379364, "kl": 0.02679443359375, "learning_rate": 4.5643973913200837e-07, "loss": 0.0011, "reward": 0.3731088787317276, "reward_std": 0.5312269479036331, "rewards/cosine_scaled_reward": -0.029668924515135586, "rewards/format_reward": 0.4027777910232544, "step": 183 }, { "completion_length": 3144.5625, "epoch": 0.6301369863013698, "grad_norm": 0.15973497927188873, "kl": 0.03131103515625, "learning_rate": 4.513246191154434e-07, "loss": 0.0013, "reward": 0.2171999216079712, "reward_std": 0.43705086410045624, "rewards/cosine_scaled_reward": -0.12307787034660578, "rewards/format_reward": 0.3402777910232544, "step": 184 }, { "completion_length": 2747.4652099609375, "epoch": 0.6335616438356164, "grad_norm": 0.13849014043807983, "kl": 0.02557373046875, "learning_rate": 4.4622285816590186e-07, "loss": 0.001, "reward": 0.5670523345470428, "reward_std": 0.41843467950820923, "rewards/cosine_scaled_reward": 0.07399673759937286, "rewards/format_reward": 0.4930555671453476, "step": 185 }, { "completion_length": 2985.71533203125, "epoch": 0.636986301369863, "grad_norm": 0.15879811346530914, "kl": 0.0255126953125, "learning_rate": 4.4113514698014953e-07, "loss": 0.001, "reward": 0.4425032064318657, "reward_std": 0.6330223977565765, "rewards/cosine_scaled_reward": -0.036663462640717626, "rewards/format_reward": 0.4791666716337204, "step": 186 }, { "completion_length": 2936.0416259765625, "epoch": 0.6404109589041096, "grad_norm": 0.12994582951068878, "kl": 0.0272216796875, "learning_rate": 4.360621743528392e-07, "loss": 0.0011, "reward": 0.34703654050827026, "reward_std": 0.4127475470304489, "rewards/cosine_scaled_reward": -0.11824123747646809, "rewards/format_reward": 0.4652777761220932, "step": 187 }, { "completion_length": 2758.3126220703125, "epoch": 0.6438356164383562, "grad_norm": 0.17351487278938293, "kl": 0.03253173828125, "learning_rate": 4.3100462708325914e-07, "loss": 0.0013, "reward": 0.3991873562335968, "reward_std": 0.5110540986061096, "rewards/cosine_scaled_reward": -0.07303486485034227, "rewards/format_reward": 0.4722222238779068, "step": 188 }, { "completion_length": 3067.34033203125, "epoch": 0.6472602739726028, "grad_norm": 0.17369891703128815, "kl": 0.02911376953125, "learning_rate": 4.2596318988235037e-07, "loss": 0.0012, "reward": 0.3477390259504318, "reward_std": 0.5584754794836044, "rewards/cosine_scaled_reward": -0.05503876507282257, "rewards/format_reward": 0.4027777910232544, "step": 189 }, { "completion_length": 2643.451416015625, "epoch": 0.6506849315068494, "grad_norm": 0.15890643000602722, "kl": 0.0362548828125, "learning_rate": 4.209385452800095e-07, "loss": 0.0015, "reward": 0.3941483050584793, "reward_std": 0.4240891933441162, "rewards/cosine_scaled_reward": -0.05724058859050274, "rewards/format_reward": 0.4513888955116272, "step": 190 }, { "completion_length": 3008.076416015625, "epoch": 0.6541095890410958, "grad_norm": 0.14325085282325745, "kl": 0.02728271484375, "learning_rate": 4.1593137353268303e-07, "loss": 0.0011, "reward": 0.4985590726137161, "reward_std": 0.573657751083374, "rewards/cosine_scaled_reward": 0.04717019200325012, "rewards/format_reward": 0.4513888955116272, "step": 191 }, { "completion_length": 2820.451416015625, "epoch": 0.6575342465753424, "grad_norm": 0.14183150231838226, "kl": 0.0311279296875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0012, "reward": 0.2823975309729576, "reward_std": 0.4726349860429764, "rewards/cosine_scaled_reward": -0.15510250255465508, "rewards/format_reward": 0.4375, "step": 192 }, { "completion_length": 3112.888916015625, "epoch": 0.660958904109589, "grad_norm": 0.1434364914894104, "kl": 0.0272216796875, "learning_rate": 4.059721577093628e-07, "loss": 0.0011, "reward": 0.2338184304535389, "reward_std": 0.513207420706749, "rewards/cosine_scaled_reward": -0.0925704650580883, "rewards/format_reward": 0.3263888955116272, "step": 193 }, { "completion_length": 2760.3958740234375, "epoch": 0.6643835616438356, "grad_norm": 0.15593542158603668, "kl": 0.02716064453125, "learning_rate": 4.0102146195176887e-07, "loss": 0.0011, "reward": 0.39227430522441864, "reward_std": 0.431293249130249, "rewards/cosine_scaled_reward": -0.05911456607282162, "rewards/format_reward": 0.4513888955116272, "step": 194 }, { "completion_length": 3120.576416015625, "epoch": 0.6678082191780822, "grad_norm": 0.12067221850156784, "kl": 0.02813720703125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0011, "reward": 0.33260975778102875, "reward_std": 0.3859928399324417, "rewards/cosine_scaled_reward": -0.014612471219152212, "rewards/format_reward": 0.3472222238779068, "step": 195 }, { "completion_length": 2929.52783203125, "epoch": 0.6712328767123288, "grad_norm": 0.15069261193275452, "kl": 0.0296630859375, "learning_rate": 3.911812458787591e-07, "loss": 0.0012, "reward": 0.3924526572227478, "reward_std": 0.48095013201236725, "rewards/cosine_scaled_reward": -0.051991806365549564, "rewards/format_reward": 0.4444444477558136, "step": 196 }, { "completion_length": 2846.9375, "epoch": 0.6746575342465754, "grad_norm": 0.1341117024421692, "kl": 0.0289306640625, "learning_rate": 3.86293057771082e-07, "loss": 0.0012, "reward": 0.5837523490190506, "reward_std": 0.5863839089870453, "rewards/cosine_scaled_reward": 0.07680792175233364, "rewards/format_reward": 0.5069444328546524, "step": 197 }, { "completion_length": 2936.75, "epoch": 0.678082191780822, "grad_norm": 0.12375517934560776, "kl": 0.02490234375, "learning_rate": 3.8142703296283953e-07, "loss": 0.001, "reward": 0.421320840716362, "reward_std": 0.48637837171554565, "rewards/cosine_scaled_reward": 0.018543066456913948, "rewards/format_reward": 0.4027777761220932, "step": 198 }, { "completion_length": 3141.076416015625, "epoch": 0.6815068493150684, "grad_norm": 0.14611783623695374, "kl": 0.02886962890625, "learning_rate": 3.7658383023589833e-07, "loss": 0.0012, "reward": 0.31543052941560745, "reward_std": 0.484183669090271, "rewards/cosine_scaled_reward": -0.059569500386714935, "rewards/format_reward": 0.375, "step": 199 }, { "completion_length": 2385.8680419921875, "epoch": 0.684931506849315, "grad_norm": 0.1563843935728073, "kl": 0.02777099609375, "learning_rate": 3.7176410528237945e-07, "loss": 0.0011, "reward": 0.546364039182663, "reward_std": 0.4009109437465668, "rewards/cosine_scaled_reward": -0.023080429062247276, "rewards/format_reward": 0.5694444626569748, "step": 200 }, { "completion_length": 3089.6458740234375, "epoch": 0.6883561643835616, "grad_norm": 0.12665298581123352, "kl": 0.03118896484375, "learning_rate": 3.6696851061588994e-07, "loss": 0.0012, "reward": 0.17153404070995748, "reward_std": 0.39160603284835815, "rewards/cosine_scaled_reward": -0.18263262510299683, "rewards/format_reward": 0.354166679084301, "step": 201 }, { "completion_length": 2912.826416015625, "epoch": 0.6917808219178082, "grad_norm": 0.1533837616443634, "kl": 0.0333251953125, "learning_rate": 3.62197695483182e-07, "loss": 0.0013, "reward": 0.4708049148321152, "reward_std": 0.5301374197006226, "rewards/cosine_scaled_reward": -0.01530623622238636, "rewards/format_reward": 0.486111119389534, "step": 202 }, { "completion_length": 2903.9652099609375, "epoch": 0.6952054794520548, "grad_norm": 0.1623891443014145, "kl": 0.03265380859375, "learning_rate": 3.5745230577625573e-07, "loss": 0.0013, "reward": 0.3767779842019081, "reward_std": 0.37230053544044495, "rewards/cosine_scaled_reward": -0.025999773293733597, "rewards/format_reward": 0.4027777761220932, "step": 203 }, { "completion_length": 2757.076416015625, "epoch": 0.6986301369863014, "grad_norm": 0.9816704988479614, "kl": 0.03271484375, "learning_rate": 3.5273298394491515e-07, "loss": 0.0013, "reward": 0.44859715551137924, "reward_std": 0.3788439631462097, "rewards/cosine_scaled_reward": 0.004152711480855942, "rewards/format_reward": 0.4444444626569748, "step": 204 }, { "completion_length": 3004.104248046875, "epoch": 0.702054794520548, "grad_norm": 0.14980177581310272, "kl": 0.0347900390625, "learning_rate": 3.4804036890979205e-07, "loss": 0.0014, "reward": 0.26160044223070145, "reward_std": 0.4303555190563202, "rewards/cosine_scaled_reward": -0.0647884514182806, "rewards/format_reward": 0.3263888955116272, "step": 205 }, { "completion_length": 2953.40283203125, "epoch": 0.7054794520547946, "grad_norm": 0.13833436369895935, "kl": 0.031982421875, "learning_rate": 3.433750959758446e-07, "loss": 0.0013, "reward": 0.3072604089975357, "reward_std": 0.5101044028997421, "rewards/cosine_scaled_reward": -0.12329516559839249, "rewards/format_reward": 0.4305555522441864, "step": 206 }, { "completion_length": 2859.96533203125, "epoch": 0.708904109589041, "grad_norm": 0.13901107013225555, "kl": 0.03009033203125, "learning_rate": 3.387377967463493e-07, "loss": 0.0012, "reward": 0.5190957188606262, "reward_std": 0.4833519160747528, "rewards/cosine_scaled_reward": 0.07465130463242531, "rewards/format_reward": 0.4444444626569748, "step": 207 }, { "completion_length": 2990.9583740234375, "epoch": 0.7123287671232876, "grad_norm": 0.16973043978214264, "kl": 0.0328369140625, "learning_rate": 3.3412909903738936e-07, "loss": 0.0013, "reward": 0.1571333408355713, "reward_std": 0.33002787828445435, "rewards/cosine_scaled_reward": -0.13453333638608456, "rewards/format_reward": 0.2916666716337204, "step": 208 }, { "completion_length": 2839.3126220703125, "epoch": 0.7157534246575342, "grad_norm": 0.13653664290905, "kl": 0.03125, "learning_rate": 3.295496267928609e-07, "loss": 0.0013, "reward": 0.3505253791809082, "reward_std": 0.47255241870880127, "rewards/cosine_scaled_reward": -0.1008635088801384, "rewards/format_reward": 0.451388880610466, "step": 209 }, { "completion_length": 2634.1737060546875, "epoch": 0.7191780821917808, "grad_norm": 0.16086305677890778, "kl": 0.02947998046875, "learning_rate": 3.250000000000001e-07, "loss": 0.0012, "reward": 0.4823562502861023, "reward_std": 0.4473782926797867, "rewards/cosine_scaled_reward": -0.04542151384521276, "rewards/format_reward": 0.5277777910232544, "step": 210 }, { "completion_length": 2892.3958740234375, "epoch": 0.7226027397260274, "grad_norm": 0.13830755650997162, "kl": 0.03363037109375, "learning_rate": 3.204808346054461e-07, "loss": 0.0013, "reward": 0.4033074826002121, "reward_std": 0.41210463643074036, "rewards/cosine_scaled_reward": -0.10363698564469814, "rewards/format_reward": 0.5069444477558136, "step": 211 }, { "completion_length": 2885.541748046875, "epoch": 0.726027397260274, "grad_norm": 0.1686064600944519, "kl": 0.03106689453125, "learning_rate": 3.159927424318531e-07, "loss": 0.0012, "reward": 0.3951975554227829, "reward_std": 0.5239757895469666, "rewards/cosine_scaled_reward": -0.04924688953906298, "rewards/format_reward": 0.4444444477558136, "step": 212 }, { "completion_length": 2877.71533203125, "epoch": 0.7294520547945206, "grad_norm": 0.15673589706420898, "kl": 0.034423828125, "learning_rate": 3.115363310950578e-07, "loss": 0.0014, "reward": 0.2623257301747799, "reward_std": 0.3177921772003174, "rewards/cosine_scaled_reward": -0.08489650301635265, "rewards/format_reward": 0.3472222164273262, "step": 213 }, { "completion_length": 3026.298583984375, "epoch": 0.7328767123287672, "grad_norm": 0.1316094845533371, "kl": 0.0333251953125, "learning_rate": 3.0711220392181934e-07, "loss": 0.0013, "reward": 0.4284323900938034, "reward_std": 0.5533152520656586, "rewards/cosine_scaled_reward": -0.009067630395293236, "rewards/format_reward": 0.4375, "step": 214 }, { "completion_length": 2914.27783203125, "epoch": 0.7363013698630136, "grad_norm": 0.1542029231786728, "kl": 0.02874755859375, "learning_rate": 3.027209598681373e-07, "loss": 0.0011, "reward": 0.3265261799097061, "reward_std": 0.4032795578241348, "rewards/cosine_scaled_reward": -0.06930714938789606, "rewards/format_reward": 0.3958333432674408, "step": 215 }, { "completion_length": 2932.25, "epoch": 0.7397260273972602, "grad_norm": 0.142373189330101, "kl": 0.03369140625, "learning_rate": 2.9836319343816397e-07, "loss": 0.0013, "reward": 0.5414205491542816, "reward_std": 0.5791518688201904, "rewards/cosine_scaled_reward": 0.013642808422446251, "rewards/format_reward": 0.5277777761220932, "step": 216 }, { "completion_length": 2438.7987060546875, "epoch": 0.7431506849315068, "grad_norm": 0.15347974002361298, "kl": 0.03460693359375, "learning_rate": 2.9403949460371677e-07, "loss": 0.0014, "reward": 0.6215761005878448, "reward_std": 0.360453262925148, "rewards/cosine_scaled_reward": 0.0312983263283968, "rewards/format_reward": 0.5902777910232544, "step": 217 }, { "completion_length": 2892.2916259765625, "epoch": 0.7465753424657534, "grad_norm": 0.14735041558742523, "kl": 0.0301513671875, "learning_rate": 2.897504487244061e-07, "loss": 0.0012, "reward": 0.46117305755615234, "reward_std": 0.4907253533601761, "rewards/cosine_scaled_reward": 0.030617523938417435, "rewards/format_reward": 0.4305555522441864, "step": 218 }, { "completion_length": 2625.59033203125, "epoch": 0.75, "grad_norm": 0.30190229415893555, "kl": 0.0400390625, "learning_rate": 2.854966364683872e-07, "loss": 0.0016, "reward": 0.4576456546783447, "reward_std": 0.3912748843431473, "rewards/cosine_scaled_reward": -0.014576543122529984, "rewards/format_reward": 0.472222238779068, "step": 219 }, { "completion_length": 2769.8333740234375, "epoch": 0.7534246575342466, "grad_norm": 0.13973061740398407, "kl": 0.035400390625, "learning_rate": 2.812786337337463e-07, "loss": 0.0014, "reward": 0.5715092867612839, "reward_std": 0.45521561801433563, "rewards/cosine_scaled_reward": 0.022898193448781967, "rewards/format_reward": 0.5486111342906952, "step": 220 }, { "completion_length": 2313.9097900390625, "epoch": 0.7568493150684932, "grad_norm": 0.1722535640001297, "kl": 0.0362548828125, "learning_rate": 2.770970115705341e-07, "loss": 0.0014, "reward": 0.6356571912765503, "reward_std": 0.31826692819595337, "rewards/cosine_scaled_reward": -0.017120573669672012, "rewards/format_reward": 0.6527777910232544, "step": 221 }, { "completion_length": 2617.8333740234375, "epoch": 0.7602739726027398, "grad_norm": 0.1793624311685562, "kl": 0.0341796875, "learning_rate": 2.729523361034538e-07, "loss": 0.0014, "reward": 0.5496674627065659, "reward_std": 0.45733560621738434, "rewards/cosine_scaled_reward": 0.01494525047019124, "rewards/format_reward": 0.5347222089767456, "step": 222 }, { "completion_length": 2766.7291259765625, "epoch": 0.7636986301369864, "grad_norm": 0.1420270800590515, "kl": 0.02960205078125, "learning_rate": 2.68845168455218e-07, "loss": 0.0012, "reward": 0.6161434650421143, "reward_std": 0.4993426203727722, "rewards/cosine_scaled_reward": 0.07447678688913584, "rewards/format_reward": 0.5416666865348816, "step": 223 }, { "completion_length": 2775.375, "epoch": 0.7671232876712328, "grad_norm": 0.15780124068260193, "kl": 0.0301513671875, "learning_rate": 2.6477606467058035e-07, "loss": 0.0012, "reward": 0.5157144665718079, "reward_std": 0.3755457401275635, "rewards/cosine_scaled_reward": 0.03654780611395836, "rewards/format_reward": 0.4791666716337204, "step": 224 }, { "completion_length": 2975.041748046875, "epoch": 0.7705479452054794, "grad_norm": 0.13926586508750916, "kl": 0.03179931640625, "learning_rate": 2.6074557564105724e-07, "loss": 0.0013, "reward": 0.19908806309103966, "reward_std": 0.42198337614536285, "rewards/cosine_scaled_reward": -0.14813418313860893, "rewards/format_reward": 0.3472222238779068, "step": 225 }, { "completion_length": 3097.52783203125, "epoch": 0.773972602739726, "grad_norm": 0.1462773233652115, "kl": 0.028076171875, "learning_rate": 2.567542470303452e-07, "loss": 0.0011, "reward": 0.4122817665338516, "reward_std": 0.5014311969280243, "rewards/cosine_scaled_reward": -0.0321626765653491, "rewards/format_reward": 0.4444444477558136, "step": 226 }, { "completion_length": 2771.5556640625, "epoch": 0.7773972602739726, "grad_norm": 0.1374952346086502, "kl": 0.0338134765625, "learning_rate": 2.528026192004466e-07, "loss": 0.0014, "reward": 0.4719505310058594, "reward_std": 0.49139489233493805, "rewards/cosine_scaled_reward": -0.014160582795739174, "rewards/format_reward": 0.486111119389534, "step": 227 }, { "completion_length": 2911.5833740234375, "epoch": 0.7808219178082192, "grad_norm": 0.1661740243434906, "kl": 0.0330810546875, "learning_rate": 2.488912271385139e-07, "loss": 0.0013, "reward": 0.3701959401369095, "reward_std": 0.479979932308197, "rewards/cosine_scaled_reward": -0.06730403914116323, "rewards/format_reward": 0.4375, "step": 228 }, { "completion_length": 2854.826416015625, "epoch": 0.7842465753424658, "grad_norm": 0.1530180722475052, "kl": 0.03662109375, "learning_rate": 2.450206003844205e-07, "loss": 0.0015, "reward": 0.47989606857299805, "reward_std": 0.40031544864177704, "rewards/cosine_scaled_reward": -0.027048394083976746, "rewards/format_reward": 0.5069444626569748, "step": 229 }, { "completion_length": 3026.8055419921875, "epoch": 0.7876712328767124, "grad_norm": 0.12480524182319641, "kl": 0.0284423828125, "learning_rate": 2.411912629590699e-07, "loss": 0.0011, "reward": 0.38817086815834045, "reward_std": 0.41600513458251953, "rewards/cosine_scaled_reward": -0.014606935903429985, "rewards/format_reward": 0.4027777761220932, "step": 230 }, { "completion_length": 2905.3333740234375, "epoch": 0.791095890410959, "grad_norm": 0.14893729984760284, "kl": 0.033203125, "learning_rate": 2.374037332934512e-07, "loss": 0.0013, "reward": 0.4625111222267151, "reward_std": 0.45194628834724426, "rewards/cosine_scaled_reward": -0.023599994368851185, "rewards/format_reward": 0.4861111342906952, "step": 231 }, { "completion_length": 2417.1319580078125, "epoch": 0.7945205479452054, "grad_norm": 0.1770123839378357, "kl": 0.0350341796875, "learning_rate": 2.336585241584522e-07, "loss": 0.0014, "reward": 0.7192187607288361, "reward_std": 0.39792077243328094, "rewards/cosine_scaled_reward": 0.059496549889445305, "rewards/format_reward": 0.659722238779068, "step": 232 }, { "completion_length": 3020.0833740234375, "epoch": 0.797945205479452, "grad_norm": 0.1579238921403885, "kl": 0.0362548828125, "learning_rate": 2.299561425954383e-07, "loss": 0.0015, "reward": 0.3186444193124771, "reward_std": 0.5212821513414383, "rewards/cosine_scaled_reward": -0.11885556951165199, "rewards/format_reward": 0.4375, "step": 233 }, { "completion_length": 2965.21533203125, "epoch": 0.8013698630136986, "grad_norm": 0.13462784886360168, "kl": 0.03240966796875, "learning_rate": 2.2629708984760706e-07, "loss": 0.0013, "reward": 0.4752238541841507, "reward_std": 0.40988166630268097, "rewards/cosine_scaled_reward": 0.003001643344759941, "rewards/format_reward": 0.4722222238779068, "step": 234 }, { "completion_length": 2902.916748046875, "epoch": 0.8047945205479452, "grad_norm": 0.13331294059753418, "kl": 0.02911376953125, "learning_rate": 2.2268186129212807e-07, "loss": 0.0012, "reward": 0.3500567376613617, "reward_std": 0.4639824479818344, "rewards/cosine_scaled_reward": -0.0735543726477772, "rewards/format_reward": 0.4236111044883728, "step": 235 }, { "completion_length": 2946.5, "epoch": 0.8082191780821918, "grad_norm": 0.14252088963985443, "kl": 0.03155517578125, "learning_rate": 2.1911094637307714e-07, "loss": 0.0013, "reward": 0.3601393699645996, "reward_std": 0.40262600779533386, "rewards/cosine_scaled_reward": -0.05652729608118534, "rewards/format_reward": 0.4166666716337204, "step": 236 }, { "completion_length": 3064.47216796875, "epoch": 0.8116438356164384, "grad_norm": 0.16358421742916107, "kl": 0.03594970703125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0014, "reward": 0.3410459593869746, "reward_std": 0.3948727697134018, "rewards/cosine_scaled_reward": -0.020065151154994965, "rewards/format_reward": 0.3611111119389534, "step": 237 }, { "completion_length": 2980.416748046875, "epoch": 0.815068493150685, "grad_norm": 0.36835718154907227, "kl": 0.039306640625, "learning_rate": 2.1210398515832536e-07, "loss": 0.0016, "reward": 0.2538940832018852, "reward_std": 0.3796728700399399, "rewards/cosine_scaled_reward": -0.10721703246235847, "rewards/format_reward": 0.3611111119389534, "step": 238 }, { "completion_length": 2511.3333740234375, "epoch": 0.8184931506849316, "grad_norm": 0.15058235824108124, "kl": 0.03302001953125, "learning_rate": 2.08668887493009e-07, "loss": 0.0013, "reward": 0.42302054166793823, "reward_std": 0.35738538205623627, "rewards/cosine_scaled_reward": -0.11864613555371761, "rewards/format_reward": 0.5416666865348816, "step": 239 }, { "completion_length": 2788.2430419921875, "epoch": 0.821917808219178, "grad_norm": 0.15094441175460815, "kl": 0.03497314453125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0014, "reward": 0.4362386465072632, "reward_std": 0.37140533328056335, "rewards/cosine_scaled_reward": -0.04987248365068808, "rewards/format_reward": 0.486111119389534, "step": 240 }, { "completion_length": 2689.46533203125, "epoch": 0.8253424657534246, "grad_norm": 0.15874545276165009, "kl": 0.03240966796875, "learning_rate": 2.0193778326971628e-07, "loss": 0.0013, "reward": 0.44388699531555176, "reward_std": 0.3412891924381256, "rewards/cosine_scaled_reward": -0.08389079011976719, "rewards/format_reward": 0.5277777910232544, "step": 241 }, { "completion_length": 3123.8680419921875, "epoch": 0.8287671232876712, "grad_norm": 0.1301407366991043, "kl": 0.0311279296875, "learning_rate": 1.986426879955034e-07, "loss": 0.0012, "reward": 0.21631913632154465, "reward_std": 0.39288755506277084, "rewards/cosine_scaled_reward": -0.15173641964793205, "rewards/format_reward": 0.3680555522441864, "step": 242 }, { "completion_length": 2796.3056640625, "epoch": 0.8321917808219178, "grad_norm": 0.1545010507106781, "kl": 0.038818359375, "learning_rate": 1.9539516087697517e-07, "loss": 0.0016, "reward": 0.3822927325963974, "reward_std": 0.5006706863641739, "rewards/cosine_scaled_reward": -0.08298505656421185, "rewards/format_reward": 0.4652777761220932, "step": 243 }, { "completion_length": 2943.77783203125, "epoch": 0.8356164383561644, "grad_norm": 0.16031137108802795, "kl": 0.0350341796875, "learning_rate": 1.9219564157731844e-07, "loss": 0.0014, "reward": 0.3758165240287781, "reward_std": 0.3894062936306, "rewards/cosine_scaled_reward": -0.0339057189412415, "rewards/format_reward": 0.409722238779068, "step": 244 }, { "completion_length": 2864.8819580078125, "epoch": 0.839041095890411, "grad_norm": 0.14703238010406494, "kl": 0.0367431640625, "learning_rate": 1.8904456326023027e-07, "loss": 0.0015, "reward": 0.3677906394004822, "reward_std": 0.2914382070302963, "rewards/cosine_scaled_reward": -0.06276494171470404, "rewards/format_reward": 0.4305555671453476, "step": 245 }, { "completion_length": 2885.84033203125, "epoch": 0.8424657534246576, "grad_norm": 0.13430039584636688, "kl": 0.03369140625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0013, "reward": 0.39504893124103546, "reward_std": 0.447158083319664, "rewards/cosine_scaled_reward": -0.07717327354475856, "rewards/format_reward": 0.4722222238779068, "step": 246 }, { "completion_length": 2797.9583740234375, "epoch": 0.8458904109589042, "grad_norm": 0.15996281802654266, "kl": 0.03466796875, "learning_rate": 1.8288942938012267e-07, "loss": 0.0014, "reward": 0.4512728601694107, "reward_std": 0.49667105078697205, "rewards/cosine_scaled_reward": -0.03483825922012329, "rewards/format_reward": 0.4861111044883728, "step": 247 }, { "completion_length": 2761.7362060546875, "epoch": 0.8493150684931506, "grad_norm": 0.16140304505825043, "kl": 0.0394287109375, "learning_rate": 1.7988620712370195e-07, "loss": 0.0016, "reward": 0.5905152261257172, "reward_std": 0.4745737165212631, "rewards/cosine_scaled_reward": 0.03495965828187764, "rewards/format_reward": 0.5555555522441864, "step": 248 }, { "completion_length": 2945.3055419921875, "epoch": 0.8527397260273972, "grad_norm": 0.14055365324020386, "kl": 0.0325927734375, "learning_rate": 1.7693309235023127e-07, "loss": 0.0013, "reward": 0.5377485156059265, "reward_std": 0.5574042201042175, "rewards/cosine_scaled_reward": 0.023859622422605753, "rewards/format_reward": 0.5138888955116272, "step": 249 }, { "completion_length": 2663.6458740234375, "epoch": 0.8561643835616438, "grad_norm": 0.17629733681678772, "kl": 0.03955078125, "learning_rate": 1.7403048486417868e-07, "loss": 0.0016, "reward": 0.5798373818397522, "reward_std": 0.5406672060489655, "rewards/cosine_scaled_reward": 0.06594848074018955, "rewards/format_reward": 0.5138888955116272, "step": 250 }, { "completion_length": 2895.1944580078125, "epoch": 0.8595890410958904, "grad_norm": 0.18740905821323395, "kl": 0.03955078125, "learning_rate": 1.711787776321341e-07, "loss": 0.0016, "reward": 0.3543848991394043, "reward_std": 0.5366209447383881, "rewards/cosine_scaled_reward": -0.09700398705899715, "rewards/format_reward": 0.451388880610466, "step": 251 }, { "completion_length": 2812.7222900390625, "epoch": 0.863013698630137, "grad_norm": 0.13896240293979645, "kl": 0.0306396484375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0012, "reward": 0.49542590975761414, "reward_std": 0.42443887889385223, "rewards/cosine_scaled_reward": -0.018462970852851868, "rewards/format_reward": 0.513888880610466, "step": 252 }, { "completion_length": 2959.5069580078125, "epoch": 0.8664383561643836, "grad_norm": 0.1251368522644043, "kl": 0.03564453125, "learning_rate": 1.6562960128876353e-07, "loss": 0.0014, "reward": 0.2707902789115906, "reward_std": 0.5160115361213684, "rewards/cosine_scaled_reward": -0.14587640017271042, "rewards/format_reward": 0.4166666567325592, "step": 253 }, { "completion_length": 2675.423583984375, "epoch": 0.8698630136986302, "grad_norm": 0.15856920182704926, "kl": 0.0364990234375, "learning_rate": 1.6293288344708566e-07, "loss": 0.0015, "reward": 0.6221481561660767, "reward_std": 0.3958921879529953, "rewards/cosine_scaled_reward": 0.03881483152508736, "rewards/format_reward": 0.5833333432674408, "step": 254 }, { "completion_length": 3005.2362060546875, "epoch": 0.8732876712328768, "grad_norm": 0.1423628181219101, "kl": 0.0340576171875, "learning_rate": 1.6028856829700258e-07, "loss": 0.0014, "reward": 0.20419325679540634, "reward_std": 0.3235137313604355, "rewards/cosine_scaled_reward": -0.17080675438046455, "rewards/format_reward": 0.375, "step": 255 }, { "completion_length": 2853.2222900390625, "epoch": 0.8767123287671232, "grad_norm": 0.18754243850708008, "kl": 0.0416259765625, "learning_rate": 1.5769701383645698e-07, "loss": 0.0017, "reward": 0.5054954886436462, "reward_std": 0.41478313505649567, "rewards/cosine_scaled_reward": 0.047162143513560295, "rewards/format_reward": 0.4583333283662796, "step": 256 }, { "completion_length": 3062.604248046875, "epoch": 0.8801369863013698, "grad_norm": 0.14577165246009827, "kl": 0.035888671875, "learning_rate": 1.551585709204381e-07, "loss": 0.0014, "reward": 0.5648697018623352, "reward_std": 0.6114741563796997, "rewards/cosine_scaled_reward": 0.0787586160004139, "rewards/format_reward": 0.4861111044883728, "step": 257 }, { "completion_length": 2579.1944580078125, "epoch": 0.8835616438356164, "grad_norm": 0.14230208098888397, "kl": 0.03363037109375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0013, "reward": 0.5974612534046173, "reward_std": 0.35620053112506866, "rewards/cosine_scaled_reward": 0.0557946152985096, "rewards/format_reward": 0.5416666865348816, "step": 258 }, { "completion_length": 2702.3194580078125, "epoch": 0.886986301369863, "grad_norm": 0.1390962451696396, "kl": 0.033935546875, "learning_rate": 1.5024238714314825e-07, "loss": 0.0014, "reward": 0.6885968148708344, "reward_std": 0.4262382835149765, "rewards/cosine_scaled_reward": 0.09137461334466934, "rewards/format_reward": 0.597222238779068, "step": 259 }, { "completion_length": 2968.2362060546875, "epoch": 0.8904109589041096, "grad_norm": 0.1457592248916626, "kl": 0.03399658203125, "learning_rate": 1.4786531185446452e-07, "loss": 0.0014, "reward": 0.33401037007570267, "reward_std": 0.33265479654073715, "rewards/cosine_scaled_reward": -0.00626740138977766, "rewards/format_reward": 0.3402777761220932, "step": 260 }, { "completion_length": 3024.8680419921875, "epoch": 0.8938356164383562, "grad_norm": 0.14392928779125214, "kl": 0.0400390625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0016, "reward": 0.30607690662145615, "reward_std": 0.3898402601480484, "rewards/cosine_scaled_reward": -0.027256430126726627, "rewards/format_reward": 0.3333333432674408, "step": 261 }, { "completion_length": 2864.013916015625, "epoch": 0.8972602739726028, "grad_norm": 0.12024541944265366, "kl": 0.0279541015625, "learning_rate": 1.432748035231658e-07, "loss": 0.0011, "reward": 0.5254741907119751, "reward_std": 0.46267665922641754, "rewards/cosine_scaled_reward": -0.0231369249522686, "rewards/format_reward": 0.548611119389534, "step": 262 }, { "completion_length": 2763.9583740234375, "epoch": 0.9006849315068494, "grad_norm": 0.24614769220352173, "kl": 0.040771484375, "learning_rate": 1.4106199196189608e-07, "loss": 0.0016, "reward": 0.35452011227607727, "reward_std": 0.47811339795589447, "rewards/cosine_scaled_reward": -0.09686877019703388, "rewards/format_reward": 0.4513889104127884, "step": 263 }, { "completion_length": 2571.0, "epoch": 0.9041095890410958, "grad_norm": 0.14297829568386078, "kl": 0.03631591796875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0015, "reward": 0.49256862699985504, "reward_std": 0.3805427849292755, "rewards/cosine_scaled_reward": -0.0074313730001449585, "rewards/format_reward": 0.5, "step": 264 }, { "completion_length": 2234.326416015625, "epoch": 0.9075342465753424, "grad_norm": 0.15764550864696503, "kl": 0.032470703125, "learning_rate": 1.3680275190387675e-07, "loss": 0.0013, "reward": 0.7705516219139099, "reward_std": 0.4592142701148987, "rewards/cosine_scaled_reward": 0.05527384765446186, "rewards/format_reward": 0.7152777910232544, "step": 265 }, { "completion_length": 2849.0347900390625, "epoch": 0.910958904109589, "grad_norm": 0.13773973286151886, "kl": 0.03289794921875, "learning_rate": 1.3475690004005097e-07, "loss": 0.0013, "reward": 0.5077018439769745, "reward_std": 0.3785110265016556, "rewards/cosine_scaled_reward": -0.013131474610418081, "rewards/format_reward": 0.5208333432674408, "step": 266 }, { "completion_length": 3086.2431640625, "epoch": 0.9143835616438356, "grad_norm": 0.1191553846001625, "kl": 0.0340576171875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0014, "reward": 0.38693176954984665, "reward_std": 0.45597271621227264, "rewards/cosine_scaled_reward": -0.1269571604207158, "rewards/format_reward": 0.513888880610466, "step": 267 }, { "completion_length": 2707.8681640625, "epoch": 0.9178082191780822, "grad_norm": 0.1543823927640915, "kl": 0.0347900390625, "learning_rate": 1.308341174832359e-07, "loss": 0.0014, "reward": 0.6485514044761658, "reward_std": 0.4456641525030136, "rewards/cosine_scaled_reward": 0.03744027949869633, "rewards/format_reward": 0.6111111044883728, "step": 268 }, { "completion_length": 2430.388916015625, "epoch": 0.9212328767123288, "grad_norm": 0.20193035900592804, "kl": 0.040283203125, "learning_rate": 1.2895771787221088e-07, "loss": 0.0016, "reward": 0.5673209726810455, "reward_std": 0.47004686295986176, "rewards/cosine_scaled_reward": -0.009067919105291367, "rewards/format_reward": 0.5763888955116272, "step": 269 }, { "completion_length": 2542.5208740234375, "epoch": 0.9246575342465754, "grad_norm": 0.14499835669994354, "kl": 0.036376953125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0015, "reward": 0.6222769916057587, "reward_std": 0.46079638600349426, "rewards/cosine_scaled_reward": 0.05283256620168686, "rewards/format_reward": 0.5694444477558136, "step": 270 }, { "completion_length": 2701.8056640625, "epoch": 0.928082191780822, "grad_norm": 0.16896043717861176, "kl": 0.0341796875, "learning_rate": 1.2537617212288742e-07, "loss": 0.0014, "reward": 0.36604734510183334, "reward_std": 0.361560583114624, "rewards/cosine_scaled_reward": -0.0922860149294138, "rewards/format_reward": 0.4583333432674408, "step": 271 }, { "completion_length": 2764.4583740234375, "epoch": 0.9315068493150684, "grad_norm": 0.16842736303806305, "kl": 0.0345458984375, "learning_rate": 1.2367151086855187e-07, "loss": 0.0014, "reward": 0.5705253630876541, "reward_std": 0.5290426909923553, "rewards/cosine_scaled_reward": 0.07746978849172592, "rewards/format_reward": 0.4930555522441864, "step": 272 }, { "completion_length": 2728.6597900390625, "epoch": 0.934931506849315, "grad_norm": 0.1411057859659195, "kl": 0.0361328125, "learning_rate": 1.220245676671809e-07, "loss": 0.0014, "reward": 0.5213780552148819, "reward_std": 0.5097576379776001, "rewards/cosine_scaled_reward": -0.02723303623497486, "rewards/format_reward": 0.5486111044883728, "step": 273 }, { "completion_length": 3240.3055419921875, "epoch": 0.9383561643835616, "grad_norm": 0.1661859005689621, "kl": 0.03228759765625, "learning_rate": 1.2043556548852063e-07, "loss": 0.0013, "reward": 0.25158608704805374, "reward_std": 0.4693699926137924, "rewards/cosine_scaled_reward": -0.09563614055514336, "rewards/format_reward": 0.3472222238779068, "step": 274 }, { "completion_length": 2608.8472900390625, "epoch": 0.9417808219178082, "grad_norm": 0.14709579944610596, "kl": 0.0390625, "learning_rate": 1.1890471945802999e-07, "loss": 0.0016, "reward": 0.5507668703794479, "reward_std": 0.32081814110279083, "rewards/cosine_scaled_reward": 0.0021557584404945374, "rewards/format_reward": 0.548611119389534, "step": 275 }, { "completion_length": 2953.8055419921875, "epoch": 0.9452054794520548, "grad_norm": 0.13219843804836273, "kl": 0.035888671875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0014, "reward": 0.4363091439008713, "reward_std": 0.5200669467449188, "rewards/cosine_scaled_reward": -0.0428575212135911, "rewards/format_reward": 0.4791666865348816, "step": 276 }, { "completion_length": 2877.375, "epoch": 0.9486301369863014, "grad_norm": 0.1316324770450592, "kl": 0.038818359375, "learning_rate": 1.160183169482775e-07, "loss": 0.0016, "reward": 0.5365364849567413, "reward_std": 0.5149263441562653, "rewards/cosine_scaled_reward": 0.05042533949017525, "rewards/format_reward": 0.4861111342906952, "step": 277 }, { "completion_length": 2528.6944580078125, "epoch": 0.952054794520548, "grad_norm": 0.21519577503204346, "kl": 0.03662109375, "learning_rate": 1.1466315124171128e-07, "loss": 0.0015, "reward": 0.4645202308893204, "reward_std": 0.4477800279855728, "rewards/cosine_scaled_reward": -0.07714640907943249, "rewards/format_reward": 0.5416666716337204, "step": 278 }, { "completion_length": 2701.9862060546875, "epoch": 0.9554794520547946, "grad_norm": 0.17575909197330475, "kl": 0.0399169921875, "learning_rate": 1.1336692317580158e-07, "loss": 0.0016, "reward": 0.40750324726104736, "reward_std": 0.4330911487340927, "rewards/cosine_scaled_reward": -0.07166343554854393, "rewards/format_reward": 0.4791666716337204, "step": 279 }, { "completion_length": 2913.5, "epoch": 0.958904109589041, "grad_norm": 0.1264602094888687, "kl": 0.03594970703125, "learning_rate": 1.1212980823907929e-07, "loss": 0.0014, "reward": 0.5513550490140915, "reward_std": 0.5005469620227814, "rewards/cosine_scaled_reward": 0.04441063478589058, "rewards/format_reward": 0.5069444626569748, "step": 280 }, { "completion_length": 2647.451416015625, "epoch": 0.9623287671232876, "grad_norm": 0.2159995436668396, "kl": 0.0328369140625, "learning_rate": 1.1095197391710362e-07, "loss": 0.0013, "reward": 0.5046227127313614, "reward_std": 0.4968739449977875, "rewards/cosine_scaled_reward": -0.01621063333004713, "rewards/format_reward": 0.5208333283662796, "step": 281 }, { "completion_length": 2636.2708740234375, "epoch": 0.9657534246575342, "grad_norm": 0.16840696334838867, "kl": 0.0361328125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0014, "reward": 0.4477865546941757, "reward_std": 0.47679105401039124, "rewards/cosine_scaled_reward": -0.0730467566754669, "rewards/format_reward": 0.5208333283662796, "step": 282 }, { "completion_length": 2957.4930419921875, "epoch": 0.9691780821917808, "grad_norm": 0.14714959263801575, "kl": 0.03460693359375, "learning_rate": 1.0877477690980931e-07, "loss": 0.0014, "reward": 0.35540203750133514, "reward_std": 0.47071878612041473, "rewards/cosine_scaled_reward": -0.09598685055971146, "rewards/format_reward": 0.4513888955116272, "step": 283 }, { "completion_length": 2786.5208740234375, "epoch": 0.9726027397260274, "grad_norm": 0.1734078973531723, "kl": 0.044677734375, "learning_rate": 1.0777570898211405e-07, "loss": 0.0018, "reward": 0.3655647486448288, "reward_std": 0.4497402310371399, "rewards/cosine_scaled_reward": -0.05110191088169813, "rewards/format_reward": 0.4166666716337204, "step": 284 }, { "completion_length": 2854.5069580078125, "epoch": 0.976027397260274, "grad_norm": 0.140737384557724, "kl": 0.0367431640625, "learning_rate": 1.068365111445064e-07, "loss": 0.0015, "reward": 0.3777775317430496, "reward_std": 0.4247249662876129, "rewards/cosine_scaled_reward": -0.09444468468427658, "rewards/format_reward": 0.4722222238779068, "step": 285 }, { "completion_length": 2879.7501220703125, "epoch": 0.9794520547945206, "grad_norm": 0.15024779736995697, "kl": 0.0333251953125, "learning_rate": 1.0595731054933934e-07, "loss": 0.0013, "reward": 0.24627424031496048, "reward_std": 0.3542858809232712, "rewards/cosine_scaled_reward": -0.12178133055567741, "rewards/format_reward": 0.3680555671453476, "step": 286 }, { "completion_length": 2901.6944580078125, "epoch": 0.9828767123287672, "grad_norm": 0.12313710153102875, "kl": 0.03375244140625, "learning_rate": 1.0513822622629978e-07, "loss": 0.0013, "reward": 0.5626899600028992, "reward_std": 0.47839629650115967, "rewards/cosine_scaled_reward": 0.06268996931612492, "rewards/format_reward": 0.5000000149011612, "step": 287 }, { "completion_length": 2720.4097900390625, "epoch": 0.9863013698630136, "grad_norm": 0.13372260332107544, "kl": 0.03326416015625, "learning_rate": 1.0437936906629334e-07, "loss": 0.0013, "reward": 0.42101193219423294, "reward_std": 0.44517213106155396, "rewards/cosine_scaled_reward": -0.07898806827142835, "rewards/format_reward": 0.5, "step": 288 }, { "completion_length": 2722.4305419921875, "epoch": 0.9897260273972602, "grad_norm": 0.18578775227069855, "kl": 0.03375244140625, "learning_rate": 1.0368084180643224e-07, "loss": 0.0013, "reward": 0.574064090847969, "reward_std": 0.4914132207632065, "rewards/cosine_scaled_reward": 0.018508493900299072, "rewards/format_reward": 0.5555555522441864, "step": 289 }, { "completion_length": 2860.0208740234375, "epoch": 0.9931506849315068, "grad_norm": 0.16790378093719482, "kl": 0.037841796875, "learning_rate": 1.0304273901612565e-07, "loss": 0.0015, "reward": 0.4475916475057602, "reward_std": 0.35639651119709015, "rewards/cosine_scaled_reward": -0.05240832082927227, "rewards/format_reward": 0.5000000149011612, "step": 290 }, { "completion_length": 2500.354248046875, "epoch": 0.9965753424657534, "grad_norm": 0.19861853122711182, "kl": 0.0384521484375, "learning_rate": 1.0246514708427701e-07, "loss": 0.0015, "reward": 0.5283551514148712, "reward_std": 0.4327033758163452, "rewards/cosine_scaled_reward": -0.03414486348628998, "rewards/format_reward": 0.5625, "step": 291 }, { "completion_length": 3228.75, "epoch": 1.0, "grad_norm": 0.14693370461463928, "kl": 0.03466796875, "learning_rate": 1.0194814420758804e-07, "loss": 0.0014, "reward": 0.8688084781169891, "reward_std": 0.7189086824655533, "rewards/cosine_scaled_reward": -0.00619150698184967, "rewards/format_reward": 0.875, "step": 292 }, { "completion_length": 2950.875, "epoch": 1.0034246575342465, "grad_norm": 0.1779322773218155, "kl": 0.0443115234375, "learning_rate": 1.0149180037997228e-07, "loss": 0.0018, "reward": 0.3629312068223953, "reward_std": 0.47756847739219666, "rewards/cosine_scaled_reward": -0.06067990604788065, "rewards/format_reward": 0.423611119389534, "step": 293 }, { "completion_length": 2664.9375, "epoch": 1.0068493150684932, "grad_norm": 0.140543133020401, "kl": 0.03173828125, "learning_rate": 1.0109617738307911e-07, "loss": 0.0013, "reward": 0.42808833718299866, "reward_std": 0.38089750707149506, "rewards/cosine_scaled_reward": -0.08580057881772518, "rewards/format_reward": 0.513888880610466, "step": 294 }, { "completion_length": 2873.826416015625, "epoch": 1.0102739726027397, "grad_norm": 0.14133252203464508, "kl": 0.032958984375, "learning_rate": 1.0076132877792932e-07, "loss": 0.0013, "reward": 0.463130921125412, "reward_std": 0.6191278696060181, "rewards/cosine_scaled_reward": -0.03686907887458801, "rewards/format_reward": 0.5, "step": 295 }, { "completion_length": 3035.78466796875, "epoch": 1.0136986301369864, "grad_norm": 0.12591783702373505, "kl": 0.03314208984375, "learning_rate": 1.0048729989766394e-07, "loss": 0.0013, "reward": 0.5283119380474091, "reward_std": 0.4726581275463104, "rewards/cosine_scaled_reward": 0.06997863575816154, "rewards/format_reward": 0.4583333432674408, "step": 296 }, { "completion_length": 2424.90283203125, "epoch": 1.0171232876712328, "grad_norm": 0.14170390367507935, "kl": 0.0355224609375, "learning_rate": 1.002741278414069e-07, "loss": 0.0014, "reward": 0.8962737321853638, "reward_std": 0.46189258992671967, "rewards/cosine_scaled_reward": 0.20877373218536377, "rewards/format_reward": 0.6875000298023224, "step": 297 }, { "completion_length": 2943.513916015625, "epoch": 1.0205479452054795, "grad_norm": 0.1521555632352829, "kl": 0.0386962890625, "learning_rate": 1.0012184146924223e-07, "loss": 0.0015, "reward": 0.26369042694568634, "reward_std": 0.40857334434986115, "rewards/cosine_scaled_reward": -0.15297627449035645, "rewards/format_reward": 0.4166666716337204, "step": 298 }, { "completion_length": 2802.791748046875, "epoch": 1.023972602739726, "grad_norm": 0.13754691183567047, "kl": 0.0372314453125, "learning_rate": 1.0003046139830701e-07, "loss": 0.0015, "reward": 0.6113243997097015, "reward_std": 0.4505104422569275, "rewards/cosine_scaled_reward": 0.055768875405192375, "rewards/format_reward": 0.5555555820465088, "step": 299 }, { "completion_length": 2618.9722900390625, "epoch": 1.0273972602739727, "grad_norm": 0.15953682363033295, "kl": 0.0386962890625, "learning_rate": 1e-07, "loss": 0.0015, "reward": 0.6579746007919312, "reward_std": 0.4734661132097244, "rewards/cosine_scaled_reward": 0.08158569037914276, "rewards/format_reward": 0.5763888955116272, "step": 300 }, { "epoch": 1.0273972602739727, "step": 300, "total_flos": 0.0, "train_loss": 0.0008326698157195504, "train_runtime": 30165.0377, "train_samples_per_second": 0.239, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }