{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 3134.95849609375, "epoch": 0.0005714285714285715, "grad_norm": 0.7448968291282654, "kl": 0.0, "learning_rate": 2.857142857142857e-08, "loss": -0.9387, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2868.9583740234375, "epoch": 0.001142857142857143, "grad_norm": 0.39778050780296326, "kl": 0.0, "learning_rate": 5.714285714285714e-08, "loss": -0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2831.75, "epoch": 0.0017142857142857142, "grad_norm": 1.477035641670227, "kl": 2.5451183319091797e-05, "learning_rate": 8.571428571428572e-08, "loss": -1.9087, "reward": 0.2916666865348816, "reward_std": 0.26603007316589355, "rewards/accuracy_reward": 0.2916666865348816, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2640.375, "epoch": 0.002285714285714286, "grad_norm": 1.3637254238128662, "kl": 2.60770320892334e-05, "learning_rate": 1.1428571428571427e-07, "loss": -0.9903, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2990.416748046875, "epoch": 0.002857142857142857, "grad_norm": 0.9073159694671631, "kl": 4.756450653076172e-05, "learning_rate": 1.4285714285714285e-07, "loss": -0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 2824.9583740234375, "epoch": 0.0034285714285714284, "grad_norm": 0.2965989410877228, "kl": 3.439188003540039e-05, "learning_rate": 1.7142857142857143e-07, "loss": -0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 3328.125, "epoch": 0.004, "grad_norm": 0.6783003807067871, "kl": 3.141164779663086e-05, "learning_rate": 2e-07, "loss": -0.735, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2861.7501220703125, "epoch": 0.004571428571428572, "grad_norm": 0.6755802035331726, "kl": 2.0116567611694336e-05, "learning_rate": 2.2857142857142855e-07, "loss": -0.733, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2332.33349609375, "epoch": 0.005142857142857143, "grad_norm": 1.6076233386993408, "kl": 4.553794860839844e-05, "learning_rate": 2.571428571428571e-07, "loss": -1.4705, "reward": 0.2500000111758709, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.2500000111758709, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 2922.7083740234375, "epoch": 0.005714285714285714, "grad_norm": 1.9102457761764526, "kl": 2.950429916381836e-05, "learning_rate": 2.857142857142857e-07, "loss": -2.88, "reward": 0.2916666716337204, "reward_std": 0.395129531621933, "rewards/accuracy_reward": 0.2916666716337204, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2510.08349609375, "epoch": 0.006285714285714286, "grad_norm": 2.036496639251709, "kl": 5.0187110900878906e-05, "learning_rate": 3.142857142857143e-07, "loss": -1.6748, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2951.5001220703125, "epoch": 0.006857142857142857, "grad_norm": 3.159651279449463, "kl": 3.3855438232421875e-05, "learning_rate": 3.4285714285714286e-07, "loss": -1.9348, "reward": 0.2083333358168602, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2083333358168602, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2773.166748046875, "epoch": 0.0074285714285714285, "grad_norm": 0.7064540386199951, "kl": 1.9043684005737305e-05, "learning_rate": 3.7142857142857145e-07, "loss": -1.4774, "reward": 0.25, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.25, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2341.0833740234375, "epoch": 0.008, "grad_norm": 0.7222815155982971, "kl": 4.0650367736816406e-05, "learning_rate": 4e-07, "loss": -0.7442, "reward": 0.4583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.4583333432674408, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2472.791748046875, "epoch": 0.008571428571428572, "grad_norm": 0.6996123790740967, "kl": 2.658367156982422e-05, "learning_rate": 4.285714285714285e-07, "loss": -0.7392, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 3173.7501220703125, "epoch": 0.009142857142857144, "grad_norm": 1.0215026140213013, "kl": 4.792213439941406e-05, "learning_rate": 4.571428571428571e-07, "loss": -0.9988, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 3044.916748046875, "epoch": 0.009714285714285713, "grad_norm": 0.6144117116928101, "kl": 4.780292510986328e-05, "learning_rate": 4.857142857142857e-07, "loss": -0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3092.2083740234375, "epoch": 0.010285714285714285, "grad_norm": 1.0112086534500122, "kl": 4.398822784423828e-05, "learning_rate": 5.142857142857142e-07, "loss": -1.7343, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 2726.7501220703125, "epoch": 0.010857142857142857, "grad_norm": 0.9500206112861633, "kl": 5.0067901611328125e-05, "learning_rate": 5.428571428571428e-07, "loss": -0.9414, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1488.6250915527344, "epoch": 0.011428571428571429, "grad_norm": 1.102255940437317, "kl": 2.771615982055664e-05, "learning_rate": 5.714285714285714e-07, "loss": -1.685, "reward": 0.375, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.375, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1883.8334350585938, "epoch": 0.012, "grad_norm": 2.169161558151245, "kl": 4.553794860839844e-05, "learning_rate": 6e-07, "loss": -2.3845, "reward": 0.1666666716337204, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.1666666716337204, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 2890.416748046875, "epoch": 0.012571428571428572, "grad_norm": 3.423088550567627, "kl": 3.8743019104003906e-05, "learning_rate": 6.285714285714286e-07, "loss": -3.6207, "reward": 0.5000000298023224, "reward_std": 0.49719157814979553, "rewards/accuracy_reward": 0.5000000298023224, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 2703.9583740234375, "epoch": 0.013142857142857144, "grad_norm": 1.8604422807693481, "kl": 5.328655242919922e-05, "learning_rate": 6.571428571428571e-07, "loss": -1.8681, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2797.5, "epoch": 0.013714285714285714, "grad_norm": 1.5852028131484985, "kl": 6.42538070678711e-05, "learning_rate": 6.857142857142857e-07, "loss": -1.6552, "reward": 0.291666679084301, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.291666679084301, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1578.5417175292969, "epoch": 0.014285714285714285, "grad_norm": 1.9514633417129517, "kl": 5.4836273193359375e-05, "learning_rate": 7.142857142857143e-07, "loss": -2.4821, "reward": 0.375, "reward_std": 0.3410547971725464, "rewards/accuracy_reward": 0.375, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 2654.666748046875, "epoch": 0.014857142857142857, "grad_norm": 1.7990684509277344, "kl": 7.772445678710938e-05, "learning_rate": 7.428571428571429e-07, "loss": -1.8798, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3421.9583740234375, "epoch": 0.015428571428571429, "grad_norm": 0.19747313857078552, "kl": 9.441375732421875e-05, "learning_rate": 7.714285714285714e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2514.3751220703125, "epoch": 0.016, "grad_norm": 1.2808500528335571, "kl": 8.916854858398438e-05, "learning_rate": 8e-07, "loss": -1.7419, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2835.9583740234375, "epoch": 0.01657142857142857, "grad_norm": 2.1129915714263916, "kl": 9.989738464355469e-05, "learning_rate": 8.285714285714285e-07, "loss": -1.9361, "reward": 0.2083333432674408, "reward_std": 0.26603007316589355, "rewards/accuracy_reward": 0.2083333432674408, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 3564.416748046875, "epoch": 0.017142857142857144, "grad_norm": 1.2208162546157837, "kl": 0.0001220703125, "learning_rate": 8.57142857142857e-07, "loss": -1.8741, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2305.2916870117188, "epoch": 0.017714285714285714, "grad_norm": 1.7113858461380005, "kl": 0.00023293495178222656, "learning_rate": 8.857142857142856e-07, "loss": -1.6837, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2269.6666870117188, "epoch": 0.018285714285714287, "grad_norm": 2.3853516578674316, "kl": 0.00031375885009765625, "learning_rate": 9.142857142857142e-07, "loss": -1.8832, "reward": 0.3333333432674408, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.3333333432674408, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.018857142857142857, "grad_norm": 0.5030116438865662, "kl": 0.00035381317138671875, "learning_rate": 9.428571428571428e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2739.6250610351562, "epoch": 0.019428571428571427, "grad_norm": 1.6508129835128784, "kl": 0.00036144256591796875, "learning_rate": 9.714285714285715e-07, "loss": -1.8729, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3151.291748046875, "epoch": 0.02, "grad_norm": 0.6652354001998901, "kl": 0.0004940032958984375, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2798.291748046875, "epoch": 0.02057142857142857, "grad_norm": 1.781134843826294, "kl": 0.0005702972412109375, "learning_rate": 9.999776201301742e-07, "loss": -1.6796, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3469.8751220703125, "epoch": 0.021142857142857144, "grad_norm": 1.4302971363067627, "kl": 0.000423431396484375, "learning_rate": 9.999104827467354e-07, "loss": -1.7207, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3070.75, "epoch": 0.021714285714285714, "grad_norm": 1.0685802698135376, "kl": 0.0008945465087890625, "learning_rate": 9.997985945275765e-07, "loss": -1.7185, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1992.791748046875, "epoch": 0.022285714285714287, "grad_norm": 1.3023139238357544, "kl": 0.00078582763671875, "learning_rate": 9.996419666017806e-07, "loss": -0.9405, "reward": 0.4166666865348816, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666865348816, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 3451.2083740234375, "epoch": 0.022857142857142857, "grad_norm": 0.3098903298377991, "kl": 0.000820159912109375, "learning_rate": 9.994406145485149e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2855.041748046875, "epoch": 0.023428571428571427, "grad_norm": 0.9272823333740234, "kl": 0.001934051513671875, "learning_rate": 9.991945583954808e-07, "loss": -0.7224, "reward": 0.2916666865348816, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.2916666865348816, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2162.3334350585938, "epoch": 0.024, "grad_norm": 1.3229894638061523, "kl": 0.00215911865234375, "learning_rate": 9.989038226169207e-07, "loss": -1.6771, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3419.291748046875, "epoch": 0.02457142857142857, "grad_norm": 1.2003979682922363, "kl": 0.00185394287109375, "learning_rate": 9.985684361311858e-07, "loss": -1.4837, "reward": 0.0833333358168602, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.0833333358168602, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2956.9583740234375, "epoch": 0.025142857142857144, "grad_norm": 1.1588594913482666, "kl": 0.00295257568359375, "learning_rate": 9.981884322978574e-07, "loss": -1.8687, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3275.166748046875, "epoch": 0.025714285714285714, "grad_norm": 0.5378715991973877, "kl": 0.00176239013671875, "learning_rate": 9.977638489144307e-07, "loss": -0.731, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2521.20849609375, "epoch": 0.026285714285714287, "grad_norm": 1.5500916242599487, "kl": 0.00409698486328125, "learning_rate": 9.972947282125533e-07, "loss": -2.6114, "reward": 0.2916666716337204, "reward_std": 0.3602609634399414, "rewards/accuracy_reward": 0.2916666716337204, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2619.5416870117188, "epoch": 0.026857142857142857, "grad_norm": 0.6222373247146606, "kl": 0.00214385986328125, "learning_rate": 9.967811168538266e-07, "loss": -0.7438, "reward": 0.4583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.4583333432674408, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1774.7500610351562, "epoch": 0.027428571428571427, "grad_norm": 2.3108158111572266, "kl": 0.00504302978515625, "learning_rate": 9.962230659251635e-07, "loss": -2.8747, "reward": 0.375, "reward_std": 0.395129531621933, "rewards/accuracy_reward": 0.375, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2151.1666870117188, "epoch": 0.028, "grad_norm": 1.405102252960205, "kl": 0.00498199462890625, "learning_rate": 9.956206309337066e-07, "loss": -1.7383, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2918.75, "epoch": 0.02857142857142857, "grad_norm": 0.5462047457695007, "kl": 0.0054779052734375, "learning_rate": 9.949738718013078e-07, "loss": -0.7418, "reward": 0.2916666865348816, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666865348816, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2755.2084350585938, "epoch": 0.029142857142857144, "grad_norm": 1.4510318040847778, "kl": 0.003520965576171875, "learning_rate": 9.94282852858568e-07, "loss": -1.4865, "reward": 0.2500000111758709, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.2500000111758709, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3465.5, "epoch": 0.029714285714285714, "grad_norm": 0.6955327987670898, "kl": 0.003753662109375, "learning_rate": 9.935476428384382e-07, "loss": -0.9347, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3004.291748046875, "epoch": 0.030285714285714287, "grad_norm": 1.4090065956115723, "kl": 0.0052032470703125, "learning_rate": 9.927683148693833e-07, "loss": -1.6757, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3041.541748046875, "epoch": 0.030857142857142857, "grad_norm": 0.2869970500469208, "kl": 0.0038604736328125, "learning_rate": 9.919449464681074e-07, "loss": 0.0006, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3156.916748046875, "epoch": 0.03142857142857143, "grad_norm": 0.863756537437439, "kl": 0.0051727294921875, "learning_rate": 9.910776195318447e-07, "loss": -1.6736, "reward": 0.2083333432674408, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.2083333432674408, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 2776.041748046875, "epoch": 0.032, "grad_norm": 0.8829509615898132, "kl": 0.00750732421875, "learning_rate": 9.901664203302124e-07, "loss": -0.7267, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 2872.666748046875, "epoch": 0.03257142857142857, "grad_norm": 1.085909366607666, "kl": 0.0074005126953125, "learning_rate": 9.89211439496631e-07, "loss": -2.6113, "reward": 0.2083333395421505, "reward_std": 0.3602609634399414, "rewards/accuracy_reward": 0.2083333395421505, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3217.875, "epoch": 0.03314285714285714, "grad_norm": 0.8099249005317688, "kl": 0.0081024169921875, "learning_rate": 9.882127720193078e-07, "loss": -1.7368, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 2047.6250610351562, "epoch": 0.03371428571428572, "grad_norm": 0.9201177358627319, "kl": 0.01129150390625, "learning_rate": 9.871705172317903e-07, "loss": -1.4821, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.0833333358168602, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3198.166748046875, "epoch": 0.03428571428571429, "grad_norm": 0.7448273301124573, "kl": 0.009307861328125, "learning_rate": 9.86084778803085e-07, "loss": -1.7306, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 2209.9583740234375, "epoch": 0.03485714285714286, "grad_norm": 2.11501145362854, "kl": 0.006805419921875, "learning_rate": 9.849556647273461e-07, "loss": -3.6049, "reward": 0.4166666716337204, "reward_std": 0.4971916079521179, "rewards/accuracy_reward": 0.4166666716337204, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2689.3333740234375, "epoch": 0.03542857142857143, "grad_norm": 0.3600199520587921, "kl": 0.00649261474609375, "learning_rate": 9.83783287313134e-07, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3274.6251220703125, "epoch": 0.036, "grad_norm": 0.7515490055084229, "kl": 0.009490966796875, "learning_rate": 9.825677631722435e-07, "loss": -1.4768, "reward": 0.0833333358168602, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.0833333358168602, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 3410.041748046875, "epoch": 0.036571428571428574, "grad_norm": 0.2419460266828537, "kl": 0.006988525390625, "learning_rate": 9.81309213208106e-07, "loss": 0.0011, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3360.58349609375, "epoch": 0.037142857142857144, "grad_norm": 1.0176174640655518, "kl": 0.0145263671875, "learning_rate": 9.800077626037633e-07, "loss": -0.9358, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1987.7084350585938, "epoch": 0.037714285714285714, "grad_norm": 1.1186784505844116, "kl": 0.008026123046875, "learning_rate": 9.786635408094157e-07, "loss": -1.7376, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1821.4166870117188, "epoch": 0.038285714285714284, "grad_norm": 2.2777140140533447, "kl": 0.013885498046875, "learning_rate": 9.772766815295467e-07, "loss": -2.2118, "reward": 0.1250000037252903, "reward_std": 0.3061862140893936, "rewards/accuracy_reward": 0.1250000037252903, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3485.416748046875, "epoch": 0.038857142857142854, "grad_norm": 1.044973373413086, "kl": 0.008209228515625, "learning_rate": 9.758473227096238e-07, "loss": -2.2271, "reward": 0.1250000037252903, "reward_std": 0.306186206638813, "rewards/accuracy_reward": 0.1250000037252903, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 2774.791748046875, "epoch": 0.03942857142857143, "grad_norm": 0.6572657823562622, "kl": 0.00588226318359375, "learning_rate": 9.743756065223773e-07, "loss": -0.9946, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2589.7083740234375, "epoch": 0.04, "grad_norm": 1.0476188659667969, "kl": 0.011474609375, "learning_rate": 9.728616793536587e-07, "loss": -0.9965, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1670.9166870117188, "epoch": 0.04057142857142857, "grad_norm": 0.6323314309120178, "kl": 0.0037994384765625, "learning_rate": 9.713056917878816e-07, "loss": 0.0008, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 2806.416748046875, "epoch": 0.04114285714285714, "grad_norm": 0.5533509850502014, "kl": 0.01007080078125, "learning_rate": 9.697077985930424e-07, "loss": -0.7427, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1994.25, "epoch": 0.04171428571428572, "grad_norm": 1.0187442302703857, "kl": 0.0106658935546875, "learning_rate": 9.68068158705326e-07, "loss": -0.997, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2693.125, "epoch": 0.04228571428571429, "grad_norm": 0.834760844707489, "kl": 0.00494384765625, "learning_rate": 9.663869352132985e-07, "loss": -1.6737, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 2046.6666870117188, "epoch": 0.04285714285714286, "grad_norm": 0.8847272992134094, "kl": 0.01031494140625, "learning_rate": 9.646642953416834e-07, "loss": -0.7426, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3069.7083740234375, "epoch": 0.04342857142857143, "grad_norm": 0.8105300664901733, "kl": 0.0060882568359375, "learning_rate": 9.6290041043473e-07, "loss": -1.67, "reward": 0.1250000037252903, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.1250000037252903, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2794.1251220703125, "epoch": 0.044, "grad_norm": 0.8459436893463135, "kl": 0.011474609375, "learning_rate": 9.610954559391704e-07, "loss": -0.9395, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 2131.7083740234375, "epoch": 0.044571428571428574, "grad_norm": 0.9749023914337158, "kl": 0.00823974609375, "learning_rate": 9.592496113867668e-07, "loss": -1.9386, "reward": 0.5416666716337204, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.5416666716337204, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 2000.5833740234375, "epoch": 0.045142857142857144, "grad_norm": 0.6731491684913635, "kl": 0.0103607177734375, "learning_rate": 9.573630603764566e-07, "loss": -0.7392, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3062.0833740234375, "epoch": 0.045714285714285714, "grad_norm": 0.9580934643745422, "kl": 0.0061187744140625, "learning_rate": 9.554359905560885e-07, "loss": -0.988, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1195.0000610351562, "epoch": 0.046285714285714284, "grad_norm": 1.316493034362793, "kl": 0.011627197265625, "learning_rate": 9.534685936037593e-07, "loss": -1.9331, "reward": 0.2916666716337204, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2916666716337204, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 2899.75, "epoch": 0.046857142857142854, "grad_norm": 0.8076637387275696, "kl": 0.01312255859375, "learning_rate": 9.514610652087475e-07, "loss": -1.6822, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 869.2917175292969, "epoch": 0.04742857142857143, "grad_norm": 1.114304542541504, "kl": 0.0088043212890625, "learning_rate": 9.494136050520494e-07, "loss": -0.7427, "reward": 0.4583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.4583333432674408, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1685.3334350585938, "epoch": 0.048, "grad_norm": 1.8916584253311157, "kl": 0.0077972412109375, "learning_rate": 9.473264167865171e-07, "loss": -2.8129, "reward": 0.2500000074505806, "reward_std": 0.3872983306646347, "rewards/accuracy_reward": 0.2500000074505806, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1882.3750610351562, "epoch": 0.04857142857142857, "grad_norm": 0.999433696269989, "kl": 0.023345947265625, "learning_rate": 9.451997080166028e-07, "loss": -0.7365, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2046.541748046875, "epoch": 0.04914285714285714, "grad_norm": 0.9735854268074036, "kl": 0.0104827880859375, "learning_rate": 9.430336902777083e-07, "loss": -2.4278, "reward": 0.5000000223517418, "reward_std": 0.3332235813140869, "rewards/accuracy_reward": 0.5000000223517418, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 2605.2501220703125, "epoch": 0.04971428571428571, "grad_norm": 1.1373125314712524, "kl": 0.019866943359375, "learning_rate": 9.40828579015145e-07, "loss": -1.481, "reward": 0.2500000111758709, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.2500000111758709, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 2085.6251220703125, "epoch": 0.05028571428571429, "grad_norm": 1.1215746402740479, "kl": 0.01055908203125, "learning_rate": 9.385845935627039e-07, "loss": -1.6782, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1928.4166870117188, "epoch": 0.05085714285714286, "grad_norm": 1.2779592275619507, "kl": 0.022705078125, "learning_rate": 9.363019571208397e-07, "loss": -1.722, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1391.0833740234375, "epoch": 0.05142857142857143, "grad_norm": 1.28791081905365, "kl": 0.011474609375, "learning_rate": 9.3398089673447e-07, "loss": -2.3681, "reward": 0.4166666865348816, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.4166666865348816, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 2919.33349609375, "epoch": 0.052, "grad_norm": 0.5967366695404053, "kl": 0.0152587890625, "learning_rate": 9.316216432703916e-07, "loss": 0.0027, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2806.8333740234375, "epoch": 0.052571428571428575, "grad_norm": 0.5494978427886963, "kl": 0.01953125, "learning_rate": 9.292244313943176e-07, "loss": 0.0038, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1886.0833435058594, "epoch": 0.053142857142857144, "grad_norm": 0.9120233654975891, "kl": 0.014862060546875, "learning_rate": 9.267894995475355e-07, "loss": -0.7398, "reward": 0.2916666865348816, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.2916666865348816, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2132.8333740234375, "epoch": 0.053714285714285714, "grad_norm": 0.9982142448425293, "kl": 0.01995849609375, "learning_rate": 9.24317089923191e-07, "loss": -1.682, "reward": 0.291666679084301, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.291666679084301, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1412.0000305175781, "epoch": 0.054285714285714284, "grad_norm": 1.1398898363113403, "kl": 0.009185791015625, "learning_rate": 9.218074484421977e-07, "loss": -2.6066, "reward": 0.2083333358168602, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.2083333358168602, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3135.33349609375, "epoch": 0.054857142857142854, "grad_norm": 0.46777933835983276, "kl": 0.013153076171875, "learning_rate": 9.192608247287761e-07, "loss": -0.7412, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 2038.7083435058594, "epoch": 0.05542857142857143, "grad_norm": 1.239867925643921, "kl": 0.014312744140625, "learning_rate": 9.166774720856253e-07, "loss": -0.938, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1562.3333740234375, "epoch": 0.056, "grad_norm": 1.027034044265747, "kl": 0.0085601806640625, "learning_rate": 9.140576474687263e-07, "loss": -1.9377, "reward": 0.5416666716337204, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.5416666716337204, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 2020.0001220703125, "epoch": 0.05657142857142857, "grad_norm": 1.366735577583313, "kl": 0.0066680908203125, "learning_rate": 9.114016114617857e-07, "loss": -1.8751, "reward": 0.5000000149011612, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.5000000149011612, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1672.7500610351562, "epoch": 0.05714285714285714, "grad_norm": 1.17022705078125, "kl": 0.00823974609375, "learning_rate": 9.08709628250315e-07, "loss": -2.4106, "reward": 0.4166666865348816, "reward_std": 0.3332235738635063, "rewards/accuracy_reward": 0.4166666865348816, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 2906.58349609375, "epoch": 0.05771428571428571, "grad_norm": 0.629345715045929, "kl": 0.009490966796875, "learning_rate": 9.059819655953535e-07, "loss": -1.6843, "reward": 0.291666679084301, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.291666679084301, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1907.1666870117188, "epoch": 0.05828571428571429, "grad_norm": 0.8361666202545166, "kl": 0.01873779296875, "learning_rate": 9.03218894806835e-07, "loss": -1.4741, "reward": 0.2500000111758709, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.2500000111758709, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 2738.041748046875, "epoch": 0.05885714285714286, "grad_norm": 0.5930432081222534, "kl": 0.015869140625, "learning_rate": 9.004206907166023e-07, "loss": -0.9389, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3003.791748046875, "epoch": 0.05942857142857143, "grad_norm": 0.7205336093902588, "kl": 0.01336669921875, "learning_rate": 8.975876316510698e-07, "loss": -1.8796, "reward": 0.2500000074505806, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.2500000074505806, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2454.041748046875, "epoch": 0.06, "grad_norm": 0.5209367275238037, "kl": 0.01300048828125, "learning_rate": 8.9471999940354e-07, "loss": -0.9357, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2740.2500610351562, "epoch": 0.060571428571428575, "grad_norm": 1.4679958820343018, "kl": 0.011962890625, "learning_rate": 8.918180792061751e-07, "loss": -1.6703, "reward": 0.2083333432674408, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.2083333432674408, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2093.916748046875, "epoch": 0.061142857142857145, "grad_norm": 0.57196444272995, "kl": 0.0166015625, "learning_rate": 8.88882159701625e-07, "loss": -0.9368, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2134.0833740234375, "epoch": 0.061714285714285715, "grad_norm": 0.393916517496109, "kl": 0.010894775390625, "learning_rate": 8.859125329143175e-07, "loss": 0.0017, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 2911.541748046875, "epoch": 0.062285714285714285, "grad_norm": 0.3331953287124634, "kl": 0.013275146484375, "learning_rate": 8.829094942214127e-07, "loss": -0.738, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 2564.5000610351562, "epoch": 0.06285714285714286, "grad_norm": 0.29394540190696716, "kl": 0.014984130859375, "learning_rate": 8.798733423234219e-07, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2804.0, "epoch": 0.06342857142857143, "grad_norm": 0.7025728225708008, "kl": 0.017059326171875, "learning_rate": 8.768043792144968e-07, "loss": 0.0027, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 2215.0416870117188, "epoch": 0.064, "grad_norm": 0.9984528422355652, "kl": 0.01275634765625, "learning_rate": 8.737029101523929e-07, "loss": -1.7326, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 2483.2083740234375, "epoch": 0.06457142857142857, "grad_norm": 0.7270219326019287, "kl": 0.010986328125, "learning_rate": 8.705692436281051e-07, "loss": -1.8786, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3233.20849609375, "epoch": 0.06514285714285714, "grad_norm": 0.5366232395172119, "kl": 0.016143798828125, "learning_rate": 8.674036913351838e-07, "loss": -0.9371, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2328.7501220703125, "epoch": 0.06571428571428571, "grad_norm": 0.6814904808998108, "kl": 0.012298583984375, "learning_rate": 8.642065681387327e-07, "loss": -0.9397, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 2270.916748046875, "epoch": 0.06628571428571428, "grad_norm": 0.7077198624610901, "kl": 0.0084075927734375, "learning_rate": 8.609781920440891e-07, "loss": -1.4859, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.0833333358168602, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 2786.1251220703125, "epoch": 0.06685714285714285, "grad_norm": 0.7300746440887451, "kl": 0.0172119140625, "learning_rate": 8.57718884165194e-07, "loss": -1.6803, "reward": 0.375, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.375, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1789.4583740234375, "epoch": 0.06742857142857143, "grad_norm": 0.5591477751731873, "kl": 0.01336669921875, "learning_rate": 8.544289686926524e-07, "loss": -0.9944, "reward": 0.375, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.375, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2085.916748046875, "epoch": 0.068, "grad_norm": 0.9207446575164795, "kl": 0.01312255859375, "learning_rate": 8.511087728614862e-07, "loss": -0.7421, "reward": 0.4583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.4583333432674408, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 2560.9583740234375, "epoch": 0.06857142857142857, "grad_norm": 0.5242665410041809, "kl": 0.012298583984375, "learning_rate": 8.477586269185867e-07, "loss": 0.0019, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2857.3333740234375, "epoch": 0.06914285714285714, "grad_norm": 0.8910159468650818, "kl": 0.015045166015625, "learning_rate": 8.443788640898654e-07, "loss": -0.9389, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1371.8750610351562, "epoch": 0.06971428571428571, "grad_norm": 1.075481653213501, "kl": 0.010223388671875, "learning_rate": 8.409698205471098e-07, "loss": -1.9938, "reward": 0.5, "reward_std": 0.273861289024353, "rewards/accuracy_reward": 0.5, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2519.2500610351562, "epoch": 0.07028571428571428, "grad_norm": 1.0758017301559448, "kl": 0.01434326171875, "learning_rate": 8.37531835374545e-07, "loss": -2.2018, "reward": 0.291666679084301, "reward_std": 0.3061862215399742, "rewards/accuracy_reward": 0.291666679084301, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 2478.70849609375, "epoch": 0.07085714285714285, "grad_norm": 0.8420854210853577, "kl": 0.017333984375, "learning_rate": 8.340652505351075e-07, "loss": -1.6825, "reward": 0.3750000149011612, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.3750000149011612, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1974.3333740234375, "epoch": 0.07142857142857142, "grad_norm": 0.8553494811058044, "kl": 0.016143798828125, "learning_rate": 8.305704108364301e-07, "loss": -1.9371, "reward": 0.2083333358168602, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2083333358168602, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1352.0000305175781, "epoch": 0.072, "grad_norm": 0.7503184080123901, "kl": 0.0130615234375, "learning_rate": 8.270476638965461e-07, "loss": -0.7278, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2711.20849609375, "epoch": 0.07257142857142856, "grad_norm": 0.6893806457519531, "kl": 0.0093841552734375, "learning_rate": 8.234973601093135e-07, "loss": -1.7343, "reward": 0.4166666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.4166666716337204, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 2195.4583740234375, "epoch": 0.07314285714285715, "grad_norm": 0.5396758317947388, "kl": 0.013031005859375, "learning_rate": 8.199198526095611e-07, "loss": -0.9382, "reward": 0.3333333432674408, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.3333333432674408, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1736.2916870117188, "epoch": 0.07371428571428572, "grad_norm": 1.1960101127624512, "kl": 0.026214599609375, "learning_rate": 8.163154972379655e-07, "loss": -1.7324, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1648.625, "epoch": 0.07428571428571429, "grad_norm": 0.7403463125228882, "kl": 0.009918212890625, "learning_rate": 8.126846525056555e-07, "loss": -1.4736, "reward": 0.3333333544433117, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.3333333544433117, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 2801.8333740234375, "epoch": 0.07485714285714286, "grad_norm": 0.7958399057388306, "kl": 0.0125732421875, "learning_rate": 8.090276795585531e-07, "loss": -0.7389, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 2148.2084350585938, "epoch": 0.07542857142857143, "grad_norm": 0.6350630521774292, "kl": 0.0111846923828125, "learning_rate": 8.053449421414518e-07, "loss": -1.6738, "reward": 0.1250000037252903, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.1250000037252903, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 2469.25, "epoch": 0.076, "grad_norm": 0.8013907670974731, "kl": 0.014984130859375, "learning_rate": 8.01636806561836e-07, "loss": -1.4861, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.4166666865348816, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 3052.5, "epoch": 0.07657142857142857, "grad_norm": 0.9900679588317871, "kl": 0.020294189453125, "learning_rate": 7.979036416534461e-07, "loss": -1.8765, "reward": 0.2500000074505806, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.2500000074505806, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2824.541748046875, "epoch": 0.07714285714285714, "grad_norm": 0.5742546319961548, "kl": 0.01898193359375, "learning_rate": 7.941458187395917e-07, "loss": 0.003, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1178.8750305175781, "epoch": 0.07771428571428571, "grad_norm": 1.0653953552246094, "kl": 0.01934814453125, "learning_rate": 7.903637115962179e-07, "loss": -0.9936, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2287.8333740234375, "epoch": 0.07828571428571429, "grad_norm": 0.7256926894187927, "kl": 0.01116943359375, "learning_rate": 7.86557696414727e-07, "loss": -1.7341, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2149.666748046875, "epoch": 0.07885714285714286, "grad_norm": 0.48477721214294434, "kl": 0.0179443359375, "learning_rate": 7.827281517645606e-07, "loss": 0.003, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2431.5833740234375, "epoch": 0.07942857142857143, "grad_norm": 0.7151182293891907, "kl": 0.01629638671875, "learning_rate": 7.788754585555441e-07, "loss": -0.7297, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1989.9166870117188, "epoch": 0.08, "grad_norm": 0.5723420977592468, "kl": 0.0123291015625, "learning_rate": 7.75e-07, "loss": -0.9392, "reward": 0.4166666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666716337204, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1466.0834350585938, "epoch": 0.08057142857142857, "grad_norm": 0.8384633660316467, "kl": 0.012237548828125, "learning_rate": 7.7110216157463e-07, "loss": -0.9396, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1722.1251220703125, "epoch": 0.08114285714285714, "grad_norm": 1.3027905225753784, "kl": 0.0177001953125, "learning_rate": 7.671823309821749e-07, "loss": -0.9354, "reward": 0.3333333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.3333333358168602, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2374.5834350585938, "epoch": 0.08171428571428571, "grad_norm": 0.8892642259597778, "kl": 0.02044677734375, "learning_rate": 7.632408981128493e-07, "loss": -1.924, "reward": 0.4583333432674408, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.4583333432674408, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1527.0833740234375, "epoch": 0.08228571428571428, "grad_norm": 1.809125542640686, "kl": 0.0185546875, "learning_rate": 7.592782550055628e-07, "loss": -2.7263, "reward": 0.2916666716337204, "reward_std": 0.37592336535453796, "rewards/accuracy_reward": 0.2916666716337204, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 886.1666870117188, "epoch": 0.08285714285714285, "grad_norm": 1.7466280460357666, "kl": 0.01373291015625, "learning_rate": 7.552947958089233e-07, "loss": -2.7183, "reward": 0.2916666679084301, "reward_std": 0.37592335790395737, "rewards/accuracy_reward": 0.2916666679084301, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1453.666748046875, "epoch": 0.08342857142857144, "grad_norm": 1.4468218088150024, "kl": 0.013519287109375, "learning_rate": 7.512909167420346e-07, "loss": -2.4829, "reward": 0.375, "reward_std": 0.3410547822713852, "rewards/accuracy_reward": 0.375, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1603.791748046875, "epoch": 0.084, "grad_norm": 1.1846656799316406, "kl": 0.01263427734375, "learning_rate": 7.472670160550848e-07, "loss": -2.6207, "reward": 0.3750000149011612, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.3750000149011612, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1206.4583740234375, "epoch": 0.08457142857142858, "grad_norm": 0.9129040837287903, "kl": 0.01019287109375, "learning_rate": 7.432234939897342e-07, "loss": -1.6655, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1288.4583740234375, "epoch": 0.08514285714285715, "grad_norm": 0.93757164478302, "kl": 0.01715087890625, "learning_rate": 7.391607527393044e-07, "loss": -0.9368, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1977.041748046875, "epoch": 0.08571428571428572, "grad_norm": 1.4448423385620117, "kl": 0.017486572265625, "learning_rate": 7.350791964087752e-07, "loss": -1.6729, "reward": 0.2083333395421505, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.2083333395421505, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1958.7917175292969, "epoch": 0.08628571428571429, "grad_norm": 2.432656764984131, "kl": 0.032012939453125, "learning_rate": 7.309792309745878e-07, "loss": -0.7388, "reward": 0.4583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.4583333432674408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2387.1251220703125, "epoch": 0.08685714285714285, "grad_norm": 0.5419829487800598, "kl": 0.016754150390625, "learning_rate": 7.268612642442656e-07, "loss": 0.0024, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2126.3333740234375, "epoch": 0.08742857142857142, "grad_norm": 0.5498960018157959, "kl": 0.01190185546875, "learning_rate": 7.227257058158502e-07, "loss": -0.9965, "reward": 0.375, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.375, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2022.7501220703125, "epoch": 0.088, "grad_norm": 0.857801616191864, "kl": 0.011138916015625, "learning_rate": 7.185729670371604e-07, "loss": -1.6669, "reward": 0.4583333432674408, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.4583333432674408, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3400.95849609375, "epoch": 0.08857142857142856, "grad_norm": 0.3610871732234955, "kl": 0.0118408203125, "learning_rate": 7.144034609648778e-07, "loss": -0.7363, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1649.166748046875, "epoch": 0.08914285714285715, "grad_norm": 1.4170763492584229, "kl": 0.012237548828125, "learning_rate": 7.102176023234605e-07, "loss": -2.4233, "reward": 0.4166666716337204, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.4166666716337204, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2151.5001220703125, "epoch": 0.08971428571428572, "grad_norm": 0.8594135046005249, "kl": 0.01202392578125, "learning_rate": 7.060158074638932e-07, "loss": -0.7423, "reward": 0.2083333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.2083333432674408, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1745.2083740234375, "epoch": 0.09028571428571429, "grad_norm": 0.9574673771858215, "kl": 0.01275634765625, "learning_rate": 7.017984943222735e-07, "loss": -2.6742, "reward": 0.25, "reward_std": 0.3680921494960785, "rewards/accuracy_reward": 0.25, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3289.6251220703125, "epoch": 0.09085714285714286, "grad_norm": 0.5987799763679504, "kl": 0.013763427734375, "learning_rate": 6.97566082378242e-07, "loss": -1.4697, "reward": 0.0833333358168602, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.0833333358168602, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3266.5833740234375, "epoch": 0.09142857142857143, "grad_norm": 0.5092036724090576, "kl": 0.012542724609375, "learning_rate": 6.93318992613258e-07, "loss": -0.9381, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1581.166748046875, "epoch": 0.092, "grad_norm": 0.6672082543373108, "kl": 0.01666259765625, "learning_rate": 6.890576474687263e-07, "loss": -0.7416, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2034.916748046875, "epoch": 0.09257142857142857, "grad_norm": 1.0506476163864136, "kl": 0.0162353515625, "learning_rate": 6.847824708039786e-07, "loss": -1.9362, "reward": 0.2916666716337204, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2916666716337204, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 2280.166748046875, "epoch": 0.09314285714285714, "grad_norm": 0.7479243278503418, "kl": 0.019195556640625, "learning_rate": 6.804938878541138e-07, "loss": -0.9382, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2798.8751220703125, "epoch": 0.09371428571428571, "grad_norm": 0.7925019264221191, "kl": 0.016387939453125, "learning_rate": 6.761923251877012e-07, "loss": -1.9367, "reward": 0.2083333358168602, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2083333358168602, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 3475.291748046875, "epoch": 0.09428571428571429, "grad_norm": 0.44603925943374634, "kl": 0.01324462890625, "learning_rate": 6.718782106643523e-07, "loss": -0.7406, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09485714285714286, "grad_norm": 0.43778079748153687, "kl": 0.014923095703125, "learning_rate": 6.675519733921623e-07, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1407.8333740234375, "epoch": 0.09542857142857143, "grad_norm": 1.5409038066864014, "kl": 0.01873779296875, "learning_rate": 6.632140436850289e-07, "loss": -2.9278, "reward": 0.5833333432674408, "reward_std": 0.40296071767807007, "rewards/accuracy_reward": 0.5833333432674408, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 2116.8751220703125, "epoch": 0.096, "grad_norm": 1.3273327350616455, "kl": 0.0230712890625, "learning_rate": 6.588648530198504e-07, "loss": -2.4069, "reward": 0.2500000111758709, "reward_std": 0.3332235813140869, "rewards/accuracy_reward": 0.2500000111758709, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1039.9166870117188, "epoch": 0.09657142857142857, "grad_norm": 1.1365933418273926, "kl": 0.015655517578125, "learning_rate": 6.545048339936091e-07, "loss": -2.4803, "reward": 0.375, "reward_std": 0.3410547971725464, "rewards/accuracy_reward": 0.375, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2980.3333740234375, "epoch": 0.09714285714285714, "grad_norm": 0.33093100786209106, "kl": 0.01751708984375, "learning_rate": 6.501344202803414e-07, "loss": -0.7351, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2490.08349609375, "epoch": 0.09771428571428571, "grad_norm": 0.4821685254573822, "kl": 0.011688232421875, "learning_rate": 6.45754046588003e-07, "loss": -0.9394, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2099.2083740234375, "epoch": 0.09828571428571428, "grad_norm": 0.5119174718856812, "kl": 0.01446533203125, "learning_rate": 6.413641486152292e-07, "loss": -0.7392, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1332.7917175292969, "epoch": 0.09885714285714285, "grad_norm": 1.1657651662826538, "kl": 0.017425537109375, "learning_rate": 6.36965163007999e-07, "loss": -2.2233, "reward": 0.1250000037252903, "reward_std": 0.3061862215399742, "rewards/accuracy_reward": 0.1250000037252903, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 2518.7083740234375, "epoch": 0.09942857142857142, "grad_norm": 0.38400009274482727, "kl": 0.02191162109375, "learning_rate": 6.32557527316202e-07, "loss": 0.0035, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1589.9166870117188, "epoch": 0.1, "grad_norm": 1.2936434745788574, "kl": 0.027374267578125, "learning_rate": 6.281416799501187e-07, "loss": -1.7382, "reward": 0.5833333432674408, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.5833333432674408, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1653.9167175292969, "epoch": 0.10057142857142858, "grad_norm": 1.3641895055770874, "kl": 0.02020263671875, "learning_rate": 6.23718060136812e-07, "loss": -2.6149, "reward": 0.291666679084301, "reward_std": 0.3602609634399414, "rewards/accuracy_reward": 0.291666679084301, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 2173.9583435058594, "epoch": 0.10114285714285715, "grad_norm": 0.7888267636299133, "kl": 0.016754150390625, "learning_rate": 6.1928710787644e-07, "loss": -0.9389, "reward": 0.4166666865348816, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666865348816, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 3312.25, "epoch": 0.10171428571428572, "grad_norm": 0.9001780152320862, "kl": 0.01397705078125, "learning_rate": 6.14849263898491e-07, "loss": -1.876, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 2765.416748046875, "epoch": 0.10228571428571429, "grad_norm": 0.5644757747650146, "kl": 0.015350341796875, "learning_rate": 6.10404969617945e-07, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 2159.08349609375, "epoch": 0.10285714285714286, "grad_norm": 1.2660763263702393, "kl": 0.0147705078125, "learning_rate": 6.059546670913684e-07, "loss": -2.7191, "reward": 0.2916666716337204, "reward_std": 0.37592336535453796, "rewards/accuracy_reward": 0.2916666716337204, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 2710.625, "epoch": 0.10342857142857143, "grad_norm": 0.7003759145736694, "kl": 0.02130126953125, "learning_rate": 6.014987989729444e-07, "loss": -1.8783, "reward": 0.2500000074505806, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.2500000074505806, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 2344.625, "epoch": 0.104, "grad_norm": 0.6448298096656799, "kl": 0.0184326171875, "learning_rate": 5.97037808470444e-07, "loss": -1.4745, "reward": 0.0833333358168602, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.0833333358168602, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2700.1251220703125, "epoch": 0.10457142857142857, "grad_norm": 0.40715157985687256, "kl": 0.01531982421875, "learning_rate": 5.925721393011417e-07, "loss": -0.7211, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1704.2501220703125, "epoch": 0.10514285714285715, "grad_norm": 1.0119861364364624, "kl": 0.02203369140625, "learning_rate": 5.881022356476804e-07, "loss": -0.7407, "reward": 0.4583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.4583333432674408, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1675.4166870117188, "epoch": 0.10571428571428572, "grad_norm": 2.0721378326416016, "kl": 0.025970458984375, "learning_rate": 5.836285421138909e-07, "loss": -1.4489, "reward": 0.0833333358168602, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.0833333358168602, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2593.291748046875, "epoch": 0.10628571428571429, "grad_norm": 0.6480510234832764, "kl": 0.0172119140625, "learning_rate": 5.791515036805684e-07, "loss": 0.003, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 2512.4583740234375, "epoch": 0.10685714285714286, "grad_norm": 0.7385961413383484, "kl": 0.017333984375, "learning_rate": 5.74671565661212e-07, "loss": 0.0028, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 2527.8333740234375, "epoch": 0.10742857142857143, "grad_norm": 1.0105268955230713, "kl": 0.018035888671875, "learning_rate": 5.701891736577317e-07, "loss": -2.7295, "reward": 0.4583333432674408, "reward_std": 0.37592336535453796, "rewards/accuracy_reward": 0.4583333432674408, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 3513.25, "epoch": 0.108, "grad_norm": 0.6996456384658813, "kl": 0.01776123046875, "learning_rate": 5.657047735161255e-07, "loss": -1.6768, "reward": 0.2083333432674408, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.2083333432674408, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 2915.916748046875, "epoch": 0.10857142857142857, "grad_norm": 0.6052663922309875, "kl": 0.012054443359375, "learning_rate": 5.612188112821328e-07, "loss": -0.7361, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2475.5001220703125, "epoch": 0.10914285714285714, "grad_norm": 0.3194851279258728, "kl": 0.0223388671875, "learning_rate": 5.567317331568686e-07, "loss": 0.0036, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2002.1250610351562, "epoch": 0.10971428571428571, "grad_norm": 45.269439697265625, "kl": 0.251220703125, "learning_rate": 5.522439854524411e-07, "loss": -2.4264, "reward": 0.3750000149011612, "reward_std": 0.3410547897219658, "rewards/accuracy_reward": 0.3750000149011612, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 2904.2083740234375, "epoch": 0.11028571428571429, "grad_norm": 0.3201312720775604, "kl": 0.01861572265625, "learning_rate": 5.477560145475589e-07, "loss": 0.003, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1847.7500610351562, "epoch": 0.11085714285714286, "grad_norm": 0.9246636629104614, "kl": 0.0289306640625, "learning_rate": 5.432682668431314e-07, "loss": -0.917, "reward": 0.3333333432674408, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.3333333432674408, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 2270.6666870117188, "epoch": 0.11142857142857143, "grad_norm": 0.4476500153541565, "kl": 0.017730712890625, "learning_rate": 5.387811887178673e-07, "loss": 0.0029, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2440.8333740234375, "epoch": 0.112, "grad_norm": 0.5211483836174011, "kl": 0.011688232421875, "learning_rate": 5.342952264838747e-07, "loss": 0.0022, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 2394.9583740234375, "epoch": 0.11257142857142857, "grad_norm": 0.533424973487854, "kl": 0.02685546875, "learning_rate": 5.298108263422685e-07, "loss": 0.0048, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2006.25, "epoch": 0.11314285714285714, "grad_norm": 1.4151604175567627, "kl": 0.019134521484375, "learning_rate": 5.25328434338788e-07, "loss": -2.6747, "reward": 0.3333333432674408, "reward_std": 0.3680921494960785, "rewards/accuracy_reward": 0.3333333432674408, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 2482.041748046875, "epoch": 0.11371428571428571, "grad_norm": 1.1144918203353882, "kl": 0.019287109375, "learning_rate": 5.208484963194316e-07, "loss": -1.4696, "reward": 0.2500000111758709, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.2500000111758709, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2661.916748046875, "epoch": 0.11428571428571428, "grad_norm": 0.9477536082267761, "kl": 0.023193359375, "learning_rate": 5.163714578861091e-07, "loss": -0.9854, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2519.8333740234375, "epoch": 0.11485714285714285, "grad_norm": 1.073439359664917, "kl": 0.016082763671875, "learning_rate": 5.118977643523196e-07, "loss": -0.7384, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2459.916748046875, "epoch": 0.11542857142857142, "grad_norm": 1.0262060165405273, "kl": 0.014678955078125, "learning_rate": 5.074278606988584e-07, "loss": -1.6796, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 3012.3751220703125, "epoch": 0.116, "grad_norm": 0.8095118403434753, "kl": 0.01739501953125, "learning_rate": 5.02962191529556e-07, "loss": -2.6648, "reward": 0.25, "reward_std": 0.3680921643972397, "rewards/accuracy_reward": 0.25, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2506.3751220703125, "epoch": 0.11657142857142858, "grad_norm": 0.5008082985877991, "kl": 0.013458251953125, "learning_rate": 4.985012010270557e-07, "loss": 0.0021, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2347.7916870117188, "epoch": 0.11714285714285715, "grad_norm": 0.6901289820671082, "kl": 0.01611328125, "learning_rate": 4.940453329086318e-07, "loss": -1.4828, "reward": 0.0833333358168602, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.0833333358168602, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 2616.666748046875, "epoch": 0.11771428571428572, "grad_norm": 0.631709098815918, "kl": 0.02020263671875, "learning_rate": 4.895950303820552e-07, "loss": -0.9945, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2875.8333740234375, "epoch": 0.11828571428571429, "grad_norm": 0.7697901129722595, "kl": 0.014129638671875, "learning_rate": 4.85150736101509e-07, "loss": -0.9966, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2224.3751220703125, "epoch": 0.11885714285714286, "grad_norm": 0.7268201112747192, "kl": 0.011077880859375, "learning_rate": 4.807128921235598e-07, "loss": -1.4794, "reward": 0.5, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.5, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2420.7083740234375, "epoch": 0.11942857142857143, "grad_norm": 0.5704336166381836, "kl": 0.009765625, "learning_rate": 4.76281939863188e-07, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1993.3333740234375, "epoch": 0.12, "grad_norm": 0.507115364074707, "kl": 0.017822265625, "learning_rate": 4.7185832004988133e-07, "loss": 0.0029, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1624.125, "epoch": 0.12057142857142857, "grad_norm": 1.2795109748840332, "kl": 0.009735107421875, "learning_rate": 4.67442472683798e-07, "loss": -2.6766, "reward": 0.3333333432674408, "reward_std": 0.3680921494960785, "rewards/accuracy_reward": 0.3333333432674408, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 3554.3333740234375, "epoch": 0.12114285714285715, "grad_norm": 0.6074011325836182, "kl": 0.021728515625, "learning_rate": 4.6303483699200105e-07, "loss": -0.9938, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1680.666748046875, "epoch": 0.12171428571428572, "grad_norm": 1.1823277473449707, "kl": 0.014251708984375, "learning_rate": 4.5863585138477077e-07, "loss": -1.7337, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2227.375, "epoch": 0.12228571428571429, "grad_norm": 0.7161904573440552, "kl": 0.01544189453125, "learning_rate": 4.542459534119971e-07, "loss": -0.7411, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2544.7501220703125, "epoch": 0.12285714285714286, "grad_norm": 0.9380499720573425, "kl": 0.01983642578125, "learning_rate": 4.4986557971965856e-07, "loss": -0.9942, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.125, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1188.8333740234375, "epoch": 0.12342857142857143, "grad_norm": 1.2638156414031982, "kl": 0.015777587890625, "learning_rate": 4.454951660063909e-07, "loss": -1.6826, "reward": 0.7916666865348816, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.7916666865348816, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 3525.25, "epoch": 0.124, "grad_norm": 0.28554829955101013, "kl": 0.01068115234375, "learning_rate": 4.4113514698014953e-07, "loss": 0.0017, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2003.75, "epoch": 0.12457142857142857, "grad_norm": 0.7368922233581543, "kl": 0.02069091796875, "learning_rate": 4.367859563149712e-07, "loss": -0.741, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 884.8333435058594, "epoch": 0.12514285714285714, "grad_norm": 0.8822182416915894, "kl": 0.01611328125, "learning_rate": 4.3244802660783775e-07, "loss": -0.9384, "reward": 0.4166666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666716337204, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2650.75, "epoch": 0.12571428571428572, "grad_norm": 0.6846665143966675, "kl": 0.014801025390625, "learning_rate": 4.281217893356478e-07, "loss": -1.6798, "reward": 0.3750000111758709, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.3750000111758709, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1293.7916870117188, "epoch": 0.12628571428571428, "grad_norm": 0.869556188583374, "kl": 0.016845703125, "learning_rate": 4.2380767481229884e-07, "loss": -0.7405, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1300.9583740234375, "epoch": 0.12685714285714286, "grad_norm": 0.937000572681427, "kl": 0.01593017578125, "learning_rate": 4.195061121458862e-07, "loss": -0.939, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2665.791748046875, "epoch": 0.12742857142857142, "grad_norm": 0.794379472732544, "kl": 0.01953125, "learning_rate": 4.152175291960214e-07, "loss": -0.9379, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 2896.08349609375, "epoch": 0.128, "grad_norm": 0.8766622543334961, "kl": 0.0145263671875, "learning_rate": 4.1094235253127374e-07, "loss": -2.6164, "reward": 0.2916666716337204, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.2916666716337204, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 2229.8750610351562, "epoch": 0.12857142857142856, "grad_norm": 1.1731126308441162, "kl": 0.016143798828125, "learning_rate": 4.0668100738674205e-07, "loss": -2.4688, "reward": 0.2083333395421505, "reward_std": 0.3410547971725464, "rewards/accuracy_reward": 0.2083333395421505, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1741.8750915527344, "epoch": 0.12914285714285714, "grad_norm": 0.7152168154716492, "kl": 0.01202392578125, "learning_rate": 4.0243391762175803e-07, "loss": -0.9392, "reward": 0.4166666865348816, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666865348816, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2109.291748046875, "epoch": 0.12971428571428573, "grad_norm": 0.4053463339805603, "kl": 0.0142822265625, "learning_rate": 3.982015056777265e-07, "loss": -0.7372, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 981.5833740234375, "epoch": 0.13028571428571428, "grad_norm": 1.3437594175338745, "kl": 0.017852783203125, "learning_rate": 3.939841925361067e-07, "loss": -1.7393, "reward": 0.4166666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.4166666716337204, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2041.2916870117188, "epoch": 0.13085714285714287, "grad_norm": 0.44564294815063477, "kl": 0.01416015625, "learning_rate": 3.897823976765394e-07, "loss": -0.7404, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2914.416748046875, "epoch": 0.13142857142857142, "grad_norm": 5.425497531890869, "kl": 0.02838134765625, "learning_rate": 3.855965390351222e-07, "loss": -1.8737, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 2574.375, "epoch": 0.132, "grad_norm": 1.1059547662734985, "kl": 0.02020263671875, "learning_rate": 3.8142703296283953e-07, "loss": -1.6797, "reward": 0.2083333432674408, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.2083333432674408, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2597.041748046875, "epoch": 0.13257142857142856, "grad_norm": 0.80033940076828, "kl": 0.01983642578125, "learning_rate": 3.772742941841499e-07, "loss": -1.4814, "reward": 0.2500000111758709, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.2500000111758709, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 3533.5, "epoch": 0.13314285714285715, "grad_norm": 0.4203540086746216, "kl": 0.015869140625, "learning_rate": 3.731387357557344e-07, "loss": 0.0025, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 3225.6251220703125, "epoch": 0.1337142857142857, "grad_norm": 0.7065865397453308, "kl": 0.017547607421875, "learning_rate": 3.6902076902541214e-07, "loss": -1.6826, "reward": 0.2083333432674408, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.2083333432674408, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1722.6250610351562, "epoch": 0.13428571428571429, "grad_norm": 0.5265027284622192, "kl": 0.013519287109375, "learning_rate": 3.649208035912249e-07, "loss": -0.7374, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1470.5000610351562, "epoch": 0.13485714285714287, "grad_norm": 0.5969668030738831, "kl": 0.01806640625, "learning_rate": 3.608392472606956e-07, "loss": 0.0026, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1409.5000610351562, "epoch": 0.13542857142857143, "grad_norm": 1.4041366577148438, "kl": 0.0216064453125, "learning_rate": 3.5677650601026585e-07, "loss": -1.4828, "reward": 0.0833333358168602, "reward_std": 0.20412415266036987, "rewards/accuracy_reward": 0.0833333358168602, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2312.5001220703125, "epoch": 0.136, "grad_norm": 1.0515042543411255, "kl": 0.0201416015625, "learning_rate": 3.5273298394491515e-07, "loss": -2.6223, "reward": 0.291666679084301, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.291666679084301, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 2486.9166870117188, "epoch": 0.13657142857142857, "grad_norm": 0.8044987916946411, "kl": 0.02252197265625, "learning_rate": 3.4870908325796527e-07, "loss": -2.2171, "reward": 0.3750000223517418, "reward_std": 0.306186206638813, "rewards/accuracy_reward": 0.3750000223517418, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1590.8751220703125, "epoch": 0.13714285714285715, "grad_norm": 0.6538607478141785, "kl": 0.015533447265625, "learning_rate": 3.4470520419107664e-07, "loss": -0.9312, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2145.8750610351562, "epoch": 0.1377142857142857, "grad_norm": 0.8519302010536194, "kl": 0.02294921875, "learning_rate": 3.407217449944373e-07, "loss": -0.9377, "reward": 0.4166666865348816, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666865348816, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3469.291748046875, "epoch": 0.1382857142857143, "grad_norm": 0.5576246380805969, "kl": 0.016998291015625, "learning_rate": 3.367591018871506e-07, "loss": -0.9348, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 2839.541748046875, "epoch": 0.13885714285714285, "grad_norm": 0.41596850752830505, "kl": 0.01690673828125, "learning_rate": 3.3281766901782517e-07, "loss": 0.0027, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2468.5416870117188, "epoch": 0.13942857142857143, "grad_norm": 0.8380830883979797, "kl": 0.01788330078125, "learning_rate": 3.2889783842536987e-07, "loss": -0.9381, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 2339.041748046875, "epoch": 0.14, "grad_norm": 1.0777580738067627, "kl": 0.01507568359375, "learning_rate": 3.250000000000001e-07, "loss": -2.5829, "reward": 0.2083333358168602, "reward_std": 0.3602609634399414, "rewards/accuracy_reward": 0.2083333358168602, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2758.166748046875, "epoch": 0.14057142857142857, "grad_norm": 0.8583000302314758, "kl": 0.015411376953125, "learning_rate": 3.211245414444559e-07, "loss": -1.9945, "reward": 0.25, "reward_std": 0.273861289024353, "rewards/accuracy_reward": 0.25, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 2797.291748046875, "epoch": 0.14114285714285715, "grad_norm": 1.0498836040496826, "kl": 0.027252197265625, "learning_rate": 3.172718482354393e-07, "loss": -2.6177, "reward": 0.458333358168602, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.458333358168602, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1560.8750610351562, "epoch": 0.1417142857142857, "grad_norm": 0.2750161290168762, "kl": 0.01739501953125, "learning_rate": 3.1344230358527284e-07, "loss": 0.0025, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 2254.375, "epoch": 0.1422857142857143, "grad_norm": 0.7589177489280701, "kl": 0.02679443359375, "learning_rate": 3.096362884037821e-07, "loss": -0.7333, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2698.6251220703125, "epoch": 0.14285714285714285, "grad_norm": 0.5587661266326904, "kl": 0.015472412109375, "learning_rate": 3.058541812604083e-07, "loss": -0.7418, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1584.3750610351562, "epoch": 0.14342857142857143, "grad_norm": 0.8878784775733948, "kl": 0.0150146484375, "learning_rate": 3.020963583465539e-07, "loss": -1.6822, "reward": 0.3750000149011612, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.3750000149011612, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1354.0833435058594, "epoch": 0.144, "grad_norm": 1.3565187454223633, "kl": 0.0235595703125, "learning_rate": 2.9836319343816397e-07, "loss": -1.6732, "reward": 0.125, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.125, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2829.625, "epoch": 0.14457142857142857, "grad_norm": 0.8856632113456726, "kl": 0.015411376953125, "learning_rate": 2.946550578585483e-07, "loss": -0.7394, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2557.0, "epoch": 0.14514285714285713, "grad_norm": 0.5032978653907776, "kl": 0.01812744140625, "learning_rate": 2.9097232044144696e-07, "loss": -1.724, "reward": 0.1666666679084301, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.1666666679084301, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2025.5001220703125, "epoch": 0.1457142857142857, "grad_norm": 0.6513389945030212, "kl": 0.014007568359375, "learning_rate": 2.8731534749434464e-07, "loss": -0.9391, "reward": 0.4166666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666716337204, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 3475.541748046875, "epoch": 0.1462857142857143, "grad_norm": 0.5068712830543518, "kl": 0.01947021484375, "learning_rate": 2.836845027620346e-07, "loss": 0.0031, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1864.4584350585938, "epoch": 0.14685714285714285, "grad_norm": 11.749297142028809, "kl": 0.2183837890625, "learning_rate": 2.8008014739043884e-07, "loss": -1.8304, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1771.0833740234375, "epoch": 0.14742857142857144, "grad_norm": 0.6075973510742188, "kl": 0.009613037109375, "learning_rate": 2.765026398906865e-07, "loss": -1.6771, "reward": 0.2083333395421505, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.2083333395421505, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2659.1666870117188, "epoch": 0.148, "grad_norm": 0.9356407523155212, "kl": 0.014862060546875, "learning_rate": 2.729523361034538e-07, "loss": -1.6647, "reward": 0.125, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.125, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 2985.7083740234375, "epoch": 0.14857142857142858, "grad_norm": 0.3902454674243927, "kl": 0.015960693359375, "learning_rate": 2.6942958916356994e-07, "loss": -0.9388, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 3379.5833740234375, "epoch": 0.14914285714285713, "grad_norm": 0.5188402533531189, "kl": 0.02239990234375, "learning_rate": 2.659347494648925e-07, "loss": -0.7375, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 2401.70849609375, "epoch": 0.14971428571428572, "grad_norm": 0.5534473657608032, "kl": 0.014617919921875, "learning_rate": 2.6246816462545496e-07, "loss": -0.9393, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2452.0833740234375, "epoch": 0.15028571428571427, "grad_norm": 0.7840031981468201, "kl": 0.01806640625, "learning_rate": 2.5903017945289017e-07, "loss": -0.9387, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 3485.625, "epoch": 0.15085714285714286, "grad_norm": 0.45268696546554565, "kl": 0.013824462890625, "learning_rate": 2.5562113591013457e-07, "loss": 0.0022, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 3424.5001220703125, "epoch": 0.15142857142857144, "grad_norm": 0.4016251564025879, "kl": 0.01983642578125, "learning_rate": 2.5224137308141336e-07, "loss": -0.7367, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 2272.375, "epoch": 0.152, "grad_norm": 0.6088007092475891, "kl": 0.0252685546875, "learning_rate": 2.488912271385139e-07, "loss": 0.004, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1409.5833740234375, "epoch": 0.15257142857142858, "grad_norm": 0.706413209438324, "kl": 0.016357421875, "learning_rate": 2.4557103130734763e-07, "loss": -0.7323, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2088.666748046875, "epoch": 0.15314285714285714, "grad_norm": 2.1446173191070557, "kl": 0.019317626953125, "learning_rate": 2.4228111583480596e-07, "loss": -1.7378, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 2598.20849609375, "epoch": 0.15371428571428572, "grad_norm": 0.18762169778347015, "kl": 0.012420654296875, "learning_rate": 2.390218079559109e-07, "loss": 0.002, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1651.0833740234375, "epoch": 0.15428571428571428, "grad_norm": 0.6639404892921448, "kl": 0.01995849609375, "learning_rate": 2.3579343186126726e-07, "loss": -0.7353, "reward": 0.2916666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2916666679084301, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 991.2500610351562, "epoch": 0.15485714285714286, "grad_norm": 0.9474217891693115, "kl": 0.0179443359375, "learning_rate": 2.3259630866481605e-07, "loss": -1.8744, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2655.5834350585938, "epoch": 0.15542857142857142, "grad_norm": 0.628149688243866, "kl": 0.017578125, "learning_rate": 2.294307563718949e-07, "loss": -0.7414, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 2282.3333740234375, "epoch": 0.156, "grad_norm": 1.2667988538742065, "kl": 0.0169677734375, "learning_rate": 2.2629708984760706e-07, "loss": -1.6717, "reward": 0.1250000037252903, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.1250000037252903, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 2919.916748046875, "epoch": 0.15657142857142858, "grad_norm": 0.7472139596939087, "kl": 0.01800537109375, "learning_rate": 2.2319562078550318e-07, "loss": -0.9381, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1767.5000610351562, "epoch": 0.15714285714285714, "grad_norm": 1.192237377166748, "kl": 0.01751708984375, "learning_rate": 2.2012665767657823e-07, "loss": -0.7415, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1048.2917175292969, "epoch": 0.15771428571428572, "grad_norm": 1.2694878578186035, "kl": 0.018829345703125, "learning_rate": 2.1709050577858728e-07, "loss": -1.8771, "reward": 0.583333358168602, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.583333358168602, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 2410.416748046875, "epoch": 0.15828571428571428, "grad_norm": 1.6138927936553955, "kl": 0.030670166015625, "learning_rate": 2.1408746708568242e-07, "loss": -1.93, "reward": 0.2083333432674408, "reward_std": 0.26603007316589355, "rewards/accuracy_reward": 0.2083333432674408, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2286.125, "epoch": 0.15885714285714286, "grad_norm": 0.7141113877296448, "kl": 0.014129638671875, "learning_rate": 2.1111784029837509e-07, "loss": -0.742, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1842.8750610351562, "epoch": 0.15942857142857142, "grad_norm": 0.6313254833221436, "kl": 0.01605224609375, "learning_rate": 2.081819207938249e-07, "loss": -0.939, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2166.541748046875, "epoch": 0.16, "grad_norm": 0.8989630341529846, "kl": 0.015899658203125, "learning_rate": 2.0528000059645995e-07, "loss": -0.9391, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1435.3750915527344, "epoch": 0.16057142857142856, "grad_norm": 0.4465429186820984, "kl": 0.01531982421875, "learning_rate": 2.0241236834893028e-07, "loss": 0.0025, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 2514.5833740234375, "epoch": 0.16114285714285714, "grad_norm": 1.1687437295913696, "kl": 0.02166748046875, "learning_rate": 1.9957930928339772e-07, "loss": -1.8717, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2165.3333740234375, "epoch": 0.16171428571428573, "grad_norm": 0.9040661454200745, "kl": 0.016571044921875, "learning_rate": 1.96781105193165e-07, "loss": -0.9278, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2579.9583740234375, "epoch": 0.16228571428571428, "grad_norm": 0.7441158890724182, "kl": 0.01458740234375, "learning_rate": 1.9401803440464654e-07, "loss": -1.6716, "reward": 0.375, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.375, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3037.75, "epoch": 0.16285714285714287, "grad_norm": 0.6573392152786255, "kl": 0.015625, "learning_rate": 1.9129037174968505e-07, "loss": -0.9305, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2087.666748046875, "epoch": 0.16342857142857142, "grad_norm": 0.9549260139465332, "kl": 0.017333984375, "learning_rate": 1.8859838853821435e-07, "loss": -2.6746, "reward": 0.2500000074505806, "reward_std": 0.3680921643972397, "rewards/accuracy_reward": 0.2500000074505806, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1758.9166870117188, "epoch": 0.164, "grad_norm": 1.3493952751159668, "kl": 0.0152587890625, "learning_rate": 1.8594235253127372e-07, "loss": -2.8729, "reward": 0.291666679084301, "reward_std": 0.3951295167207718, "rewards/accuracy_reward": 0.291666679084301, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 2302.3751220703125, "epoch": 0.16457142857142856, "grad_norm": 0.5830644965171814, "kl": 0.0174560546875, "learning_rate": 1.8332252791437486e-07, "loss": -0.7415, "reward": 0.2083333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.2083333432674408, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 2104.8333740234375, "epoch": 0.16514285714285715, "grad_norm": 0.9054921269416809, "kl": 0.0169677734375, "learning_rate": 1.8073917527122385e-07, "loss": -1.7252, "reward": 0.1666666716337204, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666716337204, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1549.2500915527344, "epoch": 0.1657142857142857, "grad_norm": 1.4962562322616577, "kl": 0.02374267578125, "learning_rate": 1.7819255155780238e-07, "loss": -0.9334, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 2643.3751220703125, "epoch": 0.1662857142857143, "grad_norm": 0.8332542777061462, "kl": 0.020294189453125, "learning_rate": 1.7568291007680907e-07, "loss": -1.4634, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.0833333358168602, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 2123.791748046875, "epoch": 0.16685714285714287, "grad_norm": 0.7176339030265808, "kl": 0.0137939453125, "learning_rate": 1.7321050045246455e-07, "loss": -1.4564, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.0833333358168602, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1340.9167175292969, "epoch": 0.16742857142857143, "grad_norm": 1.2635085582733154, "kl": 0.011871337890625, "learning_rate": 1.7077556860568238e-07, "loss": -2.4176, "reward": 0.3333333544433117, "reward_std": 0.3332235738635063, "rewards/accuracy_reward": 0.3333333544433117, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2578.8333740234375, "epoch": 0.168, "grad_norm": 0.5623243451118469, "kl": 0.0145263671875, "learning_rate": 1.6837835672960831e-07, "loss": -0.7386, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2309.2084350585938, "epoch": 0.16857142857142857, "grad_norm": 1.792176365852356, "kl": 0.014251708984375, "learning_rate": 1.6601910326552998e-07, "loss": -1.6771, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2761.2083740234375, "epoch": 0.16914285714285715, "grad_norm": 0.5179618000984192, "kl": 0.0174560546875, "learning_rate": 1.6369804287916025e-07, "loss": -0.74, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 2832.916748046875, "epoch": 0.1697142857142857, "grad_norm": 1.062676191329956, "kl": 0.0169677734375, "learning_rate": 1.6141540643729612e-07, "loss": -2.6165, "reward": 0.2916666679084301, "reward_std": 0.3602609559893608, "rewards/accuracy_reward": 0.2916666679084301, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 2174.5, "epoch": 0.1702857142857143, "grad_norm": 1.0952427387237549, "kl": 0.015716552734375, "learning_rate": 1.5917142098485503e-07, "loss": -1.4785, "reward": 0.25, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.25, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1648.291748046875, "epoch": 0.17085714285714285, "grad_norm": 0.9061567187309265, "kl": 0.016937255859375, "learning_rate": 1.5696630972229166e-07, "loss": -1.6821, "reward": 0.2083333395421505, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.2083333395421505, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2321.9583740234375, "epoch": 0.17142857142857143, "grad_norm": 0.47703251242637634, "kl": 0.01666259765625, "learning_rate": 1.548002919833971e-07, "loss": -0.7289, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 2111.7501220703125, "epoch": 0.172, "grad_norm": 0.47726908326148987, "kl": 0.01434326171875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0023, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1759.4166870117188, "epoch": 0.17257142857142857, "grad_norm": 0.6928355693817139, "kl": 0.016265869140625, "learning_rate": 1.5058639494795067e-07, "loss": -0.9378, "reward": 0.4166666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666716337204, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 2077.5, "epoch": 0.17314285714285715, "grad_norm": 0.6859722137451172, "kl": 0.0133056640625, "learning_rate": 1.485389347912525e-07, "loss": -1.4857, "reward": 0.2500000111758709, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.2500000111758709, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 2916.2501220703125, "epoch": 0.1737142857142857, "grad_norm": 0.9551180601119995, "kl": 0.01416015625, "learning_rate": 1.4653140639624066e-07, "loss": -1.8796, "reward": 0.1666666716337204, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.1666666716337204, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 2326.8750610351562, "epoch": 0.1742857142857143, "grad_norm": 0.44118639826774597, "kl": 0.0216064453125, "learning_rate": 1.4456400944391144e-07, "loss": -0.7264, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1633.6666870117188, "epoch": 0.17485714285714285, "grad_norm": 0.9067421555519104, "kl": 0.02105712890625, "learning_rate": 1.4263693962354336e-07, "loss": -1.4626, "reward": 0.3333333544433117, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.3333333544433117, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 2906.0833740234375, "epoch": 0.17542857142857143, "grad_norm": 0.49431222677230835, "kl": 0.011474609375, "learning_rate": 1.4075038861323302e-07, "loss": -0.7356, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1410.7500610351562, "epoch": 0.176, "grad_norm": 0.8035596609115601, "kl": 0.01629638671875, "learning_rate": 1.3890454406082956e-07, "loss": -0.9359, "reward": 0.5833333432674408, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.5833333432674408, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 3014.25, "epoch": 0.17657142857142857, "grad_norm": 0.2768433392047882, "kl": 0.01849365234375, "learning_rate": 1.3709958956526974e-07, "loss": 0.0029, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3245.7083740234375, "epoch": 0.17714285714285713, "grad_norm": 0.7835389971733093, "kl": 0.01654052734375, "learning_rate": 1.353357046583165e-07, "loss": -2.2183, "reward": 0.1250000037252903, "reward_std": 0.3061862140893936, "rewards/accuracy_reward": 0.1250000037252903, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 2727.3333740234375, "epoch": 0.1777142857142857, "grad_norm": 0.48111966252326965, "kl": 0.009368896484375, "learning_rate": 1.3361306478670148e-07, "loss": -0.9302, "reward": 0.0833333358168602, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.0833333358168602, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1927.541748046875, "epoch": 0.1782857142857143, "grad_norm": 0.5695589184761047, "kl": 0.02117919921875, "learning_rate": 1.3193184129467384e-07, "loss": 0.0034, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2037.0000610351562, "epoch": 0.17885714285714285, "grad_norm": 1.5638245344161987, "kl": 0.017608642578125, "learning_rate": 1.3029220140695756e-07, "loss": -1.6713, "reward": 0.125, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.125, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2119.4584350585938, "epoch": 0.17942857142857144, "grad_norm": 0.8178718686103821, "kl": 0.0159912109375, "learning_rate": 1.2869430821211826e-07, "loss": -1.6774, "reward": 0.458333358168602, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.458333358168602, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2671.666748046875, "epoch": 0.18, "grad_norm": 0.7374725341796875, "kl": 0.012298583984375, "learning_rate": 1.2713832064634125e-07, "loss": -1.6682, "reward": 0.1250000037252903, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.1250000037252903, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1793.666748046875, "epoch": 0.18057142857142858, "grad_norm": 0.8555610775947571, "kl": 0.0196533203125, "learning_rate": 1.2562439347762275e-07, "loss": -0.7343, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2572.6250610351562, "epoch": 0.18114285714285713, "grad_norm": 0.7458478808403015, "kl": 0.012847900390625, "learning_rate": 1.2415267729037608e-07, "loss": -1.6769, "reward": 0.1250000037252903, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.1250000037252903, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1808.0000610351562, "epoch": 0.18171428571428572, "grad_norm": 1.242701530456543, "kl": 0.017181396484375, "learning_rate": 1.2272331847045313e-07, "loss": -2.6198, "reward": 0.3750000223517418, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.3750000223517418, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2072.875, "epoch": 0.18228571428571427, "grad_norm": 0.43984729051589966, "kl": 0.01568603515625, "learning_rate": 1.2133645919058418e-07, "loss": -0.939, "reward": 0.4166666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666716337204, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2183.375, "epoch": 0.18285714285714286, "grad_norm": 0.7200397849082947, "kl": 0.017120361328125, "learning_rate": 1.1999223739623666e-07, "loss": -1.6787, "reward": 0.125, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.125, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3046.916748046875, "epoch": 0.18342857142857144, "grad_norm": 0.5004404187202454, "kl": 0.016571044921875, "learning_rate": 1.1869078679189393e-07, "loss": -0.7381, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1882.75, "epoch": 0.184, "grad_norm": 1.3577691316604614, "kl": 0.0198974609375, "learning_rate": 1.1743223682775649e-07, "loss": -3.1577, "reward": 0.2083333358168602, "reward_std": 0.43528564274311066, "rewards/accuracy_reward": 0.2083333358168602, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2563.2083740234375, "epoch": 0.18457142857142858, "grad_norm": 1.3319201469421387, "kl": 0.023193359375, "learning_rate": 1.1621671268686605e-07, "loss": -3.1597, "reward": 0.291666679084301, "reward_std": 0.43528565764427185, "rewards/accuracy_reward": 0.291666679084301, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2025.666748046875, "epoch": 0.18514285714285714, "grad_norm": 0.5463172793388367, "kl": 0.02020263671875, "learning_rate": 1.1504433527265378e-07, "loss": -1.4631, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.0833333358168602, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2767.6251220703125, "epoch": 0.18571428571428572, "grad_norm": 0.6291061639785767, "kl": 0.0208740234375, "learning_rate": 1.1391522119691496e-07, "loss": -0.7406, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1606.8750610351562, "epoch": 0.18628571428571428, "grad_norm": 1.8553162813186646, "kl": 0.02337646484375, "learning_rate": 1.1282948276820962e-07, "loss": -1.9341, "reward": 0.2916666865348816, "reward_std": 0.26603007316589355, "rewards/accuracy_reward": 0.2916666865348816, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2260.25, "epoch": 0.18685714285714286, "grad_norm": 0.6469095945358276, "kl": 0.014312744140625, "learning_rate": 1.1178722798069215e-07, "loss": 0.0023, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2050.666748046875, "epoch": 0.18742857142857142, "grad_norm": 0.7297951579093933, "kl": 0.017822265625, "learning_rate": 1.10788560503369e-07, "loss": -0.9389, "reward": 0.1666666716337204, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.1666666716337204, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2868.33349609375, "epoch": 0.188, "grad_norm": 1.0545990467071533, "kl": 0.01544189453125, "learning_rate": 1.0983357966978745e-07, "loss": -2.73, "reward": 0.2916666679084301, "reward_std": 0.37592335790395737, "rewards/accuracy_reward": 0.2916666679084301, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2244.7501220703125, "epoch": 0.18857142857142858, "grad_norm": 1.0005416870117188, "kl": 0.017578125, "learning_rate": 1.0892238046815527e-07, "loss": -2.2275, "reward": 0.291666679084301, "reward_std": 0.3061862140893936, "rewards/accuracy_reward": 0.291666679084301, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1698.3333435058594, "epoch": 0.18914285714285714, "grad_norm": 0.24079620838165283, "kl": 0.01593017578125, "learning_rate": 1.0805505353189254e-07, "loss": 0.0029, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1608.1667175292969, "epoch": 0.18971428571428572, "grad_norm": 0.5765926837921143, "kl": 0.013397216796875, "learning_rate": 1.0723168513061665e-07, "loss": -0.7406, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2000.416748046875, "epoch": 0.19028571428571428, "grad_norm": 0.4461122751235962, "kl": 0.01611328125, "learning_rate": 1.0645235716156168e-07, "loss": 0.0026, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 2835.75, "epoch": 0.19085714285714286, "grad_norm": 0.9358536005020142, "kl": 0.01904296875, "learning_rate": 1.0571714714143197e-07, "loss": -1.6761, "reward": 0.2083333432674408, "reward_std": 0.23116150498390198, "rewards/accuracy_reward": 0.2083333432674408, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1533.125, "epoch": 0.19142857142857142, "grad_norm": 0.58393394947052, "kl": 0.0191650390625, "learning_rate": 1.0502612819869216e-07, "loss": 0.003, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1860.0833740234375, "epoch": 0.192, "grad_norm": 0.9539235234260559, "kl": 0.0213623046875, "learning_rate": 1.0437936906629334e-07, "loss": -2.6119, "reward": 0.2083333358168602, "reward_std": 0.3602609485387802, "rewards/accuracy_reward": 0.2083333358168602, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2359.9584350585938, "epoch": 0.19257142857142856, "grad_norm": 0.591833233833313, "kl": 0.020263671875, "learning_rate": 1.0377693407483638e-07, "loss": -0.9325, "reward": 0.4166666865348816, "reward_std": 0.12909944355487823, "rewards/accuracy_reward": 0.4166666865348816, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2456.5834350585938, "epoch": 0.19314285714285714, "grad_norm": 1.7795339822769165, "kl": 0.01800537109375, "learning_rate": 1.032188831461732e-07, "loss": -3.5502, "reward": 0.375, "reward_std": 0.48936039209365845, "rewards/accuracy_reward": 0.375, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 2746.6251220703125, "epoch": 0.19371428571428573, "grad_norm": 0.5191667675971985, "kl": 0.0203857421875, "learning_rate": 1.0270527178744664e-07, "loss": -0.7381, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.0416666679084301, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1982.0000610351562, "epoch": 0.19428571428571428, "grad_norm": 0.5445987582206726, "kl": 0.01641845703125, "learning_rate": 1.0223615108556937e-07, "loss": 0.0026, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1073.125, "epoch": 0.19485714285714287, "grad_norm": 1.0652012825012207, "kl": 0.01904296875, "learning_rate": 1.0181156770214242e-07, "loss": -1.9294, "reward": 0.2083333358168602, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2083333358168602, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1901.2917175292969, "epoch": 0.19542857142857142, "grad_norm": 1.262035846710205, "kl": 0.016754150390625, "learning_rate": 1.0143156386881408e-07, "loss": -1.8793, "reward": 0.2500000074505806, "reward_std": 0.25819888710975647, "rewards/accuracy_reward": 0.2500000074505806, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1721.8750610351562, "epoch": 0.196, "grad_norm": 0.8769494295120239, "kl": 0.02349853515625, "learning_rate": 1.0109617738307911e-07, "loss": -0.7405, "reward": 0.2083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.2083333432674408, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2183.2084350585938, "epoch": 0.19657142857142856, "grad_norm": 2.0577425956726074, "kl": 0.01995849609375, "learning_rate": 1.0080544160451918e-07, "loss": -2.7138, "reward": 0.2916666679084301, "reward_std": 0.37592336535453796, "rewards/accuracy_reward": 0.2916666679084301, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1418.0833435058594, "epoch": 0.19714285714285715, "grad_norm": 0.6003240942955017, "kl": 0.017822265625, "learning_rate": 1.0055938545148495e-07, "loss": 0.0033, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 2902.291748046875, "epoch": 0.1977142857142857, "grad_norm": 0.4958335757255554, "kl": 0.01824951171875, "learning_rate": 1.0035803339821934e-07, "loss": -0.7393, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2797.2083740234375, "epoch": 0.1982857142857143, "grad_norm": 0.5800639390945435, "kl": 0.012176513671875, "learning_rate": 1.002014054724235e-07, "loss": -1.6757, "reward": 0.1250000037252903, "reward_std": 0.23116151243448257, "rewards/accuracy_reward": 0.1250000037252903, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2299.666748046875, "epoch": 0.19885714285714284, "grad_norm": 0.43301284313201904, "kl": 0.02374267578125, "learning_rate": 1.0008951725326441e-07, "loss": 0.0038, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 3268.9583740234375, "epoch": 0.19942857142857143, "grad_norm": 0.4114702641963959, "kl": 0.0225830078125, "learning_rate": 1.0002237986982564e-07, "loss": -0.7369, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1340.791748046875, "epoch": 0.2, "grad_norm": 0.5905561447143555, "kl": 0.017578125, "learning_rate": 1e-07, "loss": 0.0026, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "step": 350 }, { "epoch": 0.2, "step": 350, "total_flos": 0.0, "train_loss": -1.2733016510141806, "train_runtime": 18317.2078, "train_samples_per_second": 0.459, "train_steps_per_second": 0.019 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }