{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.374772190851782, "eval_steps": 100, "global_step": 1566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 562.7477951049805, "epoch": 0.0019121030145499089, "grad_norm": 0.022798627614974976, "kl": 0.0, "learning_rate": 1.5923566878980894e-08, "loss": 0.002, "num_tokens": 3752220.0, "reward": 0.011439732770668343, "reward_std": 0.019404857070185244, "rewards/pure_accuracy_reward_math": 0.011439732537837699, "step": 1 }, { "clip_ratio": 0.0, "epoch": 0.0038242060290998177, "grad_norm": 0.02280641719698906, "kl": 0.0, "learning_rate": 3.184713375796179e-08, "loss": 0.002, "step": 2 }, { "clip_ratio": 7.760171627069212e-05, "epoch": 0.005736309043649726, "grad_norm": 0.02249608002603054, "kl": 0.00034177303314208984, "learning_rate": 4.777070063694268e-08, "loss": 0.002, "step": 3 }, { "clip_ratio": 7.010291557207893e-05, "epoch": 0.0076484120581996355, "grad_norm": 0.022546162828803062, "kl": 0.0003476440906524658, "learning_rate": 6.369426751592358e-08, "loss": 0.002, "step": 4 }, { "clip_ratio": 6.121935876990392e-05, "epoch": 0.009560515072749545, "grad_norm": 0.022293007001280785, "kl": 0.00034675002098083496, "learning_rate": 7.961783439490447e-08, "loss": 0.002, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 569.8786544799805, "epoch": 0.011472618087299453, "grad_norm": 0.04666038230061531, "kl": 0.000458449125289917, "learning_rate": 9.554140127388536e-08, "loss": 0.0036, "num_tokens": 7526881.0, "reward": 0.010323661233996972, "reward_std": 0.01923220051685348, "rewards/pure_accuracy_reward_math": 0.01032366111758165, "step": 6 }, { "clip_ratio": 9.284320668712098e-05, "epoch": 0.013384721101849363, "grad_norm": 0.03701707720756531, "kl": 0.0004444718360900879, "learning_rate": 1.1146496815286625e-07, "loss": 0.0037, "step": 7 }, { "clip_ratio": 0.00010049525423028172, "epoch": 0.015296824116399271, "grad_norm": 0.05443934351205826, "kl": 0.0004649162292480469, "learning_rate": 1.2738853503184715e-07, "loss": 0.0037, "step": 8 }, { "clip_ratio": 9.395023369052069e-05, "epoch": 0.01720892713094918, "grad_norm": 0.0357414111495018, "kl": 0.0004501640796661377, "learning_rate": 1.4331210191082803e-07, "loss": 0.0037, "step": 9 }, { "clip_ratio": 0.00010371651984542041, "epoch": 0.01912103014549909, "grad_norm": 0.05199029669165611, "kl": 0.0004614591598510742, "learning_rate": 1.5923566878980893e-07, "loss": 0.0037, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 570.0694994926453, "epoch": 0.021033133160048997, "grad_norm": 0.022845298051834106, "kl": 0.00035685300827026367, "learning_rate": 1.751592356687898e-07, "loss": 0.0025, "num_tokens": 11302358.0, "reward": 0.00948660756694153, "reward_std": 0.017558093182742596, "rewards/pure_accuracy_reward_math": 0.00948660756694153, "step": 11 }, { "clip_ratio": 7.08361723127382e-05, "epoch": 0.022945236174598906, "grad_norm": 0.02234972082078457, "kl": 0.0003580451011657715, "learning_rate": 1.9108280254777072e-07, "loss": 0.0025, "step": 12 }, { "clip_ratio": 6.80922717606336e-05, "epoch": 0.024857339189148814, "grad_norm": 0.021554963663220406, "kl": 0.00035765767097473145, "learning_rate": 2.070063694267516e-07, "loss": 0.0024, "step": 13 }, { "clip_ratio": 7.82350492158912e-05, "epoch": 0.026769442203698725, "grad_norm": 0.02103673666715622, "kl": 0.000364154577255249, "learning_rate": 2.229299363057325e-07, "loss": 0.0025, "step": 14 }, { "clip_ratio": 7.339451894949889e-05, "epoch": 0.028681545218248634, "grad_norm": 0.023219415917992592, "kl": 0.00036078691482543945, "learning_rate": 2.3885350318471343e-07, "loss": 0.0025, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 560.8797726631165, "epoch": 0.030593648232798542, "grad_norm": 0.024746257811784744, "kl": 0.0003574490547180176, "learning_rate": 2.547770700636943e-07, "loss": 0.0041, "num_tokens": 15044695.0, "reward": 0.011160714813740924, "reward_std": 0.0194911856087856, "rewards/pure_accuracy_reward_math": 0.011160714755533263, "step": 16 }, { "clip_ratio": 9.0199953319825e-05, "epoch": 0.032505751247348454, "grad_norm": 0.02409624680876732, "kl": 0.0003629624843597412, "learning_rate": 2.707006369426752e-07, "loss": 0.0042, "step": 17 }, { "clip_ratio": 8.157364351291108e-05, "epoch": 0.03441785426189836, "grad_norm": 0.023118698969483376, "kl": 0.0003673136234283447, "learning_rate": 2.8662420382165606e-07, "loss": 0.0041, "step": 18 }, { "clip_ratio": 9.048881202033954e-05, "epoch": 0.03632995727644827, "grad_norm": 0.02316245064139366, "kl": 0.00036725401878356934, "learning_rate": 3.02547770700637e-07, "loss": 0.0041, "step": 19 }, { "clip_ratio": 8.188984941170929e-05, "epoch": 0.03824206029099818, "grad_norm": 0.021714523434638977, "kl": 0.0003698766231536865, "learning_rate": 3.1847133757961787e-07, "loss": 0.0041, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 554.6908774375916, "epoch": 0.040154163305548086, "grad_norm": 0.021168457344174385, "kl": 0.000368267297744751, "learning_rate": 3.3439490445859875e-07, "loss": 0.0026, "num_tokens": 18758275.0, "reward": 0.010044643335277215, "reward_std": 0.018202457285951823, "rewards/pure_accuracy_reward_math": 0.010044643277069554, "step": 21 }, { "clip_ratio": 7.562077911416054e-05, "epoch": 0.042066266320097995, "grad_norm": 0.020001132041215897, "kl": 0.00037425756454467773, "learning_rate": 3.503184713375796e-07, "loss": 0.0026, "step": 22 }, { "clip_ratio": 7.507880479806772e-05, "epoch": 0.0439783693346479, "grad_norm": 0.019386926665902138, "kl": 0.0003781616687774658, "learning_rate": 3.6624203821656055e-07, "loss": 0.0026, "step": 23 }, { "clip_ratio": 7.805726602327923e-05, "epoch": 0.04589047234919781, "grad_norm": 0.018619129434227943, "kl": 0.0003878176212310791, "learning_rate": 3.8216560509554143e-07, "loss": 0.0026, "step": 24 }, { "clip_ratio": 6.671031508176384e-05, "epoch": 0.04780257536374772, "grad_norm": 0.01833859272301197, "kl": 0.00040024518966674805, "learning_rate": 3.980891719745223e-07, "loss": 0.0026, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 552.0828938484192, "epoch": 0.04971467837829763, "grad_norm": 0.02587960660457611, "kl": 0.00041344761848449707, "learning_rate": 4.140127388535032e-07, "loss": 0.0024, "num_tokens": 22468764.0, "reward": 0.012276786321308464, "reward_std": 0.022195036057382822, "rewards/pure_accuracy_reward_math": 0.012276786204893142, "step": 26 }, { "clip_ratio": 9.613389988771814e-05, "epoch": 0.05162678139284754, "grad_norm": 0.02422533929347992, "kl": 0.00043016672134399414, "learning_rate": 4.2993630573248406e-07, "loss": 0.0024, "step": 27 }, { "clip_ratio": 8.45099556840978e-05, "epoch": 0.05353888440739745, "grad_norm": 0.023998353630304337, "kl": 0.0004411041736602783, "learning_rate": 4.45859872611465e-07, "loss": 0.0024, "step": 28 }, { "clip_ratio": 9.715859295056362e-05, "epoch": 0.05545098742194736, "grad_norm": 0.023024486377835274, "kl": 0.0004749894142150879, "learning_rate": 4.6178343949044587e-07, "loss": 0.0024, "step": 29 }, { "clip_ratio": 9.816014483021718e-05, "epoch": 0.05736309043649727, "grad_norm": 0.022171439602971077, "kl": 0.0005015134811401367, "learning_rate": 4.777070063694269e-07, "loss": 0.0024, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 549.791042804718, "epoch": 0.059275193451047176, "grad_norm": 0.027614159509539604, "kl": 0.0005223453044891357, "learning_rate": 4.936305732484077e-07, "loss": 0.0029, "num_tokens": 26170579.0, "reward": 0.017299108090810478, "reward_std": 0.03018019301816821, "rewards/pure_accuracy_reward_math": 0.017299107741564512, "step": 31 }, { "clip_ratio": 0.00012569415866892086, "epoch": 0.061187296465597084, "grad_norm": 0.02653171494603157, "kl": 0.0005522072315216064, "learning_rate": 5.095541401273886e-07, "loss": 0.0029, "step": 32 }, { "clip_ratio": 0.00012863677034147258, "epoch": 0.06309939948014699, "grad_norm": 0.0255680400878191, "kl": 0.0005916953086853027, "learning_rate": 5.254777070063695e-07, "loss": 0.0029, "step": 33 }, { "clip_ratio": 0.00012797017114962728, "epoch": 0.06501150249469691, "grad_norm": 0.02455417811870575, "kl": 0.0006306171417236328, "learning_rate": 5.414012738853504e-07, "loss": 0.0029, "step": 34 }, { "clip_ratio": 0.00012855784757448419, "epoch": 0.06692360550924681, "grad_norm": 0.024154040962457657, "kl": 0.0006751418113708496, "learning_rate": 5.573248407643312e-07, "loss": 0.0029, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 552.728542804718, "epoch": 0.06883570852379672, "grad_norm": 0.023450903594493866, "kl": 0.000738978385925293, "learning_rate": 5.732484076433121e-07, "loss": 0.0034, "num_tokens": 29883398.0, "reward": 0.018415179511066526, "reward_std": 0.030997214023955166, "rewards/pure_accuracy_reward_math": 0.01841517922002822, "step": 36 }, { "clip_ratio": 0.00012425195308196635, "epoch": 0.07074781153834662, "grad_norm": 0.023070134222507477, "kl": 0.0007783770561218262, "learning_rate": 5.89171974522293e-07, "loss": 0.0034, "step": 37 }, { "clip_ratio": 0.00012334759713894528, "epoch": 0.07265991455289654, "grad_norm": 0.023447532206773758, "kl": 0.0008447170257568359, "learning_rate": 6.05095541401274e-07, "loss": 0.0034, "step": 38 }, { "clip_ratio": 0.00012615493608336692, "epoch": 0.07457201756744644, "grad_norm": 0.024682210758328438, "kl": 0.0009213089942932129, "learning_rate": 6.210191082802549e-07, "loss": 0.0034, "step": 39 }, { "clip_ratio": 0.00012461718182521508, "epoch": 0.07648412058199636, "grad_norm": 0.02555885910987854, "kl": 0.000977635383605957, "learning_rate": 6.369426751592357e-07, "loss": 0.0033, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 527.030158996582, "epoch": 0.07839622359654626, "grad_norm": 0.059237755835056305, "kl": 0.001125633716583252, "learning_rate": 6.528662420382166e-07, "loss": 0.0031, "num_tokens": 33502406.0, "reward": 0.024832590454025194, "reward_std": 0.04194520629243925, "rewards/pure_accuracy_reward_math": 0.02483259010477923, "step": 41 }, { "clip_ratio": 0.00016323180295785278, "epoch": 0.08030832661109617, "grad_norm": 0.029172642156481743, "kl": 0.0011183619499206543, "learning_rate": 6.687898089171975e-07, "loss": 0.0031, "step": 42 }, { "clip_ratio": 0.0001751068371618203, "epoch": 0.08222042962564609, "grad_norm": 0.030453085899353027, "kl": 0.0011813640594482422, "learning_rate": 6.847133757961784e-07, "loss": 0.0031, "step": 43 }, { "clip_ratio": 0.00018521026674989116, "epoch": 0.08413253264019599, "grad_norm": 0.03091653250157833, "kl": 0.0012224912643432617, "learning_rate": 7.006369426751592e-07, "loss": 0.0031, "step": 44 }, { "clip_ratio": 0.00017049979595640252, "epoch": 0.0860446356547459, "grad_norm": 0.030593233183026314, "kl": 0.0012733936309814453, "learning_rate": 7.165605095541401e-07, "loss": 0.0031, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 529.7360739707947, "epoch": 0.0879567386692958, "grad_norm": 0.03371572494506836, "kl": 0.0012704133987426758, "learning_rate": 7.324840764331211e-07, "loss": 0.0039, "num_tokens": 37133676.0, "reward": 0.029017858760198578, "reward_std": 0.04838265001308173, "rewards/pure_accuracy_reward_math": 0.029017858061706647, "step": 46 }, { "clip_ratio": 0.000227557278265067, "epoch": 0.08986884168384572, "grad_norm": 0.033185359090566635, "kl": 0.0012688040733337402, "learning_rate": 7.48407643312102e-07, "loss": 0.0039, "step": 47 }, { "clip_ratio": 0.0002238695693677073, "epoch": 0.09178094469839562, "grad_norm": 0.03329231217503548, "kl": 0.0013200044631958008, "learning_rate": 7.643312101910829e-07, "loss": 0.0039, "step": 48 }, { "clip_ratio": 0.00021458888153347289, "epoch": 0.09369304771294554, "grad_norm": 0.03329336270689964, "kl": 0.0013244152069091797, "learning_rate": 7.802547770700637e-07, "loss": 0.0039, "step": 49 }, { "clip_ratio": 0.0002193794426830209, "epoch": 0.09560515072749544, "grad_norm": 0.0323607362806797, "kl": 0.0013269782066345215, "learning_rate": 7.961783439490446e-07, "loss": 0.0039, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 535.9352931976318, "epoch": 0.09751725374204535, "grad_norm": 0.030199358239769936, "kl": 0.0013506412506103516, "learning_rate": 8.121019108280255e-07, "loss": 0.0042, "num_tokens": 40789896.0, "reward": 0.030970983498264104, "reward_std": 0.0486878992523998, "rewards/pure_accuracy_reward_math": 0.030970983090810478, "step": 51 }, { "clip_ratio": 0.00019589511845197194, "epoch": 0.09942935675659526, "grad_norm": 0.029786745086312294, "kl": 0.001370549201965332, "learning_rate": 8.280254777070064e-07, "loss": 0.0042, "step": 52 }, { "clip_ratio": 0.00021279048064570816, "epoch": 0.10134145977114517, "grad_norm": 0.029834378510713577, "kl": 0.0013399124145507812, "learning_rate": 8.439490445859872e-07, "loss": 0.0042, "step": 53 }, { "clip_ratio": 0.000190277668878025, "epoch": 0.10325356278569509, "grad_norm": 0.029410598799586296, "kl": 0.00139617919921875, "learning_rate": 8.598726114649681e-07, "loss": 0.0042, "step": 54 }, { "clip_ratio": 0.00019459096591845082, "epoch": 0.10516566580024499, "grad_norm": 0.02935440093278885, "kl": 0.0014204978942871094, "learning_rate": 8.757961783439491e-07, "loss": 0.0042, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 536.9548239707947, "epoch": 0.1070777688147949, "grad_norm": 0.02805081568658352, "kl": 0.0014168024063110352, "learning_rate": 8.9171974522293e-07, "loss": 0.0048, "num_tokens": 44444894.0, "reward": 0.027901787223527208, "reward_std": 0.04121451411629096, "rewards/pure_accuracy_reward_math": 0.02790178669965826, "step": 56 }, { "clip_ratio": 0.00016821016617996065, "epoch": 0.1089898718293448, "grad_norm": 0.02779608964920044, "kl": 0.0014551877975463867, "learning_rate": 9.076433121019109e-07, "loss": 0.0048, "step": 57 }, { "clip_ratio": 0.00018197509814399382, "epoch": 0.11090197484389472, "grad_norm": 0.02721741609275341, "kl": 0.0014206171035766602, "learning_rate": 9.235668789808917e-07, "loss": 0.0048, "step": 58 }, { "clip_ratio": 0.00016919344039934003, "epoch": 0.11281407785844462, "grad_norm": 0.02676265314221382, "kl": 0.0014575719833374023, "learning_rate": 9.394904458598727e-07, "loss": 0.0048, "step": 59 }, { "clip_ratio": 0.00017069062050723005, "epoch": 0.11472618087299453, "grad_norm": 0.027010478079319, "kl": 0.0014843940734863281, "learning_rate": 9.554140127388537e-07, "loss": 0.0048, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 522.1998043060303, "epoch": 0.11663828388754444, "grad_norm": 0.030231643468141556, "kl": 0.0015106201171875, "learning_rate": 9.713375796178345e-07, "loss": 0.0029, "num_tokens": 48046694.0, "reward": 0.02762276935391128, "reward_std": 0.04623683576937765, "rewards/pure_accuracy_reward_math": 0.02762276877183467, "step": 61 }, { "clip_ratio": 0.0001882643781527804, "epoch": 0.11855038690209435, "grad_norm": 0.030413959175348282, "kl": 0.0015065670013427734, "learning_rate": 9.872611464968155e-07, "loss": 0.0029, "step": 62 }, { "clip_ratio": 0.00019050979824442038, "epoch": 0.12046248991664425, "grad_norm": 0.029997214674949646, "kl": 0.0014984607696533203, "learning_rate": 1.0031847133757962e-06, "loss": 0.0029, "step": 63 }, { "clip_ratio": 0.0001963579389325787, "epoch": 0.12237459293119417, "grad_norm": 0.02927768975496292, "kl": 0.0014634132385253906, "learning_rate": 1.0191082802547772e-06, "loss": 0.0029, "step": 64 }, { "clip_ratio": 0.0002130206620449826, "epoch": 0.12428669594574408, "grad_norm": 0.028719380497932434, "kl": 0.0014470815658569336, "learning_rate": 1.035031847133758e-06, "loss": 0.0029, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 516.4283137321472, "epoch": 0.12619879896029398, "grad_norm": 0.031215306371450424, "kl": 0.0014127492904663086, "learning_rate": 1.050955414012739e-06, "loss": 0.0038, "num_tokens": 51628501.0, "reward": 0.03487723405123688, "reward_std": 0.05173706269124523, "rewards/pure_accuracy_reward_math": 0.03487723323632963, "step": 66 }, { "clip_ratio": 0.00019433782460964721, "epoch": 0.1281109019748439, "grad_norm": 0.03108724020421505, "kl": 0.0014324188232421875, "learning_rate": 1.06687898089172e-06, "loss": 0.0038, "step": 67 }, { "clip_ratio": 0.00020085336353758976, "epoch": 0.13002300498939381, "grad_norm": 0.030220478773117065, "kl": 0.0014306306838989258, "learning_rate": 1.0828025477707007e-06, "loss": 0.0038, "step": 68 }, { "clip_ratio": 0.00021161197844321578, "epoch": 0.1319351080039437, "grad_norm": 0.030320733785629272, "kl": 0.001450181007385254, "learning_rate": 1.0987261146496817e-06, "loss": 0.0038, "step": 69 }, { "clip_ratio": 0.00019352555551677142, "epoch": 0.13384721101849362, "grad_norm": 0.02980073168873787, "kl": 0.0014796257019042969, "learning_rate": 1.1146496815286625e-06, "loss": 0.0038, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 525.2854599952698, "epoch": 0.13575931403304353, "grad_norm": 0.0338500440120697, "kl": 0.0015006065368652344, "learning_rate": 1.1305732484076435e-06, "loss": 0.006, "num_tokens": 55247180.0, "reward": 0.03710937674622983, "reward_std": 0.05426825548056513, "rewards/pure_accuracy_reward_math": 0.037109375989530236, "step": 71 }, { "clip_ratio": 0.0002256608086668166, "epoch": 0.13767141704759345, "grad_norm": 0.03328324109315872, "kl": 0.0015664100646972656, "learning_rate": 1.1464968152866242e-06, "loss": 0.006, "step": 72 }, { "clip_ratio": 0.0002166868289350532, "epoch": 0.13958352006214333, "grad_norm": 0.03267475962638855, "kl": 0.0016113519668579102, "learning_rate": 1.1624203821656052e-06, "loss": 0.006, "step": 73 }, { "clip_ratio": 0.00024709346627105333, "epoch": 0.14149562307669325, "grad_norm": 0.032320376485586166, "kl": 0.0017037391662597656, "learning_rate": 1.178343949044586e-06, "loss": 0.006, "step": 74 }, { "clip_ratio": 0.00021453456992048814, "epoch": 0.14340772609124317, "grad_norm": 0.0322573184967041, "kl": 0.0017703771591186523, "learning_rate": 1.194267515923567e-06, "loss": 0.006, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 539.314199924469, "epoch": 0.14531982910579308, "grad_norm": 0.03833702206611633, "kl": 0.0018039941787719727, "learning_rate": 1.210191082802548e-06, "loss": 0.0055, "num_tokens": 58912938.0, "reward": 0.04045759144355543, "reward_std": 0.060838291130494326, "rewards/pure_accuracy_reward_math": 0.040457590454025194, "step": 76 }, { "clip_ratio": 0.0002450900424548763, "epoch": 0.147231932120343, "grad_norm": 0.03705858439207077, "kl": 0.0018303394317626953, "learning_rate": 1.2261146496815287e-06, "loss": 0.0055, "step": 77 }, { "clip_ratio": 0.0002520209266094753, "epoch": 0.14914403513489288, "grad_norm": 0.03624257072806358, "kl": 0.0019118785858154297, "learning_rate": 1.2420382165605097e-06, "loss": 0.0055, "step": 78 }, { "clip_ratio": 0.00023157394139161624, "epoch": 0.1510561381494428, "grad_norm": 0.03626013919711113, "kl": 0.001949906349182129, "learning_rate": 1.2579617834394905e-06, "loss": 0.0055, "step": 79 }, { "clip_ratio": 0.0002889583781211513, "epoch": 0.1529682411639927, "grad_norm": 0.03634464740753174, "kl": 0.001984238624572754, "learning_rate": 1.2738853503184715e-06, "loss": 0.0055, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 531.2025918960571, "epoch": 0.15488034417854263, "grad_norm": 0.032439347356557846, "kl": 0.0019190311431884766, "learning_rate": 1.2898089171974522e-06, "loss": 0.0067, "num_tokens": 62551992.0, "reward": 0.03766741280560382, "reward_std": 0.0572711571585387, "rewards/pure_accuracy_reward_math": 0.0376674119324889, "step": 81 }, { "clip_ratio": 0.00025730342139240747, "epoch": 0.15679244719309252, "grad_norm": 0.03198026493191719, "kl": 0.001917123794555664, "learning_rate": 1.3057324840764332e-06, "loss": 0.0067, "step": 82 }, { "clip_ratio": 0.0002504205738205201, "epoch": 0.15870455020764243, "grad_norm": 0.02998184598982334, "kl": 0.0019073486328125, "learning_rate": 1.3216560509554142e-06, "loss": 0.0067, "step": 83 }, { "clip_ratio": 0.00025362581419585695, "epoch": 0.16061665322219235, "grad_norm": 0.029601849615573883, "kl": 0.0019354820251464844, "learning_rate": 1.337579617834395e-06, "loss": 0.0067, "step": 84 }, { "clip_ratio": 0.0003167184295307379, "epoch": 0.16252875623674226, "grad_norm": 0.030052170157432556, "kl": 0.0019598007202148438, "learning_rate": 1.353503184713376e-06, "loss": 0.0067, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 527.9562168121338, "epoch": 0.16444085925129218, "grad_norm": 0.03331635147333145, "kl": 0.002047300338745117, "learning_rate": 1.3694267515923567e-06, "loss": 0.0076, "num_tokens": 66182275.0, "reward": 0.04045759132714011, "reward_std": 0.06074576545506716, "rewards/pure_accuracy_reward_math": 0.04045759068685584, "step": 86 }, { "clip_ratio": 0.0002471263709367122, "epoch": 0.16635296226584206, "grad_norm": 0.03298444300889969, "kl": 0.0020711421966552734, "learning_rate": 1.3853503184713377e-06, "loss": 0.0076, "step": 87 }, { "clip_ratio": 0.00024866302578629984, "epoch": 0.16826506528039198, "grad_norm": 0.03206898272037506, "kl": 0.0020384788513183594, "learning_rate": 1.4012738853503185e-06, "loss": 0.0076, "step": 88 }, { "clip_ratio": 0.00026278120321876486, "epoch": 0.1701771682949419, "grad_norm": 0.03115510568022728, "kl": 0.002008795738220215, "learning_rate": 1.4171974522292995e-06, "loss": 0.0076, "step": 89 }, { "clip_ratio": 0.000245522400405207, "epoch": 0.1720892713094918, "grad_norm": 0.030577220022678375, "kl": 0.0019922256469726562, "learning_rate": 1.4331210191082802e-06, "loss": 0.0076, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 523.3948340415955, "epoch": 0.1740013743240417, "grad_norm": 0.0348118431866169, "kl": 0.0019588470458984375, "learning_rate": 1.4490445859872612e-06, "loss": 0.0046, "num_tokens": 69793534.0, "reward": 0.04436384132714011, "reward_std": 0.059376906079705805, "rewards/pure_accuracy_reward_math": 0.044363840570440516, "step": 91 }, { "clip_ratio": 0.00021377558331892033, "epoch": 0.1759134773385916, "grad_norm": 0.03493114933371544, "kl": 0.0019345283508300781, "learning_rate": 1.4649681528662422e-06, "loss": 0.0046, "step": 92 }, { "clip_ratio": 0.00023636125789039397, "epoch": 0.17782558035314153, "grad_norm": 0.03362264111638069, "kl": 0.0019860267639160156, "learning_rate": 1.480891719745223e-06, "loss": 0.0046, "step": 93 }, { "clip_ratio": 0.00022836430440520417, "epoch": 0.17973768336769144, "grad_norm": 0.03336656093597412, "kl": 0.002032160758972168, "learning_rate": 1.496815286624204e-06, "loss": 0.0045, "step": 94 }, { "clip_ratio": 0.00024139108904819295, "epoch": 0.18164978638224133, "grad_norm": 0.03235051408410072, "kl": 0.0021082162857055664, "learning_rate": 1.5127388535031847e-06, "loss": 0.0045, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 530.8058252334595, "epoch": 0.18356188939679124, "grad_norm": 0.03482769802212715, "kl": 0.0021872520446777344, "learning_rate": 1.5286624203821657e-06, "loss": 0.0075, "num_tokens": 73427974.0, "reward": 0.04101562709547579, "reward_std": 0.06101094774203375, "rewards/pure_accuracy_reward_math": 0.04101562616415322, "step": 96 }, { "clip_ratio": 0.00024072786442275174, "epoch": 0.18547399241134116, "grad_norm": 0.03345990553498268, "kl": 0.002261519432067871, "learning_rate": 1.5445859872611465e-06, "loss": 0.0075, "step": 97 }, { "clip_ratio": 0.00024480573915752757, "epoch": 0.18738609542589107, "grad_norm": 0.03318383917212486, "kl": 0.0022890567779541016, "learning_rate": 1.5605095541401275e-06, "loss": 0.0075, "step": 98 }, { "clip_ratio": 0.00027489714915418517, "epoch": 0.189298198440441, "grad_norm": 0.03230712562799454, "kl": 0.0023267269134521484, "learning_rate": 1.5764331210191083e-06, "loss": 0.0074, "step": 99 }, { "clip_ratio": 0.00029621877195040724, "epoch": 0.19121030145499088, "grad_norm": 0.03260359168052673, "kl": 0.002334117889404297, "learning_rate": 1.5923566878980892e-06, "loss": 0.0074, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 526.358283996582, "epoch": 0.0019121030145499089, "grad_norm": 0.30763256549835205, "kl": 0.0024461746215820312, "learning_rate": 1.6082802547770702e-06, "loss": 0.0053, "num_tokens": 3621800.0, "reward": 0.0546875023865141, "reward_std": 0.06997958500869572, "rewards/pure_accuracy_reward_math": 0.054687501629814506, "step": 101 }, { "clip_ratio": 0.00028505406811518696, "epoch": 0.0038242060290998177, "grad_norm": 0.7424792647361755, "kl": 0.005189061164855957, "learning_rate": 1.624203821656051e-06, "loss": 0.0054, "step": 102 }, { "clip_ratio": 0.000307778484739174, "epoch": 0.005736309043649726, "grad_norm": 0.5747273564338684, "kl": 0.005206584930419922, "learning_rate": 1.640127388535032e-06, "loss": 0.0054, "step": 103 }, { "clip_ratio": 0.0003712488735345687, "epoch": 0.0076484120581996355, "grad_norm": 0.15304483473300934, "kl": 0.0026189088821411133, "learning_rate": 1.6560509554140127e-06, "loss": 0.0053, "step": 104 }, { "clip_ratio": 0.00037476027159755176, "epoch": 0.009560515072749545, "grad_norm": 0.2118157148361206, "kl": 0.00246584415435791, "learning_rate": 1.6719745222929937e-06, "loss": 0.0053, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 527.46431016922, "epoch": 0.011472618087299453, "grad_norm": 0.2036779820919037, "kl": 0.0037467479705810547, "learning_rate": 1.6878980891719745e-06, "loss": 0.0067, "num_tokens": 7244448.0, "reward": 0.05161830596625805, "reward_std": 0.06822534638922662, "rewards/pure_accuracy_reward_math": 0.05161830474389717, "step": 106 }, { "clip_ratio": 0.0002751678786125922, "epoch": 0.013384721101849363, "grad_norm": 0.1858554631471634, "kl": 0.0035070180892944336, "learning_rate": 1.7038216560509555e-06, "loss": 0.0067, "step": 107 }, { "clip_ratio": 0.0002901391828800115, "epoch": 0.015296824116399271, "grad_norm": 0.06319136172533035, "kl": 0.0033702850341796875, "learning_rate": 1.7197452229299363e-06, "loss": 0.0067, "step": 108 }, { "clip_ratio": 0.00029408001091724145, "epoch": 0.01720892713094918, "grad_norm": 0.061827220022678375, "kl": 0.00351715087890625, "learning_rate": 1.7356687898089172e-06, "loss": 0.0067, "step": 109 }, { "clip_ratio": 0.0002710100695253459, "epoch": 0.01912103014549909, "grad_norm": 0.13167870044708252, "kl": 0.0036835670471191406, "learning_rate": 1.7515923566878982e-06, "loss": 0.0067, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 515.4391984939575, "epoch": 0.021033133160048997, "grad_norm": 0.034568388015031815, "kl": 0.0024957656860351562, "learning_rate": 1.767515923566879e-06, "loss": 0.0068, "num_tokens": 10824130.0, "reward": 0.0468750023865141, "reward_std": 0.06221334758447483, "rewards/pure_accuracy_reward_math": 0.04687500116415322, "step": 111 }, { "clip_ratio": 0.00025272632768746917, "epoch": 0.022945236174598906, "grad_norm": 0.03421744704246521, "kl": 0.002499222755432129, "learning_rate": 1.78343949044586e-06, "loss": 0.0068, "step": 112 }, { "clip_ratio": 0.00025192988658773174, "epoch": 0.024857339189148814, "grad_norm": 0.03444651514291763, "kl": 0.002528548240661621, "learning_rate": 1.7993630573248407e-06, "loss": 0.0068, "step": 113 }, { "clip_ratio": 0.0002639102876287325, "epoch": 0.026769442203698725, "grad_norm": 0.033966146409511566, "kl": 0.0025298595428466797, "learning_rate": 1.8152866242038217e-06, "loss": 0.0067, "step": 114 }, { "clip_ratio": 0.0002613060296994263, "epoch": 0.028681545218248634, "grad_norm": 0.03252725675702095, "kl": 0.0025829076766967773, "learning_rate": 1.8312101910828025e-06, "loss": 0.0067, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 511.59126138687134, "epoch": 0.030593648232798542, "grad_norm": 0.04161737114191055, "kl": 0.002777099609375, "learning_rate": 1.8471337579617835e-06, "loss": 0.0084, "num_tokens": 14389817.0, "reward": 0.04464285934227519, "reward_std": 0.0631567623349838, "rewards/pure_accuracy_reward_math": 0.044642858469160274, "step": 116 }, { "clip_ratio": 0.0002685248994680478, "epoch": 0.032505751247348454, "grad_norm": 0.03920653462409973, "kl": 0.002690911293029785, "learning_rate": 1.8630573248407643e-06, "loss": 0.0084, "step": 117 }, { "clip_ratio": 0.00028247613772691693, "epoch": 0.03441785426189836, "grad_norm": 0.037915512919425964, "kl": 0.0026444196701049805, "learning_rate": 1.8789808917197455e-06, "loss": 0.0084, "step": 118 }, { "clip_ratio": 0.00028578577973803476, "epoch": 0.03632995727644827, "grad_norm": 0.03727024793624878, "kl": 0.002573251724243164, "learning_rate": 1.8949044585987264e-06, "loss": 0.0083, "step": 119 }, { "clip_ratio": 0.0003107314861381383, "epoch": 0.03824206029099818, "grad_norm": 0.03734543174505234, "kl": 0.002534151077270508, "learning_rate": 1.9108280254777074e-06, "loss": 0.0083, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 511.96654319763184, "epoch": 0.040154163305548086, "grad_norm": 18.524425506591797, "kl": 0.05213046073913574, "learning_rate": 1.926751592356688e-06, "loss": 0.0067, "num_tokens": 17950273.0, "reward": 0.044642859254963696, "reward_std": 0.0572310917195864, "rewards/pure_accuracy_reward_math": 0.04464285838184878, "step": 121 }, { "clip_ratio": 0.00024330438452579983, "epoch": 0.042066266320097995, "grad_norm": 0.06961806118488312, "kl": 0.0025354623794555664, "learning_rate": 1.942675159235669e-06, "loss": 0.0047, "step": 122 }, { "clip_ratio": 0.00023799908234423128, "epoch": 0.0439783693346479, "grad_norm": 0.038592379540205, "kl": 0.0024437904357910156, "learning_rate": 1.95859872611465e-06, "loss": 0.0047, "step": 123 }, { "clip_ratio": 0.00023513944393016573, "epoch": 0.04589047234919781, "grad_norm": 0.036785636097192764, "kl": 0.002588033676147461, "learning_rate": 1.974522292993631e-06, "loss": 0.0047, "step": 124 }, { "clip_ratio": 0.0002449645085107477, "epoch": 0.04780257536374772, "grad_norm": 0.03537231311202049, "kl": 0.002721548080444336, "learning_rate": 1.9904458598726117e-06, "loss": 0.0047, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 513.618884563446, "epoch": 0.04971467837829763, "grad_norm": 0.03746291249990463, "kl": 0.0026862621307373047, "learning_rate": 2.0063694267515925e-06, "loss": 0.0063, "num_tokens": 21522907.0, "reward": 0.04492187732830644, "reward_std": 0.061436392075847834, "rewards/pure_accuracy_reward_math": 0.04492187616415322, "step": 126 }, { "clip_ratio": 0.0002821582585283977, "epoch": 0.05162678139284754, "grad_norm": 0.036032602190971375, "kl": 0.0027321577072143555, "learning_rate": 2.0222929936305737e-06, "loss": 0.0063, "step": 127 }, { "clip_ratio": 0.0002675421079629814, "epoch": 0.05353888440739745, "grad_norm": 0.03723033517599106, "kl": 0.002848386764526367, "learning_rate": 2.0382165605095544e-06, "loss": 0.0062, "step": 128 }, { "clip_ratio": 0.00030748845301786787, "epoch": 0.05545098742194736, "grad_norm": 0.03697400540113449, "kl": 0.002881765365600586, "learning_rate": 2.054140127388535e-06, "loss": 0.0062, "step": 129 }, { "clip_ratio": 0.0003087153630758621, "epoch": 0.05736309043649727, "grad_norm": 0.03756724298000336, "kl": 0.002836942672729492, "learning_rate": 2.070063694267516e-06, "loss": 0.0062, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 518.1615762710571, "epoch": 0.059275193451047176, "grad_norm": 0.039371710270643234, "kl": 0.00270843505859375, "learning_rate": 2.085987261146497e-06, "loss": 0.0064, "num_tokens": 25111362.0, "reward": 0.05106027016881853, "reward_std": 0.06736206263303757, "rewards/pure_accuracy_reward_math": 0.051060269062872976, "step": 131 }, { "clip_ratio": 0.0002896036380661826, "epoch": 0.061187296465597084, "grad_norm": 0.03780793026089668, "kl": 0.0027250051498413086, "learning_rate": 2.101910828025478e-06, "loss": 0.0064, "step": 132 }, { "clip_ratio": 0.0002853632216783808, "epoch": 0.06309939948014699, "grad_norm": 0.03720535710453987, "kl": 0.0027070045471191406, "learning_rate": 2.1178343949044587e-06, "loss": 0.0064, "step": 133 }, { "clip_ratio": 0.0002896762144928289, "epoch": 0.06501150249469691, "grad_norm": 0.036468133330345154, "kl": 0.0027469396591186523, "learning_rate": 2.13375796178344e-06, "loss": 0.0064, "step": 134 }, { "clip_ratio": 0.0003120482754184195, "epoch": 0.06692360550924681, "grad_norm": 0.03586801886558533, "kl": 0.002748727798461914, "learning_rate": 2.1496815286624207e-06, "loss": 0.0063, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 528.0351796150208, "epoch": 0.06883570852379672, "grad_norm": 0.03092282824218273, "kl": 0.002766728401184082, "learning_rate": 2.1656050955414015e-06, "loss": 0.0056, "num_tokens": 28735680.0, "reward": 0.04017857348662801, "reward_std": 0.05289319949224591, "rewards/pure_accuracy_reward_math": 0.040178572438890114, "step": 136 }, { "clip_ratio": 0.00020221989311153266, "epoch": 0.07074781153834662, "grad_norm": 0.030703941360116005, "kl": 0.0028089284896850586, "learning_rate": 2.1815286624203822e-06, "loss": 0.0056, "step": 137 }, { "clip_ratio": 0.00019867721590571819, "epoch": 0.07265991455289654, "grad_norm": 0.030248478055000305, "kl": 0.0027884244918823242, "learning_rate": 2.1974522292993634e-06, "loss": 0.0056, "step": 138 }, { "clip_ratio": 0.00021304549886735913, "epoch": 0.07457201756744644, "grad_norm": 0.029539138078689575, "kl": 0.002767205238342285, "learning_rate": 2.213375796178344e-06, "loss": 0.0056, "step": 139 }, { "clip_ratio": 0.00021535260020755231, "epoch": 0.07648412058199636, "grad_norm": 0.02955791726708412, "kl": 0.002725839614868164, "learning_rate": 2.229299363057325e-06, "loss": 0.0055, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 525.7212834358215, "epoch": 0.07839622359654626, "grad_norm": 0.060058850795030594, "kl": 0.0032591819763183594, "learning_rate": 2.245222929936306e-06, "loss": 0.0071, "num_tokens": 32349997.0, "reward": 0.048828127211891115, "reward_std": 0.056028691527899355, "rewards/pure_accuracy_reward_math": 0.04882812628056854, "step": 141 }, { "clip_ratio": 0.00022036872547914754, "epoch": 0.08030832661109617, "grad_norm": 0.03533012047410011, "kl": 0.002978205680847168, "learning_rate": 2.261146496815287e-06, "loss": 0.0071, "step": 142 }, { "clip_ratio": 0.0002158615123448726, "epoch": 0.08222042962564609, "grad_norm": 0.029908612370491028, "kl": 0.002841353416442871, "learning_rate": 2.2770700636942677e-06, "loss": 0.0071, "step": 143 }, { "clip_ratio": 0.0002112481060976279, "epoch": 0.08413253264019599, "grad_norm": 0.028638474643230438, "kl": 0.002796173095703125, "learning_rate": 2.2929936305732485e-06, "loss": 0.0071, "step": 144 }, { "clip_ratio": 0.00022246911356660348, "epoch": 0.0860446356547459, "grad_norm": 0.02828238159418106, "kl": 0.0027240514755249023, "learning_rate": 2.3089171974522297e-06, "loss": 0.007, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 534.0318322181702, "epoch": 0.0879567386692958, "grad_norm": 3.060509443283081, "kl": 0.022321224212646484, "learning_rate": 2.3248407643312104e-06, "loss": 0.0062, "num_tokens": 35996663.0, "reward": 0.04436384144355543, "reward_std": 0.06130999844754115, "rewards/pure_accuracy_reward_math": 0.04436384039581753, "step": 146 }, { "clip_ratio": 0.00023404289771633557, "epoch": 0.08986884168384572, "grad_norm": 0.28904739022254944, "kl": 0.004893064498901367, "learning_rate": 2.3407643312101912e-06, "loss": 0.0055, "step": 147 }, { "clip_ratio": 0.00024259101735424338, "epoch": 0.09178094469839562, "grad_norm": 0.03826431185007095, "kl": 0.0027625560760498047, "learning_rate": 2.356687898089172e-06, "loss": 0.0054, "step": 148 }, { "clip_ratio": 0.0002517821457672653, "epoch": 0.09369304771294554, "grad_norm": 0.03572425991296768, "kl": 0.002875208854675293, "learning_rate": 2.372611464968153e-06, "loss": 0.0054, "step": 149 }, { "clip_ratio": 0.00024034848578935453, "epoch": 0.09560515072749544, "grad_norm": 0.036431849002838135, "kl": 0.0031164884567260742, "learning_rate": 2.388535031847134e-06, "loss": 0.0054, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 538.2168254852295, "epoch": 0.09751725374204535, "grad_norm": 0.03362743556499481, "kl": 0.002684354782104492, "learning_rate": 2.4044585987261147e-06, "loss": 0.0027, "num_tokens": 39661060.0, "reward": 0.05133928792201914, "reward_std": 0.06672389659797773, "rewards/pure_accuracy_reward_math": 0.05133928681607358, "step": 151 }, { "clip_ratio": 0.0002668876670099962, "epoch": 0.09942935675659526, "grad_norm": 0.033922772854566574, "kl": 0.002791762351989746, "learning_rate": 2.420382165605096e-06, "loss": 0.0027, "step": 152 }, { "clip_ratio": 0.0002435101382616267, "epoch": 0.10134145977114517, "grad_norm": 0.03526493161916733, "kl": 0.002907991409301758, "learning_rate": 2.4363057324840767e-06, "loss": 0.0027, "step": 153 }, { "clip_ratio": 0.00025345294346834635, "epoch": 0.10325356278569509, "grad_norm": 0.034125424921512604, "kl": 0.0029108524322509766, "learning_rate": 2.4522292993630575e-06, "loss": 0.0027, "step": 154 }, { "clip_ratio": 0.0002378649581942227, "epoch": 0.10516566580024499, "grad_norm": 0.033436987549066544, "kl": 0.002874612808227539, "learning_rate": 2.4681528662420382e-06, "loss": 0.0027, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 541.2424907684326, "epoch": 0.1070777688147949, "grad_norm": 0.031592246145009995, "kl": 0.002785801887512207, "learning_rate": 2.4840764331210194e-06, "loss": 0.005, "num_tokens": 43331425.0, "reward": 0.044363841181620955, "reward_std": 0.05607495462754741, "rewards/pure_accuracy_reward_math": 0.044363840599544346, "step": 156 }, { "clip_ratio": 0.00019312051063025137, "epoch": 0.1089898718293448, "grad_norm": 0.030642936006188393, "kl": 0.0027495622634887695, "learning_rate": 2.5e-06, "loss": 0.005, "step": 157 }, { "clip_ratio": 0.0002267159566713417, "epoch": 0.11090197484389472, "grad_norm": 0.03025418519973755, "kl": 0.002672433853149414, "learning_rate": 2.515923566878981e-06, "loss": 0.0049, "step": 158 }, { "clip_ratio": 0.00023296605036193796, "epoch": 0.11281407785844462, "grad_norm": 0.03024701401591301, "kl": 0.0026074647903442383, "learning_rate": 2.531847133757962e-06, "loss": 0.0049, "step": 159 }, { "clip_ratio": 0.00024551542321660236, "epoch": 0.11472618087299453, "grad_norm": 0.03065372072160244, "kl": 0.0025725364685058594, "learning_rate": 2.547770700636943e-06, "loss": 0.0049, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 531.590705871582, "epoch": 0.11663828388754444, "grad_norm": 0.03286377340555191, "kl": 0.002618551254272461, "learning_rate": 2.5636942675159237e-06, "loss": 0.0032, "num_tokens": 46966882.0, "reward": 0.0401785735739395, "reward_std": 0.05864621384534985, "rewards/pure_accuracy_reward_math": 0.04017857258440927, "step": 161 }, { "clip_ratio": 0.000249601399104904, "epoch": 0.11855038690209435, "grad_norm": 0.03168044239282608, "kl": 0.0025817155838012695, "learning_rate": 2.5796178343949045e-06, "loss": 0.0032, "step": 162 }, { "clip_ratio": 0.0002426054838338132, "epoch": 0.12046248991664425, "grad_norm": 0.03161012753844261, "kl": 0.0025763511657714844, "learning_rate": 2.5955414012738857e-06, "loss": 0.0032, "step": 163 }, { "clip_ratio": 0.0002400714004124893, "epoch": 0.12237459293119417, "grad_norm": 0.031408168375492096, "kl": 0.002588987350463867, "learning_rate": 2.6114649681528665e-06, "loss": 0.0032, "step": 164 }, { "clip_ratio": 0.00024877328468164706, "epoch": 0.12428669594574408, "grad_norm": 0.030564049258828163, "kl": 0.0026369094848632812, "learning_rate": 2.6273885350318472e-06, "loss": 0.0031, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 525.5862393379211, "epoch": 0.12619879896029398, "grad_norm": 0.03767310827970505, "kl": 0.0026383399963378906, "learning_rate": 2.6433121019108284e-06, "loss": 0.0062, "num_tokens": 50581511.0, "reward": 0.04966518055880442, "reward_std": 0.06985319027444348, "rewards/pure_accuracy_reward_math": 0.04966517968568951, "step": 166 }, { "clip_ratio": 0.0002872111982696879, "epoch": 0.1281109019748439, "grad_norm": 0.03578091412782669, "kl": 0.0027115345001220703, "learning_rate": 2.659235668789809e-06, "loss": 0.0062, "step": 167 }, { "clip_ratio": 0.0002957127134664006, "epoch": 0.13002300498939381, "grad_norm": 0.03471493721008301, "kl": 0.0028066635131835938, "learning_rate": 2.67515923566879e-06, "loss": 0.0062, "step": 168 }, { "clip_ratio": 0.0003112256898702981, "epoch": 0.1319351080039437, "grad_norm": 0.035491716116666794, "kl": 0.0028966665267944336, "learning_rate": 2.6910828025477707e-06, "loss": 0.0062, "step": 169 }, { "clip_ratio": 0.0003354581235726073, "epoch": 0.13384721101849362, "grad_norm": 0.03574714809656143, "kl": 0.0029289722442626953, "learning_rate": 2.707006369426752e-06, "loss": 0.0061, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 527.8571667671204, "epoch": 0.13575931403304353, "grad_norm": 0.03648287057876587, "kl": 0.0030307769775390625, "learning_rate": 2.7229299363057327e-06, "loss": 0.0061, "num_tokens": 54209407.0, "reward": 0.05161830587894656, "reward_std": 0.06465821276651695, "rewards/pure_accuracy_reward_math": 0.05161830494762398, "step": 171 }, { "clip_ratio": 0.0002587431810354701, "epoch": 0.13767141704759345, "grad_norm": 0.03615426644682884, "kl": 0.0030341148376464844, "learning_rate": 2.7388535031847135e-06, "loss": 0.0061, "step": 172 }, { "clip_ratio": 0.0002548517101104153, "epoch": 0.13958352006214333, "grad_norm": 0.03565597161650658, "kl": 0.002932310104370117, "learning_rate": 2.7547770700636942e-06, "loss": 0.0061, "step": 173 }, { "clip_ratio": 0.00027394448250106507, "epoch": 0.14149562307669325, "grad_norm": 0.035612594336271286, "kl": 0.0029175281524658203, "learning_rate": 2.7707006369426754e-06, "loss": 0.0061, "step": 174 }, { "clip_ratio": 0.00027776476230201297, "epoch": 0.14340772609124317, "grad_norm": 0.036588992923498154, "kl": 0.002942800521850586, "learning_rate": 2.786624203821656e-06, "loss": 0.006, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 529.2430500984192, "epoch": 0.14531982910579308, "grad_norm": 0.03312006592750549, "kl": 0.0028772354125976562, "learning_rate": 2.802547770700637e-06, "loss": 0.0056, "num_tokens": 57839070.0, "reward": 0.04854910931317136, "reward_std": 0.05881887051509693, "rewards/pure_accuracy_reward_math": 0.048549108614679426, "step": 176 }, { "clip_ratio": 0.00022063881021949783, "epoch": 0.147231932120343, "grad_norm": 0.0327099934220314, "kl": 0.002942681312561035, "learning_rate": 2.818471337579618e-06, "loss": 0.0056, "step": 177 }, { "clip_ratio": 0.00021944492368675128, "epoch": 0.14914403513489288, "grad_norm": 0.03261202201247215, "kl": 0.002986431121826172, "learning_rate": 2.834394904458599e-06, "loss": 0.0056, "step": 178 }, { "clip_ratio": 0.0002127133307396889, "epoch": 0.1510561381494428, "grad_norm": 0.03220335766673088, "kl": 0.002970457077026367, "learning_rate": 2.8503184713375797e-06, "loss": 0.0056, "step": 179 }, { "clip_ratio": 0.0001991192841614975, "epoch": 0.1529682411639927, "grad_norm": 0.03179548308253288, "kl": 0.0029560327529907227, "learning_rate": 2.8662420382165605e-06, "loss": 0.0056, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 522.1275358200073, "epoch": 0.15488034417854263, "grad_norm": 0.030966561287641525, "kl": 0.0029218196868896484, "learning_rate": 2.8821656050955417e-06, "loss": 0.0048, "num_tokens": 61445599.0, "reward": 0.04352678795112297, "reward_std": 0.05598862626357004, "rewards/pure_accuracy_reward_math": 0.043526787078008056, "step": 181 }, { "clip_ratio": 0.00021554413663693595, "epoch": 0.15679244719309252, "grad_norm": 0.030419446527957916, "kl": 0.0029065608978271484, "learning_rate": 2.8980891719745225e-06, "loss": 0.0048, "step": 182 }, { "clip_ratio": 0.0002025423377176594, "epoch": 0.15870455020764243, "grad_norm": 0.030062729492783546, "kl": 0.0028995275497436523, "learning_rate": 2.9140127388535032e-06, "loss": 0.0048, "step": 183 }, { "clip_ratio": 0.00023064417456453157, "epoch": 0.16061665322219235, "grad_norm": 0.029301613569259644, "kl": 0.002888321876525879, "learning_rate": 2.9299363057324844e-06, "loss": 0.0048, "step": 184 }, { "clip_ratio": 0.0002338091023261768, "epoch": 0.16252875623674226, "grad_norm": 0.029127391055226326, "kl": 0.0028772354125976562, "learning_rate": 2.945859872611465e-06, "loss": 0.0047, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 533.0510892868042, "epoch": 0.16444085925129218, "grad_norm": 0.036479271948337555, "kl": 0.002923727035522461, "learning_rate": 2.961783439490446e-06, "loss": 0.0063, "num_tokens": 65094142.0, "reward": 0.05022321717115119, "reward_std": 0.06538890500087291, "rewards/pure_accuracy_reward_math": 0.050223215424921364, "step": 186 }, { "clip_ratio": 0.00026048495129771254, "epoch": 0.16635296226584206, "grad_norm": 0.036232370883226395, "kl": 0.0029561519622802734, "learning_rate": 2.9777070063694267e-06, "loss": 0.0063, "step": 187 }, { "clip_ratio": 0.0002226464382033555, "epoch": 0.16826506528039198, "grad_norm": 0.03523917496204376, "kl": 0.003048419952392578, "learning_rate": 2.993630573248408e-06, "loss": 0.0063, "step": 188 }, { "clip_ratio": 0.0002362887615845466, "epoch": 0.1701771682949419, "grad_norm": 0.03477315977215767, "kl": 0.003025054931640625, "learning_rate": 3.0095541401273887e-06, "loss": 0.0062, "step": 189 }, { "clip_ratio": 0.00023160997727700305, "epoch": 0.1720892713094918, "grad_norm": 0.03342609107494354, "kl": 0.0030221939086914062, "learning_rate": 3.0254777070063695e-06, "loss": 0.0062, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 529.7776494026184, "epoch": 0.1740013743240417, "grad_norm": 0.03668810427188873, "kl": 0.0029752254486083984, "learning_rate": 3.0414012738853503e-06, "loss": 0.0066, "num_tokens": 68728277.0, "reward": 0.04994419863214716, "reward_std": 0.06135626137256622, "rewards/pure_accuracy_reward_math": 0.04994419787544757, "step": 191 }, { "clip_ratio": 0.0002391185845453947, "epoch": 0.1759134773385916, "grad_norm": 0.035618141293525696, "kl": 0.0029642581939697266, "learning_rate": 3.0573248407643314e-06, "loss": 0.0066, "step": 192 }, { "clip_ratio": 0.00024402707180115613, "epoch": 0.17782558035314153, "grad_norm": 0.032588809728622437, "kl": 0.002981424331665039, "learning_rate": 3.0732484076433122e-06, "loss": 0.0066, "step": 193 }, { "clip_ratio": 0.0002546731577126593, "epoch": 0.17973768336769144, "grad_norm": 0.0323190875351429, "kl": 0.0030133724212646484, "learning_rate": 3.089171974522293e-06, "loss": 0.0066, "step": 194 }, { "clip_ratio": 0.0002784079450179888, "epoch": 0.18164978638224133, "grad_norm": 0.03181909769773483, "kl": 0.002997159957885742, "learning_rate": 3.105095541401274e-06, "loss": 0.0065, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 536.7226796150208, "epoch": 0.18356188939679124, "grad_norm": 0.034835390746593475, "kl": 0.003053426742553711, "learning_rate": 3.121019108280255e-06, "loss": 0.0053, "num_tokens": 72383923.0, "reward": 0.04352678789291531, "reward_std": 0.06164911447558552, "rewards/pure_accuracy_reward_math": 0.043526787078008056, "step": 196 }, { "clip_ratio": 0.00022759345233680506, "epoch": 0.18547399241134116, "grad_norm": 0.03316686674952507, "kl": 0.003064870834350586, "learning_rate": 3.1369426751592357e-06, "loss": 0.0053, "step": 197 }, { "clip_ratio": 0.00024183520912401946, "epoch": 0.18738609542589107, "grad_norm": 0.0329214446246624, "kl": 0.003040313720703125, "learning_rate": 3.1528662420382165e-06, "loss": 0.0053, "step": 198 }, { "clip_ratio": 0.0002539973459079192, "epoch": 0.189298198440441, "grad_norm": 0.031231405213475227, "kl": 0.0030624866485595703, "learning_rate": 3.1687898089171977e-06, "loss": 0.0052, "step": 199 }, { "clip_ratio": 0.0002776768195076329, "epoch": 0.19121030145499088, "grad_norm": 0.031124714761972427, "kl": 0.0030813217163085938, "learning_rate": 3.1847133757961785e-06, "loss": 0.0052, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 522.4048767089844, "epoch": 0.1931224044695408, "grad_norm": 0.03386811539530754, "kl": 0.003122568130493164, "learning_rate": 3.2006369426751592e-06, "loss": 0.0052, "num_tokens": 75984438.0, "reward": 0.04771205616998486, "reward_std": 0.06319682823959738, "rewards/pure_accuracy_reward_math": 0.04771205471479334, "step": 201 }, { "clip_ratio": 0.00024403837670661233, "epoch": 0.1950345074840907, "grad_norm": 0.03252818062901497, "kl": 0.003181934356689453, "learning_rate": 3.2165605095541404e-06, "loss": 0.0052, "step": 202 }, { "clip_ratio": 0.0002548924753114079, "epoch": 0.19694661049864062, "grad_norm": 0.03233063966035843, "kl": 0.0032570362091064453, "learning_rate": 3.232484076433121e-06, "loss": 0.0052, "step": 203 }, { "clip_ratio": 0.0003048134046252926, "epoch": 0.1988587135131905, "grad_norm": 0.032457806169986725, "kl": 0.0032837390899658203, "learning_rate": 3.248407643312102e-06, "loss": 0.0051, "step": 204 }, { "clip_ratio": 0.0003034327668842707, "epoch": 0.20077081652774043, "grad_norm": 0.03239855542778969, "kl": 0.0032906532287597656, "learning_rate": 3.2643312101910827e-06, "loss": 0.0051, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 509.6657609939575, "epoch": 0.20268291954229034, "grad_norm": 0.0370325967669487, "kl": 0.0033235549926757812, "learning_rate": 3.280254777070064e-06, "loss": 0.0075, "num_tokens": 79548556.0, "reward": 0.052176341792801395, "reward_std": 0.06135626166360453, "rewards/pure_accuracy_reward_math": 0.0521763407450635, "step": 206 }, { "clip_ratio": 0.00026798775621728055, "epoch": 0.20459502255684026, "grad_norm": 0.03616202250123024, "kl": 0.0032608509063720703, "learning_rate": 3.2961783439490447e-06, "loss": 0.0075, "step": 207 }, { "clip_ratio": 0.0002652346859690624, "epoch": 0.20650712557139017, "grad_norm": 0.03537038713693619, "kl": 0.0032129287719726562, "learning_rate": 3.3121019108280255e-06, "loss": 0.0074, "step": 208 }, { "clip_ratio": 0.00026950107780976396, "epoch": 0.20841922858594006, "grad_norm": 0.03502323478460312, "kl": 0.0031485557556152344, "learning_rate": 3.3280254777070063e-06, "loss": 0.0074, "step": 209 }, { "clip_ratio": 0.00025725525091502277, "epoch": 0.21033133160048997, "grad_norm": 0.03380832076072693, "kl": 0.0031027793884277344, "learning_rate": 3.3439490445859875e-06, "loss": 0.0074, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 511.36637449264526, "epoch": 0.2122434346150399, "grad_norm": 1.5961617231369019, "kl": 0.007061004638671875, "learning_rate": 3.3598726114649682e-06, "loss": 0.0062, "num_tokens": 83116585.0, "reward": 0.05078125247382559, "reward_std": 0.06568795558996499, "rewards/pure_accuracy_reward_math": 0.05078125119325705, "step": 211 }, { "clip_ratio": 0.0002800602194383828, "epoch": 0.2141555376295898, "grad_norm": 0.04389820247888565, "kl": 0.004379749298095703, "learning_rate": 3.375796178343949e-06, "loss": 0.0061, "step": 212 }, { "clip_ratio": 0.0002803218378630845, "epoch": 0.2160676406441397, "grad_norm": 0.04022788628935814, "kl": 0.0043125152587890625, "learning_rate": 3.39171974522293e-06, "loss": 0.0061, "step": 213 }, { "clip_ratio": 0.0002704095267631601, "epoch": 0.2179797436586896, "grad_norm": 0.041697319597005844, "kl": 0.004408597946166992, "learning_rate": 3.407643312101911e-06, "loss": 0.0061, "step": 214 }, { "clip_ratio": 0.0003097587871820906, "epoch": 0.21989184667323952, "grad_norm": 0.04933662340044975, "kl": 0.004500150680541992, "learning_rate": 3.4235668789808917e-06, "loss": 0.006, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 520.8122463226318, "epoch": 0.22180394968778944, "grad_norm": 0.03384365886449814, "kl": 0.0032858848571777344, "learning_rate": 3.4394904458598725e-06, "loss": 0.0069, "num_tokens": 86710660.0, "reward": 0.041015627270098776, "reward_std": 0.05345123494043946, "rewards/pure_accuracy_reward_math": 0.041015626047737896, "step": 216 }, { "clip_ratio": 0.00022953049290208583, "epoch": 0.22371605270233935, "grad_norm": 0.03259577602148056, "kl": 0.003277301788330078, "learning_rate": 3.4554140127388537e-06, "loss": 0.0069, "step": 217 }, { "clip_ratio": 0.00024143920052210888, "epoch": 0.22562815571688924, "grad_norm": 0.031054330989718437, "kl": 0.0031991004943847656, "learning_rate": 3.4713375796178345e-06, "loss": 0.0069, "step": 218 }, { "clip_ratio": 0.0002552373456978785, "epoch": 0.22754025873143915, "grad_norm": 0.031755171716213226, "kl": 0.003099679946899414, "learning_rate": 3.4872611464968152e-06, "loss": 0.0069, "step": 219 }, { "clip_ratio": 0.0002681780064790473, "epoch": 0.22945236174598907, "grad_norm": 0.031188273802399635, "kl": 0.003045320510864258, "learning_rate": 3.5031847133757964e-06, "loss": 0.0068, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 516.6825013160706, "epoch": 0.23136446476053898, "grad_norm": 0.03775335103273392, "kl": 0.003011941909790039, "learning_rate": 3.5191082802547772e-06, "loss": 0.0063, "num_tokens": 90291858.0, "reward": 0.04715401996509172, "reward_std": 0.06113734241807833, "rewards/pure_accuracy_reward_math": 0.04715401915018447, "step": 221 }, { "clip_ratio": 0.0002582234144483664, "epoch": 0.23327656777508887, "grad_norm": 0.03602875769138336, "kl": 0.002973794937133789, "learning_rate": 3.535031847133758e-06, "loss": 0.0063, "step": 222 }, { "clip_ratio": 0.0002264754746761355, "epoch": 0.2351886707896388, "grad_norm": 0.03449266403913498, "kl": 0.002980470657348633, "learning_rate": 3.5509554140127388e-06, "loss": 0.0063, "step": 223 }, { "clip_ratio": 0.00025999376231311544, "epoch": 0.2371007738041887, "grad_norm": 0.0329199843108654, "kl": 0.002971053123474121, "learning_rate": 3.56687898089172e-06, "loss": 0.0062, "step": 224 }, { "clip_ratio": 0.000296181439978227, "epoch": 0.23901287681873862, "grad_norm": 0.033409375697374344, "kl": 0.0030214786529541016, "learning_rate": 3.5828025477707007e-06, "loss": 0.0062, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 530.4132494926453, "epoch": 0.2409249798332885, "grad_norm": 0.03549947962164879, "kl": 0.004068970680236816, "learning_rate": 3.5987261146496815e-06, "loss": 0.0083, "num_tokens": 93927655.0, "reward": 0.039899555675219744, "reward_std": 0.05890519870445132, "rewards/pure_accuracy_reward_math": 0.03989955480210483, "step": 226 }, { "clip_ratio": 0.00024125495940552355, "epoch": 0.24283708284783842, "grad_norm": 0.033262889832258224, "kl": 0.0040683746337890625, "learning_rate": 3.6146496815286623e-06, "loss": 0.0083, "step": 227 }, { "clip_ratio": 0.00024547909194438944, "epoch": 0.24474918586238834, "grad_norm": 0.03303634375333786, "kl": 0.004040956497192383, "learning_rate": 3.6305732484076435e-06, "loss": 0.0083, "step": 228 }, { "clip_ratio": 0.0002773670349256463, "epoch": 0.24666128887693825, "grad_norm": 0.03389015421271324, "kl": 0.00404667854309082, "learning_rate": 3.6464968152866242e-06, "loss": 0.0083, "step": 229 }, { "clip_ratio": 0.000270649900215858, "epoch": 0.24857339189148817, "grad_norm": 0.035877879709005356, "kl": 0.0038802623748779297, "learning_rate": 3.662420382165605e-06, "loss": 0.0082, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 527.9338984489441, "epoch": 0.25048549490603805, "grad_norm": 0.032850634306669235, "kl": 0.0030744075775146484, "learning_rate": 3.678343949044586e-06, "loss": 0.0064, "num_tokens": 97554714.0, "reward": 0.04743303795112297, "reward_std": 0.061522720265202224, "rewards/pure_accuracy_reward_math": 0.047433037019800395, "step": 231 }, { "clip_ratio": 0.00024459305313939694, "epoch": 0.25239759792058797, "grad_norm": 0.03185749799013138, "kl": 0.00302886962890625, "learning_rate": 3.694267515923567e-06, "loss": 0.0064, "step": 232 }, { "clip_ratio": 0.00025332184179660544, "epoch": 0.2543097009351379, "grad_norm": 0.03135737404227257, "kl": 0.002967357635498047, "learning_rate": 3.7101910828025477e-06, "loss": 0.0064, "step": 233 }, { "clip_ratio": 0.0002861271710798974, "epoch": 0.2562218039496878, "grad_norm": 0.030725885182619095, "kl": 0.0029573440551757812, "learning_rate": 3.7261146496815285e-06, "loss": 0.0064, "step": 234 }, { "clip_ratio": 0.0002841630366674508, "epoch": 0.2581339069642377, "grad_norm": 0.030670415610074997, "kl": 0.002954721450805664, "learning_rate": 3.7420382165605097e-06, "loss": 0.0063, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 531.4467325210571, "epoch": 0.26004600997878763, "grad_norm": 0.03534790128469467, "kl": 0.003011465072631836, "learning_rate": 3.757961783439491e-06, "loss": 0.0041, "num_tokens": 101193143.0, "reward": 0.04631696638534777, "reward_std": 0.0601075982558541, "rewards/pure_accuracy_reward_math": 0.046316965454025194, "step": 236 }, { "clip_ratio": 0.00022260297603793333, "epoch": 0.2619581129933375, "grad_norm": 0.03438499942421913, "kl": 0.0030508041381835938, "learning_rate": 3.773885350318472e-06, "loss": 0.0041, "step": 237 }, { "clip_ratio": 0.00024397839513312647, "epoch": 0.2638702160078874, "grad_norm": 0.032804593443870544, "kl": 0.0030994415283203125, "learning_rate": 3.789808917197453e-06, "loss": 0.0041, "step": 238 }, { "clip_ratio": 0.0002508007286223801, "epoch": 0.2657823190224373, "grad_norm": 0.03402625024318695, "kl": 0.0031244754791259766, "learning_rate": 3.8057324840764336e-06, "loss": 0.004, "step": 239 }, { "clip_ratio": 0.00025242620182552855, "epoch": 0.26769442203698723, "grad_norm": 0.03291900083422661, "kl": 0.003187417984008789, "learning_rate": 3.821656050955415e-06, "loss": 0.004, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 536.9913763999939, "epoch": 0.26960652505153715, "grad_norm": 0.033690325915813446, "kl": 0.003125429153442383, "learning_rate": 3.837579617834396e-06, "loss": 0.0089, "num_tokens": 104860392.0, "reward": 0.05496652069268748, "reward_std": 0.07028483308386058, "rewards/pure_accuracy_reward_math": 0.05496651929570362, "step": 241 }, { "clip_ratio": 0.0002661047830088137, "epoch": 0.27151862806608706, "grad_norm": 0.03227640688419342, "kl": 0.0031244754791259766, "learning_rate": 3.853503184713376e-06, "loss": 0.009, "step": 242 }, { "clip_ratio": 0.00027503777869242185, "epoch": 0.273430731080637, "grad_norm": 0.03168897703289986, "kl": 0.003157377243041992, "learning_rate": 3.869426751592357e-06, "loss": 0.0089, "step": 243 }, { "clip_ratio": 0.00029653536631712996, "epoch": 0.2753428340951869, "grad_norm": 0.03222280368208885, "kl": 0.0031862258911132812, "learning_rate": 3.885350318471338e-06, "loss": 0.0089, "step": 244 }, { "clip_ratio": 0.0003081631187455969, "epoch": 0.2772549371097368, "grad_norm": 0.03176514804363251, "kl": 0.0032341480255126953, "learning_rate": 3.901273885350319e-06, "loss": 0.0088, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 513.5616898536682, "epoch": 0.27916704012428667, "grad_norm": 0.037929438054561615, "kl": 0.0035233497619628906, "learning_rate": 3.9171974522293e-06, "loss": 0.0075, "num_tokens": 108427949.0, "reward": 0.0544084852153901, "reward_std": 0.0659469406818971, "rewards/pure_accuracy_reward_math": 0.054408483527367935, "step": 246 }, { "clip_ratio": 0.0002633177949178389, "epoch": 0.2810791431388366, "grad_norm": 0.03561301901936531, "kl": 0.0035467147827148438, "learning_rate": 3.933121019108281e-06, "loss": 0.0075, "step": 247 }, { "clip_ratio": 0.0003005996498472996, "epoch": 0.2829912461533865, "grad_norm": 0.035342708230018616, "kl": 0.003578662872314453, "learning_rate": 3.949044585987262e-06, "loss": 0.0075, "step": 248 }, { "clip_ratio": 0.0003206986277177748, "epoch": 0.2849033491679364, "grad_norm": 0.03841444477438927, "kl": 0.0036001205444335938, "learning_rate": 3.964968152866243e-06, "loss": 0.0075, "step": 249 }, { "clip_ratio": 0.00030761192169848073, "epoch": 0.28681545218248633, "grad_norm": 0.03515273705124855, "kl": 0.003624439239501953, "learning_rate": 3.980891719745223e-06, "loss": 0.0074, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 504.73858308792114, "epoch": 0.28872755519703625, "grad_norm": 0.04030496999621391, "kl": 0.003686189651489258, "learning_rate": 3.996815286624204e-06, "loss": 0.0081, "num_tokens": 111975532.0, "reward": 0.0647321458964143, "reward_std": 0.07547981187235564, "rewards/pure_accuracy_reward_math": 0.06473214420839213, "step": 251 }, { "clip_ratio": 0.00031485489739679906, "epoch": 0.29063965821158616, "grad_norm": 0.04058763012290001, "kl": 0.003683328628540039, "learning_rate": 4.012738853503185e-06, "loss": 0.0081, "step": 252 }, { "clip_ratio": 0.0003329372994471669, "epoch": 0.2925517612261361, "grad_norm": 0.039948880672454834, "kl": 0.003644227981567383, "learning_rate": 4.0286624203821666e-06, "loss": 0.0081, "step": 253 }, { "clip_ratio": 0.00031999613804600813, "epoch": 0.294463864240686, "grad_norm": 0.038771189749240875, "kl": 0.003670930862426758, "learning_rate": 4.044585987261147e-06, "loss": 0.008, "step": 254 }, { "clip_ratio": 0.0003391868065136805, "epoch": 0.29637596725523585, "grad_norm": 0.03820183873176575, "kl": 0.0036439895629882812, "learning_rate": 4.060509554140128e-06, "loss": 0.0079, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 507.980770111084, "epoch": 0.29828807026978577, "grad_norm": 0.0373733825981617, "kl": 0.003556966781616211, "learning_rate": 4.076433121019109e-06, "loss": 0.0047, "num_tokens": 115530899.0, "reward": 0.05217634199652821, "reward_std": 0.06624599173665047, "rewards/pure_accuracy_reward_math": 0.05217634030850604, "step": 256 }, { "clip_ratio": 0.0002444871162765594, "epoch": 0.3002001732843357, "grad_norm": 0.03655192255973816, "kl": 0.003623485565185547, "learning_rate": 4.09235668789809e-06, "loss": 0.0047, "step": 257 }, { "clip_ratio": 0.0002544127338524049, "epoch": 0.3021122762988856, "grad_norm": 0.035692181438207626, "kl": 0.003640890121459961, "learning_rate": 4.10828025477707e-06, "loss": 0.0046, "step": 258 }, { "clip_ratio": 0.0002950017506577751, "epoch": 0.3040243793134355, "grad_norm": 0.03550735488533974, "kl": 0.0036733150482177734, "learning_rate": 4.124203821656051e-06, "loss": 0.0046, "step": 259 }, { "clip_ratio": 0.0002894491571510116, "epoch": 0.3059364823279854, "grad_norm": 0.03471330925822258, "kl": 0.00366973876953125, "learning_rate": 4.140127388535032e-06, "loss": 0.0045, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 517.2921543121338, "epoch": 0.30784858534253534, "grad_norm": 0.6632264852523804, "kl": 0.007929325103759766, "learning_rate": 4.156050955414014e-06, "loss": 0.0041, "num_tokens": 119123970.0, "reward": 0.046875002153683454, "reward_std": 0.06358220643596724, "rewards/pure_accuracy_reward_math": 0.04687500122236088, "step": 261 }, { "clip_ratio": 0.00027907352409783925, "epoch": 0.30976068835708526, "grad_norm": 0.03735750913619995, "kl": 0.0038709640502929688, "learning_rate": 4.171974522292994e-06, "loss": 0.004, "step": 262 }, { "clip_ratio": 0.000277261100677606, "epoch": 0.31167279137163517, "grad_norm": 0.03806532546877861, "kl": 0.004002094268798828, "learning_rate": 4.187898089171975e-06, "loss": 0.004, "step": 263 }, { "clip_ratio": 0.00026404397090118437, "epoch": 0.31358489438618503, "grad_norm": 0.03587675094604492, "kl": 0.00407719612121582, "learning_rate": 4.203821656050956e-06, "loss": 0.0039, "step": 264 }, { "clip_ratio": 0.0003132741497324787, "epoch": 0.31549699740073495, "grad_norm": 0.03516336902976036, "kl": 0.004099607467651367, "learning_rate": 4.219745222929937e-06, "loss": 0.0039, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 528.596004486084, "epoch": 0.31740910041528486, "grad_norm": 0.038204919546842575, "kl": 0.0035691261291503906, "learning_rate": 4.2356687898089174e-06, "loss": 0.006, "num_tokens": 122758966.0, "reward": 0.054966520925518125, "reward_std": 0.06770737608894706, "rewards/pure_accuracy_reward_math": 0.05496651912108064, "step": 266 }, { "clip_ratio": 0.00026713599251593223, "epoch": 0.3193212034298348, "grad_norm": 0.03804617002606392, "kl": 0.003623485565185547, "learning_rate": 4.251592356687898e-06, "loss": 0.006, "step": 267 }, { "clip_ratio": 0.00027288361513910786, "epoch": 0.3212333064443847, "grad_norm": 0.03765474632382393, "kl": 0.003659486770629883, "learning_rate": 4.26751592356688e-06, "loss": 0.006, "step": 268 }, { "clip_ratio": 0.0002754389876429286, "epoch": 0.3231454094589346, "grad_norm": 0.037356842309236526, "kl": 0.0036840438842773438, "learning_rate": 4.283439490445861e-06, "loss": 0.0059, "step": 269 }, { "clip_ratio": 0.0002686067065269526, "epoch": 0.3250575124734845, "grad_norm": 0.03656876087188721, "kl": 0.003694295883178711, "learning_rate": 4.299363057324841e-06, "loss": 0.0059, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 543.1746897697449, "epoch": 0.32696961548803444, "grad_norm": 0.03417838364839554, "kl": 0.0035529136657714844, "learning_rate": 4.315286624203822e-06, "loss": 0.0076, "num_tokens": 126443800.0, "reward": 0.04882812697906047, "reward_std": 0.05766273388871923, "rewards/pure_accuracy_reward_math": 0.04882812616415322, "step": 271 }, { "clip_ratio": 0.0002270729566475893, "epoch": 0.32888171850258435, "grad_norm": 0.03328363224864006, "kl": 0.0035278797149658203, "learning_rate": 4.331210191082803e-06, "loss": 0.0076, "step": 272 }, { "clip_ratio": 0.0002132950650661769, "epoch": 0.3307938215171342, "grad_norm": 0.03230879083275795, "kl": 0.0034902095794677734, "learning_rate": 4.347133757961784e-06, "loss": 0.0076, "step": 273 }, { "clip_ratio": 0.0002096330554195447, "epoch": 0.3327059245316841, "grad_norm": 0.031601596623659134, "kl": 0.003440380096435547, "learning_rate": 4.3630573248407645e-06, "loss": 0.0076, "step": 274 }, { "clip_ratio": 0.00027223577194490645, "epoch": 0.33461802754623404, "grad_norm": 0.033090248703956604, "kl": 0.003412485122680664, "learning_rate": 4.378980891719746e-06, "loss": 0.0075, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 535.8942775726318, "epoch": 0.33653013056078396, "grad_norm": 0.03229549527168274, "kl": 0.003350973129272461, "learning_rate": 4.394904458598727e-06, "loss": 0.0057, "num_tokens": 130099677.0, "reward": 0.04966518087894656, "reward_std": 0.060705700190737844, "rewards/pure_accuracy_reward_math": 0.049665179773001, "step": 276 }, { "clip_ratio": 0.00025271691475836633, "epoch": 0.3384422335753339, "grad_norm": 0.03214692696928978, "kl": 0.0033435821533203125, "learning_rate": 4.410828025477708e-06, "loss": 0.0057, "step": 277 }, { "clip_ratio": 0.00023837689644778948, "epoch": 0.3403543365898838, "grad_norm": 0.03055053949356079, "kl": 0.003403902053833008, "learning_rate": 4.426751592356688e-06, "loss": 0.0057, "step": 278 }, { "clip_ratio": 0.0002586998209039848, "epoch": 0.3422664396044337, "grad_norm": 0.030119990929961205, "kl": 0.003477334976196289, "learning_rate": 4.442675159235669e-06, "loss": 0.0057, "step": 279 }, { "clip_ratio": 0.00026621688834893575, "epoch": 0.3441785426189836, "grad_norm": 0.030735207721590996, "kl": 0.0035724639892578125, "learning_rate": 4.45859872611465e-06, "loss": 0.0056, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 542.7466769218445, "epoch": 0.34609064563353353, "grad_norm": 0.033374350517988205, "kl": 0.003545999526977539, "learning_rate": 4.474522292993631e-06, "loss": 0.0036, "num_tokens": 133773381.0, "reward": 0.051339288300368935, "reward_std": 0.06345581240020692, "rewards/pure_accuracy_reward_math": 0.05133928690338507, "step": 281 }, { "clip_ratio": 0.0002734534241994879, "epoch": 0.3480027486480834, "grad_norm": 0.03312847390770912, "kl": 0.0035567283630371094, "learning_rate": 4.490445859872612e-06, "loss": 0.0036, "step": 282 }, { "clip_ratio": 0.00022532319422907676, "epoch": 0.3499148516626333, "grad_norm": 0.03281605243682861, "kl": 0.0035707950592041016, "learning_rate": 4.506369426751593e-06, "loss": 0.0035, "step": 283 }, { "clip_ratio": 0.0002544033526419298, "epoch": 0.3518269546771832, "grad_norm": 0.032299675047397614, "kl": 0.003595113754272461, "learning_rate": 4.522292993630574e-06, "loss": 0.0035, "step": 284 }, { "clip_ratio": 0.00024219880805276262, "epoch": 0.35373905769173314, "grad_norm": 0.031959276646375656, "kl": 0.0035622119903564453, "learning_rate": 4.538216560509555e-06, "loss": 0.0035, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 536.3122510910034, "epoch": 0.35565116070628305, "grad_norm": 0.035966720432043076, "kl": 0.003755331039428711, "learning_rate": 4.554140127388535e-06, "loss": 0.0076, "num_tokens": 137425032.0, "reward": 0.05524553809664212, "reward_std": 0.07191267621237785, "rewards/pure_accuracy_reward_math": 0.055245536990696564, "step": 286 }, { "clip_ratio": 0.00029696975889237365, "epoch": 0.35756326372083297, "grad_norm": 0.03485076501965523, "kl": 0.0036923885345458984, "learning_rate": 4.570063694267516e-06, "loss": 0.0076, "step": 287 }, { "clip_ratio": 0.0003252405772968814, "epoch": 0.3594753667353829, "grad_norm": 0.03465472534298897, "kl": 0.003720998764038086, "learning_rate": 4.585987261146497e-06, "loss": 0.0076, "step": 288 }, { "clip_ratio": 0.0003269365803362234, "epoch": 0.3613874697499328, "grad_norm": 0.033384956419467926, "kl": 0.003762483596801758, "learning_rate": 4.601910828025479e-06, "loss": 0.0075, "step": 289 }, { "clip_ratio": 0.0003269619904813226, "epoch": 0.36329957276448266, "grad_norm": 0.03343256562948227, "kl": 0.0037889480590820312, "learning_rate": 4.617834394904459e-06, "loss": 0.0075, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 533.6155371665955, "epoch": 0.3652116757790326, "grad_norm": 0.035127099603414536, "kl": 0.0037310123443603516, "learning_rate": 4.63375796178344e-06, "loss": 0.0084, "num_tokens": 141070278.0, "reward": 0.05580357421422377, "reward_std": 0.06861072615720332, "rewards/pure_accuracy_reward_math": 0.05580357281723991, "step": 291 }, { "clip_ratio": 0.00026876470258230256, "epoch": 0.3671237787935825, "grad_norm": 0.034193847328424454, "kl": 0.0037539005279541016, "learning_rate": 4.649681528662421e-06, "loss": 0.0084, "step": 292 }, { "clip_ratio": 0.00024497293054537295, "epoch": 0.3690358818081324, "grad_norm": 0.033800724893808365, "kl": 0.0037734508514404297, "learning_rate": 4.665605095541402e-06, "loss": 0.0084, "step": 293 }, { "clip_ratio": 0.0002538224067620831, "epoch": 0.3709479848226823, "grad_norm": 0.03376767784357071, "kl": 0.003782033920288086, "learning_rate": 4.6815286624203824e-06, "loss": 0.0083, "step": 294 }, { "clip_ratio": 0.00027697558522277177, "epoch": 0.37286008783723223, "grad_norm": 0.03229675441980362, "kl": 0.003787994384765625, "learning_rate": 4.697452229299363e-06, "loss": 0.0083, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 532.5739660263062, "epoch": 0.37477219085178215, "grad_norm": 0.035769619047641754, "kl": 0.0037794113159179688, "learning_rate": 4.713375796178344e-06, "loss": 0.0057, "num_tokens": 144715023.0, "reward": 0.05915178812574595, "reward_std": 0.07096926274243742, "rewards/pure_accuracy_reward_math": 0.059151787019800395, "step": 296 }, { "clip_ratio": 0.00030428163654505624, "epoch": 0.37668429386633207, "grad_norm": 0.035648081451654434, "kl": 0.003717660903930664, "learning_rate": 4.729299363057326e-06, "loss": 0.0057, "step": 297 }, { "clip_ratio": 0.00029741515106707084, "epoch": 0.378596396880882, "grad_norm": 0.03551783785223961, "kl": 0.0036716461181640625, "learning_rate": 4.745222929936306e-06, "loss": 0.0057, "step": 298 }, { "clip_ratio": 0.0003008591765478741, "epoch": 0.38050849989543184, "grad_norm": 0.03452136367559433, "kl": 0.0036542415618896484, "learning_rate": 4.761146496815287e-06, "loss": 0.0056, "step": 299 }, { "clip_ratio": 0.00032588979291858777, "epoch": 0.38242060290998175, "grad_norm": 0.03325437009334564, "kl": 0.003694295883178711, "learning_rate": 4.777070063694268e-06, "loss": 0.0056, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 519.2416524887085, "epoch": 0.38433270592453167, "grad_norm": 0.04327908158302307, "kl": 0.004815816879272461, "learning_rate": 4.792993630573249e-06, "loss": 0.0041, "num_tokens": 148307505.0, "reward": 0.05329241341678426, "reward_std": 0.061954362492542714, "rewards/pure_accuracy_reward_math": 0.0532924123108387, "step": 301 }, { "clip_ratio": 0.0002521659018839273, "epoch": 0.3862448089390816, "grad_norm": 0.041329506784677505, "kl": 0.004758596420288086, "learning_rate": 4.8089171974522295e-06, "loss": 0.0041, "step": 302 }, { "clip_ratio": 0.0002661041191913682, "epoch": 0.3881569119536315, "grad_norm": 0.03914090245962143, "kl": 0.0045318603515625, "learning_rate": 4.82484076433121e-06, "loss": 0.0041, "step": 303 }, { "clip_ratio": 0.0002647961523507547, "epoch": 0.3900690149681814, "grad_norm": 0.0363956093788147, "kl": 0.0043642520904541016, "learning_rate": 4.840764331210192e-06, "loss": 0.004, "step": 304 }, { "clip_ratio": 0.00030025097066754824, "epoch": 0.39198111798273133, "grad_norm": 0.05623022839426994, "kl": 0.00441288948059082, "learning_rate": 4.856687898089173e-06, "loss": 0.004, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 529.0496897697449, "epoch": 0.39389322099728125, "grad_norm": 0.03662995249032974, "kl": 0.0038270950317382812, "learning_rate": 4.872611464968153e-06, "loss": 0.0077, "num_tokens": 151936939.0, "reward": 0.0560825914144516, "reward_std": 0.061781705473549664, "rewards/pure_accuracy_reward_math": 0.05608259071595967, "step": 306 }, { "clip_ratio": 0.00025576306325092446, "epoch": 0.39580532401183116, "grad_norm": 0.03553188219666481, "kl": 0.00376129150390625, "learning_rate": 4.888535031847134e-06, "loss": 0.0076, "step": 307 }, { "clip_ratio": 0.00027371336784653977, "epoch": 0.397717427026381, "grad_norm": 0.035399794578552246, "kl": 0.0036725997924804688, "learning_rate": 4.904458598726115e-06, "loss": 0.0076, "step": 308 }, { "clip_ratio": 0.0002955471370569285, "epoch": 0.39962953004093094, "grad_norm": 0.03487352281808853, "kl": 0.003664731979370117, "learning_rate": 4.920382165605096e-06, "loss": 0.0076, "step": 309 }, { "clip_ratio": 0.00030850259520320833, "epoch": 0.40154163305548085, "grad_norm": 0.03433185815811157, "kl": 0.003676176071166992, "learning_rate": 4.9363057324840765e-06, "loss": 0.0075, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 524.8312191963196, "epoch": 0.40345373607003077, "grad_norm": 0.03824182599782944, "kl": 0.003762483596801758, "learning_rate": 4.952229299363058e-06, "loss": 0.0062, "num_tokens": 155550782.0, "reward": 0.05496652075089514, "reward_std": 0.0689961050520651, "rewards/pure_accuracy_reward_math": 0.0549665194703266, "step": 311 }, { "clip_ratio": 0.0002548059320588436, "epoch": 0.4053658390845807, "grad_norm": 0.036028265953063965, "kl": 0.003760099411010742, "learning_rate": 4.968152866242039e-06, "loss": 0.0062, "step": 312 }, { "clip_ratio": 0.00029642158040132927, "epoch": 0.4072779420991306, "grad_norm": 0.03537724167108536, "kl": 0.0038378238677978516, "learning_rate": 4.98407643312102e-06, "loss": 0.0062, "step": 313 }, { "clip_ratio": 0.00030970463706125884, "epoch": 0.4091900451136805, "grad_norm": 0.03521754965186119, "kl": 0.003871440887451172, "learning_rate": 5e-06, "loss": 0.0062, "step": 314 }, { "clip_ratio": 0.000315766970174991, "epoch": 0.4111021481282304, "grad_norm": 0.034070126712322235, "kl": 0.0037851333618164062, "learning_rate": 4.999992129526286e-06, "loss": 0.0061, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 528.3727917671204, "epoch": 0.41301425114278034, "grad_norm": 0.12440560013055801, "kl": 0.005699872970581055, "learning_rate": 4.999968518154701e-06, "loss": 0.0041, "num_tokens": 159174918.0, "reward": 0.05050223457510583, "reward_std": 0.06435916194459423, "rewards/pure_accuracy_reward_math": 0.050502233527367935, "step": 316 }, { "clip_ratio": 0.0002532021657657424, "epoch": 0.4149263541573302, "grad_norm": 0.05440036952495575, "kl": 0.005144357681274414, "learning_rate": 4.99992916603391e-06, "loss": 0.004, "step": 317 }, { "clip_ratio": 0.00025051761485883617, "epoch": 0.4168384571718801, "grad_norm": 0.051424141973257065, "kl": 0.005103111267089844, "learning_rate": 4.999874073411688e-06, "loss": 0.004, "step": 318 }, { "clip_ratio": 0.0002561948363677402, "epoch": 0.41875056018643003, "grad_norm": 0.06930891424417496, "kl": 0.004969120025634766, "learning_rate": 4.9998032406349205e-06, "loss": 0.0039, "step": 319 }, { "clip_ratio": 0.0002573228107394243, "epoch": 0.42066266320097995, "grad_norm": 0.06900722533464432, "kl": 0.004853248596191406, "learning_rate": 4.9997166681495975e-06, "loss": 0.0039, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 517.6638069152832, "epoch": 0.42257476621552986, "grad_norm": 0.03829098492860794, "kl": 0.0038361549377441406, "learning_rate": 4.999614356500811e-06, "loss": 0.0072, "num_tokens": 162764497.0, "reward": 0.06110491356230341, "reward_std": 0.07393209857400507, "rewards/pure_accuracy_reward_math": 0.06110491222352721, "step": 321 }, { "clip_ratio": 0.0002886460991931017, "epoch": 0.4244868692300798, "grad_norm": 0.03761793673038483, "kl": 0.0038406848907470703, "learning_rate": 4.999496306332755e-06, "loss": 0.0072, "step": 322 }, { "clip_ratio": 0.00029219654425105546, "epoch": 0.4263989722446297, "grad_norm": 0.03714153915643692, "kl": 0.003914356231689453, "learning_rate": 4.999362518388718e-06, "loss": 0.0071, "step": 323 }, { "clip_ratio": 0.0003099845329757045, "epoch": 0.4283110752591796, "grad_norm": 0.03610815480351448, "kl": 0.0039288997650146484, "learning_rate": 4.99921299351108e-06, "loss": 0.0071, "step": 324 }, { "clip_ratio": 0.0003404705674370234, "epoch": 0.4302231782737295, "grad_norm": 0.03599926084280014, "kl": 0.003935813903808594, "learning_rate": 4.999047732641305e-06, "loss": 0.007, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 510.4832811355591, "epoch": 0.4321352812882794, "grad_norm": 0.04078551381826401, "kl": 0.003900766372680664, "learning_rate": 4.998866736819938e-06, "loss": 0.0063, "num_tokens": 166324161.0, "reward": 0.059151788242161274, "reward_std": 0.07354671962093562, "rewards/pure_accuracy_reward_math": 0.05915178725263104, "step": 326 }, { "clip_ratio": 0.00026936357801332633, "epoch": 0.4340473843028293, "grad_norm": 0.03855260834097862, "kl": 0.003957986831665039, "learning_rate": 4.998670007186599e-06, "loss": 0.0063, "step": 327 }, { "clip_ratio": 0.0002843770836875592, "epoch": 0.4359594873173792, "grad_norm": 0.03724536672234535, "kl": 0.0039751529693603516, "learning_rate": 4.998457544979971e-06, "loss": 0.0062, "step": 328 }, { "clip_ratio": 0.0003156123698886404, "epoch": 0.43787159033192913, "grad_norm": 0.03662634268403053, "kl": 0.0040798187255859375, "learning_rate": 4.998229351537797e-06, "loss": 0.0062, "step": 329 }, { "clip_ratio": 0.0003457550078564964, "epoch": 0.43978369334647904, "grad_norm": 0.03598077967762947, "kl": 0.004061460494995117, "learning_rate": 4.997985428296869e-06, "loss": 0.0061, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 528.4207811355591, "epoch": 0.44169579636102896, "grad_norm": 0.08678283542394638, "kl": 0.008905410766601562, "learning_rate": 4.997725776793021e-06, "loss": 0.0058, "num_tokens": 169950285.0, "reward": 0.05636160948779434, "reward_std": 0.07148723275167868, "rewards/pure_accuracy_reward_math": 0.05636160867288709, "step": 331 }, { "clip_ratio": 0.00029096677934603576, "epoch": 0.4436078993755789, "grad_norm": 0.09512893110513687, "kl": 0.007820606231689453, "learning_rate": 4.997450398661117e-06, "loss": 0.0058, "step": 332 }, { "clip_ratio": 0.00029938158724007735, "epoch": 0.4455200023901288, "grad_norm": 0.24316293001174927, "kl": 0.007544517517089844, "learning_rate": 4.9971592956350405e-06, "loss": 0.0057, "step": 333 }, { "clip_ratio": 0.00032061134919558754, "epoch": 0.4474321054046787, "grad_norm": 0.07169396430253983, "kl": 0.006528377532958984, "learning_rate": 4.996852469547688e-06, "loss": 0.0057, "step": 334 }, { "clip_ratio": 0.00034978831735088534, "epoch": 0.44934420841922856, "grad_norm": 0.06073050945997238, "kl": 0.0060198307037353516, "learning_rate": 4.996529922330954e-06, "loss": 0.0056, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 535.8259167671204, "epoch": 0.4512563114337785, "grad_norm": 0.034031759947538376, "kl": 0.0037636756896972656, "learning_rate": 4.996191656015715e-06, "loss": 0.0063, "num_tokens": 173606605.0, "reward": 0.05273437770665623, "reward_std": 0.061655311612412333, "rewards/pure_accuracy_reward_math": 0.05273437625146471, "step": 336 }, { "clip_ratio": 0.0002175188884052659, "epoch": 0.4531684144483284, "grad_norm": 0.03333257883787155, "kl": 0.0038194656372070312, "learning_rate": 4.995837672731827e-06, "loss": 0.0063, "step": 337 }, { "clip_ratio": 0.00022021491247414815, "epoch": 0.4550805174628783, "grad_norm": 0.032678041607141495, "kl": 0.0038101673126220703, "learning_rate": 4.9954679747081e-06, "loss": 0.0063, "step": 338 }, { "clip_ratio": 0.000264580338352971, "epoch": 0.4569926204774282, "grad_norm": 0.032030362635850906, "kl": 0.0037910938262939453, "learning_rate": 4.995082564272295e-06, "loss": 0.0062, "step": 339 }, { "clip_ratio": 0.00027159255438391483, "epoch": 0.45890472349197814, "grad_norm": 0.031298909336328506, "kl": 0.0038001537322998047, "learning_rate": 4.994681443851102e-06, "loss": 0.0062, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 527.6174931526184, "epoch": 0.46081682650652805, "grad_norm": 0.04015278443694115, "kl": 0.004010200500488281, "learning_rate": 4.994264615970126e-06, "loss": 0.0062, "num_tokens": 177226454.0, "reward": 0.056361609895247966, "reward_std": 0.06633232033345848, "rewards/pure_accuracy_reward_math": 0.05636160867288709, "step": 341 }, { "clip_ratio": 0.00026669438159387937, "epoch": 0.46272892952107797, "grad_norm": 0.03813392296433449, "kl": 0.0039997100830078125, "learning_rate": 4.993832083253874e-06, "loss": 0.0062, "step": 342 }, { "clip_ratio": 0.0003048689098363866, "epoch": 0.46464103253562783, "grad_norm": 0.03776548057794571, "kl": 0.004065752029418945, "learning_rate": 4.993383848425736e-06, "loss": 0.0061, "step": 343 }, { "clip_ratio": 0.0003051352168768062, "epoch": 0.46655313555017774, "grad_norm": 0.03955227509140968, "kl": 0.0041925907135009766, "learning_rate": 4.992919914307969e-06, "loss": 0.0061, "step": 344 }, { "clip_ratio": 0.00030118576887616655, "epoch": 0.46846523856472766, "grad_norm": 0.036648593842983246, "kl": 0.00420832633972168, "learning_rate": 4.992440283821676e-06, "loss": 0.006, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 527.5131411552429, "epoch": 0.4703773415792776, "grad_norm": 13.381791114807129, "kl": 0.1310877799987793, "learning_rate": 4.991944959986793e-06, "loss": 0.018, "num_tokens": 180852413.0, "reward": 0.06138393163564615, "reward_std": 0.07144096971023828, "rewards/pure_accuracy_reward_math": 0.061383930064039305, "step": 346 }, { "clip_ratio": 0.00030088673440786806, "epoch": 0.4722894445938275, "grad_norm": 1.359532356262207, "kl": 0.01866316795349121, "learning_rate": 4.991433945922068e-06, "loss": 0.0135, "step": 347 }, { "clip_ratio": 0.0003527746957843192, "epoch": 0.4742015476083774, "grad_norm": 0.050763800740242004, "kl": 0.005962371826171875, "learning_rate": 4.9909072448450386e-06, "loss": 0.013, "step": 348 }, { "clip_ratio": 0.0003426602560239189, "epoch": 0.4761136506229273, "grad_norm": 0.0476795993745327, "kl": 0.006250858306884766, "learning_rate": 4.990364860072014e-06, "loss": 0.013, "step": 349 }, { "clip_ratio": 0.00033057811066328213, "epoch": 0.47802575363747724, "grad_norm": 0.04783082380890846, "kl": 0.0066144466400146484, "learning_rate": 4.989806795018054e-06, "loss": 0.013, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 522.409900188446, "epoch": 0.47993785665202715, "grad_norm": 0.036505699157714844, "kl": 0.0040128231048583984, "learning_rate": 4.989233053196948e-06, "loss": 0.0024, "num_tokens": 184454394.0, "reward": 0.04771205602446571, "reward_std": 0.05920424917712808, "rewards/pure_accuracy_reward_math": 0.047712054976727813, "step": 351 }, { "clip_ratio": 0.00023261837060317703, "epoch": 0.481849959666577, "grad_norm": 0.037214819341897964, "kl": 0.004108428955078125, "learning_rate": 4.988643638221193e-06, "loss": 0.0024, "step": 352 }, { "clip_ratio": 0.0002573013600795093, "epoch": 0.4837620626811269, "grad_norm": 0.03702811896800995, "kl": 0.004202127456665039, "learning_rate": 4.9880385538019665e-06, "loss": 0.0024, "step": 353 }, { "clip_ratio": 0.0002758479482167786, "epoch": 0.48567416569567684, "grad_norm": 0.03838437795639038, "kl": 0.004250764846801758, "learning_rate": 4.987417803749112e-06, "loss": 0.0023, "step": 354 }, { "clip_ratio": 0.00024451872050690326, "epoch": 0.48758626871022676, "grad_norm": 0.035314518958330154, "kl": 0.00424647331237793, "learning_rate": 4.986781391971105e-06, "loss": 0.0023, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 522.8789310455322, "epoch": 0.48949837172477667, "grad_norm": 0.038822874426841736, "kl": 0.004703998565673828, "learning_rate": 4.986129322475037e-06, "loss": 0.006, "num_tokens": 188061244.0, "reward": 0.05887277075089514, "reward_std": 0.0715272988891229, "rewards/pure_accuracy_reward_math": 0.058872769062872976, "step": 356 }, { "clip_ratio": 0.0003040988601696881, "epoch": 0.4914104747393266, "grad_norm": 0.03750370442867279, "kl": 0.004604816436767578, "learning_rate": 4.985461599366583e-06, "loss": 0.006, "step": 357 }, { "clip_ratio": 0.0003311016299676339, "epoch": 0.4933225777538765, "grad_norm": 0.03735021874308586, "kl": 0.004613637924194336, "learning_rate": 4.984778226849983e-06, "loss": 0.0059, "step": 358 }, { "clip_ratio": 0.00031427563314423423, "epoch": 0.4952346807684264, "grad_norm": 0.037090424448251724, "kl": 0.00463104248046875, "learning_rate": 4.984079209228007e-06, "loss": 0.0059, "step": 359 }, { "clip_ratio": 0.0003153682554284387, "epoch": 0.49714678378297633, "grad_norm": 0.03496375307440758, "kl": 0.004604816436767578, "learning_rate": 4.983364550901936e-06, "loss": 0.0058, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 523.5016980171204, "epoch": 0.4990588867975262, "grad_norm": 1978.1619873046875, "kl": 5.663617134094238, "learning_rate": 4.982634256371529e-06, "loss": 0.2313, "num_tokens": 191670522.0, "reward": 0.05943080599536188, "reward_std": 0.06242607004242018, "rewards/pure_accuracy_reward_math": 0.059430805064039305, "step": 361 }, { "clip_ratio": 0.0003008291907349303, "epoch": 0.5009709898120761, "grad_norm": 6.705481052398682, "kl": 0.07292413711547852, "learning_rate": 4.981888330234998e-06, "loss": 0.0076, "step": 362 }, { "clip_ratio": 0.00038137949604788446, "epoch": 0.502883092826626, "grad_norm": 0.4056338369846344, "kl": 0.013193130493164062, "learning_rate": 4.981126777188976e-06, "loss": 0.0053, "step": 363 }, { "clip_ratio": 0.00039371675529764616, "epoch": 0.5047951958411759, "grad_norm": 0.40032151341438293, "kl": 0.009969472885131836, "learning_rate": 4.980349602028489e-06, "loss": 0.0052, "step": 364 }, { "clip_ratio": 0.0003270253398568457, "epoch": 0.5067072988557259, "grad_norm": 0.08224909007549286, "kl": 0.010345458984375, "learning_rate": 4.979556809646928e-06, "loss": 0.0051, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 534.6082878112793, "epoch": 0.5086194018702758, "grad_norm": 0.036373648792505264, "kl": 0.003941535949707031, "learning_rate": 4.978748405036014e-06, "loss": 0.0071, "num_tokens": 195317270.0, "reward": 0.05552455584984273, "reward_std": 0.06775363947963342, "rewards/pure_accuracy_reward_math": 0.05552455486031249, "step": 366 }, { "clip_ratio": 0.00027453447256675645, "epoch": 0.5105315048848257, "grad_norm": 0.03525104746222496, "kl": 0.0039365291595458984, "learning_rate": 4.977924393285767e-06, "loss": 0.0072, "step": 367 }, { "clip_ratio": 0.0003015769660521528, "epoch": 0.5124436078993756, "grad_norm": 0.03737647458910942, "kl": 0.0039522647857666016, "learning_rate": 4.977084779584479e-06, "loss": 0.0071, "step": 368 }, { "clip_ratio": 0.0002889172319555655, "epoch": 0.5143557109139255, "grad_norm": 0.03506501764059067, "kl": 0.0039052963256835938, "learning_rate": 4.976229569218676e-06, "loss": 0.0071, "step": 369 }, { "clip_ratio": 0.0002910121094146234, "epoch": 0.5162678139284754, "grad_norm": 0.03558839485049248, "kl": 0.003898143768310547, "learning_rate": 4.975358767573085e-06, "loss": 0.007, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 523.1417660713196, "epoch": 0.5181799169430253, "grad_norm": 9.403284072875977, "kl": 0.0705575942993164, "learning_rate": 4.974472380130605e-06, "loss": 0.0078, "num_tokens": 198926094.0, "reward": 0.06305803885334171, "reward_std": 0.0737193762906827, "rewards/pure_accuracy_reward_math": 0.06305803733994253, "step": 371 }, { "clip_ratio": 0.00028168898450076085, "epoch": 0.5200920199575753, "grad_norm": 0.10174906253814697, "kl": 0.005540609359741211, "learning_rate": 4.9735704124722665e-06, "loss": 0.0053, "step": 372 }, { "clip_ratio": 0.00026055807722968893, "epoch": 0.5220041229721252, "grad_norm": 0.036394841969013214, "kl": 0.004784584045410156, "learning_rate": 4.9726528702771985e-06, "loss": 0.0052, "step": 373 }, { "clip_ratio": 0.0003154287535949152, "epoch": 0.523916225986675, "grad_norm": 0.03702308237552643, "kl": 0.004788875579833984, "learning_rate": 4.971719759322596e-06, "loss": 0.0052, "step": 374 }, { "clip_ratio": 0.000301387064496339, "epoch": 0.5258283290012249, "grad_norm": 0.03516030311584473, "kl": 0.004770994186401367, "learning_rate": 4.97077108548368e-06, "loss": 0.0051, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 520.7994132041931, "epoch": 0.5277404320157748, "grad_norm": 0.04183080792427063, "kl": 0.006031513214111328, "learning_rate": 4.969806854733658e-06, "loss": 0.0091, "num_tokens": 202522419.0, "reward": 0.0638950924621895, "reward_std": 0.07990403153235093, "rewards/pure_accuracy_reward_math": 0.0638950903667137, "step": 376 }, { "clip_ratio": 0.00032519385399609746, "epoch": 0.5296525350303247, "grad_norm": 0.0407201424241066, "kl": 0.005979061126708984, "learning_rate": 4.968827073143694e-06, "loss": 0.0091, "step": 377 }, { "clip_ratio": 0.00031682528469900717, "epoch": 0.5315646380448746, "grad_norm": 0.040043942630290985, "kl": 0.005922555923461914, "learning_rate": 4.967831746882863e-06, "loss": 0.0091, "step": 378 }, { "clip_ratio": 0.00033513708405052967, "epoch": 0.5334767410594246, "grad_norm": 0.03983679041266441, "kl": 0.005841970443725586, "learning_rate": 4.966820882218118e-06, "loss": 0.009, "step": 379 }, { "clip_ratio": 0.00034104771594911654, "epoch": 0.5353888440739745, "grad_norm": 0.03983955457806587, "kl": 0.005755186080932617, "learning_rate": 4.965794485514245e-06, "loss": 0.0089, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 520.5067186355591, "epoch": 0.5373009470885244, "grad_norm": 0.034092146903276443, "kl": 0.0043926239013671875, "learning_rate": 4.964752563233826e-06, "loss": 0.008, "num_tokens": 206122403.0, "reward": 0.055803573748562485, "reward_std": 0.05980854749213904, "rewards/pure_accuracy_reward_math": 0.05580357275903225, "step": 381 }, { "clip_ratio": 0.00025422318708478997, "epoch": 0.5392130501030743, "grad_norm": 0.03263320028781891, "kl": 0.0043218135833740234, "learning_rate": 4.9636951219372e-06, "loss": 0.008, "step": 382 }, { "clip_ratio": 0.00025885856206286917, "epoch": 0.5411251531176242, "grad_norm": 0.032487623393535614, "kl": 0.004242420196533203, "learning_rate": 4.962622168282416e-06, "loss": 0.008, "step": 383 }, { "clip_ratio": 0.0002850476581102157, "epoch": 0.5430372561321741, "grad_norm": 0.032427769154310226, "kl": 0.004185199737548828, "learning_rate": 4.961533709025199e-06, "loss": 0.0079, "step": 384 }, { "clip_ratio": 0.00029774147623129466, "epoch": 0.544949359146724, "grad_norm": 0.031092027202248573, "kl": 0.004144430160522461, "learning_rate": 4.960429751018901e-06, "loss": 0.0079, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 522.9258050918579, "epoch": 0.546861462161274, "grad_norm": 0.6398438811302185, "kl": 0.013398170471191406, "learning_rate": 4.959310301214458e-06, "loss": 0.0048, "num_tokens": 209727833.0, "reward": 0.06668527127476409, "reward_std": 0.07586519059259444, "rewards/pure_accuracy_reward_math": 0.06668526941211894, "step": 386 }, { "clip_ratio": 0.0002956847454242961, "epoch": 0.5487735651758239, "grad_norm": 0.09603609144687653, "kl": 0.006535530090332031, "learning_rate": 4.958175366660352e-06, "loss": 0.0045, "step": 387 }, { "clip_ratio": 0.00032585520455086225, "epoch": 0.5506856681903738, "grad_norm": 0.042251698672771454, "kl": 0.004881858825683594, "learning_rate": 4.95702495450256e-06, "loss": 0.0045, "step": 388 }, { "clip_ratio": 0.00030688931195754776, "epoch": 0.5525977712049237, "grad_norm": 0.03725959733128548, "kl": 0.00462651252746582, "learning_rate": 4.955859071984512e-06, "loss": 0.0044, "step": 389 }, { "clip_ratio": 0.0002833517196449975, "epoch": 0.5545098742194736, "grad_norm": 0.03557269275188446, "kl": 0.004591941833496094, "learning_rate": 4.954677726447049e-06, "loss": 0.0044, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 529.50141954422, "epoch": 0.5564219772340235, "grad_norm": 0.03767434135079384, "kl": 0.0041730403900146484, "learning_rate": 4.953480925328369e-06, "loss": 0.0053, "num_tokens": 213359594.0, "reward": 0.05636160998255946, "reward_std": 0.06873711966909468, "rewards/pure_accuracy_reward_math": 0.05636160829453729, "step": 391 }, { "clip_ratio": 0.0002943199858691514, "epoch": 0.5583340802485733, "grad_norm": 0.03691519424319267, "kl": 0.004199981689453125, "learning_rate": 4.952268676163984e-06, "loss": 0.0053, "step": 392 }, { "clip_ratio": 0.00028674039270981666, "epoch": 0.5602461832631233, "grad_norm": 0.036044176667928696, "kl": 0.004216432571411133, "learning_rate": 4.951040986586676e-06, "loss": 0.0053, "step": 393 }, { "clip_ratio": 0.0003071572371595721, "epoch": 0.5621582862776732, "grad_norm": 0.0358373187482357, "kl": 0.004226207733154297, "learning_rate": 4.949797864326442e-06, "loss": 0.0053, "step": 394 }, { "clip_ratio": 0.000308680556543095, "epoch": 0.5640703892922231, "grad_norm": 0.0356404110789299, "kl": 0.004263877868652344, "learning_rate": 4.9485393172104525e-06, "loss": 0.0052, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 528.1506924629211, "epoch": 0.565982492306773, "grad_norm": 0.03425108641386032, "kl": 0.004232645034790039, "learning_rate": 4.947265353162997e-06, "loss": 0.0047, "num_tokens": 216984490.0, "reward": 0.05831473466241732, "reward_std": 0.06912249873857945, "rewards/pure_accuracy_reward_math": 0.058314733556471765, "step": 396 }, { "clip_ratio": 0.0002443079777663115, "epoch": 0.5678945953213229, "grad_norm": 0.03406741842627525, "kl": 0.004246950149536133, "learning_rate": 4.945975980205435e-06, "loss": 0.0046, "step": 397 }, { "clip_ratio": 0.00025582832455484095, "epoch": 0.5698066983358728, "grad_norm": 0.033892109990119934, "kl": 0.004239320755004883, "learning_rate": 4.944671206456148e-06, "loss": 0.0046, "step": 398 }, { "clip_ratio": 0.0002801110364885062, "epoch": 0.5717188013504227, "grad_norm": 0.03294463828206062, "kl": 0.0042018890380859375, "learning_rate": 4.943351040130485e-06, "loss": 0.0046, "step": 399 }, { "clip_ratio": 0.00030015600407296006, "epoch": 0.5736309043649727, "grad_norm": 0.03228214010596275, "kl": 0.004125118255615234, "learning_rate": 4.942015489540715e-06, "loss": 0.0045, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 527.8225684165955, "epoch": 0.5755430073795226, "grad_norm": 0.037567272782325745, "kl": 0.005152702331542969, "learning_rate": 4.94066456309597e-06, "loss": 0.0071, "num_tokens": 220604938.0, "reward": 0.06166294886497781, "reward_std": 0.07311507751001045, "rewards/pure_accuracy_reward_math": 0.06166294764261693, "step": 401 }, { "clip_ratio": 0.0002694410874823916, "epoch": 0.5774551103940725, "grad_norm": 0.036373041570186615, "kl": 0.005210161209106445, "learning_rate": 4.939298269302194e-06, "loss": 0.0071, "step": 402 }, { "clip_ratio": 0.0002891406058438406, "epoch": 0.5793672134086224, "grad_norm": 0.03582580015063286, "kl": 0.0052187442779541016, "learning_rate": 4.9379166167620915e-06, "loss": 0.007, "step": 403 }, { "clip_ratio": 0.00030127688086167836, "epoch": 0.5812793164231723, "grad_norm": 0.035248763859272, "kl": 0.005229949951171875, "learning_rate": 4.93651961417507e-06, "loss": 0.007, "step": 404 }, { "clip_ratio": 0.00031262176707969047, "epoch": 0.5831914194377222, "grad_norm": 0.03461577743291855, "kl": 0.00519251823425293, "learning_rate": 4.9351072703371885e-06, "loss": 0.0069, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 534.0067219734192, "epoch": 0.5851035224522722, "grad_norm": 0.0363302007317543, "kl": 0.004278659820556641, "learning_rate": 4.933679594141096e-06, "loss": 0.0041, "num_tokens": 224253906.0, "reward": 0.06222098533180542, "reward_std": 0.07462272536940873, "rewards/pure_accuracy_reward_math": 0.06222098329453729, "step": 406 }, { "clip_ratio": 0.0002887690876320903, "epoch": 0.5870156254668221, "grad_norm": 0.03538454696536064, "kl": 0.004297971725463867, "learning_rate": 4.932236594575986e-06, "loss": 0.0041, "step": 407 }, { "clip_ratio": 0.00029836769689950415, "epoch": 0.588927728481372, "grad_norm": 0.03521309420466423, "kl": 0.004305362701416016, "learning_rate": 4.9307782807275304e-06, "loss": 0.0041, "step": 408 }, { "clip_ratio": 0.0003077857980144927, "epoch": 0.5908398314959219, "grad_norm": 0.03468110039830208, "kl": 0.004298210144042969, "learning_rate": 4.929304661777823e-06, "loss": 0.0041, "step": 409 }, { "clip_ratio": 0.00030735837987094783, "epoch": 0.5927519345104717, "grad_norm": 0.03504593297839165, "kl": 0.004282474517822266, "learning_rate": 4.9278157470053305e-06, "loss": 0.004, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 531.0987973213196, "epoch": 0.5946640375250216, "grad_norm": 0.03893313929438591, "kl": 0.004411935806274414, "learning_rate": 4.926311545784823e-06, "loss": 0.0081, "num_tokens": 227887088.0, "reward": 0.06138393160654232, "reward_std": 0.07560620526783168, "rewards/pure_accuracy_reward_math": 0.061383930034935474, "step": 411 }, { "clip_ratio": 0.0003015478255292692, "epoch": 0.5965761405395715, "grad_norm": 0.03745520859956741, "kl": 0.004415750503540039, "learning_rate": 4.924792067587321e-06, "loss": 0.0081, "step": 412 }, { "clip_ratio": 0.00033068407248038056, "epoch": 0.5984882435541214, "grad_norm": 0.037219781428575516, "kl": 0.004396915435791016, "learning_rate": 4.923257321980036e-06, "loss": 0.0081, "step": 413 }, { "clip_ratio": 0.00037280973344877566, "epoch": 0.6004003465686714, "grad_norm": 0.03754372149705887, "kl": 0.0044384002685546875, "learning_rate": 4.9217073186263075e-06, "loss": 0.0081, "step": 414 }, { "clip_ratio": 0.0003646712993372603, "epoch": 0.6023124495832213, "grad_norm": 0.03602118790149689, "kl": 0.004477262496948242, "learning_rate": 4.920142067285544e-06, "loss": 0.008, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 508.44282722473145, "epoch": 0.6042245525977712, "grad_norm": 0.039943527430295944, "kl": 0.004469871520996094, "learning_rate": 4.9185615778131614e-06, "loss": 0.0078, "num_tokens": 231443183.0, "reward": 0.0705915211874526, "reward_std": 0.07968511217040941, "rewards/pure_accuracy_reward_math": 0.07059151926659979, "step": 416 }, { "clip_ratio": 0.00031770144798315414, "epoch": 0.6061366556123211, "grad_norm": 0.039055656641721725, "kl": 0.004549264907836914, "learning_rate": 4.916965860160521e-06, "loss": 0.0078, "step": 417 }, { "clip_ratio": 0.00030108455553090607, "epoch": 0.608048758626871, "grad_norm": 0.03719799593091011, "kl": 0.004551410675048828, "learning_rate": 4.915354924374864e-06, "loss": 0.0078, "step": 418 }, { "clip_ratio": 0.0003208976940527464, "epoch": 0.6099608616414209, "grad_norm": 0.03626833111047745, "kl": 0.004576444625854492, "learning_rate": 4.913728780599254e-06, "loss": 0.0077, "step": 419 }, { "clip_ratio": 0.00030395733068644404, "epoch": 0.6118729646559709, "grad_norm": 0.035672470927238464, "kl": 0.004616498947143555, "learning_rate": 4.912087439072508e-06, "loss": 0.0077, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 519.3401436805725, "epoch": 0.6137850676705208, "grad_norm": 0.035979609936475754, "kl": 0.004936695098876953, "learning_rate": 4.9104309101291345e-06, "loss": 0.008, "num_tokens": 235040570.0, "reward": 0.0558035739522893, "reward_std": 0.06414644059259444, "rewards/pure_accuracy_reward_math": 0.05580357278813608, "step": 421 }, { "clip_ratio": 0.0002606460908509689, "epoch": 0.6156971706850707, "grad_norm": 0.034824173897504807, "kl": 0.004873991012573242, "learning_rate": 4.908759204199268e-06, "loss": 0.008, "step": 422 }, { "clip_ratio": 0.0002711625579081556, "epoch": 0.6176092736996206, "grad_norm": 0.034011878073215485, "kl": 0.00480341911315918, "learning_rate": 4.907072331808602e-06, "loss": 0.008, "step": 423 }, { "clip_ratio": 0.0002719364555332504, "epoch": 0.6195213767141705, "grad_norm": 0.0330798402428627, "kl": 0.00470733642578125, "learning_rate": 4.905370303578324e-06, "loss": 0.0079, "step": 424 }, { "clip_ratio": 0.0003164075427548596, "epoch": 0.6214334797287204, "grad_norm": 0.03356935828924179, "kl": 0.004645586013793945, "learning_rate": 4.903653130225049e-06, "loss": 0.0079, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 524.4051547050476, "epoch": 0.6233455827432703, "grad_norm": 0.037987031042575836, "kl": 0.004395723342895508, "learning_rate": 4.901920822560753e-06, "loss": 0.004, "num_tokens": 238650146.0, "reward": 0.056082592491293326, "reward_std": 0.06946781190345064, "rewards/pure_accuracy_reward_math": 0.05608259033760987, "step": 426 }, { "clip_ratio": 0.0002752577877913609, "epoch": 0.6252576857578201, "grad_norm": 0.03711739555001259, "kl": 0.0043413639068603516, "learning_rate": 4.900173391492698e-06, "loss": 0.004, "step": 427 }, { "clip_ratio": 0.0002780464546390249, "epoch": 0.6271697887723701, "grad_norm": 0.03583519160747528, "kl": 0.004349231719970703, "learning_rate": 4.898410848023374e-06, "loss": 0.004, "step": 428 }, { "clip_ratio": 0.0002759867400072835, "epoch": 0.62908189178692, "grad_norm": 0.035115331411361694, "kl": 0.0043909549713134766, "learning_rate": 4.896633203250424e-06, "loss": 0.0039, "step": 429 }, { "clip_ratio": 0.0002873923492074937, "epoch": 0.6309939948014699, "grad_norm": 0.03465187922120094, "kl": 0.004460573196411133, "learning_rate": 4.89484046836657e-06, "loss": 0.0039, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 527.1116304397583, "epoch": 0.6329060978160198, "grad_norm": 0.03591939061880112, "kl": 0.004395723342895508, "learning_rate": 4.893032654659554e-06, "loss": 0.0068, "num_tokens": 242275198.0, "reward": 0.05859375320142135, "reward_std": 0.06461814750218764, "rewards/pure_accuracy_reward_math": 0.05859375110594556, "step": 431 }, { "clip_ratio": 0.00021255032419276176, "epoch": 0.6348182008305697, "grad_norm": 0.03488593176007271, "kl": 0.0043849945068359375, "learning_rate": 4.891209773512054e-06, "loss": 0.0068, "step": 432 }, { "clip_ratio": 0.00023523596212271514, "epoch": 0.6367303038451196, "grad_norm": 0.03410722687840462, "kl": 0.004419565200805664, "learning_rate": 4.889371836401621e-06, "loss": 0.0067, "step": 433 }, { "clip_ratio": 0.00024576090385153293, "epoch": 0.6386424068596696, "grad_norm": 0.03335421159863472, "kl": 0.004421710968017578, "learning_rate": 4.887518854900603e-06, "loss": 0.0067, "step": 434 }, { "clip_ratio": 0.0002828803910119859, "epoch": 0.6405545098742195, "grad_norm": 0.03240649402141571, "kl": 0.004340171813964844, "learning_rate": 4.885650840676074e-06, "loss": 0.0066, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 532.2051043510437, "epoch": 0.6424666128887694, "grad_norm": 0.03588009625673294, "kl": 0.0044574737548828125, "learning_rate": 4.88376780548976e-06, "loss": 0.0041, "num_tokens": 245917009.0, "reward": 0.05775669912691228, "reward_std": 0.06611959752626717, "rewards/pure_accuracy_reward_math": 0.05775669778813608, "step": 436 }, { "clip_ratio": 0.0002524082638899472, "epoch": 0.6443787159033193, "grad_norm": 0.03471923619508743, "kl": 0.0044062137603759766, "learning_rate": 4.881869761197963e-06, "loss": 0.0041, "step": 437 }, { "clip_ratio": 0.0002889056303843063, "epoch": 0.6462908189178692, "grad_norm": 0.03379988297820091, "kl": 0.004372119903564453, "learning_rate": 4.879956719751491e-06, "loss": 0.004, "step": 438 }, { "clip_ratio": 0.0003009145272017122, "epoch": 0.6482029219324191, "grad_norm": 0.03446533530950546, "kl": 0.004400730133056641, "learning_rate": 4.878028693195577e-06, "loss": 0.004, "step": 439 }, { "clip_ratio": 0.00030466545126728306, "epoch": 0.650115024946969, "grad_norm": 0.03484022617340088, "kl": 0.004462242126464844, "learning_rate": 4.876085693669806e-06, "loss": 0.0039, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 517.0904240608215, "epoch": 0.652027127961519, "grad_norm": 0.0366295725107193, "kl": 0.004509925842285156, "learning_rate": 4.8741277334080405e-06, "loss": 0.0066, "num_tokens": 249502673.0, "reward": 0.05719866382423788, "reward_std": 0.06594694149680436, "rewards/pure_accuracy_reward_math": 0.057198662078008056, "step": 441 }, { "clip_ratio": 0.00023539985437537325, "epoch": 0.6539392309760689, "grad_norm": 0.03590084984898567, "kl": 0.0045740604400634766, "learning_rate": 4.87215482473834e-06, "loss": 0.0066, "step": 442 }, { "clip_ratio": 0.00022167488214108744, "epoch": 0.6558513339906188, "grad_norm": 0.03433714434504509, "kl": 0.004676342010498047, "learning_rate": 4.870166980082885e-06, "loss": 0.0066, "step": 443 }, { "clip_ratio": 0.0002476425726172238, "epoch": 0.6577634370051687, "grad_norm": 0.03389691188931465, "kl": 0.004789113998413086, "learning_rate": 4.868164211957899e-06, "loss": 0.0065, "step": 444 }, { "clip_ratio": 0.00025810993128061455, "epoch": 0.6596755400197185, "grad_norm": 0.03417885676026344, "kl": 0.004879474639892578, "learning_rate": 4.866146532973569e-06, "loss": 0.0064, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 520.3697214126587, "epoch": 0.6615876430342684, "grad_norm": 0.03560737892985344, "kl": 0.00455927848815918, "learning_rate": 4.864113955833967e-06, "loss": 0.0056, "num_tokens": 253104314.0, "reward": 0.06584821722935885, "reward_std": 0.07672227645525709, "rewards/pure_accuracy_reward_math": 0.06584821565775201, "step": 446 }, { "clip_ratio": 0.00029780695723502504, "epoch": 0.6634997460488183, "grad_norm": 0.034836821258068085, "kl": 0.0045278072357177734, "learning_rate": 4.862066493336967e-06, "loss": 0.0056, "step": 447 }, { "clip_ratio": 0.00030120932990485016, "epoch": 0.6654118490633683, "grad_norm": 0.03460467606782913, "kl": 0.0045435428619384766, "learning_rate": 4.860004158374172e-06, "loss": 0.0055, "step": 448 }, { "clip_ratio": 0.000313081463019671, "epoch": 0.6673239520779182, "grad_norm": 0.03467562422156334, "kl": 0.004552364349365234, "learning_rate": 4.857926963930822e-06, "loss": 0.0055, "step": 449 }, { "clip_ratio": 0.00031086072692687594, "epoch": 0.6692360550924681, "grad_norm": 0.03409102186560631, "kl": 0.004626035690307617, "learning_rate": 4.855834923085721e-06, "loss": 0.0054, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 514.4771447181702, "epoch": 0.671148158107018, "grad_norm": 0.03815117105841637, "kl": 0.005002737045288086, "learning_rate": 4.853728049011151e-06, "loss": 0.0091, "num_tokens": 256687388.0, "reward": 0.06556919938884676, "reward_std": 0.07874169782735407, "rewards/pure_accuracy_reward_math": 0.06556919787544757, "step": 451 }, { "clip_ratio": 0.0003133106871473501, "epoch": 0.6730602611215679, "grad_norm": 0.03761136531829834, "kl": 0.005041837692260742, "learning_rate": 4.851606354972791e-06, "loss": 0.0091, "step": 452 }, { "clip_ratio": 0.00034106033973557714, "epoch": 0.6749723641361178, "grad_norm": 0.0372379869222641, "kl": 0.0050508975982666016, "learning_rate": 4.849469854329629e-06, "loss": 0.0091, "step": 453 }, { "clip_ratio": 0.00033749614277667206, "epoch": 0.6768844671506677, "grad_norm": 0.03686762601137161, "kl": 0.005095005035400391, "learning_rate": 4.847318560533882e-06, "loss": 0.009, "step": 454 }, { "clip_ratio": 0.00035140375177888927, "epoch": 0.6787965701652177, "grad_norm": 0.036469750106334686, "kl": 0.005120754241943359, "learning_rate": 4.845152487130914e-06, "loss": 0.009, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 512.4866299629211, "epoch": 0.6807086731797676, "grad_norm": 0.037901297211647034, "kl": 0.004809379577636719, "learning_rate": 4.842971647759142e-06, "loss": 0.0063, "num_tokens": 260253700.0, "reward": 0.05775669912691228, "reward_std": 0.06710927549283952, "rewards/pure_accuracy_reward_math": 0.05775669767172076, "step": 456 }, { "clip_ratio": 0.00026634283756266086, "epoch": 0.6826207761943175, "grad_norm": 0.03568252548575401, "kl": 0.0047724246978759766, "learning_rate": 4.840776056149957e-06, "loss": 0.0063, "step": 457 }, { "clip_ratio": 0.00027518686636085476, "epoch": 0.6845328792088674, "grad_norm": 0.0351024754345417, "kl": 0.004754543304443359, "learning_rate": 4.838565726127636e-06, "loss": 0.0063, "step": 458 }, { "clip_ratio": 0.0003387172891393675, "epoch": 0.6864449822234173, "grad_norm": 0.03477272391319275, "kl": 0.004698753356933594, "learning_rate": 4.836340671609255e-06, "loss": 0.0062, "step": 459 }, { "clip_ratio": 0.0003592506114102889, "epoch": 0.6883570852379672, "grad_norm": 0.035812895745038986, "kl": 0.004735708236694336, "learning_rate": 4.834100906604601e-06, "loss": 0.0062, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 536.1403703689575, "epoch": 0.6902691882525172, "grad_norm": 0.03566034138202667, "kl": 0.004418611526489258, "learning_rate": 4.831846445216082e-06, "loss": 0.0056, "num_tokens": 263902651.0, "reward": 0.05161830614088103, "reward_std": 0.06899610540131107, "rewards/pure_accuracy_reward_math": 0.051618304976727813, "step": 461 }, { "clip_ratio": 0.00028340513017610647, "epoch": 0.6921812912670671, "grad_norm": 0.03495897352695465, "kl": 0.004414081573486328, "learning_rate": 4.829577301638642e-06, "loss": 0.0056, "step": 462 }, { "clip_ratio": 0.0002825141077664739, "epoch": 0.6940933942816169, "grad_norm": 0.034486111253499985, "kl": 0.004411220550537109, "learning_rate": 4.827293490159668e-06, "loss": 0.0056, "step": 463 }, { "clip_ratio": 0.00031019614829119746, "epoch": 0.6960054972961668, "grad_norm": 0.035884980112314224, "kl": 0.004367351531982422, "learning_rate": 4.824995025158903e-06, "loss": 0.0055, "step": 464 }, { "clip_ratio": 0.0003045983889933268, "epoch": 0.6979176003107167, "grad_norm": 0.03378836810588837, "kl": 0.004292488098144531, "learning_rate": 4.822681921108355e-06, "loss": 0.0055, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 525.3783731460571, "epoch": 0.6998297033252666, "grad_norm": 0.03726997971534729, "kl": 0.0065157413482666016, "learning_rate": 4.8203541925722016e-06, "loss": 0.0017, "num_tokens": 267508687.0, "reward": 0.06724330646102317, "reward_std": 0.07591145433252677, "rewards/pure_accuracy_reward_math": 0.06724330500583164, "step": 466 }, { "clip_ratio": 0.00026273680936128585, "epoch": 0.7017418063398165, "grad_norm": 0.03638988733291626, "kl": 0.0064983367919921875, "learning_rate": 4.818011854206706e-06, "loss": 0.0017, "step": 467 }, { "clip_ratio": 0.0002903113285128711, "epoch": 0.7036539093543664, "grad_norm": 0.0360158272087574, "kl": 0.006509542465209961, "learning_rate": 4.815654920760117e-06, "loss": 0.0016, "step": 468 }, { "clip_ratio": 0.0002849762186087901, "epoch": 0.7055660123689164, "grad_norm": 0.03577370196580887, "kl": 0.006470680236816406, "learning_rate": 4.81328340707258e-06, "loss": 0.0016, "step": 469 }, { "clip_ratio": 0.00031370155647891806, "epoch": 0.7074781153834663, "grad_norm": 0.03484919294714928, "kl": 0.006468772888183594, "learning_rate": 4.810897328076045e-06, "loss": 0.0015, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 530.1677136421204, "epoch": 0.7093902183980162, "grad_norm": 0.04198005422949791, "kl": 0.004724264144897461, "learning_rate": 4.808496698794171e-06, "loss": 0.0046, "num_tokens": 271138708.0, "reward": 0.07310268204309978, "reward_std": 0.07646948879119009, "rewards/pure_accuracy_reward_math": 0.07310267994762398, "step": 471 }, { "clip_ratio": 0.00028702764876697984, "epoch": 0.7113023214125661, "grad_norm": 0.04015243798494339, "kl": 0.004670619964599609, "learning_rate": 4.8060815343422265e-06, "loss": 0.0045, "step": 472 }, { "clip_ratio": 0.0002947892680822406, "epoch": 0.713214424427116, "grad_norm": 0.0385352224111557, "kl": 0.0046727657318115234, "learning_rate": 4.803651849927004e-06, "loss": 0.0045, "step": 473 }, { "clip_ratio": 0.00036661511779811917, "epoch": 0.7151265274416659, "grad_norm": 0.03803607076406479, "kl": 0.00463414192199707, "learning_rate": 4.801207660846717e-06, "loss": 0.0044, "step": 474 }, { "clip_ratio": 0.00040073674449558894, "epoch": 0.7170386304562159, "grad_norm": 0.03870271518826485, "kl": 0.00464320182800293, "learning_rate": 4.798748982490908e-06, "loss": 0.0044, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 539.262857913971, "epoch": 0.7189507334707658, "grad_norm": 0.0374424010515213, "kl": 0.0045392513275146484, "learning_rate": 4.796275830340344e-06, "loss": 0.0081, "num_tokens": 274802094.0, "reward": 0.061941967433085665, "reward_std": 0.07401842664694414, "rewards/pure_accuracy_reward_math": 0.06194196522119455, "step": 476 }, { "clip_ratio": 0.00026828293908920386, "epoch": 0.7208628364853157, "grad_norm": 0.03758076950907707, "kl": 0.004576683044433594, "learning_rate": 4.793788219966931e-06, "loss": 0.0081, "step": 477 }, { "clip_ratio": 0.0002991793934654652, "epoch": 0.7227749394998656, "grad_norm": 0.03570091351866722, "kl": 0.0045130252838134766, "learning_rate": 4.7912861670336065e-06, "loss": 0.008, "step": 478 }, { "clip_ratio": 0.00031140293214093617, "epoch": 0.7246870425144155, "grad_norm": 0.034991368651390076, "kl": 0.0044956207275390625, "learning_rate": 4.788769687294243e-06, "loss": 0.008, "step": 479 }, { "clip_ratio": 0.00034215352269484356, "epoch": 0.7265991455289653, "grad_norm": 0.03517301753163338, "kl": 0.00450587272644043, "learning_rate": 4.7862387965935504e-06, "loss": 0.0079, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 535.2455615997314, "epoch": 0.7285112485435152, "grad_norm": 0.03517255187034607, "kl": 0.004718780517578125, "learning_rate": 4.783693510866977e-06, "loss": 0.0066, "num_tokens": 278455030.0, "reward": 0.06222098530270159, "reward_std": 0.069766862958204, "rewards/pure_accuracy_reward_math": 0.062220983498264104, "step": 481 }, { "clip_ratio": 0.00026954136529866446, "epoch": 0.7304233515580651, "grad_norm": 0.03456445038318634, "kl": 0.004766225814819336, "learning_rate": 4.781133846140606e-06, "loss": 0.0066, "step": 482 }, { "clip_ratio": 0.000250861422671278, "epoch": 0.7323354545726151, "grad_norm": 0.033632129430770874, "kl": 0.004829883575439453, "learning_rate": 4.778559818531055e-06, "loss": 0.0066, "step": 483 }, { "clip_ratio": 0.0002590245896385568, "epoch": 0.734247557587165, "grad_norm": 0.03314875811338425, "kl": 0.00486445426940918, "learning_rate": 4.775971444245379e-06, "loss": 0.0065, "step": 484 }, { "clip_ratio": 0.0002899982684425595, "epoch": 0.7361596606017149, "grad_norm": 0.03288432955741882, "kl": 0.004921674728393555, "learning_rate": 4.773368739580963e-06, "loss": 0.0065, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 524.4258046150208, "epoch": 0.7380717636162648, "grad_norm": 0.08309170603752136, "kl": 0.006993293762207031, "learning_rate": 4.770751720925422e-06, "loss": 0.0023, "num_tokens": 282068152.0, "reward": 0.06222098495345563, "reward_std": 0.0712282478925772, "rewards/pure_accuracy_reward_math": 0.06222098338184878, "step": 486 }, { "clip_ratio": 0.0002442373284452515, "epoch": 0.7399838666308147, "grad_norm": 0.042120546102523804, "kl": 0.006081581115722656, "learning_rate": 4.768120404756497e-06, "loss": 0.0023, "step": 487 }, { "clip_ratio": 0.0002956131474434187, "epoch": 0.7418959696453646, "grad_norm": 0.036061204969882965, "kl": 0.0057599544525146484, "learning_rate": 4.765474807641951e-06, "loss": 0.0022, "step": 488 }, { "clip_ratio": 0.00030389728723889675, "epoch": 0.7438080726599146, "grad_norm": 0.03613469749689102, "kl": 0.005738019943237305, "learning_rate": 4.762814946239468e-06, "loss": 0.0022, "step": 489 }, { "clip_ratio": 0.00033159017920070255, "epoch": 0.7457201756744645, "grad_norm": 0.0360892117023468, "kl": 0.00572967529296875, "learning_rate": 4.760140837296542e-06, "loss": 0.0021, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 550.3144750595093, "epoch": 0.7476322786890144, "grad_norm": 0.03636733815073967, "kl": 0.004332542419433594, "learning_rate": 4.757452497650377e-06, "loss": 0.0072, "num_tokens": 285770403.0, "reward": 0.055803573777666315, "reward_std": 0.07161362667102367, "rewards/pure_accuracy_reward_math": 0.05580357278813608, "step": 491 }, { "clip_ratio": 0.00027637260956225873, "epoch": 0.7495443817035643, "grad_norm": 0.035727791488170624, "kl": 0.004361629486083984, "learning_rate": 4.754749944227777e-06, "loss": 0.0072, "step": 492 }, { "clip_ratio": 0.0002587454115428045, "epoch": 0.7514564847181142, "grad_norm": 0.03512200713157654, "kl": 0.0043697357177734375, "learning_rate": 4.752033194045044e-06, "loss": 0.0072, "step": 493 }, { "clip_ratio": 0.00025780797875540884, "epoch": 0.7533685877326641, "grad_norm": 0.033817108720541, "kl": 0.0043947696685791016, "learning_rate": 4.7493022642078654e-06, "loss": 0.0071, "step": 494 }, { "clip_ratio": 0.00029674232627030506, "epoch": 0.755280690747214, "grad_norm": 0.03317062556743622, "kl": 0.004454851150512695, "learning_rate": 4.746557171911211e-06, "loss": 0.0071, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 543.0692186355591, "epoch": 0.757192793761764, "grad_norm": 0.05020016431808472, "kl": 0.0062062740325927734, "learning_rate": 4.7437979344392236e-06, "loss": 0.0059, "num_tokens": 289451319.0, "reward": 0.0616629492433276, "reward_std": 0.07071027776692063, "rewards/pure_accuracy_reward_math": 0.06166294778813608, "step": 496 }, { "clip_ratio": 0.00028460744590574905, "epoch": 0.7591048967763139, "grad_norm": 0.03948064520955086, "kl": 0.0061266422271728516, "learning_rate": 4.741024569165105e-06, "loss": 0.0059, "step": 497 }, { "clip_ratio": 0.0002803450769306437, "epoch": 0.7610169997908637, "grad_norm": 0.03621263429522514, "kl": 0.00614476203918457, "learning_rate": 4.7382370935510165e-06, "loss": 0.0059, "step": 498 }, { "clip_ratio": 0.0003022695020717947, "epoch": 0.7629291028054136, "grad_norm": 0.037622902542352676, "kl": 0.006256580352783203, "learning_rate": 4.73543552514796e-06, "loss": 0.0058, "step": 499 }, { "clip_ratio": 0.00030265802058693225, "epoch": 0.7648412058199635, "grad_norm": 0.03813454508781433, "kl": 0.006264209747314453, "learning_rate": 4.732619881595672e-06, "loss": 0.0057, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 543.3538174629211, "epoch": 0.7667533088345134, "grad_norm": 0.07500133663415909, "kl": 0.005916118621826172, "learning_rate": 4.729790180622512e-06, "loss": 0.0072, "num_tokens": 293127839.0, "reward": 0.0513392879802268, "reward_std": 0.06792009877972305, "rewards/pure_accuracy_reward_math": 0.051339287048904225, "step": 501 }, { "clip_ratio": 0.0002826226679530919, "epoch": 0.7686654118490633, "grad_norm": 0.03498294949531555, "kl": 0.0057086944580078125, "learning_rate": 4.726946440045348e-06, "loss": 0.0072, "step": 502 }, { "clip_ratio": 0.000292762170943206, "epoch": 0.7705775148636133, "grad_norm": 0.0338723324239254, "kl": 0.0054700374603271484, "learning_rate": 4.7240886777694495e-06, "loss": 0.0071, "step": 503 }, { "clip_ratio": 0.00031638332251304746, "epoch": 0.7724896178781632, "grad_norm": 0.03360189124941826, "kl": 0.00526118278503418, "learning_rate": 4.721216911788371e-06, "loss": 0.0071, "step": 504 }, { "clip_ratio": 0.0003445502737804418, "epoch": 0.7744017208927131, "grad_norm": 0.03321666270494461, "kl": 0.005108356475830078, "learning_rate": 4.71833116018384e-06, "loss": 0.007, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 525.3041553497314, "epoch": 0.776313823907263, "grad_norm": 0.039082907140254974, "kl": 0.0048329830169677734, "learning_rate": 4.715431441125639e-06, "loss": 0.0072, "num_tokens": 296745449.0, "reward": 0.056640627823071554, "reward_std": 0.066464910923969, "rewards/pure_accuracy_reward_math": 0.05664062636788003, "step": 506 }, { "clip_ratio": 0.0002697859709428485, "epoch": 0.7782259269218129, "grad_norm": 0.036139652132987976, "kl": 0.0048868656158447266, "learning_rate": 4.712517772871503e-06, "loss": 0.0072, "step": 507 }, { "clip_ratio": 0.0002602223319172481, "epoch": 0.7801380299363628, "grad_norm": 0.03708622604608536, "kl": 0.004920244216918945, "learning_rate": 4.709590173766988e-06, "loss": 0.0072, "step": 508 }, { "clip_ratio": 0.00030563702995323183, "epoch": 0.7820501329509127, "grad_norm": 0.03873802721500397, "kl": 0.004922151565551758, "learning_rate": 4.706648662245368e-06, "loss": 0.0071, "step": 509 }, { "clip_ratio": 0.00027421732914945096, "epoch": 0.7839622359654627, "grad_norm": 0.0337008535861969, "kl": 0.004686117172241211, "learning_rate": 4.703693256827515e-06, "loss": 0.0071, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 544.4595675468445, "epoch": 0.7858743389800126, "grad_norm": 0.032148003578186035, "kl": 0.004284381866455078, "learning_rate": 4.700723976121782e-06, "loss": 0.0079, "num_tokens": 300427724.0, "reward": 0.05998884211294353, "reward_std": 0.06822534691309556, "rewards/pure_accuracy_reward_math": 0.059988840483129025, "step": 511 }, { "clip_ratio": 0.00023266997004611767, "epoch": 0.7877864419945625, "grad_norm": 0.03213036060333252, "kl": 0.004235267639160156, "learning_rate": 4.697740838823884e-06, "loss": 0.0079, "step": 512 }, { "clip_ratio": 0.00023210655439243055, "epoch": 0.7896985450091124, "grad_norm": 0.03171762451529503, "kl": 0.004268169403076172, "learning_rate": 4.694743863716784e-06, "loss": 0.0078, "step": 513 }, { "clip_ratio": 0.0002433597992990144, "epoch": 0.7916106480236623, "grad_norm": 0.030378276482224464, "kl": 0.004282712936401367, "learning_rate": 4.691733069670575e-06, "loss": 0.0078, "step": 514 }, { "clip_ratio": 0.00024098603546462982, "epoch": 0.7935227510382122, "grad_norm": 0.030135801061987877, "kl": 0.004299640655517578, "learning_rate": 4.688708475642356e-06, "loss": 0.0078, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 520.5064425468445, "epoch": 0.795434854052762, "grad_norm": 0.03758488968014717, "kl": 0.004748344421386719, "learning_rate": 4.685670100676117e-06, "loss": 0.0056, "num_tokens": 304030899.0, "reward": 0.059151788300368935, "reward_std": 0.06615966308163479, "rewards/pure_accuracy_reward_math": 0.05915178684517741, "step": 516 }, { "clip_ratio": 0.00024922658519699326, "epoch": 0.797346957067312, "grad_norm": 0.03667794167995453, "kl": 0.004762172698974609, "learning_rate": 4.6826179639026185e-06, "loss": 0.0056, "step": 517 }, { "clip_ratio": 0.00024439046995894387, "epoch": 0.7992590600818619, "grad_norm": 0.03566230833530426, "kl": 0.004770755767822266, "learning_rate": 4.679552084539271e-06, "loss": 0.0055, "step": 518 }, { "clip_ratio": 0.00025443012202686077, "epoch": 0.8011711630964118, "grad_norm": 0.03555983304977417, "kl": 0.004889011383056641, "learning_rate": 4.676472481890012e-06, "loss": 0.0055, "step": 519 }, { "clip_ratio": 0.0002555244412860702, "epoch": 0.8030832661109617, "grad_norm": 0.03477266803383827, "kl": 0.004910707473754883, "learning_rate": 4.673379175345187e-06, "loss": 0.0054, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 530.2039861679077, "epoch": 0.8049953691255116, "grad_norm": 0.03352927044034004, "kl": 0.004728078842163086, "learning_rate": 4.670272184381426e-06, "loss": 0.0064, "num_tokens": 307666714.0, "reward": 0.05106027063447982, "reward_std": 0.061781705473549664, "rewards/pure_accuracy_reward_math": 0.05106026888824999, "step": 521 }, { "clip_ratio": 0.00022480493561261028, "epoch": 0.8069074721400615, "grad_norm": 0.0328591950237751, "kl": 0.004677772521972656, "learning_rate": 4.667151528561522e-06, "loss": 0.0064, "step": 522 }, { "clip_ratio": 0.0002208993353463029, "epoch": 0.8088195751546114, "grad_norm": 0.0323566235601902, "kl": 0.004681825637817383, "learning_rate": 4.664017227534308e-06, "loss": 0.0064, "step": 523 }, { "clip_ratio": 0.0002261604544742113, "epoch": 0.8107316781691614, "grad_norm": 0.03178941085934639, "kl": 0.004633665084838867, "learning_rate": 4.6608693010345285e-06, "loss": 0.0063, "step": 524 }, { "clip_ratio": 0.0002347389614101303, "epoch": 0.8126437811837113, "grad_norm": 0.03144075721502304, "kl": 0.004633426666259766, "learning_rate": 4.657707768882723e-06, "loss": 0.0063, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 534.2078919410706, "epoch": 0.8145558841982612, "grad_norm": 36658.046875, "kl": 696.0046517848969, "learning_rate": 4.6545326509850965e-06, "loss": 27.8583, "num_tokens": 311314491.0, "reward": 0.05747768114088103, "reward_std": 0.06521624798187986, "rewards/pure_accuracy_reward_math": 0.057477680093143135, "step": 526 }, { "clip_ratio": 0.0006453408203128674, "epoch": 0.8164679872128111, "grad_norm": 3234.42724609375, "kl": 42.254658937454224, "learning_rate": 4.651343967333394e-06, "loss": 1.7021, "step": 527 }, { "clip_ratio": 0.0006781478184620937, "epoch": 0.818380090227361, "grad_norm": 430.01318359375, "kl": 0.21270966529846191, "learning_rate": 4.648141738004776e-06, "loss": 0.256, "step": 528 }, { "clip_ratio": 0.0006916913723671314, "epoch": 0.8202921932419109, "grad_norm": 457.1385803222656, "kl": 0.1541590690612793, "learning_rate": 4.644925983161691e-06, "loss": 0.3118, "step": 529 }, { "clip_ratio": 0.0007114471513887111, "epoch": 0.8222042962564609, "grad_norm": 61.02793884277344, "kl": 1.6688117980957031, "learning_rate": 4.641696723051753e-06, "loss": 0.1081, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 544.7664904594421, "epoch": 0.8241163992710108, "grad_norm": 0.03665775805711746, "kl": 0.0046710968017578125, "learning_rate": 4.638453978007606e-06, "loss": 0.0033, "num_tokens": 315000186.0, "reward": 0.05691964577999897, "reward_std": 0.06766731111565605, "rewards/pure_accuracy_reward_math": 0.056919643975561485, "step": 531 }, { "clip_ratio": 0.000247030089042255, "epoch": 0.8260285022855607, "grad_norm": 0.03543345257639885, "kl": 0.004717826843261719, "learning_rate": 4.635197768446799e-06, "loss": 0.0033, "step": 532 }, { "clip_ratio": 0.00024415442914005325, "epoch": 0.8279406053001105, "grad_norm": 0.034531209617853165, "kl": 0.004744768142700195, "learning_rate": 4.631928114871667e-06, "loss": 0.0032, "step": 533 }, { "clip_ratio": 0.0002580326566032909, "epoch": 0.8298527083146604, "grad_norm": 0.03323632851243019, "kl": 0.004830360412597656, "learning_rate": 4.628645037869183e-06, "loss": 0.0032, "step": 534 }, { "clip_ratio": 0.00029695888167680096, "epoch": 0.8317648113292103, "grad_norm": 0.03470376506447792, "kl": 0.0048847198486328125, "learning_rate": 4.625348558110846e-06, "loss": 0.0031, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 543.506443977356, "epoch": 0.8336769143437602, "grad_norm": 33.48581314086914, "kl": 0.7041072845458984, "learning_rate": 4.6220386963525425e-06, "loss": 0.0349, "num_tokens": 318683697.0, "reward": 0.06333705675206147, "reward_std": 0.0759915838134475, "rewards/pure_accuracy_reward_math": 0.063337054773001, "step": 536 }, { "clip_ratio": 0.00030500417074108555, "epoch": 0.8355890173583101, "grad_norm": 5.391356468200684, "kl": 0.12163639068603516, "learning_rate": 4.6187154734344144e-06, "loss": 0.0115, "step": 537 }, { "clip_ratio": 0.0003094891900445873, "epoch": 0.8375011203728601, "grad_norm": 0.24674992263317108, "kl": 0.011260032653808594, "learning_rate": 4.615378910280735e-06, "loss": 0.007, "step": 538 }, { "clip_ratio": 0.0003443351265559613, "epoch": 0.83941322338741, "grad_norm": 0.040490083396434784, "kl": 0.0068547725677490234, "learning_rate": 4.61202902789977e-06, "loss": 0.0068, "step": 539 }, { "clip_ratio": 0.0003249310258297555, "epoch": 0.8413253264019599, "grad_norm": 0.037383101880550385, "kl": 0.006977081298828125, "learning_rate": 4.608665847383646e-06, "loss": 0.0068, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 528.8432207107544, "epoch": 0.8432374294165098, "grad_norm": 0.0408562608063221, "kl": 0.005080223083496094, "learning_rate": 4.6052893899082244e-06, "loss": 0.0092, "num_tokens": 322311955.0, "reward": 0.07505580695578828, "reward_std": 0.08672685426427051, "rewards/pure_accuracy_reward_math": 0.07505580462748185, "step": 541 }, { "clip_ratio": 0.0003254984287082152, "epoch": 0.8451495324310597, "grad_norm": 0.03888032212853432, "kl": 0.005081653594970703, "learning_rate": 4.60189967673296e-06, "loss": 0.0091, "step": 542 }, { "clip_ratio": 0.00032150591908930437, "epoch": 0.8470616354456096, "grad_norm": 0.03769301995635033, "kl": 0.005054950714111328, "learning_rate": 4.598496729200772e-06, "loss": 0.0091, "step": 543 }, { "clip_ratio": 0.0003807161001532222, "epoch": 0.8489737384601596, "grad_norm": 0.03671475872397423, "kl": 0.005011320114135742, "learning_rate": 4.595080568737907e-06, "loss": 0.009, "step": 544 }, { "clip_ratio": 0.00040073374452731514, "epoch": 0.8508858414747095, "grad_norm": 0.03656642884016037, "kl": 0.004985332489013672, "learning_rate": 4.591651216853808e-06, "loss": 0.009, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 521.1850123405457, "epoch": 0.8527979444892594, "grad_norm": 0.04072614386677742, "kl": 0.005250692367553711, "learning_rate": 4.588208695140972e-06, "loss": 0.008, "num_tokens": 325915646.0, "reward": 0.06891741379513405, "reward_std": 0.07457646180409938, "rewards/pure_accuracy_reward_math": 0.0689174119324889, "step": 546 }, { "clip_ratio": 0.0002774237623270892, "epoch": 0.8547100475038093, "grad_norm": 0.03891909867525101, "kl": 0.005267620086669922, "learning_rate": 4.5847530252748206e-06, "loss": 0.008, "step": 547 }, { "clip_ratio": 0.0003099276901821213, "epoch": 0.8566221505183592, "grad_norm": 0.03776893764734268, "kl": 0.005312681198120117, "learning_rate": 4.581284229013561e-06, "loss": 0.008, "step": 548 }, { "clip_ratio": 0.0003329096458060121, "epoch": 0.8585342535329091, "grad_norm": 0.03786613792181015, "kl": 0.0053446292877197266, "learning_rate": 4.57780232819805e-06, "loss": 0.0079, "step": 549 }, { "clip_ratio": 0.0003465502328481307, "epoch": 0.860446356547459, "grad_norm": 0.03782954812049866, "kl": 0.00535893440246582, "learning_rate": 4.574307344751654e-06, "loss": 0.0079, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 512.2042679786682, "epoch": 0.8623584595620088, "grad_norm": 0.04082540422677994, "kl": 0.005150318145751953, "learning_rate": 4.570799300680112e-06, "loss": 0.0061, "num_tokens": 329486142.0, "reward": 0.06696428914438002, "reward_std": 0.07865536911413074, "rewards/pure_accuracy_reward_math": 0.06696428681607358, "step": 551 }, { "clip_ratio": 0.0002784457984148503, "epoch": 0.8642705625765588, "grad_norm": 0.039590511471033096, "kl": 0.005137205123901367, "learning_rate": 4.5672782180714005e-06, "loss": 0.0061, "step": 552 }, { "clip_ratio": 0.0003210699376268167, "epoch": 0.8661826655911087, "grad_norm": 0.03983275964856148, "kl": 0.005161285400390625, "learning_rate": 4.56374411909559e-06, "loss": 0.0061, "step": 553 }, { "clip_ratio": 0.00032905748116718314, "epoch": 0.8680947686056586, "grad_norm": 0.03924131765961647, "kl": 0.0051097869873046875, "learning_rate": 4.560197026004706e-06, "loss": 0.006, "step": 554 }, { "clip_ratio": 0.00036174511694753164, "epoch": 0.8700068716202085, "grad_norm": 0.03864859789609909, "kl": 0.0051233768463134766, "learning_rate": 4.556636961132591e-06, "loss": 0.0059, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 524.8490724563599, "epoch": 0.8719189746347584, "grad_norm": 0.03831901028752327, "kl": 0.005173921585083008, "learning_rate": 4.553063946894765e-06, "loss": 0.0089, "num_tokens": 333101169.0, "reward": 0.05970982427243143, "reward_std": 0.06925509037682787, "rewards/pure_accuracy_reward_math": 0.05970982293365523, "step": 556 }, { "clip_ratio": 0.00024058804717697058, "epoch": 0.8738310776493083, "grad_norm": 0.03815346583724022, "kl": 0.005152463912963867, "learning_rate": 4.549478005788276e-06, "loss": 0.0088, "step": 557 }, { "clip_ratio": 0.0002689754076072859, "epoch": 0.8757431806638583, "grad_norm": 0.03663227707147598, "kl": 0.00511932373046875, "learning_rate": 4.5458791603915695e-06, "loss": 0.0088, "step": 558 }, { "clip_ratio": 0.0002769273295371022, "epoch": 0.8776552836784082, "grad_norm": 0.03534897044301033, "kl": 0.005173921585083008, "learning_rate": 4.5422674333643415e-06, "loss": 0.0087, "step": 559 }, { "clip_ratio": 0.0003186316080245888, "epoch": 0.8795673866929581, "grad_norm": 0.03454131633043289, "kl": 0.005182981491088867, "learning_rate": 4.538642847447393e-06, "loss": 0.0087, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 499.49025869369507, "epoch": 0.881479489707508, "grad_norm": 0.03870520368218422, "kl": 0.005303621292114258, "learning_rate": 4.53500542546249e-06, "loss": 0.0063, "num_tokens": 336621146.0, "reward": 0.06724330663564615, "reward_std": 0.07539348350837827, "rewards/pure_accuracy_reward_math": 0.0672433050640393, "step": 561 }, { "clip_ratio": 0.0002930208739826412, "epoch": 0.8833915927220579, "grad_norm": 0.03670111671090126, "kl": 0.005410432815551758, "learning_rate": 4.5313551903122195e-06, "loss": 0.0063, "step": 562 }, { "clip_ratio": 0.00033625421181682214, "epoch": 0.8853036957366078, "grad_norm": 0.03873737156391144, "kl": 0.0054399967193603516, "learning_rate": 4.5276921649798475e-06, "loss": 0.0063, "step": 563 }, { "clip_ratio": 0.0003349392310383337, "epoch": 0.8872157987511577, "grad_norm": 0.038494061678647995, "kl": 0.0053806304931640625, "learning_rate": 4.524016372529168e-06, "loss": 0.0062, "step": 564 }, { "clip_ratio": 0.00031196477385719845, "epoch": 0.8891279017657077, "grad_norm": 0.03559175133705139, "kl": 0.005260467529296875, "learning_rate": 4.520327836104363e-06, "loss": 0.0061, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 521.2452793121338, "epoch": 0.8910400047802576, "grad_norm": 0.033526018261909485, "kl": 0.0050280094146728516, "learning_rate": 4.516626578929857e-06, "loss": 0.0083, "num_tokens": 340217537.0, "reward": 0.05970982470898889, "reward_std": 0.06920882686972618, "rewards/pure_accuracy_reward_math": 0.059709822555305436, "step": 566 }, { "clip_ratio": 0.0002854210310374583, "epoch": 0.8929521077948075, "grad_norm": 0.03320698440074921, "kl": 0.00494694709777832, "learning_rate": 4.512912624310166e-06, "loss": 0.0083, "step": 567 }, { "clip_ratio": 0.00028784406134718665, "epoch": 0.8948642108093574, "grad_norm": 0.0334990993142128, "kl": 0.004927158355712891, "learning_rate": 4.509185995629758e-06, "loss": 0.0083, "step": 568 }, { "clip_ratio": 0.00028731861192454744, "epoch": 0.8967763138239072, "grad_norm": 0.032721105962991714, "kl": 0.004916667938232422, "learning_rate": 4.505446716352898e-06, "loss": 0.0083, "step": 569 }, { "clip_ratio": 0.0003211342911981774, "epoch": 0.8986884168384571, "grad_norm": 0.031691305339336395, "kl": 0.0050427913665771484, "learning_rate": 4.501694810023506e-06, "loss": 0.0082, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 513.3175444602966, "epoch": 0.900600519853007, "grad_norm": 0.039067283272743225, "kl": 0.0051767826080322266, "learning_rate": 4.497930300265005e-06, "loss": 0.0062, "num_tokens": 343792675.0, "reward": 0.07254464668221772, "reward_std": 0.07260330504504964, "rewards/pure_accuracy_reward_math": 0.07254464394645765, "step": 571 }, { "clip_ratio": 0.000284439854624452, "epoch": 0.902512622867557, "grad_norm": 0.03746037185192108, "kl": 0.0051670074462890625, "learning_rate": 4.494153210780177e-06, "loss": 0.0062, "step": 572 }, { "clip_ratio": 0.0002894837679718876, "epoch": 0.9044247258821069, "grad_norm": 0.0363248772919178, "kl": 0.0051119327545166016, "learning_rate": 4.490363565351007e-06, "loss": 0.0061, "step": 573 }, { "clip_ratio": 0.00029392389137683494, "epoch": 0.9063368288966568, "grad_norm": 0.03513769805431366, "kl": 0.005059242248535156, "learning_rate": 4.486561387838539e-06, "loss": 0.0061, "step": 574 }, { "clip_ratio": 0.0003296555175325011, "epoch": 0.9082489319112067, "grad_norm": 0.03513012453913689, "kl": 0.005059242248535156, "learning_rate": 4.482746702182725e-06, "loss": 0.006, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 520.8926033973694, "epoch": 0.9101610349257566, "grad_norm": 0.049145400524139404, "kl": 0.011604547500610352, "learning_rate": 4.478919532402271e-06, "loss": 0.0046, "num_tokens": 347395370.0, "reward": 0.07170759254950099, "reward_std": 0.0817445982247591, "rewards/pure_accuracy_reward_math": 0.07170759091968648, "step": 576 }, { "clip_ratio": 0.00030760892423131736, "epoch": 0.9120731379403065, "grad_norm": 0.04954507574439049, "kl": 0.011447906494140625, "learning_rate": 4.4750799025944866e-06, "loss": 0.0045, "step": 577 }, { "clip_ratio": 0.0003202956161487691, "epoch": 0.9139852409548564, "grad_norm": 0.04883984476327896, "kl": 0.010998249053955078, "learning_rate": 4.471227836935139e-06, "loss": 0.0045, "step": 578 }, { "clip_ratio": 0.0003312723312660637, "epoch": 0.9158973439694064, "grad_norm": 0.049066606909036636, "kl": 0.010381698608398438, "learning_rate": 4.467363359678291e-06, "loss": 0.0044, "step": 579 }, { "clip_ratio": 0.00041312941800697445, "epoch": 0.9178094469839563, "grad_norm": 0.053418997675180435, "kl": 0.009602546691894531, "learning_rate": 4.463486495156157e-06, "loss": 0.0043, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 539.5678267478943, "epoch": 0.9197215499985062, "grad_norm": 0.03747523948550224, "kl": 0.004802227020263672, "learning_rate": 4.459597267778945e-06, "loss": 0.0041, "num_tokens": 351065793.0, "reward": 0.062220984895247966, "reward_std": 0.07298868335783482, "rewards/pure_accuracy_reward_math": 0.0622209832072258, "step": 581 }, { "clip_ratio": 0.0002890200073579763, "epoch": 0.9216336530130561, "grad_norm": 0.03557584062218666, "kl": 0.004851579666137695, "learning_rate": 4.455695702034705e-06, "loss": 0.0041, "step": 582 }, { "clip_ratio": 0.00031045296407228307, "epoch": 0.923545756027606, "grad_norm": 0.034734807908535004, "kl": 0.004895925521850586, "learning_rate": 4.451781822489173e-06, "loss": 0.0041, "step": 583 }, { "clip_ratio": 0.00032734786560695284, "epoch": 0.9254578590421559, "grad_norm": 0.03634972497820854, "kl": 0.004976511001586914, "learning_rate": 4.447855653785617e-06, "loss": 0.004, "step": 584 }, { "clip_ratio": 0.00036698238614008005, "epoch": 0.9273699620567059, "grad_norm": 0.036671172827482224, "kl": 0.004954338073730469, "learning_rate": 4.4439172206446845e-06, "loss": 0.0039, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 538.6261405944824, "epoch": 0.9292820650712557, "grad_norm": 0.03805253654718399, "kl": 0.005060434341430664, "learning_rate": 4.439966547864243e-06, "loss": 0.0061, "num_tokens": 354732057.0, "reward": 0.06194196725846268, "reward_std": 0.07766569184605032, "rewards/pure_accuracy_reward_math": 0.06194196580327116, "step": 586 }, { "clip_ratio": 0.0002944122598478316, "epoch": 0.9311941680858056, "grad_norm": 0.03603314608335495, "kl": 0.005051136016845703, "learning_rate": 4.436003660319224e-06, "loss": 0.0061, "step": 587 }, { "clip_ratio": 0.0003042620955966413, "epoch": 0.9331062711003555, "grad_norm": 0.035505130887031555, "kl": 0.005032539367675781, "learning_rate": 4.432028582961472e-06, "loss": 0.006, "step": 588 }, { "clip_ratio": 0.00032173160303727855, "epoch": 0.9350183741149054, "grad_norm": 0.03633759915828705, "kl": 0.00509190559387207, "learning_rate": 4.428041340819579e-06, "loss": 0.006, "step": 589 }, { "clip_ratio": 0.00038377046530513326, "epoch": 0.9369304771294553, "grad_norm": 0.03761395812034607, "kl": 0.005148649215698242, "learning_rate": 4.424041958998732e-06, "loss": 0.0059, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 540.8948354721069, "epoch": 0.9388425801440052, "grad_norm": 0.04139011353254318, "kl": 0.005031585693359375, "learning_rate": 4.420030462680554e-06, "loss": 0.007, "num_tokens": 358409840.0, "reward": 0.0714285749127157, "reward_std": 0.07565246830927208, "rewards/pure_accuracy_reward_math": 0.07142857275903225, "step": 591 }, { "clip_ratio": 0.0002982392526291733, "epoch": 0.9407546831585551, "grad_norm": 0.03948375955224037, "kl": 0.005082845687866211, "learning_rate": 4.416006877122948e-06, "loss": 0.007, "step": 592 }, { "clip_ratio": 0.00033647330587882607, "epoch": 0.9426667861731051, "grad_norm": 0.041717879474163055, "kl": 0.005113363265991211, "learning_rate": 4.411971227659933e-06, "loss": 0.0069, "step": 593 }, { "clip_ratio": 0.00036752876485479646, "epoch": 0.944578889187655, "grad_norm": 0.04109462723135948, "kl": 0.005068063735961914, "learning_rate": 4.407923539701486e-06, "loss": 0.0069, "step": 594 }, { "clip_ratio": 0.0003528254699176614, "epoch": 0.9464909922022049, "grad_norm": 0.03620041161775589, "kl": 0.0049245357513427734, "learning_rate": 4.403863838733386e-06, "loss": 0.0068, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 545.2444491386414, "epoch": 0.9484030952167548, "grad_norm": 42.05046463012695, "kl": 0.3311493396759033, "learning_rate": 4.399792150317048e-06, "loss": 0.0203, "num_tokens": 362096328.0, "reward": 0.06026786071015522, "reward_std": 0.07324766798410565, "rewards/pure_accuracy_reward_math": 0.06026785832364112, "step": 596 }, { "clip_ratio": 0.0003009684866128737, "epoch": 0.9503151982313047, "grad_norm": 0.575372040271759, "kl": 0.01551508903503418, "learning_rate": 4.395708500089366e-06, "loss": 0.0076, "step": 597 }, { "clip_ratio": 0.0003299758830053179, "epoch": 0.9522273012458546, "grad_norm": 0.052088066935539246, "kl": 0.01082468032836914, "learning_rate": 4.391612913762549e-06, "loss": 0.0074, "step": 598 }, { "clip_ratio": 0.00032988658261956516, "epoch": 0.9541394042604046, "grad_norm": 0.046673182398080826, "kl": 0.011472225189208984, "learning_rate": 4.38750541712396e-06, "loss": 0.0074, "step": 599 }, { "clip_ratio": 0.00031585949000145774, "epoch": 0.9560515072749545, "grad_norm": 0.04350757598876953, "kl": 0.011662006378173828, "learning_rate": 4.383386036035956e-06, "loss": 0.0074, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 539.0309958457947, "epoch": 0.9579636102895044, "grad_norm": 0.04193362593650818, "kl": 0.005011081695556641, "learning_rate": 4.379254796435719e-06, "loss": 0.0085, "num_tokens": 365761119.0, "reward": 0.06696428923169151, "reward_std": 0.08311965479515493, "rewards/pure_accuracy_reward_math": 0.06696428667055443, "step": 601 }, { "clip_ratio": 0.0003076634293392999, "epoch": 0.9598757133040543, "grad_norm": 0.04204736277461052, "kl": 0.005095720291137695, "learning_rate": 4.375111724335102e-06, "loss": 0.0085, "step": 602 }, { "clip_ratio": 0.0002991189727481469, "epoch": 0.9617878163186042, "grad_norm": 0.041649866849184036, "kl": 0.00509333610534668, "learning_rate": 4.370956845820455e-06, "loss": 0.0085, "step": 603 }, { "clip_ratio": 0.0003053998929090085, "epoch": 0.963699919333154, "grad_norm": 0.03969484567642212, "kl": 0.005100727081298828, "learning_rate": 4.366790187052468e-06, "loss": 0.0084, "step": 604 }, { "clip_ratio": 0.0003063883330014505, "epoch": 0.9656120223477039, "grad_norm": 0.03833401948213577, "kl": 0.005064487457275391, "learning_rate": 4.362611774266005e-06, "loss": 0.0083, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 534.4046006202698, "epoch": 0.9675241253622539, "grad_norm": 0.038279399275779724, "kl": 0.005177021026611328, "learning_rate": 4.358421633769934e-06, "loss": 0.0061, "num_tokens": 369412689.0, "reward": 0.07087053885334171, "reward_std": 0.08299326128326356, "rewards/pure_accuracy_reward_math": 0.0708705369324889, "step": 606 }, { "clip_ratio": 0.00030927538728064974, "epoch": 0.9694362283768038, "grad_norm": 0.037665851414203644, "kl": 0.005164146423339844, "learning_rate": 4.35421979194697e-06, "loss": 0.0061, "step": 607 }, { "clip_ratio": 0.0003293242310178357, "epoch": 0.9713483313913537, "grad_norm": 0.036888375878334045, "kl": 0.005212306976318359, "learning_rate": 4.3500062752535e-06, "loss": 0.006, "step": 608 }, { "clip_ratio": 0.0003369250752029984, "epoch": 0.9732604344059036, "grad_norm": 0.03607965633273125, "kl": 0.005278587341308594, "learning_rate": 4.3457811102194225e-06, "loss": 0.006, "step": 609 }, { "clip_ratio": 0.00034393194414406025, "epoch": 0.9751725374204535, "grad_norm": 0.036863330751657486, "kl": 0.005379676818847656, "learning_rate": 4.341544323447978e-06, "loss": 0.0059, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 527.9905385971069, "epoch": 0.9770846404350034, "grad_norm": 0.03825363516807556, "kl": 0.005227804183959961, "learning_rate": 4.33729594161558e-06, "loss": 0.0103, "num_tokens": 373041503.0, "reward": 0.07254464607103728, "reward_std": 0.07848271250259131, "rewards/pure_accuracy_reward_math": 0.07254464444122277, "step": 611 }, { "clip_ratio": 0.0002938344064205012, "epoch": 0.9789967434495533, "grad_norm": 0.037028077989816666, "kl": 0.005240917205810547, "learning_rate": 4.333035991471653e-06, "loss": 0.0102, "step": 612 }, { "clip_ratio": 0.00029232190240691125, "epoch": 0.9809088464641033, "grad_norm": 0.03623189404606819, "kl": 0.005187034606933594, "learning_rate": 4.328764499838456e-06, "loss": 0.0102, "step": 613 }, { "clip_ratio": 0.000318144969014611, "epoch": 0.9828209494786532, "grad_norm": 0.036878351122140884, "kl": 0.005211830139160156, "learning_rate": 4.324481493610919e-06, "loss": 0.0101, "step": 614 }, { "clip_ratio": 0.0003371401809317831, "epoch": 0.9847330524932031, "grad_norm": 0.036278340965509415, "kl": 0.0051462650299072266, "learning_rate": 4.320186999756473e-06, "loss": 0.0101, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 513.4927659034729, "epoch": 0.986645155507753, "grad_norm": 0.037584077566862106, "kl": 0.005333662033081055, "learning_rate": 4.315881045314878e-06, "loss": 0.007, "num_tokens": 376615645.0, "reward": 0.07087053899886087, "reward_std": 0.07342032523592934, "rewards/pure_accuracy_reward_math": 0.0708705370198004, "step": 616 }, { "clip_ratio": 0.0002886684330292155, "epoch": 0.9885572585223029, "grad_norm": 0.035872798413038254, "kl": 0.005288362503051758, "learning_rate": 4.311563657398056e-06, "loss": 0.007, "step": 617 }, { "clip_ratio": 0.0002961605097766551, "epoch": 0.9904693615368528, "grad_norm": 0.034989748150110245, "kl": 0.0052263736724853516, "learning_rate": 4.307234863189917e-06, "loss": 0.007, "step": 618 }, { "clip_ratio": 0.0003532402791392997, "epoch": 0.9923814645514027, "grad_norm": 0.0338488332927227, "kl": 0.005165576934814453, "learning_rate": 4.302894689946189e-06, "loss": 0.0069, "step": 619 }, { "clip_ratio": 0.00035387994120128496, "epoch": 0.9942935675659527, "grad_norm": 0.03370453417301178, "kl": 0.005126953125, "learning_rate": 4.298543164994249e-06, "loss": 0.0069, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 526.433337688446, "epoch": 1.00191210301455, "grad_norm": 0.0355641208589077, "kl": 0.004958152770996094, "learning_rate": 4.294180315732946e-06, "loss": 0.0063, "num_tokens": 380233970.0, "reward": 0.05412946696742438, "reward_std": 0.06637858302565292, "rewards/pure_accuracy_reward_math": 0.0541294657450635, "step": 621 }, { "clip_ratio": 0.0002793830541349962, "epoch": 1.0038242060290998, "grad_norm": 0.034697938710451126, "kl": 0.004967689514160156, "learning_rate": 4.289806169632434e-06, "loss": 0.0063, "step": 622 }, { "clip_ratio": 0.00026950584020823953, "epoch": 1.0057363090436497, "grad_norm": 0.034267228096723557, "kl": 0.005029439926147461, "learning_rate": 4.285420754233992e-06, "loss": 0.0062, "step": 623 }, { "clip_ratio": 0.0002694177366606709, "epoch": 1.0076484120581997, "grad_norm": 0.03245500102639198, "kl": 0.005047798156738281, "learning_rate": 4.2810240971498594e-06, "loss": 0.0062, "step": 624 }, { "clip_ratio": 0.0002762260926942872, "epoch": 1.0095605150727496, "grad_norm": 0.03143523633480072, "kl": 0.005035400390625, "learning_rate": 4.276616226063055e-06, "loss": 0.0061, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 528.094889163971, "epoch": 1.0114726180872995, "grad_norm": 0.03780335932970047, "kl": 0.005240440368652344, "learning_rate": 4.272197168727204e-06, "loss": 0.0082, "num_tokens": 383858818.0, "reward": 0.06891741388244554, "reward_std": 0.07891435397323221, "rewards/pure_accuracy_reward_math": 0.06891741207800806, "step": 626 }, { "clip_ratio": 0.0002971897219481434, "epoch": 1.0133847211018494, "grad_norm": 0.03676832467317581, "kl": 0.005240440368652344, "learning_rate": 4.267766952966369e-06, "loss": 0.0082, "step": 627 }, { "clip_ratio": 0.00032256075144232454, "epoch": 1.0152968241163993, "grad_norm": 0.03722486272454262, "kl": 0.005322933197021484, "learning_rate": 4.263325606674865e-06, "loss": 0.0082, "step": 628 }, { "clip_ratio": 0.00031109488622860226, "epoch": 1.0172089271309492, "grad_norm": 0.036808740347623825, "kl": 0.0054111480712890625, "learning_rate": 4.258873157817093e-06, "loss": 0.0081, "step": 629 }, { "clip_ratio": 0.00032292150183366175, "epoch": 1.0191210301454992, "grad_norm": 0.03518703579902649, "kl": 0.005442619323730469, "learning_rate": 4.254409634427356e-06, "loss": 0.008, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 515.6958961486816, "epoch": 1.021033133160049, "grad_norm": 0.03399791195988655, "kl": 0.005387306213378906, "learning_rate": 4.249935064609692e-06, "loss": 0.0031, "num_tokens": 387438928.0, "reward": 0.06250000285217538, "reward_std": 0.06757478544022888, "rewards/pure_accuracy_reward_math": 0.06250000145519152, "step": 631 }, { "clip_ratio": 0.0002553542814212051, "epoch": 1.022945236174599, "grad_norm": 0.03381386399269104, "kl": 0.005375385284423828, "learning_rate": 4.245449476537685e-06, "loss": 0.0031, "step": 632 }, { "clip_ratio": 0.00023506408626872144, "epoch": 1.024857339189149, "grad_norm": 0.03337083011865616, "kl": 0.00537109375, "learning_rate": 4.2409528984543e-06, "loss": 0.003, "step": 633 }, { "clip_ratio": 0.0002632986112871549, "epoch": 1.0267694422036988, "grad_norm": 0.03213095664978027, "kl": 0.005321979522705078, "learning_rate": 4.236445358671696e-06, "loss": 0.003, "step": 634 }, { "clip_ratio": 0.00025607587781451, "epoch": 1.0286815452182487, "grad_norm": 0.03154142573475838, "kl": 0.005255699157714844, "learning_rate": 4.23192688557105e-06, "loss": 0.0029, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 524.272346496582, "epoch": 1.0305936482327986, "grad_norm": 0.039318569004535675, "kl": 0.005155801773071289, "learning_rate": 4.2273975076023835e-06, "loss": 0.0075, "num_tokens": 391053556.0, "reward": 0.06473214598372579, "reward_std": 0.07401842583203688, "rewards/pure_accuracy_reward_math": 0.06473214412108064, "step": 636 }, { "clip_ratio": 0.0003024499371804268, "epoch": 1.0325057512473486, "grad_norm": 0.03726111724972725, "kl": 0.0050776004791259766, "learning_rate": 4.222857253284376e-06, "loss": 0.0075, "step": 637 }, { "clip_ratio": 0.0003151753968495541, "epoch": 1.0344178542618985, "grad_norm": 0.03595959022641182, "kl": 0.005060434341430664, "learning_rate": 4.218306151204188e-06, "loss": 0.0074, "step": 638 }, { "clip_ratio": 0.0003387899199083222, "epoch": 1.0363299572764482, "grad_norm": 0.03628028184175491, "kl": 0.005034923553466797, "learning_rate": 4.213744230017283e-06, "loss": 0.0074, "step": 639 }, { "clip_ratio": 0.00037899152403042535, "epoch": 1.038242060290998, "grad_norm": 0.03670131787657738, "kl": 0.005095720291137695, "learning_rate": 4.209171518447248e-06, "loss": 0.0073, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 536.5907049179077, "epoch": 1.040154163305548, "grad_norm": 0.03938442841172218, "kl": 0.0051763057708740234, "learning_rate": 4.204588045285607e-06, "loss": 0.0022, "num_tokens": 394708581.0, "reward": 0.06333705710130744, "reward_std": 0.07792467664694414, "rewards/pure_accuracy_reward_math": 0.06333705500583164, "step": 641 }, { "clip_ratio": 0.0002767174905216052, "epoch": 1.042066266320098, "grad_norm": 0.037835828959941864, "kl": 0.005267143249511719, "learning_rate": 4.1999938393916424e-06, "loss": 0.0022, "step": 642 }, { "clip_ratio": 0.0003277845591469486, "epoch": 1.0439783693346478, "grad_norm": 0.03832162916660309, "kl": 0.005464792251586914, "learning_rate": 4.195388929692217e-06, "loss": 0.0022, "step": 643 }, { "clip_ratio": 0.00035426640954483446, "epoch": 1.0458904723491977, "grad_norm": 0.03823033347725868, "kl": 0.005482673645019531, "learning_rate": 4.190773345181587e-06, "loss": 0.0021, "step": 644 }, { "clip_ratio": 0.0003763593267649412, "epoch": 1.0478025753637477, "grad_norm": 0.036984797567129135, "kl": 0.005467653274536133, "learning_rate": 4.186147114921221e-06, "loss": 0.002, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 528.9266424179077, "epoch": 1.0497146783782976, "grad_norm": 0.0355878509581089, "kl": 0.005333423614501953, "learning_rate": 4.18151026803962e-06, "loss": 0.0056, "num_tokens": 398334618.0, "reward": 0.06305803850409575, "reward_std": 0.06942774693015963, "rewards/pure_accuracy_reward_math": 0.06305803699069656, "step": 646 }, { "clip_ratio": 0.00024814905674475085, "epoch": 1.0516267813928475, "grad_norm": 0.034741513431072235, "kl": 0.005269289016723633, "learning_rate": 4.176862833732127e-06, "loss": 0.0056, "step": 647 }, { "clip_ratio": 0.00027503305113896204, "epoch": 1.0535388844073974, "grad_norm": 0.03375249356031418, "kl": 0.005173683166503906, "learning_rate": 4.1722048412607495e-06, "loss": 0.0055, "step": 648 }, { "clip_ratio": 0.0002895867207826086, "epoch": 1.0554509874219473, "grad_norm": 0.0341072678565979, "kl": 0.005132198333740234, "learning_rate": 4.167536319953976e-06, "loss": 0.0055, "step": 649 }, { "clip_ratio": 0.0003005371929134526, "epoch": 1.0573630904364972, "grad_norm": 0.033096957951784134, "kl": 0.005170345306396484, "learning_rate": 4.162857299206584e-06, "loss": 0.0054, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 538.7528138160706, "epoch": 1.0592751934510471, "grad_norm": 0.03696604445576668, "kl": 0.0052814483642578125, "learning_rate": 4.158167808479461e-06, "loss": 0.0097, "num_tokens": 401997276.0, "reward": 0.05943080657743849, "reward_std": 0.07388583471765742, "rewards/pure_accuracy_reward_math": 0.05943080494762398, "step": 651 }, { "clip_ratio": 0.00029416859939601636, "epoch": 1.061187296465597, "grad_norm": 0.03565770015120506, "kl": 0.005290031433105469, "learning_rate": 4.153467877299419e-06, "loss": 0.0097, "step": 652 }, { "clip_ratio": 0.00029473524284640007, "epoch": 1.063099399480147, "grad_norm": 0.03546367585659027, "kl": 0.005368709564208984, "learning_rate": 4.148757535259004e-06, "loss": 0.0096, "step": 653 }, { "clip_ratio": 0.00032781071104182047, "epoch": 1.065011502494697, "grad_norm": 0.03601039946079254, "kl": 0.005382061004638672, "learning_rate": 4.144036812016317e-06, "loss": 0.0096, "step": 654 }, { "clip_ratio": 0.0003433626044397897, "epoch": 1.0669236055092468, "grad_norm": 0.035073794424533844, "kl": 0.0053446292877197266, "learning_rate": 4.139305737294818e-06, "loss": 0.0095, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 520.1163725852966, "epoch": 1.0688357085237967, "grad_norm": 0.03852629289031029, "kl": 0.005383491516113281, "learning_rate": 4.134564340883148e-06, "loss": 0.0083, "num_tokens": 405593985.0, "reward": 0.06445312793948688, "reward_std": 0.07135464163729921, "rewards/pure_accuracy_reward_math": 0.06445312654250301, "step": 656 }, { "clip_ratio": 0.0002591365355897324, "epoch": 1.0707478115383466, "grad_norm": 0.03745557367801666, "kl": 0.0053327083587646484, "learning_rate": 4.129812652634936e-06, "loss": 0.0083, "step": 657 }, { "clip_ratio": 0.0003071958567772981, "epoch": 1.0726599145528966, "grad_norm": 0.037043727934360504, "kl": 0.00532078742980957, "learning_rate": 4.1250507024686115e-06, "loss": 0.0083, "step": 658 }, { "clip_ratio": 0.00029935286954696494, "epoch": 1.0745720175674465, "grad_norm": 0.03582773730158806, "kl": 0.005355358123779297, "learning_rate": 4.120278520367217e-06, "loss": 0.0082, "step": 659 }, { "clip_ratio": 0.0003111159166451216, "epoch": 1.0764841205819964, "grad_norm": 0.035313159227371216, "kl": 0.005402326583862305, "learning_rate": 4.115496136378219e-06, "loss": 0.0081, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 509.2994108200073, "epoch": 1.0783962235965463, "grad_norm": 0.041104141622781754, "kl": 0.005465507507324219, "learning_rate": 4.110703580613321e-06, "loss": 0.0074, "num_tokens": 409156330.0, "reward": 0.0641741098370403, "reward_std": 0.08329231233801693, "rewards/pure_accuracy_reward_math": 0.06417410826543346, "step": 661 }, { "clip_ratio": 0.0003218170786567498, "epoch": 1.0803083266110962, "grad_norm": 0.03970121592283249, "kl": 0.005608558654785156, "learning_rate": 4.105900883248269e-06, "loss": 0.0074, "step": 662 }, { "clip_ratio": 0.00032362689415776913, "epoch": 1.0822204296256461, "grad_norm": 0.039676353335380554, "kl": 0.005734920501708984, "learning_rate": 4.101088074522667e-06, "loss": 0.0074, "step": 663 }, { "clip_ratio": 0.000323468098201829, "epoch": 1.084132532640196, "grad_norm": 0.03883183002471924, "kl": 0.005713939666748047, "learning_rate": 4.096265184739781e-06, "loss": 0.0073, "step": 664 }, { "clip_ratio": 0.00033196881122421473, "epoch": 1.086044635654746, "grad_norm": 0.037281692028045654, "kl": 0.0056934356689453125, "learning_rate": 4.091432244266354e-06, "loss": 0.0072, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 522.48774766922, "epoch": 1.0879567386692959, "grad_norm": 0.037982553243637085, "kl": 0.005854606628417969, "learning_rate": 4.08658928353241e-06, "loss": 0.0086, "num_tokens": 412758914.0, "reward": 0.06835937799769454, "reward_std": 0.07526708859950304, "rewards/pure_accuracy_reward_math": 0.06835937630967237, "step": 666 }, { "clip_ratio": 0.0002976899445457093, "epoch": 1.0898688416838458, "grad_norm": 0.03663322329521179, "kl": 0.005788326263427734, "learning_rate": 4.081736333031066e-06, "loss": 0.0086, "step": 667 }, { "clip_ratio": 0.0002965517393818118, "epoch": 1.0917809446983957, "grad_norm": 0.03593512997031212, "kl": 0.005764484405517578, "learning_rate": 4.0768734233183376e-06, "loss": 0.0085, "step": 668 }, { "clip_ratio": 0.0003466513953753747, "epoch": 1.0936930477129456, "grad_norm": 0.03643948212265968, "kl": 0.005777835845947266, "learning_rate": 4.072000585012947e-06, "loss": 0.0085, "step": 669 }, { "clip_ratio": 0.00037185640462666925, "epoch": 1.0956051507274955, "grad_norm": 0.03601692244410515, "kl": 0.0058193206787109375, "learning_rate": 4.06711784879613e-06, "loss": 0.0084, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 526.0530390739441, "epoch": 1.0975172537420455, "grad_norm": 0.03892623260617256, "kl": 0.005596637725830078, "learning_rate": 4.062225245411444e-06, "loss": 0.007, "num_tokens": 416383588.0, "reward": 0.061104913387680426, "reward_std": 0.07539348275167868, "rewards/pure_accuracy_reward_math": 0.06110491187428124, "step": 671 }, { "clip_ratio": 0.0003017952032280391, "epoch": 1.0994293567565951, "grad_norm": 0.0375184491276741, "kl": 0.0056912899017333984, "learning_rate": 4.057322805664576e-06, "loss": 0.007, "step": 672 }, { "clip_ratio": 0.0002928147856096075, "epoch": 1.1013414597711453, "grad_norm": 0.03731007128953934, "kl": 0.0057830810546875, "learning_rate": 4.0524105604231435e-06, "loss": 0.0069, "step": 673 }, { "clip_ratio": 0.000317500726794151, "epoch": 1.103253562785695, "grad_norm": 0.03885798528790474, "kl": 0.005819559097290039, "learning_rate": 4.047488540616503e-06, "loss": 0.0069, "step": 674 }, { "clip_ratio": 0.0003141532706649741, "epoch": 1.105165665800245, "grad_norm": 0.03583172708749771, "kl": 0.005753278732299805, "learning_rate": 4.042556777235558e-06, "loss": 0.0068, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 523.9950060844421, "epoch": 1.1070777688147948, "grad_norm": 0.03652811422944069, "kl": 0.005724668502807617, "learning_rate": 4.037615301332559e-06, "loss": 0.0088, "num_tokens": 419993906.0, "reward": 0.061383931315504014, "reward_std": 0.07067021139664575, "rewards/pure_accuracy_reward_math": 0.06138392974389717, "step": 676 }, { "clip_ratio": 0.00028260578790195723, "epoch": 1.1089898718293447, "grad_norm": 0.035632383078336716, "kl": 0.0056421756744384766, "learning_rate": 4.0326641440209114e-06, "loss": 0.0088, "step": 677 }, { "clip_ratio": 0.0002882395116614589, "epoch": 1.1109019748438946, "grad_norm": 0.03453977406024933, "kl": 0.005593061447143555, "learning_rate": 4.027703336474979e-06, "loss": 0.0087, "step": 678 }, { "clip_ratio": 0.000319835560901538, "epoch": 1.1128140778584446, "grad_norm": 0.03415689244866371, "kl": 0.005594968795776367, "learning_rate": 4.022732909929883e-06, "loss": 0.0087, "step": 679 }, { "clip_ratio": 0.00033849146848297096, "epoch": 1.1147261808729945, "grad_norm": 0.03406994044780731, "kl": 0.005631208419799805, "learning_rate": 4.017752895681315e-06, "loss": 0.0086, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 521.6057720184326, "epoch": 1.1166382838875444, "grad_norm": 0.06026715040206909, "kl": 0.005751848220825195, "learning_rate": 4.012763325085332e-06, "loss": 0.0067, "num_tokens": 423598941.0, "reward": 0.07198661082657054, "reward_std": 0.08763020345941186, "rewards/pure_accuracy_reward_math": 0.07198660844005644, "step": 681 }, { "clip_ratio": 0.00031779767027728667, "epoch": 1.1185503869020943, "grad_norm": 2.6160011291503906, "kl": 0.005651235580444336, "learning_rate": 4.0077642295581605e-06, "loss": 0.007, "step": 682 }, { "clip_ratio": 0.00035409004277653366, "epoch": 1.1204624899166442, "grad_norm": 6.490725994110107, "kl": 0.04636049270629883, "learning_rate": 4.002755640576002e-06, "loss": 0.0083, "step": 683 }, { "clip_ratio": 0.000386831109835839, "epoch": 1.1223745929311941, "grad_norm": 0.13183599710464478, "kl": 0.0063648223876953125, "learning_rate": 3.997737589674828e-06, "loss": 0.0067, "step": 684 }, { "clip_ratio": 0.00042002629169246575, "epoch": 1.124286695945744, "grad_norm": 61.113468170166016, "kl": 0.00571751594543457, "learning_rate": 3.992710108450192e-06, "loss": 0.0205, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 534.679431438446, "epoch": 1.126198798960294, "grad_norm": 0.0341753326356411, "kl": 0.006865501403808594, "learning_rate": 3.987673228557017e-06, "loss": 0.0032, "num_tokens": 427249916.0, "reward": 0.056919645285233855, "reward_std": 0.06538890511728823, "rewards/pure_accuracy_reward_math": 0.05691964429570362, "step": 686 }, { "clip_ratio": 0.00022898520234093667, "epoch": 1.1281109019748439, "grad_norm": 0.03356679156422615, "kl": 0.006783246994018555, "learning_rate": 3.982626981709412e-06, "loss": 0.0032, "step": 687 }, { "clip_ratio": 0.00023695471924156664, "epoch": 1.1300230049893938, "grad_norm": 0.03283276781439781, "kl": 0.006662845611572266, "learning_rate": 3.977571399680457e-06, "loss": 0.0031, "step": 688 }, { "clip_ratio": 0.000234549945901108, "epoch": 1.1319351080039437, "grad_norm": 0.032041046768426895, "kl": 0.00657343864440918, "learning_rate": 3.972506514302013e-06, "loss": 0.0031, "step": 689 }, { "clip_ratio": 0.00026119674055280484, "epoch": 1.1338472110184936, "grad_norm": 0.03098335862159729, "kl": 0.006501674652099609, "learning_rate": 3.967432357464518e-06, "loss": 0.003, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 533.4330596923828, "epoch": 1.1357593140330435, "grad_norm": 0.03648236393928528, "kl": 0.005389690399169922, "learning_rate": 3.962348961116786e-06, "loss": 0.0075, "num_tokens": 430894100.0, "reward": 0.059151788300368935, "reward_std": 0.06680402747588232, "rewards/pure_accuracy_reward_math": 0.059151787078008056, "step": 691 }, { "clip_ratio": 0.00024069582485708452, "epoch": 1.1376714170475934, "grad_norm": 0.03502041473984718, "kl": 0.005405902862548828, "learning_rate": 3.957256357265806e-06, "loss": 0.0075, "step": 692 }, { "clip_ratio": 0.00026108162376203836, "epoch": 1.1395835200621434, "grad_norm": 0.03438780456781387, "kl": 0.0054416656494140625, "learning_rate": 3.952154577976543e-06, "loss": 0.0075, "step": 693 }, { "clip_ratio": 0.0002536772994972125, "epoch": 1.1414956230766933, "grad_norm": 0.03388332575559616, "kl": 0.005480289459228516, "learning_rate": 3.947043655371734e-06, "loss": 0.0075, "step": 694 }, { "clip_ratio": 0.00027197748300977764, "epoch": 1.1434077260912432, "grad_norm": 0.03378571942448616, "kl": 0.005473136901855469, "learning_rate": 3.941923621631683e-06, "loss": 0.0074, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 523.0050506591797, "epoch": 1.145319829105793, "grad_norm": 0.040138646960258484, "kl": 0.005397796630859375, "learning_rate": 3.936794508994062e-06, "loss": 0.0033, "num_tokens": 434502306.0, "reward": 0.07142857456346974, "reward_std": 0.08093377470504493, "rewards/pure_accuracy_reward_math": 0.07142857316648588, "step": 696 }, { "clip_ratio": 0.00026038982610998573, "epoch": 1.147231932120343, "grad_norm": 0.03855022042989731, "kl": 0.005437135696411133, "learning_rate": 3.931656349753709e-06, "loss": 0.0033, "step": 697 }, { "clip_ratio": 0.0002577857798655714, "epoch": 1.149144035134893, "grad_norm": 0.03805391117930412, "kl": 0.005386829376220703, "learning_rate": 3.9265091762624225e-06, "loss": 0.0032, "step": 698 }, { "clip_ratio": 0.0002938498616913421, "epoch": 1.1510561381494429, "grad_norm": 0.03830750659108162, "kl": 0.005461931228637695, "learning_rate": 3.921353020928756e-06, "loss": 0.0032, "step": 699 }, { "clip_ratio": 0.00026367085320089245, "epoch": 1.1529682411639928, "grad_norm": 0.03759397566318512, "kl": 0.0055010318756103516, "learning_rate": 3.916187916217818e-06, "loss": 0.0031, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 532.7466740608215, "epoch": 1.1548803441785427, "grad_norm": 0.03618447855114937, "kl": 0.0054166316986083984, "learning_rate": 3.911013894651067e-06, "loss": 0.0066, "num_tokens": 438144462.0, "reward": 0.06501116344588809, "reward_std": 0.07457646209513769, "rewards/pure_accuracy_reward_math": 0.06501116175786592, "step": 701 }, { "clip_ratio": 0.00028753443712048465, "epoch": 1.1567924471930926, "grad_norm": 0.035918354988098145, "kl": 0.005413532257080078, "learning_rate": 3.905830988806101e-06, "loss": 0.0066, "step": 702 }, { "clip_ratio": 0.0002842856440565811, "epoch": 1.1587045502076425, "grad_norm": 0.03422370180487633, "kl": 0.005442619323730469, "learning_rate": 3.90063923131646e-06, "loss": 0.0066, "step": 703 }, { "clip_ratio": 0.0002819241568090547, "epoch": 1.1606166532221924, "grad_norm": 0.03359530121088028, "kl": 0.00537109375, "learning_rate": 3.895438654871416e-06, "loss": 0.0065, "step": 704 }, { "clip_ratio": 0.0003241457142166837, "epoch": 1.1625287562367423, "grad_norm": 0.033465541899204254, "kl": 0.0053484439849853516, "learning_rate": 3.890229292215773e-06, "loss": 0.0065, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 526.7639741897583, "epoch": 1.1644408592512923, "grad_norm": 0.03731166943907738, "kl": 0.00535893440246582, "learning_rate": 3.885011176149647e-06, "loss": 0.0071, "num_tokens": 441760876.0, "reward": 0.06612723506987095, "reward_std": 0.06822534691309556, "rewards/pure_accuracy_reward_math": 0.06612723367288709, "step": 706 }, { "clip_ratio": 0.00025104734473302415, "epoch": 1.166352962265842, "grad_norm": 0.03429851680994034, "kl": 0.005263566970825195, "learning_rate": 3.879784339528277e-06, "loss": 0.0071, "step": 707 }, { "clip_ratio": 0.0002501190919019791, "epoch": 1.168265065280392, "grad_norm": 0.034958597272634506, "kl": 0.0052831172943115234, "learning_rate": 3.874548815261809e-06, "loss": 0.0071, "step": 708 }, { "clip_ratio": 0.0002633173795629773, "epoch": 1.1701771682949418, "grad_norm": 0.032111622393131256, "kl": 0.005318403244018555, "learning_rate": 3.869304636315085e-06, "loss": 0.007, "step": 709 }, { "clip_ratio": 0.00028521847832507774, "epoch": 1.172089271309492, "grad_norm": 0.03191748261451721, "kl": 0.005407810211181641, "learning_rate": 3.864051835707444e-06, "loss": 0.007, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 522.3457269668579, "epoch": 1.1740013743240416, "grad_norm": 0.05126773193478584, "kl": 0.01187896728515625, "learning_rate": 3.85879044651251e-06, "loss": 0.0066, "num_tokens": 445370959.0, "reward": 0.06863839653669856, "reward_std": 0.07951865292852744, "rewards/pure_accuracy_reward_math": 0.06863839438301511, "step": 711 }, { "clip_ratio": 0.00028669004558423694, "epoch": 1.1759134773385915, "grad_norm": 0.051731474697589874, "kl": 0.011458396911621094, "learning_rate": 3.853520501857981e-06, "loss": 0.0066, "step": 712 }, { "clip_ratio": 0.0003143258599038745, "epoch": 1.1778255803531414, "grad_norm": 0.051190439611673355, "kl": 0.010621786117553711, "learning_rate": 3.848242034925429e-06, "loss": 0.0065, "step": 713 }, { "clip_ratio": 0.00033165596249773444, "epoch": 1.1797376833676914, "grad_norm": 0.04840007424354553, "kl": 0.009693622589111328, "learning_rate": 3.842955078950079e-06, "loss": 0.0064, "step": 714 }, { "clip_ratio": 0.00035113433239075675, "epoch": 1.1816497863822413, "grad_norm": 0.048264067620038986, "kl": 0.008889198303222656, "learning_rate": 3.837659667220612e-06, "loss": 0.0063, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 547.5633645057678, "epoch": 1.1835618893967912, "grad_norm": 0.03458649665117264, "kl": 0.005284786224365234, "learning_rate": 3.832355833078945e-06, "loss": 0.0047, "num_tokens": 449069046.0, "reward": 0.05691964572179131, "reward_std": 0.06861072586616501, "rewards/pure_accuracy_reward_math": 0.05691964415018447, "step": 716 }, { "clip_ratio": 0.0002876185501463624, "epoch": 1.185473992411341, "grad_norm": 0.033646877855062485, "kl": 0.005215167999267578, "learning_rate": 3.82704360992003e-06, "loss": 0.0047, "step": 717 }, { "clip_ratio": 0.0003252235952686533, "epoch": 1.187386095425891, "grad_norm": 0.03455204889178276, "kl": 0.0051419734954833984, "learning_rate": 3.8217230311916365e-06, "loss": 0.0046, "step": 718 }, { "clip_ratio": 0.0003351885409870192, "epoch": 1.189298198440441, "grad_norm": 0.033362697809934616, "kl": 0.0050907135009765625, "learning_rate": 3.816394130394142e-06, "loss": 0.0046, "step": 719 }, { "clip_ratio": 0.00032723310141591355, "epoch": 1.1912103014549908, "grad_norm": 0.03211547061800957, "kl": 0.0051004886627197266, "learning_rate": 3.811056941080329e-06, "loss": 0.0045, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 537.3167090415955, "epoch": 1.1931224044695408, "grad_norm": 0.03566175699234009, "kl": 0.0053424835205078125, "learning_rate": 3.805711496855161e-06, "loss": 0.009, "num_tokens": 452726381.0, "reward": 0.06054687776486389, "reward_std": 0.07264336961088702, "rewards/pure_accuracy_reward_math": 0.06054687677533366, "step": 721 }, { "clip_ratio": 0.00029346574888222676, "epoch": 1.1950345074840907, "grad_norm": 0.03476826474070549, "kl": 0.005379438400268555, "learning_rate": 3.800357831375583e-06, "loss": 0.009, "step": 722 }, { "clip_ratio": 0.00027920183202923, "epoch": 1.1969466104986406, "grad_norm": 0.03446114435791969, "kl": 0.005425691604614258, "learning_rate": 3.794995978350301e-06, "loss": 0.009, "step": 723 }, { "clip_ratio": 0.00031396149876172785, "epoch": 1.1988587135131905, "grad_norm": 0.0340140238404274, "kl": 0.005489826202392578, "learning_rate": 3.7896259715395727e-06, "loss": 0.0089, "step": 724 }, { "clip_ratio": 0.0002986833567888425, "epoch": 1.2007708165277404, "grad_norm": 0.03497212752699852, "kl": 0.005522489547729492, "learning_rate": 3.784247844754997e-06, "loss": 0.0088, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 548.8044338226318, "epoch": 1.2026829195422903, "grad_norm": 0.04050953686237335, "kl": 0.005362510681152344, "learning_rate": 3.778861631859298e-06, "loss": 0.0112, "num_tokens": 456433388.0, "reward": 0.06696428879513405, "reward_std": 0.08140548242954537, "rewards/pure_accuracy_reward_math": 0.06696428728173487, "step": 726 }, { "clip_ratio": 0.0003468562302373357, "epoch": 1.2045950225568403, "grad_norm": 0.03805195167660713, "kl": 0.005377531051635742, "learning_rate": 3.7734673667661133e-06, "loss": 0.0112, "step": 727 }, { "clip_ratio": 0.00037477223943938043, "epoch": 1.2065071255713902, "grad_norm": 0.03666882589459419, "kl": 0.005417585372924805, "learning_rate": 3.7680650834397804e-06, "loss": 0.0112, "step": 728 }, { "clip_ratio": 0.0003945930936311015, "epoch": 1.20841922858594, "grad_norm": 0.03651399165391922, "kl": 0.005425453186035156, "learning_rate": 3.762654815895122e-06, "loss": 0.0111, "step": 729 }, { "clip_ratio": 0.0004650242010484362, "epoch": 1.21033133160049, "grad_norm": 0.03792130947113037, "kl": 0.005422115325927734, "learning_rate": 3.7572365981972335e-06, "loss": 0.0111, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 528.6861305236816, "epoch": 1.21224343461504, "grad_norm": 0.0365571565926075, "kl": 0.005487203598022461, "learning_rate": 3.7518104644612663e-06, "loss": 0.0098, "num_tokens": 460061367.0, "reward": 0.06417411062284373, "reward_std": 0.07478918455308303, "rewards/pure_accuracy_reward_math": 0.06417410852736793, "step": 731 }, { "clip_ratio": 0.0002798708824229834, "epoch": 1.2141555376295898, "grad_norm": 0.036456115543842316, "kl": 0.005484342575073242, "learning_rate": 3.746376448852216e-06, "loss": 0.0098, "step": 732 }, { "clip_ratio": 0.0003001830394850913, "epoch": 1.2160676406441397, "grad_norm": 0.036120470613241196, "kl": 0.005544900894165039, "learning_rate": 3.740934585584702e-06, "loss": 0.0098, "step": 733 }, { "clip_ratio": 0.00028155883609315424, "epoch": 1.2179797436586897, "grad_norm": 0.03475060313940048, "kl": 0.005614042282104492, "learning_rate": 3.735484908922759e-06, "loss": 0.0097, "step": 734 }, { "clip_ratio": 0.00027523975251142474, "epoch": 1.2198918466732396, "grad_norm": 0.03388671204447746, "kl": 0.005706310272216797, "learning_rate": 3.730027453179617e-06, "loss": 0.0096, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 518.6091203689575, "epoch": 1.2218039496877895, "grad_norm": 0.039098870009183884, "kl": 0.005930900573730469, "learning_rate": 3.7245622527174858e-06, "loss": 0.0072, "num_tokens": 463651718.0, "reward": 0.06277902098372579, "reward_std": 0.06552149687195197, "rewards/pure_accuracy_reward_math": 0.06277901912108064, "step": 736 }, { "clip_ratio": 0.000267848483247235, "epoch": 1.2237160527023394, "grad_norm": 0.03896670043468475, "kl": 0.005952358245849609, "learning_rate": 3.719089341947337e-06, "loss": 0.0072, "step": 737 }, { "clip_ratio": 0.00026333254504606884, "epoch": 1.2256281557168893, "grad_norm": 0.03838280960917473, "kl": 0.005873680114746094, "learning_rate": 3.7136087553286916e-06, "loss": 0.0072, "step": 738 }, { "clip_ratio": 0.0002850479507969794, "epoch": 1.2275402587314392, "grad_norm": 0.03708336502313614, "kl": 0.005741596221923828, "learning_rate": 3.7081205273694005e-06, "loss": 0.0071, "step": 739 }, { "clip_ratio": 0.00030947004142944934, "epoch": 1.2294523617459892, "grad_norm": 0.03616032376885414, "kl": 0.005689144134521484, "learning_rate": 3.702624692625427e-06, "loss": 0.007, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 515.3027577400208, "epoch": 1.231364464760539, "grad_norm": 473.16009521484375, "kl": 7.4117608070373535, "learning_rate": 3.6971212857006277e-06, "loss": 0.3027, "num_tokens": 467231411.0, "reward": 0.07003348527359776, "reward_std": 0.07058388437144458, "rewards/pure_accuracy_reward_math": 0.07003348364378326, "step": 741 }, { "clip_ratio": 0.00048789031319529386, "epoch": 1.2332765677750888, "grad_norm": 15.009349822998047, "kl": 0.3277552127838135, "learning_rate": 3.6916103412465405e-06, "loss": 0.0207, "step": 742 }, { "clip_ratio": 0.0005436847095552366, "epoch": 1.235188670789639, "grad_norm": 34.010345458984375, "kl": 0.01839423179626465, "learning_rate": 3.6860918939621586e-06, "loss": 0.0299, "step": 743 }, { "clip_ratio": 0.000597593801558105, "epoch": 1.2371007738041886, "grad_norm": 13.507566452026367, "kl": 0.02814960479736328, "learning_rate": 3.6805659785937176e-06, "loss": 0.0188, "step": 744 }, { "clip_ratio": 0.0005609532486232638, "epoch": 1.2390128768187387, "grad_norm": 6.263442516326904, "kl": 0.20073914527893066, "learning_rate": 3.675032629934475e-06, "loss": 0.0163, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 530.3340101242065, "epoch": 1.2409249798332884, "grad_norm": 0.051358480006456375, "kl": 0.0063626766204833984, "learning_rate": 3.6694918828244923e-06, "loss": 0.0095, "num_tokens": 470866344.0, "reward": 0.06333705666474998, "reward_std": 0.07530095760012046, "rewards/pure_accuracy_reward_math": 0.06333705509314314, "step": 746 }, { "clip_ratio": 0.00029982604212364095, "epoch": 1.2428370828478383, "grad_norm": 0.03713027015328407, "kl": 0.006081342697143555, "learning_rate": 3.6639437721504108e-06, "loss": 0.0095, "step": 747 }, { "clip_ratio": 0.0002941023938660692, "epoch": 1.2447491858623883, "grad_norm": 0.03500093147158623, "kl": 0.006156444549560547, "learning_rate": 3.65838833284524e-06, "loss": 0.0095, "step": 748 }, { "clip_ratio": 0.0002858027814340858, "epoch": 1.2466612888769382, "grad_norm": 0.03525420278310776, "kl": 0.006234169006347656, "learning_rate": 3.652825599888129e-06, "loss": 0.0094, "step": 749 }, { "clip_ratio": 0.0002950350276478275, "epoch": 1.248573391891488, "grad_norm": 0.03545543923974037, "kl": 0.006281852722167969, "learning_rate": 3.647255608304154e-06, "loss": 0.0093, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 530.79438829422, "epoch": 1.250485494906038, "grad_norm": 0.03711007162928581, "kl": 0.005670070648193359, "learning_rate": 3.641678393164092e-06, "loss": 0.0131, "num_tokens": 474505191.0, "reward": 0.07170759318978526, "reward_std": 0.07251697574974969, "rewards/pure_accuracy_reward_math": 0.0717075907450635, "step": 751 }, { "clip_ratio": 0.00029345202176500607, "epoch": 1.252397597920588, "grad_norm": 0.036423034965991974, "kl": 0.005608320236206055, "learning_rate": 3.636093989584204e-06, "loss": 0.0131, "step": 752 }, { "clip_ratio": 0.00030187425932126644, "epoch": 1.2543097009351378, "grad_norm": 0.03613322973251343, "kl": 0.005610466003417969, "learning_rate": 3.630502432726012e-06, "loss": 0.013, "step": 753 }, { "clip_ratio": 0.0003275847485610939, "epoch": 1.2562218039496877, "grad_norm": 0.03452349826693535, "kl": 0.0057184696197509766, "learning_rate": 3.6249037577960744e-06, "loss": 0.013, "step": 754 }, { "clip_ratio": 0.00034663524741063156, "epoch": 1.2581339069642377, "grad_norm": 0.034864939749240875, "kl": 0.005825996398925781, "learning_rate": 3.619298000045773e-06, "loss": 0.0129, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 495.8814425468445, "epoch": 1.2600460099787876, "grad_norm": 528.279052734375, "kl": 9.193241596221924, "learning_rate": 3.6136851947710804e-06, "loss": 0.3749, "num_tokens": 478011678.0, "reward": 0.07979911071015522, "reward_std": 0.07470905361697078, "rewards/pure_accuracy_reward_math": 0.0797991082072258, "step": 756 }, { "clip_ratio": 0.00028275052295612113, "epoch": 1.2619581129933375, "grad_norm": 44.662696838378906, "kl": 1.2635960578918457, "learning_rate": 3.608065377312348e-06, "loss": 0.057, "step": 757 }, { "clip_ratio": 0.00029553008619132015, "epoch": 1.2638702160078874, "grad_norm": 4.775911808013916, "kl": 0.1474595069885254, "learning_rate": 3.6024385830540758e-06, "loss": 0.0123, "step": 758 }, { "clip_ratio": 0.00033371773997714627, "epoch": 1.2657823190224373, "grad_norm": 0.30982905626296997, "kl": 0.01830148696899414, "learning_rate": 3.5968048474246925e-06, "loss": 0.0071, "step": 759 }, { "clip_ratio": 0.0003257711730952906, "epoch": 1.2676944220369872, "grad_norm": 0.05356259644031525, "kl": 0.011959552764892578, "learning_rate": 3.591164205896332e-06, "loss": 0.0068, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 519.9149203300476, "epoch": 1.2696065250515371, "grad_norm": 0.04138460382819176, "kl": 0.00600886344909668, "learning_rate": 3.585516693984612e-06, "loss": 0.0061, "num_tokens": 481610981.0, "reward": 0.07059152136207558, "reward_std": 0.07616424100706354, "rewards/pure_accuracy_reward_math": 0.07059151938301511, "step": 761 }, { "clip_ratio": 0.00029173931721970803, "epoch": 1.271518628066087, "grad_norm": 0.04057340323925018, "kl": 0.0059850215911865234, "learning_rate": 3.5798623472484074e-06, "loss": 0.006, "step": 762 }, { "clip_ratio": 0.00031361054851686276, "epoch": 1.273430731080637, "grad_norm": 0.0383637472987175, "kl": 0.005931377410888672, "learning_rate": 3.5742012012896273e-06, "loss": 0.006, "step": 763 }, { "clip_ratio": 0.000302841177983737, "epoch": 1.275342834095187, "grad_norm": 0.037009891122579575, "kl": 0.005960226058959961, "learning_rate": 3.5685332917529936e-06, "loss": 0.0059, "step": 764 }, { "clip_ratio": 0.00032496250122449055, "epoch": 1.2772549371097368, "grad_norm": 0.036052413284778595, "kl": 0.0060160160064697266, "learning_rate": 3.5628586543258116e-06, "loss": 0.0058, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 505.19645166397095, "epoch": 1.2791670401242867, "grad_norm": 0.039108723402023315, "kl": 0.0060214996337890625, "learning_rate": 3.5571773247377495e-06, "loss": 0.0077, "num_tokens": 485155493.0, "reward": 0.06473214537254535, "reward_std": 0.07595151849091053, "rewards/pure_accuracy_reward_math": 0.06473214438301511, "step": 766 }, { "clip_ratio": 0.00031215936860462534, "epoch": 1.2810791431388366, "grad_norm": 0.03890209272503853, "kl": 0.0060939788818359375, "learning_rate": 3.5514893387606113e-06, "loss": 0.0078, "step": 767 }, { "clip_ratio": 0.00029648321913100517, "epoch": 1.2829912461533866, "grad_norm": 0.038266174495220184, "kl": 0.0061397552490234375, "learning_rate": 3.5457947322081126e-06, "loss": 0.0077, "step": 768 }, { "clip_ratio": 0.0002988063008615427, "epoch": 1.2849033491679365, "grad_norm": 0.03760776296257973, "kl": 0.006152629852294922, "learning_rate": 3.5400935409356534e-06, "loss": 0.0076, "step": 769 }, { "clip_ratio": 0.00032748817852734646, "epoch": 1.2868154521824864, "grad_norm": 0.037058234214782715, "kl": 0.006194591522216797, "learning_rate": 3.5343858008400955e-06, "loss": 0.0076, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 513.085681438446, "epoch": 1.2887275551970363, "grad_norm": 0.04272163286805153, "kl": 0.006904125213623047, "learning_rate": 3.5286715478595335e-06, "loss": 0.0066, "num_tokens": 488731916.0, "reward": 0.06668527112924494, "reward_std": 0.07779828266939148, "rewards/pure_accuracy_reward_math": 0.0666852695576381, "step": 771 }, { "clip_ratio": 0.0002989328136209224, "epoch": 1.2906396582115862, "grad_norm": 0.039898019284009933, "kl": 0.006760597229003906, "learning_rate": 3.52295081797307e-06, "loss": 0.0066, "step": 772 }, { "clip_ratio": 0.0003237332452385999, "epoch": 1.2925517612261361, "grad_norm": 0.0380416214466095, "kl": 0.006653547286987305, "learning_rate": 3.5172236472005866e-06, "loss": 0.0065, "step": 773 }, { "clip_ratio": 0.0004160679777100995, "epoch": 1.294463864240686, "grad_norm": 0.03860335052013397, "kl": 0.006639003753662109, "learning_rate": 3.511490071602523e-06, "loss": 0.0065, "step": 774 }, { "clip_ratio": 0.0004345110206713798, "epoch": 1.2963759672552357, "grad_norm": 0.0405069962143898, "kl": 0.006697654724121094, "learning_rate": 3.505750127279643e-06, "loss": 0.0064, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 529.7695565223694, "epoch": 1.2982880702697859, "grad_norm": 0.040585048496723175, "kl": 0.006101369857788086, "learning_rate": 3.500003850372811e-06, "loss": 0.0043, "num_tokens": 492363370.0, "reward": 0.07477678926079534, "reward_std": 0.08466117118950933, "rewards/pure_accuracy_reward_math": 0.07477678704890423, "step": 776 }, { "clip_ratio": 0.0003347315081327906, "epoch": 1.3002001732843356, "grad_norm": 0.039613205939531326, "kl": 0.0060977935791015625, "learning_rate": 3.4942512770627655e-06, "loss": 0.0043, "step": 777 }, { "clip_ratio": 0.0003803396672310555, "epoch": 1.3021122762988857, "grad_norm": 0.03965132310986519, "kl": 0.006110668182373047, "learning_rate": 3.4884924435698875e-06, "loss": 0.0042, "step": 778 }, { "clip_ratio": 0.00035469116983222193, "epoch": 1.3040243793134354, "grad_norm": 0.038701362907886505, "kl": 0.005974292755126953, "learning_rate": 3.482727386153974e-06, "loss": 0.0041, "step": 779 }, { "clip_ratio": 0.00038596760680320585, "epoch": 1.3059364823279855, "grad_norm": 0.03767050802707672, "kl": 0.0059070587158203125, "learning_rate": 3.4769561411140123e-06, "loss": 0.0041, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 528.3593993186951, "epoch": 1.3078485853425352, "grad_norm": 0.04520969092845917, "kl": 0.015022039413452148, "learning_rate": 3.471178744787948e-06, "loss": 0.0107, "num_tokens": 495988466.0, "reward": 0.07449777098372579, "reward_std": 0.08161820413079113, "rewards/pure_accuracy_reward_math": 0.07449777016881853, "step": 781 }, { "clip_ratio": 0.00032587463357458546, "epoch": 1.3097606883570854, "grad_norm": 0.04337235167622566, "kl": 0.01485586166381836, "learning_rate": 3.465395233552458e-06, "loss": 0.0107, "step": 782 }, { "clip_ratio": 0.00031156001216459117, "epoch": 1.311672791371635, "grad_norm": 0.04306100681424141, "kl": 0.014668941497802734, "learning_rate": 3.459605643822721e-06, "loss": 0.0106, "step": 783 }, { "clip_ratio": 0.00031179932597069637, "epoch": 1.313584894386185, "grad_norm": 0.04292943701148033, "kl": 0.014333724975585938, "learning_rate": 3.4538100120521884e-06, "loss": 0.0106, "step": 784 }, { "clip_ratio": 0.00034586368491318353, "epoch": 1.315496997400735, "grad_norm": 0.04207218065857887, "kl": 0.013885498046875, "learning_rate": 3.4480083747323527e-06, "loss": 0.0105, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 521.3471217155457, "epoch": 1.3174091004152848, "grad_norm": 0.04057139530777931, "kl": 0.006026268005371094, "learning_rate": 3.4422007683925224e-06, "loss": 0.0119, "num_tokens": 499590878.0, "reward": 0.08091518239234574, "reward_std": 0.08763020328478888, "rewards/pure_accuracy_reward_math": 0.08091518023866229, "step": 786 }, { "clip_ratio": 0.00030802900647586284, "epoch": 1.3193212034298347, "grad_norm": 0.039306215941905975, "kl": 0.00603485107421875, "learning_rate": 3.436387229599587e-06, "loss": 0.0119, "step": 787 }, { "clip_ratio": 0.00034579116845634417, "epoch": 1.3212333064443846, "grad_norm": 0.03839893266558647, "kl": 0.006104469299316406, "learning_rate": 3.4305677949577915e-06, "loss": 0.0118, "step": 788 }, { "clip_ratio": 0.00036078316020393686, "epoch": 1.3231454094589346, "grad_norm": 0.03700988367199898, "kl": 0.006115436553955078, "learning_rate": 3.4247425011084993e-06, "loss": 0.0118, "step": 789 }, { "clip_ratio": 0.0003916456239494437, "epoch": 1.3250575124734845, "grad_norm": 0.03749685734510422, "kl": 0.006115436553955078, "learning_rate": 3.418911384729971e-06, "loss": 0.0117, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 502.7112407684326, "epoch": 1.3269696154880344, "grad_norm": 0.03917763754725456, "kl": 0.009302139282226562, "learning_rate": 3.413074482537123e-06, "loss": 0.0077, "num_tokens": 503128079.0, "reward": 0.07059152112924494, "reward_std": 0.07702752505429089, "rewards/pure_accuracy_reward_math": 0.07059151944122277, "step": 791 }, { "clip_ratio": 0.0002787132019079763, "epoch": 1.3288817185025843, "grad_norm": 0.03894754871726036, "kl": 0.009203910827636719, "learning_rate": 3.4072318312813044e-06, "loss": 0.0077, "step": 792 }, { "clip_ratio": 0.00031091465683630304, "epoch": 1.3307938215171342, "grad_norm": 0.03774462640285492, "kl": 0.008921146392822266, "learning_rate": 3.4013834677500612e-06, "loss": 0.0077, "step": 793 }, { "clip_ratio": 0.00030987418773520403, "epoch": 1.3327059245316841, "grad_norm": 0.03737964481115341, "kl": 0.008791923522949219, "learning_rate": 3.395529428766907e-06, "loss": 0.0076, "step": 794 }, { "clip_ratio": 0.0003597256319380904, "epoch": 1.334618027546234, "grad_norm": 0.03793202340602875, "kl": 0.008593559265136719, "learning_rate": 3.3896697511910898e-06, "loss": 0.0075, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 516.8552160263062, "epoch": 1.336530130560784, "grad_norm": 0.03877223655581474, "kl": 0.005873441696166992, "learning_rate": 3.3838044719173603e-06, "loss": 0.0086, "num_tokens": 506711636.0, "reward": 0.06529018195578828, "reward_std": 0.06942774722119793, "rewards/pure_accuracy_reward_math": 0.06529017997672781, "step": 796 }, { "clip_ratio": 0.0002862633294853367, "epoch": 1.3384422335753339, "grad_norm": 0.0376199446618557, "kl": 0.005820274353027344, "learning_rate": 3.377933627875739e-06, "loss": 0.0086, "step": 797 }, { "clip_ratio": 0.0002861461452994263, "epoch": 1.3403543365898838, "grad_norm": 0.036890070885419846, "kl": 0.005822658538818359, "learning_rate": 3.3720572560312854e-06, "loss": 0.0086, "step": 798 }, { "clip_ratio": 0.0003201163677317709, "epoch": 1.3422664396044337, "grad_norm": 0.03669756278395653, "kl": 0.005821704864501953, "learning_rate": 3.366175393383863e-06, "loss": 0.0085, "step": 799 }, { "clip_ratio": 0.0003494162402830625, "epoch": 1.3441785426189836, "grad_norm": 0.03721420839428902, "kl": 0.005818843841552734, "learning_rate": 3.360288076967909e-06, "loss": 0.0084, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 505.6105146408081, "epoch": 1.3460906456335335, "grad_norm": 0.040034398436546326, "kl": 0.006266117095947266, "learning_rate": 3.3543953438521983e-06, "loss": 0.0091, "num_tokens": 510255728.0, "reward": 0.0675223250000272, "reward_std": 0.07577886182116345, "rewards/pure_accuracy_reward_math": 0.06752232249709778, "step": 801 }, { "clip_ratio": 0.00027677676553139463, "epoch": 1.3480027486480834, "grad_norm": 0.038657769560813904, "kl": 0.006215572357177734, "learning_rate": 3.3484972311396114e-06, "loss": 0.0091, "step": 802 }, { "clip_ratio": 0.0002909586188479807, "epoch": 1.3499148516626334, "grad_norm": 0.036970507353544235, "kl": 0.006129741668701172, "learning_rate": 3.342593775966901e-06, "loss": 0.009, "step": 803 }, { "clip_ratio": 0.0003427068459700422, "epoch": 1.3518269546771833, "grad_norm": 0.03707785904407501, "kl": 0.006056785583496094, "learning_rate": 3.3366850155044595e-06, "loss": 0.009, "step": 804 }, { "clip_ratio": 0.00038909467849634893, "epoch": 1.3537390576917332, "grad_norm": 0.03700149059295654, "kl": 0.005985736846923828, "learning_rate": 3.33077098695608e-06, "loss": 0.0089, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 527.0212287902832, "epoch": 1.355651160706283, "grad_norm": 0.04373861476778984, "kl": 0.005824565887451172, "learning_rate": 3.3248517275587292e-06, "loss": 0.0094, "num_tokens": 513879112.0, "reward": 0.0703125029685907, "reward_std": 0.08085364429280162, "rewards/pure_accuracy_reward_math": 0.07031250145519152, "step": 806 }, { "clip_ratio": 0.00031092700191948097, "epoch": 1.357563263720833, "grad_norm": 0.04273909702897072, "kl": 0.0058460235595703125, "learning_rate": 3.318927274582307e-06, "loss": 0.0094, "step": 807 }, { "clip_ratio": 0.0003359753473546334, "epoch": 1.359475366735383, "grad_norm": 0.04217194393277168, "kl": 0.005980014801025391, "learning_rate": 3.312997665329414e-06, "loss": 0.0093, "step": 808 }, { "clip_ratio": 0.0003392697701940506, "epoch": 1.3613874697499329, "grad_norm": 0.04189891368150711, "kl": 0.0061492919921875, "learning_rate": 3.3070629371351176e-06, "loss": 0.0093, "step": 809 }, { "clip_ratio": 0.0003985974152556082, "epoch": 1.3632995727644825, "grad_norm": 0.04113880172371864, "kl": 0.0062618255615234375, "learning_rate": 3.3011231273667155e-06, "loss": 0.0092, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 523.8002490997314, "epoch": 1.3652116757790327, "grad_norm": 0.039511535316705704, "kl": 0.007502555847167969, "learning_rate": 3.295178273423501e-06, "loss": 0.0065, "num_tokens": 517489928.0, "reward": 0.06835937840514816, "reward_std": 0.0761642413563095, "rewards/pure_accuracy_reward_math": 0.06835937636788003, "step": 811 }, { "clip_ratio": 0.00033993283830113796, "epoch": 1.3671237787935824, "grad_norm": 0.03911852091550827, "kl": 0.0074634552001953125, "learning_rate": 3.2892284127365277e-06, "loss": 0.0065, "step": 812 }, { "clip_ratio": 0.00029188678922764666, "epoch": 1.3690358818081325, "grad_norm": 0.038789719343185425, "kl": 0.007461071014404297, "learning_rate": 3.2832735827683733e-06, "loss": 0.0064, "step": 813 }, { "clip_ratio": 0.00031692377649505943, "epoch": 1.3709479848226822, "grad_norm": 0.03795900195837021, "kl": 0.007411956787109375, "learning_rate": 3.2773138210129037e-06, "loss": 0.0063, "step": 814 }, { "clip_ratio": 0.0003394908647464945, "epoch": 1.3728600878372323, "grad_norm": 0.03683575242757797, "kl": 0.0073795318603515625, "learning_rate": 3.2713491649950375e-06, "loss": 0.0063, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 527.1018648147583, "epoch": 1.374772190851782, "grad_norm": 0.036948177963495255, "kl": 0.0058441162109375, "learning_rate": 3.26537965227051e-06, "loss": 0.0062, "num_tokens": 521113961.0, "reward": 0.06333705675206147, "reward_std": 0.07041122711962089, "rewards/pure_accuracy_reward_math": 0.06333705494762398, "step": 816 }, { "clip_ratio": 0.0002517415915690435, "epoch": 1.3766842938663322, "grad_norm": 0.03634682297706604, "kl": 0.005847454071044922, "learning_rate": 3.2594053204256344e-06, "loss": 0.0062, "step": 817 }, { "clip_ratio": 0.00027403954436522326, "epoch": 1.3785963968808819, "grad_norm": 0.034690070897340775, "kl": 0.005870342254638672, "learning_rate": 3.253426207077069e-06, "loss": 0.0062, "step": 818 }, { "clip_ratio": 0.0002389855896467452, "epoch": 1.3805084998954318, "grad_norm": 0.034505974501371384, "kl": 0.005900382995605469, "learning_rate": 3.2474423498715772e-06, "loss": 0.0061, "step": 819 }, { "clip_ratio": 0.000287152882663122, "epoch": 1.3824206029099817, "grad_norm": 0.03524321690201759, "kl": 0.005913734436035156, "learning_rate": 3.241453786485792e-06, "loss": 0.0061, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 509.66520071029663, "epoch": 1.3843327059245316, "grad_norm": 0.039214182645082474, "kl": 0.006892681121826172, "learning_rate": 3.2354605546259777e-06, "loss": 0.0032, "num_tokens": 524677265.0, "reward": 0.07979911041911691, "reward_std": 0.07959878293331712, "rewards/pure_accuracy_reward_math": 0.07979910867288709, "step": 821 }, { "clip_ratio": 0.0002965318878409562, "epoch": 1.3862448089390815, "grad_norm": 0.037640273571014404, "kl": 0.0067348480224609375, "learning_rate": 3.2294626920277928e-06, "loss": 0.0031, "step": 822 }, { "clip_ratio": 0.00035153192868619954, "epoch": 1.3881569119536314, "grad_norm": 0.038182858377695084, "kl": 0.006665706634521484, "learning_rate": 3.2234602364560543e-06, "loss": 0.0031, "step": 823 }, { "clip_ratio": 0.0003338070732752385, "epoch": 1.3900690149681814, "grad_norm": 0.038163840770721436, "kl": 0.00667572021484375, "learning_rate": 3.2174532257044957e-06, "loss": 0.003, "step": 824 }, { "clip_ratio": 0.0003418834434683049, "epoch": 1.3919811179827313, "grad_norm": 0.03628409281373024, "kl": 0.0067596435546875, "learning_rate": 3.2114416975955347e-06, "loss": 0.003, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 519.1027045249939, "epoch": 1.3938932209972812, "grad_norm": 0.037393856793642044, "kl": 0.005987644195556641, "learning_rate": 3.20542568998003e-06, "loss": 0.0097, "num_tokens": 528270425.0, "reward": 0.07784598556463607, "reward_std": 0.0774529695045203, "rewards/pure_accuracy_reward_math": 0.07784598329453729, "step": 826 }, { "clip_ratio": 0.0002753000243274073, "epoch": 1.395805324011831, "grad_norm": 0.03632253408432007, "kl": 0.00603485107421875, "learning_rate": 3.199405240737045e-06, "loss": 0.0097, "step": 827 }, { "clip_ratio": 0.00028145005671831314, "epoch": 1.397717427026381, "grad_norm": 0.035320475697517395, "kl": 0.0060482025146484375, "learning_rate": 3.1933803877736103e-06, "loss": 0.0097, "step": 828 }, { "clip_ratio": 0.00029773840276448027, "epoch": 1.399629530040931, "grad_norm": 0.03532904013991356, "kl": 0.006001472473144531, "learning_rate": 3.187351169024483e-06, "loss": 0.0096, "step": 829 }, { "clip_ratio": 0.0003131672060590063, "epoch": 1.4015416330554809, "grad_norm": 0.03497399017214775, "kl": 0.0059299468994140625, "learning_rate": 3.181317622451909e-06, "loss": 0.0095, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 519.5547099113464, "epoch": 1.4034537360700308, "grad_norm": 0.03596203401684761, "kl": 0.005957126617431641, "learning_rate": 3.1752797860453854e-06, "loss": 0.0099, "num_tokens": 531863545.0, "reward": 0.06584821754950099, "reward_std": 0.07359298237133771, "rewards/pure_accuracy_reward_math": 0.06584821580327116, "step": 831 }, { "clip_ratio": 0.0002871401754873659, "epoch": 1.4053658390845807, "grad_norm": 0.03569914028048515, "kl": 0.005918025970458984, "learning_rate": 3.169237697821417e-06, "loss": 0.0099, "step": 832 }, { "clip_ratio": 0.0002649255456503852, "epoch": 1.4072779420991306, "grad_norm": 0.035189539194107056, "kl": 0.005944252014160156, "learning_rate": 3.163191395823281e-06, "loss": 0.0098, "step": 833 }, { "clip_ratio": 0.0002522150609252094, "epoch": 1.4091900451136805, "grad_norm": 0.03371162712574005, "kl": 0.006028652191162109, "learning_rate": 3.1571409181207867e-06, "loss": 0.0098, "step": 834 }, { "clip_ratio": 0.00028182740913962334, "epoch": 1.4111021481282304, "grad_norm": 0.03411802276968956, "kl": 0.006129264831542969, "learning_rate": 3.151086302810035e-06, "loss": 0.0097, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 509.0455017089844, "epoch": 1.4130142511427803, "grad_norm": 0.042647283524274826, "kl": 0.006505012512207031, "learning_rate": 3.1450275880131782e-06, "loss": 0.0051, "num_tokens": 535420068.0, "reward": 0.06919643201399595, "reward_std": 0.06989945442182943, "rewards/pure_accuracy_reward_math": 0.06919642980210483, "step": 836 }, { "clip_ratio": 0.0002792542761653749, "epoch": 1.4149263541573303, "grad_norm": 0.03879564628005028, "kl": 0.006262302398681641, "learning_rate": 3.1389648118781795e-06, "loss": 0.0051, "step": 837 }, { "clip_ratio": 0.00032867032479089175, "epoch": 1.4168384571718802, "grad_norm": 0.03632555902004242, "kl": 0.006078004837036133, "learning_rate": 3.132898012578577e-06, "loss": 0.005, "step": 838 }, { "clip_ratio": 0.0003705890379706034, "epoch": 1.41875056018643, "grad_norm": 0.03687159717082977, "kl": 0.0058705806732177734, "learning_rate": 3.1268272283132374e-06, "loss": 0.005, "step": 839 }, { "clip_ratio": 0.00039090512018447043, "epoch": 1.42066266320098, "grad_norm": 0.03681857883930206, "kl": 0.005755186080932617, "learning_rate": 3.1207524973061183e-06, "loss": 0.0049, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 528.0865178108215, "epoch": 1.42257476621553, "grad_norm": 0.077212393283844, "kl": 0.006708621978759766, "learning_rate": 3.1146738578060293e-06, "loss": 0.0034, "num_tokens": 539042994.0, "reward": 0.05468750235741027, "reward_std": 0.06221334764268249, "rewards/pure_accuracy_reward_math": 0.05468750130967237, "step": 841 }, { "clip_ratio": 0.00023407521496210393, "epoch": 1.4244868692300798, "grad_norm": 0.03766750544309616, "kl": 0.005887508392333984, "learning_rate": 3.108591348086388e-06, "loss": 0.0034, "step": 842 }, { "clip_ratio": 0.00021864835269980176, "epoch": 1.4263989722446297, "grad_norm": 0.03435171768069267, "kl": 0.0057353973388671875, "learning_rate": 3.102505006444981e-06, "loss": 0.0033, "step": 843 }, { "clip_ratio": 0.0002327330819866802, "epoch": 1.4283110752591797, "grad_norm": 0.03385370597243309, "kl": 0.005730628967285156, "learning_rate": 3.096414871203721e-06, "loss": 0.0033, "step": 844 }, { "clip_ratio": 0.00025595308994752486, "epoch": 1.4302231782737296, "grad_norm": 0.0320701077580452, "kl": 0.005660533905029297, "learning_rate": 3.0903209807084085e-06, "loss": 0.0032, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 532.2009177207947, "epoch": 1.4321352812882795, "grad_norm": 0.035687774419784546, "kl": 0.006323099136352539, "learning_rate": 3.0842233733284866e-06, "loss": 0.0055, "num_tokens": 542686090.0, "reward": 0.06389509252039716, "reward_std": 0.06839800346642733, "rewards/pure_accuracy_reward_math": 0.06389509059954435, "step": 846 }, { "clip_ratio": 0.0002455309293054597, "epoch": 1.4340473843028292, "grad_norm": 0.03433489799499512, "kl": 0.006294965744018555, "learning_rate": 3.078122087456802e-06, "loss": 0.0055, "step": 847 }, { "clip_ratio": 0.0003179283777399178, "epoch": 1.4359594873173793, "grad_norm": 0.03377856686711311, "kl": 0.00630497932434082, "learning_rate": 3.072017161509364e-06, "loss": 0.0054, "step": 848 }, { "clip_ratio": 0.00030606188772708265, "epoch": 1.437871590331929, "grad_norm": 0.03379327058792114, "kl": 0.006325483322143555, "learning_rate": 3.065908633925099e-06, "loss": 0.0054, "step": 849 }, { "clip_ratio": 0.00029904921905199444, "epoch": 1.4397836933464792, "grad_norm": 0.03319833427667618, "kl": 0.006340742111206055, "learning_rate": 3.0597965431656125e-06, "loss": 0.0053, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 520.9991841316223, "epoch": 1.00191210301455, "grad_norm": 0.03730909898877144, "kl": 0.005851268768310547, "learning_rate": 3.0536809277149433e-06, "loss": 0.0058, "num_tokens": 3602593.0, "reward": 0.061662948777666315, "reward_std": 0.0712745109340176, "rewards/pure_accuracy_reward_math": 0.06166294767172076, "step": 851 }, { "clip_ratio": 0.0002445870232463676, "epoch": 1.0038242060290998, "grad_norm": 0.036420926451683044, "kl": 0.005807399749755859, "learning_rate": 3.047561826079324e-06, "loss": 0.0057, "step": 852 }, { "clip_ratio": 0.0002342841784184202, "epoch": 1.0057363090436497, "grad_norm": 0.03534744307398796, "kl": 0.005809783935546875, "learning_rate": 3.041439276786937e-06, "loss": 0.0057, "step": 853 }, { "clip_ratio": 0.0003130897791834286, "epoch": 1.0076484120581997, "grad_norm": 0.03456578403711319, "kl": 0.005836963653564453, "learning_rate": 3.0353133183876745e-06, "loss": 0.0056, "step": 854 }, { "clip_ratio": 0.0003235736477336104, "epoch": 1.0095605150727496, "grad_norm": 0.03683493658900261, "kl": 0.00588226318359375, "learning_rate": 3.0291839894528907e-06, "loss": 0.0056, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 529.2422127723694, "epoch": 1.0114726180872995, "grad_norm": 3.6328346729278564, "kl": 0.07409882545471191, "learning_rate": 3.023051328575164e-06, "loss": 0.0092, "num_tokens": 7231613.0, "reward": 0.06696428847499192, "reward_std": 0.07320140569936484, "rewards/pure_accuracy_reward_math": 0.06696428725263104, "step": 856 }, { "clip_ratio": 0.0002944787788692338, "epoch": 1.0133847211018494, "grad_norm": 0.23805810511112213, "kl": 0.01258087158203125, "learning_rate": 3.016915374368052e-06, "loss": 0.0068, "step": 857 }, { "clip_ratio": 0.000328014534943577, "epoch": 1.0152968241163993, "grad_norm": 0.038860052824020386, "kl": 0.008163928985595703, "learning_rate": 3.0107761654658464e-06, "loss": 0.0066, "step": 858 }, { "clip_ratio": 0.00033978425187797257, "epoch": 1.0172089271309492, "grad_norm": 0.037539608776569366, "kl": 0.008237600326538086, "learning_rate": 3.0046337405233334e-06, "loss": 0.0065, "step": 859 }, { "clip_ratio": 0.0003289994185706746, "epoch": 1.0191210301454992, "grad_norm": 0.03649570420384407, "kl": 0.008342981338500977, "learning_rate": 2.9984881382155484e-06, "loss": 0.0065, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 539.7709541320801, "epoch": 1.021033133160049, "grad_norm": 0.03506062552332878, "kl": 0.0056056976318359375, "learning_rate": 2.9923393972375337e-06, "loss": 0.0075, "num_tokens": 10898500.0, "reward": 0.06389509155997075, "reward_std": 0.07427741104038432, "rewards/pure_accuracy_reward_math": 0.06389509086147882, "step": 861 }, { "clip_ratio": 0.00025894983372154456, "epoch": 1.022945236174599, "grad_norm": 0.03387964144349098, "kl": 0.005673408508300781, "learning_rate": 2.986187556304091e-06, "loss": 0.0075, "step": 862 }, { "clip_ratio": 0.00026048227840647087, "epoch": 1.024857339189149, "grad_norm": 0.0339200459420681, "kl": 0.005715370178222656, "learning_rate": 2.9800326541495427e-06, "loss": 0.0074, "step": 863 }, { "clip_ratio": 0.000286817725225319, "epoch": 1.0267694422036988, "grad_norm": 0.033578090369701385, "kl": 0.0057220458984375, "learning_rate": 2.973874729527486e-06, "loss": 0.0074, "step": 864 }, { "clip_ratio": 0.00031288620994018856, "epoch": 1.0286815452182487, "grad_norm": 0.03253786265850067, "kl": 0.005726814270019531, "learning_rate": 2.967713821210547e-06, "loss": 0.0073, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 532.484959602356, "epoch": 1.0305936482327986, "grad_norm": 0.040393006056547165, "kl": 0.005712032318115234, "learning_rate": 2.961549967990139e-06, "loss": 0.0094, "num_tokens": 14539070.0, "reward": 0.0700334852153901, "reward_std": 0.07968511193757877, "rewards/pure_accuracy_reward_math": 0.07003348364378326, "step": 866 }, { "clip_ratio": 0.00034418605622477116, "epoch": 1.0325057512473486, "grad_norm": 0.03829828277230263, "kl": 0.00571441650390625, "learning_rate": 2.95538320867622e-06, "loss": 0.0094, "step": 867 }, { "clip_ratio": 0.0003270462358386794, "epoch": 1.0344178542618985, "grad_norm": 0.03763904795050621, "kl": 0.005820035934448242, "learning_rate": 2.949213582097042e-06, "loss": 0.0094, "step": 868 }, { "clip_ratio": 0.00039861036464117205, "epoch": 1.0363299572764482, "grad_norm": 0.03893045708537102, "kl": 0.005897045135498047, "learning_rate": 2.9430411270989112e-06, "loss": 0.0093, "step": 869 }, { "clip_ratio": 0.0004073582798014286, "epoch": 1.038242060290998, "grad_norm": 0.03808417171239853, "kl": 0.0059051513671875, "learning_rate": 2.9368658825459452e-06, "loss": 0.0092, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 518.7159852981567, "epoch": 1.040154163305548, "grad_norm": 0.03680076450109482, "kl": 0.006183147430419922, "learning_rate": 2.9306878873198227e-06, "loss": 0.0073, "num_tokens": 18123716.0, "reward": 0.06975446810247377, "reward_std": 0.07255704078124836, "rewards/pure_accuracy_reward_math": 0.06975446600699797, "step": 871 }, { "clip_ratio": 0.00025267474336487794, "epoch": 1.042066266320098, "grad_norm": 0.036574870347976685, "kl": 0.006196498870849609, "learning_rate": 2.9245071803195435e-06, "loss": 0.0072, "step": 872 }, { "clip_ratio": 0.0002888958638322947, "epoch": 1.0439783693346478, "grad_norm": 0.03539302200078964, "kl": 0.006276130676269531, "learning_rate": 2.9183238004611815e-06, "loss": 0.0072, "step": 873 }, { "clip_ratio": 0.00027933804358326597, "epoch": 1.0458904723491977, "grad_norm": 0.03457676246762276, "kl": 0.00629425048828125, "learning_rate": 2.912137786677639e-06, "loss": 0.0071, "step": 874 }, { "clip_ratio": 0.00026495220328115465, "epoch": 1.0478025753637477, "grad_norm": 0.034882258623838425, "kl": 0.006371974945068359, "learning_rate": 2.905949177918403e-06, "loss": 0.0071, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 516.4989104270935, "epoch": 1.0497146783782976, "grad_norm": 0.04403652995824814, "kl": 0.0064754486083984375, "learning_rate": 2.8997580131493004e-06, "loss": 0.0104, "num_tokens": 21706672.0, "reward": 0.07421875311410986, "reward_std": 0.08282060426427051, "rewards/pure_accuracy_reward_math": 0.07421875130967237, "step": 876 }, { "clip_ratio": 0.00034863107299543117, "epoch": 1.0516267813928475, "grad_norm": 0.040730468928813934, "kl": 0.006359100341796875, "learning_rate": 2.89356433135225e-06, "loss": 0.0104, "step": 877 }, { "clip_ratio": 0.0003696895219036378, "epoch": 1.0535388844073974, "grad_norm": 0.040028344839811325, "kl": 0.006321430206298828, "learning_rate": 2.8873681715250197e-06, "loss": 0.0104, "step": 878 }, { "clip_ratio": 0.00041197048278718285, "epoch": 1.0554509874219473, "grad_norm": 0.04009086638689041, "kl": 0.0062351226806640625, "learning_rate": 2.881169572680981e-06, "loss": 0.0103, "step": 879 }, { "clip_ratio": 0.0004460485272943515, "epoch": 1.0573630904364972, "grad_norm": 0.03965138643980026, "kl": 0.006242275238037109, "learning_rate": 2.87496857384886e-06, "loss": 0.0102, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 524.4285945892334, "epoch": 1.0592751934510471, "grad_norm": 0.03920762613415718, "kl": 0.005979061126708984, "learning_rate": 2.868765214072495e-06, "loss": 0.0082, "num_tokens": 25317588.0, "reward": 0.07338170023285784, "reward_std": 0.0805021328269504, "rewards/pure_accuracy_reward_math": 0.07338169755530544, "step": 881 }, { "clip_ratio": 0.0003169273815046836, "epoch": 1.061187296465597, "grad_norm": 0.03858224302530289, "kl": 0.006028175354003906, "learning_rate": 2.8625595324105925e-06, "loss": 0.0082, "step": 882 }, { "clip_ratio": 0.0003076135093351695, "epoch": 1.063099399480147, "grad_norm": 0.03754101321101189, "kl": 0.006089687347412109, "learning_rate": 2.8563515679364733e-06, "loss": 0.0081, "step": 883 }, { "clip_ratio": 0.0003307215861809709, "epoch": 1.065011502494697, "grad_norm": 0.03692120686173439, "kl": 0.006084442138671875, "learning_rate": 2.850141359737836e-06, "loss": 0.008, "step": 884 }, { "clip_ratio": 0.0003362660154380137, "epoch": 1.0669236055092468, "grad_norm": 0.03691774606704712, "kl": 0.006087303161621094, "learning_rate": 2.843928946916504e-06, "loss": 0.008, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 541.91938829422, "epoch": 1.0688357085237967, "grad_norm": 0.03421162813901901, "kl": 0.005934238433837891, "learning_rate": 2.8377143685881835e-06, "loss": 0.0048, "num_tokens": 28991667.0, "reward": 0.06138393090805039, "reward_std": 0.05770279868738726, "rewards/pure_accuracy_reward_math": 0.06138392991852015, "step": 886 }, { "clip_ratio": 0.00021627708133564738, "epoch": 1.0707478115383466, "grad_norm": 0.0331665463745594, "kl": 0.005833148956298828, "learning_rate": 2.8314976638822145e-06, "loss": 0.0048, "step": 887 }, { "clip_ratio": 0.00023772416773226723, "epoch": 1.0726599145528966, "grad_norm": 0.03265010192990303, "kl": 0.00572967529296875, "learning_rate": 2.825278871941325e-06, "loss": 0.0048, "step": 888 }, { "clip_ratio": 0.000255867875353033, "epoch": 1.0745720175674465, "grad_norm": 0.031934551894664764, "kl": 0.0056514739990234375, "learning_rate": 2.819058031921387e-06, "loss": 0.0047, "step": 889 }, { "clip_ratio": 0.0002752940895334177, "epoch": 1.0764841205819964, "grad_norm": 0.03180062025785446, "kl": 0.005589008331298828, "learning_rate": 2.812835182991166e-06, "loss": 0.0047, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 541.6253051757812, "epoch": 1.0783962235965463, "grad_norm": 0.0352044515311718, "kl": 0.006504535675048828, "learning_rate": 2.8066103643320774e-06, "loss": 0.005, "num_tokens": 32662984.0, "reward": 0.07003348544822074, "reward_std": 0.07148103549843654, "rewards/pure_accuracy_reward_math": 0.07003348341095261, "step": 891 }, { "clip_ratio": 0.0002908879878305015, "epoch": 1.0803083266110962, "grad_norm": 0.03477974981069565, "kl": 0.006473064422607422, "learning_rate": 2.800383615137939e-06, "loss": 0.0049, "step": 892 }, { "clip_ratio": 0.00027559091887496834, "epoch": 1.0822204296256461, "grad_norm": 0.03371204808354378, "kl": 0.006519317626953125, "learning_rate": 2.7941549746147234e-06, "loss": 0.0049, "step": 893 }, { "clip_ratio": 0.00026331023877901316, "epoch": 1.084132532640196, "grad_norm": 0.03233867511153221, "kl": 0.00655364990234375, "learning_rate": 2.7879244819803104e-06, "loss": 0.0048, "step": 894 }, { "clip_ratio": 0.0003059378379361988, "epoch": 1.086044635654746, "grad_norm": 0.032591916620731354, "kl": 0.006562709808349609, "learning_rate": 2.781692176464244e-06, "loss": 0.0048, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 538.9467296600342, "epoch": 1.0879567386692959, "grad_norm": 0.0399605967104435, "kl": 0.007935047149658203, "learning_rate": 2.7754580973074817e-06, "loss": 0.0078, "num_tokens": 36327265.0, "reward": 0.06640625328873284, "reward_std": 0.07582512497901917, "rewards/pure_accuracy_reward_math": 0.06640625142608769, "step": 896 }, { "clip_ratio": 0.00029080147635340836, "epoch": 1.0898688416838458, "grad_norm": 0.036669787019491196, "kl": 0.007892131805419922, "learning_rate": 2.769222283762148e-06, "loss": 0.0077, "step": 897 }, { "clip_ratio": 0.0003202801690349588, "epoch": 1.0917809446983957, "grad_norm": 0.036093369126319885, "kl": 0.007870197296142578, "learning_rate": 2.7629847750912885e-06, "loss": 0.0077, "step": 898 }, { "clip_ratio": 0.00034906711715620986, "epoch": 1.0936930477129456, "grad_norm": 0.036899976432323456, "kl": 0.007824897766113281, "learning_rate": 2.756745610568622e-06, "loss": 0.0076, "step": 899 }, { "clip_ratio": 0.0003909627172333785, "epoch": 1.0956051507274955, "grad_norm": 0.03607386723160744, "kl": 0.00782632827758789, "learning_rate": 2.7505048294782914e-06, "loss": 0.0076, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 519.9687776565552, "epoch": 1.0975172537420455, "grad_norm": 0.04138408601284027, "kl": 0.006854534149169922, "learning_rate": 2.7442624711146206e-06, "loss": 0.0105, "num_tokens": 39926261.0, "reward": 0.07561384263681248, "reward_std": 0.08660046180011705, "rewards/pure_accuracy_reward_math": 0.07561384089058265, "step": 901 }, { "clip_ratio": 0.0003407098130878694, "epoch": 1.0994293567565951, "grad_norm": 0.04008745029568672, "kl": 0.006922245025634766, "learning_rate": 2.7380185747818628e-06, "loss": 0.0105, "step": 902 }, { "clip_ratio": 0.0003345158028196238, "epoch": 1.1013414597711453, "grad_norm": 0.039206936955451965, "kl": 0.006981372833251953, "learning_rate": 2.7317731797939566e-06, "loss": 0.0104, "step": 903 }, { "clip_ratio": 0.0003512224284918375, "epoch": 1.103253562785695, "grad_norm": 0.03816502168774605, "kl": 0.006984233856201172, "learning_rate": 2.7255263254742746e-06, "loss": 0.0103, "step": 904 }, { "clip_ratio": 0.00038539456500075175, "epoch": 1.105165665800245, "grad_norm": 0.03802499175071716, "kl": 0.006890773773193359, "learning_rate": 2.71927805115538e-06, "loss": 0.0103, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 522.6635279655457, "epoch": 1.1070777688147948, "grad_norm": 0.03780652955174446, "kl": 0.005947589874267578, "learning_rate": 2.713028396178776e-06, "loss": 0.0044, "num_tokens": 43530039.0, "reward": 0.0691964318684768, "reward_std": 0.0774129043566063, "rewards/pure_accuracy_reward_math": 0.06919642988941632, "step": 906 }, { "clip_ratio": 0.0002883933650537074, "epoch": 1.1089898718293447, "grad_norm": 0.03706151619553566, "kl": 0.005948543548583984, "learning_rate": 2.706777399894656e-06, "loss": 0.0044, "step": 907 }, { "clip_ratio": 0.0003032470573316459, "epoch": 1.1109019748438946, "grad_norm": 0.03684515878558159, "kl": 0.005936622619628906, "learning_rate": 2.700525101661665e-06, "loss": 0.0044, "step": 908 }, { "clip_ratio": 0.0003385747261290817, "epoch": 1.1128140778584446, "grad_norm": 0.03632361814379692, "kl": 0.005986690521240234, "learning_rate": 2.6942715408466406e-06, "loss": 0.0043, "step": 909 }, { "clip_ratio": 0.00035084231319615355, "epoch": 1.1147261808729945, "grad_norm": 0.0364714041352272, "kl": 0.005983829498291016, "learning_rate": 2.6880167568243716e-06, "loss": 0.0042, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 524.6629705429077, "epoch": 1.1166382838875444, "grad_norm": 0.037073228508234024, "kl": 0.006183624267578125, "learning_rate": 2.681760788977349e-06, "loss": 0.0075, "num_tokens": 47140667.0, "reward": 0.06166294956346974, "reward_std": 0.07140090485336259, "rewards/pure_accuracy_reward_math": 0.061662947526201606, "step": 911 }, { "clip_ratio": 0.00026335007953548484, "epoch": 1.1185503869020943, "grad_norm": 0.03628791868686676, "kl": 0.006221771240234375, "learning_rate": 2.6755036766955172e-06, "loss": 0.0075, "step": 912 }, { "clip_ratio": 0.00029098790395210017, "epoch": 1.1204624899166442, "grad_norm": 0.03659017011523247, "kl": 0.006258964538574219, "learning_rate": 2.6692454593760255e-06, "loss": 0.0075, "step": 913 }, { "clip_ratio": 0.00033703100632465066, "epoch": 1.1223745929311941, "grad_norm": 0.0357106551527977, "kl": 0.006211757659912109, "learning_rate": 2.6629861764229824e-06, "loss": 0.0074, "step": 914 }, { "clip_ratio": 0.0003104925490902133, "epoch": 1.124286695945744, "grad_norm": 0.03461490571498871, "kl": 0.006183624267578125, "learning_rate": 2.6567258672472064e-06, "loss": 0.0073, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 519.3962297439575, "epoch": 1.126198798960294, "grad_norm": 0.038919847458601, "kl": 0.0060977935791015625, "learning_rate": 2.650464571265975e-06, "loss": 0.0062, "num_tokens": 50733111.0, "reward": 0.06584821734577417, "reward_std": 0.07367311330744997, "rewards/pure_accuracy_reward_math": 0.06584821583237499, "step": 916 }, { "clip_ratio": 0.0002951280029606096, "epoch": 1.1281109019748439, "grad_norm": 0.038201622664928436, "kl": 0.0060329437255859375, "learning_rate": 2.6442023279027805e-06, "loss": 0.0061, "step": 917 }, { "clip_ratio": 0.00029004437487856194, "epoch": 1.1300230049893938, "grad_norm": 0.03696547448635101, "kl": 0.006039619445800781, "learning_rate": 2.6379391765870828e-06, "loss": 0.0061, "step": 918 }, { "clip_ratio": 0.0003163389113183257, "epoch": 1.1319351080039437, "grad_norm": 0.03571280464529991, "kl": 0.006005764007568359, "learning_rate": 2.6316751567540527e-06, "loss": 0.006, "step": 919 }, { "clip_ratio": 0.0003592208154259424, "epoch": 1.1338472110184936, "grad_norm": 0.03568287193775177, "kl": 0.005993366241455078, "learning_rate": 2.625410307844335e-06, "loss": 0.006, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 538.2659268379211, "epoch": 1.1357593140330435, "grad_norm": 0.03899242356419563, "kl": 0.005813121795654297, "learning_rate": 2.6191446693037924e-06, "loss": 0.0071, "num_tokens": 54398312.0, "reward": 0.07226562857977115, "reward_std": 0.07861530320951715, "rewards/pure_accuracy_reward_math": 0.07226562648429535, "step": 921 }, { "clip_ratio": 0.00029711308371815903, "epoch": 1.1376714170475934, "grad_norm": 0.038164544850587845, "kl": 0.0058841705322265625, "learning_rate": 2.6128782805832605e-06, "loss": 0.0071, "step": 922 }, { "clip_ratio": 0.0003027216810664868, "epoch": 1.1395835200621434, "grad_norm": 0.03706645965576172, "kl": 0.005882740020751953, "learning_rate": 2.606611181138295e-06, "loss": 0.007, "step": 923 }, { "clip_ratio": 0.00032618250162386175, "epoch": 1.1414956230766933, "grad_norm": 0.036637816578149796, "kl": 0.005909442901611328, "learning_rate": 2.600343410428931e-06, "loss": 0.007, "step": 924 }, { "clip_ratio": 0.00032713054685018506, "epoch": 1.1434077260912432, "grad_norm": 0.036758605390787125, "kl": 0.005947589874267578, "learning_rate": 2.5940750079194275e-06, "loss": 0.0069, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 542.0072803497314, "epoch": 1.145319829105793, "grad_norm": 0.03791532665491104, "kl": 0.0061702728271484375, "learning_rate": 2.5878060130780225e-06, "loss": 0.0074, "num_tokens": 58073722.0, "reward": 0.06835937863797881, "reward_std": 0.07715391897363588, "rewards/pure_accuracy_reward_math": 0.06835937636788003, "step": 926 }, { "clip_ratio": 0.00030884258325158953, "epoch": 1.147231932120343, "grad_norm": 0.03749171644449234, "kl": 0.006160736083984375, "learning_rate": 2.581536465376684e-06, "loss": 0.0074, "step": 927 }, { "clip_ratio": 0.000279198229350186, "epoch": 1.149144035134893, "grad_norm": 0.03681938722729683, "kl": 0.006136417388916016, "learning_rate": 2.575266404290859e-06, "loss": 0.0073, "step": 928 }, { "clip_ratio": 0.0002930849948370451, "epoch": 1.1510561381494429, "grad_norm": 0.035750068724155426, "kl": 0.006227970123291016, "learning_rate": 2.5689958692992284e-06, "loss": 0.0072, "step": 929 }, { "clip_ratio": 0.00028936977611238035, "epoch": 1.1529682411639928, "grad_norm": 0.03503425419330597, "kl": 0.006281375885009766, "learning_rate": 2.562724899883458e-06, "loss": 0.0072, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 531.6188879013062, "epoch": 1.1548803441785427, "grad_norm": 0.05187267065048218, "kl": 0.007277965545654297, "learning_rate": 2.5564535355279464e-06, "loss": 0.0072, "num_tokens": 61714268.0, "reward": 0.07505580713041127, "reward_std": 0.08531173289520666, "rewards/pure_accuracy_reward_math": 0.07505580491852015, "step": 931 }, { "clip_ratio": 0.00033635866333270314, "epoch": 1.1567924471930926, "grad_norm": 0.039655230939388275, "kl": 0.0072231292724609375, "learning_rate": 2.550181815719581e-06, "loss": 0.0072, "step": 932 }, { "clip_ratio": 0.00035109808851530033, "epoch": 1.1587045502076425, "grad_norm": 0.038757406175136566, "kl": 0.007157802581787109, "learning_rate": 2.5439097799474867e-06, "loss": 0.0072, "step": 933 }, { "clip_ratio": 0.00037538493586453114, "epoch": 1.1606166532221924, "grad_norm": 0.03841486573219299, "kl": 0.007115840911865234, "learning_rate": 2.537637467702777e-06, "loss": 0.0071, "step": 934 }, { "clip_ratio": 0.0003936579208243529, "epoch": 1.1625287562367423, "grad_norm": 0.038453541696071625, "kl": 0.0070896148681640625, "learning_rate": 2.531364918478308e-06, "loss": 0.007, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 547.6250252723694, "epoch": 1.1644408592512923, "grad_norm": 0.03738933801651001, "kl": 0.00615692138671875, "learning_rate": 2.5250921717684247e-06, "loss": 0.0061, "num_tokens": 65415044.0, "reward": 0.07561384260770865, "reward_std": 0.07745296956272796, "rewards/pure_accuracy_reward_math": 0.07561384062864818, "step": 936 }, { "clip_ratio": 0.0002929231292227996, "epoch": 1.166352962265842, "grad_norm": 0.03690778836607933, "kl": 0.006189823150634766, "learning_rate": 2.5188192670687186e-06, "loss": 0.0061, "step": 937 }, { "clip_ratio": 0.000294325235870474, "epoch": 1.168265065280392, "grad_norm": 0.03613179549574852, "kl": 0.006130695343017578, "learning_rate": 2.512546243875776e-06, "loss": 0.0061, "step": 938 }, { "clip_ratio": 0.00031920797795237377, "epoch": 1.1701771682949418, "grad_norm": 0.03461304306983948, "kl": 0.006014347076416016, "learning_rate": 2.5062731416869267e-06, "loss": 0.006, "step": 939 }, { "clip_ratio": 0.00037188214912475814, "epoch": 1.172089271309492, "grad_norm": 0.03454398363828659, "kl": 0.005980968475341797, "learning_rate": 2.5e-06, "loss": 0.0059, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 532.1423244476318, "epoch": 1.1740013743240416, "grad_norm": 0.03934042155742645, "kl": 0.006266117095947266, "learning_rate": 2.493726858313074e-06, "loss": 0.0078, "num_tokens": 69057654.0, "reward": 0.07477678928989917, "reward_std": 0.08299326134147123, "rewards/pure_accuracy_reward_math": 0.07477678690338507, "step": 941 }, { "clip_ratio": 0.00031629414758072016, "epoch": 1.1759134773385915, "grad_norm": 0.03872406855225563, "kl": 0.0062713623046875, "learning_rate": 2.4874537561242253e-06, "loss": 0.0078, "step": 942 }, { "clip_ratio": 0.0003434862284166229, "epoch": 1.1778255803531414, "grad_norm": 0.03723340108990669, "kl": 0.00623321533203125, "learning_rate": 2.481180732931282e-06, "loss": 0.0077, "step": 943 }, { "clip_ratio": 0.00034986940886483353, "epoch": 1.1797376833676914, "grad_norm": 0.03732794523239136, "kl": 0.006276607513427734, "learning_rate": 2.4749078282315757e-06, "loss": 0.0076, "step": 944 }, { "clip_ratio": 0.0003579597876637308, "epoch": 1.1816497863822413, "grad_norm": 0.03668594732880592, "kl": 0.006198883056640625, "learning_rate": 2.468635081521693e-06, "loss": 0.0076, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 528.1718993186951, "epoch": 1.1835618893967912, "grad_norm": 0.03715552017092705, "kl": 0.006759166717529297, "learning_rate": 2.462362532297224e-06, "loss": 0.0079, "num_tokens": 72682654.0, "reward": 0.06891741449362598, "reward_std": 0.08248148870188743, "rewards/pure_accuracy_reward_math": 0.06891741199069656, "step": 946 }, { "clip_ratio": 0.0003075862115053951, "epoch": 1.185473992411341, "grad_norm": 0.03616279736161232, "kl": 0.006741523742675781, "learning_rate": 2.456090220052514e-06, "loss": 0.0079, "step": 947 }, { "clip_ratio": 0.00027696539024191225, "epoch": 1.187386095425891, "grad_norm": 0.03556762635707855, "kl": 0.006789684295654297, "learning_rate": 2.44981818428042e-06, "loss": 0.0079, "step": 948 }, { "clip_ratio": 0.0002739789470638243, "epoch": 1.189298198440441, "grad_norm": 0.03486724570393562, "kl": 0.006869316101074219, "learning_rate": 2.4435464644720544e-06, "loss": 0.0078, "step": 949 }, { "clip_ratio": 0.00031816330425726846, "epoch": 1.1912103014549908, "grad_norm": 0.03446395695209503, "kl": 0.006869316101074219, "learning_rate": 2.4372751001165427e-06, "loss": 0.0077, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 528.6573901176453, "epoch": 1.1931224044695408, "grad_norm": 0.03734345734119415, "kl": 0.006131649017333984, "learning_rate": 2.4310041307007716e-06, "loss": 0.0062, "num_tokens": 76305578.0, "reward": 0.07114955657743849, "reward_std": 0.07526708883233368, "rewards/pure_accuracy_reward_math": 0.07114955488941632, "step": 951 }, { "clip_ratio": 0.00029005661951941875, "epoch": 1.1950345074840907, "grad_norm": 0.036443449556827545, "kl": 0.006079196929931641, "learning_rate": 2.4247335957091418e-06, "loss": 0.0062, "step": 952 }, { "clip_ratio": 0.0002579906781647878, "epoch": 1.1969466104986406, "grad_norm": 0.034940823912620544, "kl": 0.006037235260009766, "learning_rate": 2.4184635346233166e-06, "loss": 0.0061, "step": 953 }, { "clip_ratio": 0.00032199256943954424, "epoch": 1.1988587135131905, "grad_norm": 0.03445851802825928, "kl": 0.006024360656738281, "learning_rate": 2.4121939869219784e-06, "loss": 0.0061, "step": 954 }, { "clip_ratio": 0.0003193520489048751, "epoch": 1.2007708165277404, "grad_norm": 0.03448885306715965, "kl": 0.005992889404296875, "learning_rate": 2.405924992080573e-06, "loss": 0.006, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 519.4358487129211, "epoch": 1.2026829195422903, "grad_norm": 0.11665105819702148, "kl": 0.008374214172363281, "learning_rate": 2.3996565895710692e-06, "loss": 0.0065, "num_tokens": 79904712.0, "reward": 0.07366071760770865, "reward_std": 0.08458104060264304, "rewards/pure_accuracy_reward_math": 0.07366071591968648, "step": 956 }, { "clip_ratio": 0.00031160829769305565, "epoch": 1.2045950225568403, "grad_norm": 0.04096413403749466, "kl": 0.006944179534912109, "learning_rate": 2.3933888188617054e-06, "loss": 0.0064, "step": 957 }, { "clip_ratio": 0.00032232171946589006, "epoch": 1.2065071255713902, "grad_norm": 0.04049144312739372, "kl": 0.006976127624511719, "learning_rate": 2.3871217194167407e-06, "loss": 0.0063, "step": 958 }, { "clip_ratio": 0.0003416440970340773, "epoch": 1.20841922858594, "grad_norm": 0.039766065776348114, "kl": 0.007042884826660156, "learning_rate": 2.380855330696208e-06, "loss": 0.0063, "step": 959 }, { "clip_ratio": 0.0003523347779150754, "epoch": 1.21033133160049, "grad_norm": 0.03884311020374298, "kl": 0.007153987884521484, "learning_rate": 2.3745896921556656e-06, "loss": 0.0062, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 530.392322063446, "epoch": 1.21224343461504, "grad_norm": 0.04043371230363846, "kl": 0.008221149444580078, "learning_rate": 2.368324843245948e-06, "loss": 0.0086, "num_tokens": 83540930.0, "reward": 0.07952009316068143, "reward_std": 0.08836089639225975, "rewards/pure_accuracy_reward_math": 0.0795200911234133, "step": 961 }, { "clip_ratio": 0.0003234188988017195, "epoch": 1.2141555376295898, "grad_norm": 0.039239391684532166, "kl": 0.008275985717773438, "learning_rate": 2.362060823412919e-06, "loss": 0.0086, "step": 962 }, { "clip_ratio": 0.00033211900500873526, "epoch": 1.2160676406441397, "grad_norm": 0.03923904523253441, "kl": 0.008409500122070312, "learning_rate": 2.355797672097219e-06, "loss": 0.0086, "step": 963 }, { "clip_ratio": 0.00036667373893806143, "epoch": 1.2179797436586897, "grad_norm": 0.038865529000759125, "kl": 0.008434295654296875, "learning_rate": 2.349535428734026e-06, "loss": 0.0085, "step": 964 }, { "clip_ratio": 0.0003816600048480723, "epoch": 1.2198918466732396, "grad_norm": 0.037728771567344666, "kl": 0.00834512710571289, "learning_rate": 2.343274132752795e-06, "loss": 0.0084, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 535.4799346923828, "epoch": 1.2218039496877895, "grad_norm": 0.03813539817929268, "kl": 0.005985260009765625, "learning_rate": 2.3370138235770184e-06, "loss": 0.0088, "num_tokens": 87187574.0, "reward": 0.060267860419116914, "reward_std": 0.07384576939512044, "rewards/pure_accuracy_reward_math": 0.060267858498264104, "step": 966 }, { "clip_ratio": 0.0002719826344446119, "epoch": 1.2237160527023394, "grad_norm": 0.03676025941967964, "kl": 0.006021976470947266, "learning_rate": 2.330754540623975e-06, "loss": 0.0088, "step": 967 }, { "clip_ratio": 0.0002730399019696961, "epoch": 1.2256281557168893, "grad_norm": 0.03579593822360039, "kl": 0.006060123443603516, "learning_rate": 2.324496323304484e-06, "loss": 0.0088, "step": 968 }, { "clip_ratio": 0.0002800920712502375, "epoch": 1.2275402587314392, "grad_norm": 0.0353357158601284, "kl": 0.0061092376708984375, "learning_rate": 2.318239211022651e-06, "loss": 0.0087, "step": 969 }, { "clip_ratio": 0.0003294056899108, "epoch": 1.2294523617459892, "grad_norm": 0.03521355986595154, "kl": 0.006182193756103516, "learning_rate": 2.3119832431756284e-06, "loss": 0.0086, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 513.8870182037354, "epoch": 1.231364464760539, "grad_norm": 0.03882085531949997, "kl": 0.006420135498046875, "learning_rate": 2.3057284591533598e-06, "loss": 0.0093, "num_tokens": 90758753.0, "reward": 0.07505580718861893, "reward_std": 0.07715391827514395, "rewards/pure_accuracy_reward_math": 0.0750558051513508, "step": 971 }, { "clip_ratio": 0.0003045887907546785, "epoch": 1.2332765677750888, "grad_norm": 0.03775356709957123, "kl": 0.006350040435791016, "learning_rate": 2.299474898338336e-06, "loss": 0.0093, "step": 972 }, { "clip_ratio": 0.0003195773986703898, "epoch": 1.235188670789639, "grad_norm": 0.03639310225844383, "kl": 0.006343841552734375, "learning_rate": 2.2932226001053444e-06, "loss": 0.0092, "step": 973 }, { "clip_ratio": 0.0003582680616318612, "epoch": 1.2371007738041886, "grad_norm": 0.036272380501031876, "kl": 0.006300926208496094, "learning_rate": 2.286971603821226e-06, "loss": 0.0092, "step": 974 }, { "clip_ratio": 0.0003946863821511215, "epoch": 1.2390128768187387, "grad_norm": 0.03584066033363342, "kl": 0.006391048431396484, "learning_rate": 2.280721948844621e-06, "loss": 0.0091, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 522.3044323921204, "epoch": 1.2409249798332884, "grad_norm": 0.038236722350120544, "kl": 0.006694316864013672, "learning_rate": 2.274473674525726e-06, "loss": 0.0094, "num_tokens": 94365488.0, "reward": 0.06556919953436591, "reward_std": 0.07405849196948111, "rewards/pure_accuracy_reward_math": 0.06556919802096672, "step": 976 }, { "clip_ratio": 0.00029697347130763774, "epoch": 1.2428370828478383, "grad_norm": 0.0369977168738842, "kl": 0.006660938262939453, "learning_rate": 2.268226820206044e-06, "loss": 0.0094, "step": 977 }, { "clip_ratio": 0.000319464833580696, "epoch": 1.2447491858623883, "grad_norm": 0.03550850227475166, "kl": 0.006519794464111328, "learning_rate": 2.261981425218138e-06, "loss": 0.0094, "step": 978 }, { "clip_ratio": 0.0003469139706453461, "epoch": 1.2466612888769382, "grad_norm": 0.03525082767009735, "kl": 0.006406307220458984, "learning_rate": 2.2557375288853803e-06, "loss": 0.0093, "step": 979 }, { "clip_ratio": 0.0003654695393606744, "epoch": 1.248573391891488, "grad_norm": 0.0355265848338604, "kl": 0.006331443786621094, "learning_rate": 2.2494951705217095e-06, "loss": 0.0092, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 516.76704454422, "epoch": 1.250485494906038, "grad_norm": 0.03745350241661072, "kl": 0.0065135955810546875, "learning_rate": 2.2432543894313797e-06, "loss": 0.0042, "num_tokens": 97952525.0, "reward": 0.06501116385334171, "reward_std": 0.07316133996937424, "rewards/pure_accuracy_reward_math": 0.06501116222352721, "step": 981 }, { "clip_ratio": 0.00029299165072416145, "epoch": 1.252397597920588, "grad_norm": 0.03690091893076897, "kl": 0.006426095962524414, "learning_rate": 2.2370152249087114e-06, "loss": 0.0042, "step": 982 }, { "clip_ratio": 0.0003187885846500649, "epoch": 1.2543097009351378, "grad_norm": 0.03645962476730347, "kl": 0.006396055221557617, "learning_rate": 2.2307777162378523e-06, "loss": 0.0042, "step": 983 }, { "clip_ratio": 0.00033352292155086616, "epoch": 1.2562218039496877, "grad_norm": 0.03598187491297722, "kl": 0.006333351135253906, "learning_rate": 2.2245419026925187e-06, "loss": 0.0041, "step": 984 }, { "clip_ratio": 0.0003533332319989313, "epoch": 1.2581339069642377, "grad_norm": 0.03577181696891785, "kl": 0.006278276443481445, "learning_rate": 2.218307823535757e-06, "loss": 0.004, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 522.8172650337219, "epoch": 1.2600460099787876, "grad_norm": 0.03590444475412369, "kl": 0.005995273590087891, "learning_rate": 2.2120755180196904e-06, "loss": 0.0045, "num_tokens": 101560026.0, "reward": 0.06054687811410986, "reward_std": 0.06865079078124836, "rewards/pure_accuracy_reward_math": 0.06054687619325705, "step": 986 }, { "clip_ratio": 0.00024842098838462334, "epoch": 1.2619581129933375, "grad_norm": 0.03513624891638756, "kl": 0.0059719085693359375, "learning_rate": 2.2058450253852783e-06, "loss": 0.0045, "step": 987 }, { "clip_ratio": 0.000271169978702801, "epoch": 1.2638702160078874, "grad_norm": 0.03392768278717995, "kl": 0.005938529968261719, "learning_rate": 2.1996163848620612e-06, "loss": 0.0044, "step": 988 }, { "clip_ratio": 0.0002971922116898895, "epoch": 1.2657823190224373, "grad_norm": 0.03286145627498627, "kl": 0.0060443878173828125, "learning_rate": 2.1933896356679226e-06, "loss": 0.0044, "step": 989 }, { "clip_ratio": 0.0003229031350429068, "epoch": 1.2676944220369872, "grad_norm": 0.032496001571416855, "kl": 0.006091594696044922, "learning_rate": 2.1871648170088347e-06, "loss": 0.0043, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 535.8125224113464, "epoch": 1.2696065250515371, "grad_norm": 0.21526122093200684, "kl": 0.007075309753417969, "learning_rate": 2.1809419680786143e-06, "loss": 0.0072, "num_tokens": 105223050.0, "reward": 0.07421875381260179, "reward_std": 0.08054219774203375, "rewards/pure_accuracy_reward_math": 0.07421875130967237, "step": 991 }, { "clip_ratio": 0.00032863151136552915, "epoch": 1.271518628066087, "grad_norm": 0.03788222745060921, "kl": 0.006428241729736328, "learning_rate": 2.1747211280586758e-06, "loss": 0.0072, "step": 992 }, { "clip_ratio": 0.00034688404628013814, "epoch": 1.273430731080637, "grad_norm": 0.03719337284564972, "kl": 0.0064296722412109375, "learning_rate": 2.168502336117787e-06, "loss": 0.0071, "step": 993 }, { "clip_ratio": 0.00034599834629034376, "epoch": 1.275342834095187, "grad_norm": 0.036535993218421936, "kl": 0.006348133087158203, "learning_rate": 2.1622856314118178e-06, "loss": 0.0071, "step": 994 }, { "clip_ratio": 0.00036459101005448247, "epoch": 1.2772549371097368, "grad_norm": 0.03548647463321686, "kl": 0.006353855133056641, "learning_rate": 2.156071053083496e-06, "loss": 0.007, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 529.536018371582, "epoch": 1.2791670401242867, "grad_norm": 0.03945273160934448, "kl": 0.006157398223876953, "learning_rate": 2.1498586402621646e-06, "loss": 0.0062, "num_tokens": 108847859.0, "reward": 0.07366071807336994, "reward_std": 0.072430647269357, "rewards/pure_accuracy_reward_math": 0.07366071533760987, "step": 996 }, { "clip_ratio": 0.0002439655858097467, "epoch": 1.2810791431388366, "grad_norm": 0.03839760273694992, "kl": 0.006161689758300781, "learning_rate": 2.1436484320635275e-06, "loss": 0.0061, "step": 997 }, { "clip_ratio": 0.0002514519866281262, "epoch": 1.2829912461533866, "grad_norm": 0.03733210638165474, "kl": 0.0061798095703125, "learning_rate": 2.1374404675894083e-06, "loss": 0.0061, "step": 998 }, { "clip_ratio": 0.0002774860670342605, "epoch": 1.2849033491679365, "grad_norm": 0.03640332072973251, "kl": 0.006183147430419922, "learning_rate": 2.131234785927505e-06, "loss": 0.006, "step": 999 }, { "clip_ratio": 0.0002877332713069336, "epoch": 1.2868154521824864, "grad_norm": 0.03559413552284241, "kl": 0.006213665008544922, "learning_rate": 2.1250314261511414e-06, "loss": 0.0059, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 528.9492444992065, "epoch": 1.2887275551970363, "grad_norm": 0.04216492921113968, "kl": 0.0073282718658447266, "learning_rate": 2.1188304273190196e-06, "loss": 0.0102, "num_tokens": 112482213.0, "reward": 0.0772879500000272, "reward_std": 0.07908701087580994, "rewards/pure_accuracy_reward_math": 0.07728794772992842, "step": 1001 }, { "clip_ratio": 0.0003075964003755871, "epoch": 1.2906396582115862, "grad_norm": 0.039000045508146286, "kl": 0.007200002670288086, "learning_rate": 2.1126318284749807e-06, "loss": 0.0102, "step": 1002 }, { "clip_ratio": 0.0003138856436635251, "epoch": 1.2925517612261361, "grad_norm": 0.036585696041584015, "kl": 0.00716710090637207, "learning_rate": 2.106435668647751e-06, "loss": 0.0101, "step": 1003 }, { "clip_ratio": 0.00033263966838603665, "epoch": 1.294463864240686, "grad_norm": 0.03634057566523552, "kl": 0.007274150848388672, "learning_rate": 2.1002419868507005e-06, "loss": 0.01, "step": 1004 }, { "clip_ratio": 0.00035104663936635916, "epoch": 1.2963759672552357, "grad_norm": 0.03524275869131088, "kl": 0.0072422027587890625, "learning_rate": 2.0940508220815978e-06, "loss": 0.01, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 519.5226221084595, "epoch": 1.2982880702697859, "grad_norm": 0.04047563299536705, "kl": 0.006965160369873047, "learning_rate": 2.087862213322362e-06, "loss": 0.0078, "num_tokens": 116078946.0, "reward": 0.06752232470898889, "reward_std": 0.08269421081058681, "rewards/pure_accuracy_reward_math": 0.0675223229045514, "step": 1006 }, { "clip_ratio": 0.00033451643105308904, "epoch": 1.3002001732843356, "grad_norm": 0.03818976879119873, "kl": 0.0069293975830078125, "learning_rate": 2.0816761995388198e-06, "loss": 0.0078, "step": 1007 }, { "clip_ratio": 0.0003828123747666723, "epoch": 1.3021122762988857, "grad_norm": 0.03969357907772064, "kl": 0.006967067718505859, "learning_rate": 2.075492819680457e-06, "loss": 0.0078, "step": 1008 }, { "clip_ratio": 0.0003832018163620887, "epoch": 1.3040243793134354, "grad_norm": 0.040100231766700745, "kl": 0.007086753845214844, "learning_rate": 2.0693121126801778e-06, "loss": 0.0077, "step": 1009 }, { "clip_ratio": 0.0003569153510625256, "epoch": 1.3059364823279855, "grad_norm": 0.037368252873420715, "kl": 0.007195472717285156, "learning_rate": 2.063134117454055e-06, "loss": 0.0076, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 514.7126340866089, "epoch": 1.3078485853425352, "grad_norm": 0.0401712991297245, "kl": 0.00678253173828125, "learning_rate": 2.0569588729010896e-06, "loss": 0.0063, "num_tokens": 119662772.0, "reward": 0.0705915214784909, "reward_std": 0.08484002540353686, "rewards/pure_accuracy_reward_math": 0.0705915190919768, "step": 1011 }, { "clip_ratio": 0.0003401347770477514, "epoch": 1.3097606883570854, "grad_norm": 0.03972383588552475, "kl": 0.006781578063964844, "learning_rate": 2.0507864179029592e-06, "loss": 0.0062, "step": 1012 }, { "clip_ratio": 0.00040657852025560715, "epoch": 1.311672791371635, "grad_norm": 0.04063359647989273, "kl": 0.006711006164550781, "learning_rate": 2.044616791323781e-06, "loss": 0.0062, "step": 1013 }, { "clip_ratio": 0.0004189488300880839, "epoch": 1.313584894386185, "grad_norm": 0.03818094730377197, "kl": 0.006552696228027344, "learning_rate": 2.0384500320098604e-06, "loss": 0.0061, "step": 1014 }, { "clip_ratio": 0.000448550158978378, "epoch": 1.315496997400735, "grad_norm": 0.03749743476510048, "kl": 0.0064678192138671875, "learning_rate": 2.032286178789454e-06, "loss": 0.006, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 529.0069990158081, "epoch": 1.3174091004152848, "grad_norm": 0.03775123134255409, "kl": 0.006552696228027344, "learning_rate": 2.0261252704725143e-06, "loss": 0.0047, "num_tokens": 123299241.0, "reward": 0.06919643163564615, "reward_std": 0.0781373989302665, "rewards/pure_accuracy_reward_math": 0.06919642994762398, "step": 1016 }, { "clip_ratio": 0.0003128642913452495, "epoch": 1.3193212034298347, "grad_norm": 0.03666616231203079, "kl": 0.006560325622558594, "learning_rate": 2.0199673458504577e-06, "loss": 0.0047, "step": 1017 }, { "clip_ratio": 0.00030665075905744743, "epoch": 1.3212333064443846, "grad_norm": 0.035805702209472656, "kl": 0.006537437438964844, "learning_rate": 2.01381244369591e-06, "loss": 0.0046, "step": 1018 }, { "clip_ratio": 0.0003063842187316368, "epoch": 1.3231454094589346, "grad_norm": 0.03492369130253792, "kl": 0.006512641906738281, "learning_rate": 2.0076606027624676e-06, "loss": 0.0046, "step": 1019 }, { "clip_ratio": 0.00033027163379983904, "epoch": 1.3250575124734845, "grad_norm": 0.03507117182016373, "kl": 0.006590366363525391, "learning_rate": 2.0015118617844516e-06, "loss": 0.0045, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 536.10493516922, "epoch": 1.3269696154880344, "grad_norm": 0.04077515751123428, "kl": 0.006287097930908203, "learning_rate": 1.9953662594766675e-06, "loss": 0.007, "num_tokens": 126958737.0, "reward": 0.0756138427532278, "reward_std": 0.08067478984594345, "rewards/pure_accuracy_reward_math": 0.07561384083237499, "step": 1021 }, { "clip_ratio": 0.0003038725464534764, "epoch": 1.3288817185025843, "grad_norm": 0.03825462609529495, "kl": 0.0063266754150390625, "learning_rate": 1.9892238345341544e-06, "loss": 0.007, "step": 1022 }, { "clip_ratio": 0.0003366774006963169, "epoch": 1.3307938215171342, "grad_norm": 0.03734288364648819, "kl": 0.006364345550537109, "learning_rate": 1.983084625631949e-06, "loss": 0.0069, "step": 1023 }, { "clip_ratio": 0.0003749641306853846, "epoch": 1.3327059245316841, "grad_norm": 0.03799683600664139, "kl": 0.006411075592041016, "learning_rate": 1.9769486714248367e-06, "loss": 0.0068, "step": 1024 }, { "clip_ratio": 0.0003729545476289786, "epoch": 1.334618027546234, "grad_norm": 0.03601997718214989, "kl": 0.006434917449951172, "learning_rate": 1.9708160105471105e-06, "loss": 0.0068, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 529.7709493637085, "epoch": 1.336530130560784, "grad_norm": 0.04102141782641411, "kl": 0.006857395172119141, "learning_rate": 1.964686681612327e-06, "loss": 0.0055, "num_tokens": 130592668.0, "reward": 0.06556919959257357, "reward_std": 0.06470447563333437, "rewards/pure_accuracy_reward_math": 0.0655691981955897, "step": 1026 }, { "clip_ratio": 0.00021823535962539609, "epoch": 1.3384422335753339, "grad_norm": 0.03428492322564125, "kl": 0.006598472595214844, "learning_rate": 1.9585607232130636e-06, "loss": 0.0054, "step": 1027 }, { "clip_ratio": 0.00024637427833340553, "epoch": 1.3403543365898838, "grad_norm": 0.032555270940065384, "kl": 0.006415843963623047, "learning_rate": 1.952438173920677e-06, "loss": 0.0054, "step": 1028 }, { "clip_ratio": 0.0002563797440870985, "epoch": 1.3422664396044337, "grad_norm": 0.03202388435602188, "kl": 0.006371498107910156, "learning_rate": 1.946319072285058e-06, "loss": 0.0053, "step": 1029 }, { "clip_ratio": 0.0002687414232696028, "epoch": 1.3441785426189836, "grad_norm": 0.03169838339090347, "kl": 0.006340980529785156, "learning_rate": 1.9402034568343888e-06, "loss": 0.0053, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 549.2184953689575, "epoch": 1.3460906456335335, "grad_norm": 0.054084766656160355, "kl": 0.006264686584472656, "learning_rate": 1.9340913660749015e-06, "loss": 0.0071, "num_tokens": 134289567.0, "reward": 0.06668527112924494, "reward_std": 0.07140090392204002, "rewards/pure_accuracy_reward_math": 0.06668526903376915, "step": 1031 }, { "clip_ratio": 0.00022883353369707038, "epoch": 1.3480027486480834, "grad_norm": 0.03612653911113739, "kl": 0.006344318389892578, "learning_rate": 1.9279828384906373e-06, "loss": 0.0071, "step": 1032 }, { "clip_ratio": 0.0002760976024376305, "epoch": 1.3499148516626334, "grad_norm": 0.036703869700431824, "kl": 0.006397724151611328, "learning_rate": 1.921877912543198e-06, "loss": 0.0071, "step": 1033 }, { "clip_ratio": 0.00027991523592163503, "epoch": 1.3518269546771833, "grad_norm": 0.036445919424295425, "kl": 0.006428718566894531, "learning_rate": 1.9157766266715142e-06, "loss": 0.007, "step": 1034 }, { "clip_ratio": 0.0003110420944381076, "epoch": 1.3537390576917332, "grad_norm": 0.032879918813705444, "kl": 0.006253242492675781, "learning_rate": 1.909679019291592e-06, "loss": 0.0069, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 525.200918674469, "epoch": 1.355651160706283, "grad_norm": 0.0374806709587574, "kl": 0.006623744964599609, "learning_rate": 1.9035851287962797e-06, "loss": 0.0088, "num_tokens": 137901395.0, "reward": 0.07170759295695461, "reward_std": 0.0834249026956968, "rewards/pure_accuracy_reward_math": 0.0717075907450635, "step": 1036 }, { "clip_ratio": 0.0002719677876825699, "epoch": 1.357563263720833, "grad_norm": 0.03692527487874031, "kl": 0.006625652313232422, "learning_rate": 1.8974949935550202e-06, "loss": 0.0088, "step": 1037 }, { "clip_ratio": 0.0003176050505544481, "epoch": 1.359475366735383, "grad_norm": 0.03605135530233383, "kl": 0.006484031677246094, "learning_rate": 1.8914086519136133e-06, "loss": 0.0088, "step": 1038 }, { "clip_ratio": 0.0003420261080577802, "epoch": 1.3613874697499329, "grad_norm": 0.03582129627466202, "kl": 0.006468296051025391, "learning_rate": 1.8853261421939718e-06, "loss": 0.0087, "step": 1039 }, { "clip_ratio": 0.00034158617637558564, "epoch": 1.3632995727644825, "grad_norm": 0.0346604622900486, "kl": 0.006458282470703125, "learning_rate": 1.8792475026938823e-06, "loss": 0.0086, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 525.6152620315552, "epoch": 1.3652116757790327, "grad_norm": 0.03809192404150963, "kl": 0.006644248962402344, "learning_rate": 1.8731727716867632e-06, "loss": 0.0098, "num_tokens": 141517968.0, "reward": 0.07477678963914514, "reward_std": 0.0749618403497152, "rewards/pure_accuracy_reward_math": 0.07477678678696975, "step": 1041 }, { "clip_ratio": 0.0002677642194726104, "epoch": 1.3671237787935824, "grad_norm": 0.0377020426094532, "kl": 0.0066089630126953125, "learning_rate": 1.8671019874214237e-06, "loss": 0.0098, "step": 1042 }, { "clip_ratio": 0.0002758102658617645, "epoch": 1.3690358818081325, "grad_norm": 0.03678804636001587, "kl": 0.006642341613769531, "learning_rate": 1.8610351881218211e-06, "loss": 0.0098, "step": 1043 }, { "clip_ratio": 0.0002790037015074631, "epoch": 1.3709479848226822, "grad_norm": 0.03615477308630943, "kl": 0.006649971008300781, "learning_rate": 1.8549724119868235e-06, "loss": 0.0097, "step": 1044 }, { "clip_ratio": 0.0002795595634097481, "epoch": 1.3728600878372323, "grad_norm": 0.03598296642303467, "kl": 0.006653785705566406, "learning_rate": 1.8489136971899658e-06, "loss": 0.0096, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 539.382839679718, "epoch": 1.374772190851782, "grad_norm": 0.03458879515528679, "kl": 0.0064601898193359375, "learning_rate": 1.8428590818792135e-06, "loss": 0.0038, "num_tokens": 145187116.0, "reward": 0.06584821731667034, "reward_std": 0.07200520334299654, "rewards/pure_accuracy_reward_math": 0.06584821562864818, "step": 1046 }, { "clip_ratio": 0.00023162108237784196, "epoch": 1.3766842938663322, "grad_norm": 0.03385276347398758, "kl": 0.006392478942871094, "learning_rate": 1.836808604176719e-06, "loss": 0.0038, "step": 1047 }, { "clip_ratio": 0.00026906593984676874, "epoch": 1.3785963968808819, "grad_norm": 0.0331512950360775, "kl": 0.0062427520751953125, "learning_rate": 1.8307623021785837e-06, "loss": 0.0037, "step": 1048 }, { "clip_ratio": 0.00025022312701139526, "epoch": 1.3805084998954318, "grad_norm": 0.032765790820121765, "kl": 0.006190299987792969, "learning_rate": 1.8247202139546155e-06, "loss": 0.0037, "step": 1049 }, { "clip_ratio": 0.0002507307134465009, "epoch": 1.3824206029099817, "grad_norm": 0.0325283482670784, "kl": 0.006188869476318359, "learning_rate": 1.8186823775480917e-06, "loss": 0.0036, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 539.5159296989441, "epoch": 1.3843327059245316, "grad_norm": 0.03628634661436081, "kl": 0.007945537567138672, "learning_rate": 1.8126488309755178e-06, "loss": 0.0101, "num_tokens": 148852261.0, "reward": 0.06194196696742438, "reward_std": 0.06792009872151539, "rewards/pure_accuracy_reward_math": 0.06194196580327116, "step": 1051 }, { "clip_ratio": 0.00025563780241100176, "epoch": 1.3862448089390815, "grad_norm": 0.035264719277620316, "kl": 0.007953643798828125, "learning_rate": 1.80661961222639e-06, "loss": 0.0101, "step": 1052 }, { "clip_ratio": 0.0002401949207069265, "epoch": 1.3881569119536314, "grad_norm": 0.034110233187675476, "kl": 0.007923126220703125, "learning_rate": 1.8005947592629551e-06, "loss": 0.0101, "step": 1053 }, { "clip_ratio": 0.00026547102737595196, "epoch": 1.3900690149681814, "grad_norm": 0.03364601358771324, "kl": 0.00788116455078125, "learning_rate": 1.7945743100199706e-06, "loss": 0.01, "step": 1054 }, { "clip_ratio": 0.0002951583905996813, "epoch": 1.3919811179827313, "grad_norm": 0.03397928550839424, "kl": 0.007859230041503906, "learning_rate": 1.788558302404466e-06, "loss": 0.0099, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 530.25337266922, "epoch": 1.3938932209972812, "grad_norm": 0.03863634541630745, "kl": 0.006538867950439453, "learning_rate": 1.7825467742955052e-06, "loss": 0.0066, "num_tokens": 152486009.0, "reward": 0.06780134289874695, "reward_std": 0.06736206321511418, "rewards/pure_accuracy_reward_math": 0.06780134057044052, "step": 1056 }, { "clip_ratio": 0.00027592373527340897, "epoch": 1.395805324011831, "grad_norm": 0.036583587527275085, "kl": 0.0065402984619140625, "learning_rate": 1.7765397635439468e-06, "loss": 0.0066, "step": 1057 }, { "clip_ratio": 0.0002849266509201698, "epoch": 1.397717427026381, "grad_norm": 0.03605053946375847, "kl": 0.006500244140625, "learning_rate": 1.7705373079722083e-06, "loss": 0.0065, "step": 1058 }, { "clip_ratio": 0.0003116865132142266, "epoch": 1.399629530040931, "grad_norm": 0.03675729036331177, "kl": 0.006489276885986328, "learning_rate": 1.7645394453740227e-06, "loss": 0.0064, "step": 1059 }, { "clip_ratio": 0.0003249485117748918, "epoch": 1.4015416330554809, "grad_norm": 0.03623329848051071, "kl": 0.006478786468505859, "learning_rate": 1.7585462135142083e-06, "loss": 0.0064, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 520.029598236084, "epoch": 1.4034537360700308, "grad_norm": 0.03506990894675255, "kl": 0.006392955780029297, "learning_rate": 1.752557650128423e-06, "loss": 0.0096, "num_tokens": 156082643.0, "reward": 0.06194196664728224, "reward_std": 0.07560620515141636, "rewards/pure_accuracy_reward_math": 0.061941966181620955, "step": 1061 }, { "clip_ratio": 0.0002744606111662051, "epoch": 1.4053658390845807, "grad_norm": 0.03450053185224533, "kl": 0.006424903869628906, "learning_rate": 1.7465737929229317e-06, "loss": 0.0096, "step": 1062 }, { "clip_ratio": 0.00027279697263793423, "epoch": 1.4072779420991306, "grad_norm": 0.033764585852622986, "kl": 0.006496906280517578, "learning_rate": 1.7405946795743665e-06, "loss": 0.0096, "step": 1063 }, { "clip_ratio": 0.000298209258943416, "epoch": 1.4091900451136805, "grad_norm": 0.03335048630833626, "kl": 0.0065898895263671875, "learning_rate": 1.7346203477294916e-06, "loss": 0.0095, "step": 1064 }, { "clip_ratio": 0.00030832760762677935, "epoch": 1.4111021481282304, "grad_norm": 0.03299354016780853, "kl": 0.006653308868408203, "learning_rate": 1.7286508350049627e-06, "loss": 0.0094, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 525.4023675918579, "epoch": 1.4130142511427803, "grad_norm": 0.04127517342567444, "kl": 0.010558605194091797, "learning_rate": 1.722686178987097e-06, "loss": 0.0076, "num_tokens": 159696133.0, "reward": 0.06640625282307155, "reward_std": 0.07264956791186705, "rewards/pure_accuracy_reward_math": 0.06640625101863407, "step": 1066 }, { "clip_ratio": 0.00030437137564831573, "epoch": 1.4149263541573303, "grad_norm": 0.039496634155511856, "kl": 0.010538101196289062, "learning_rate": 1.7167264172316273e-06, "loss": 0.0076, "step": 1067 }, { "clip_ratio": 0.0003244270092181978, "epoch": 1.4168384571718802, "grad_norm": 0.039376117289066315, "kl": 0.010515689849853516, "learning_rate": 1.7107715872634731e-06, "loss": 0.0075, "step": 1068 }, { "clip_ratio": 0.0003491952173817481, "epoch": 1.41875056018643, "grad_norm": 0.03863466531038284, "kl": 0.01038360595703125, "learning_rate": 1.7048217265764993e-06, "loss": 0.0075, "step": 1069 }, { "clip_ratio": 0.00037865171140083476, "epoch": 1.42066266320098, "grad_norm": 0.03795957565307617, "kl": 0.010157585144042969, "learning_rate": 1.6988768726332856e-06, "loss": 0.0074, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 512.8691644668579, "epoch": 1.42257476621553, "grad_norm": 0.04360206797719002, "kl": 0.0067138671875, "learning_rate": 1.6929370628648828e-06, "loss": 0.0086, "num_tokens": 163268528.0, "reward": 0.08565848623402417, "reward_std": 0.08861368341604248, "rewards/pure_accuracy_reward_math": 0.08565848384751007, "step": 1071 }, { "clip_ratio": 0.00031944918799808875, "epoch": 1.4244868692300798, "grad_norm": 0.04292250797152519, "kl": 0.006737709045410156, "learning_rate": 1.6870023346705866e-06, "loss": 0.0085, "step": 1072 }, { "clip_ratio": 0.00031442818647064996, "epoch": 1.4263989722446297, "grad_norm": 0.04044810310006142, "kl": 0.006873607635498047, "learning_rate": 1.6810727254176937e-06, "loss": 0.0085, "step": 1073 }, { "clip_ratio": 0.0003650832475727839, "epoch": 1.4283110752591797, "grad_norm": 0.04156485199928284, "kl": 0.006984233856201172, "learning_rate": 1.6751482724412716e-06, "loss": 0.0084, "step": 1074 }, { "clip_ratio": 0.0003947964444250829, "epoch": 1.4302231782737296, "grad_norm": 0.04023054987192154, "kl": 0.007004737854003906, "learning_rate": 1.669229013043921e-06, "loss": 0.0083, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 512.7343969345093, "epoch": 1.4321352812882795, "grad_norm": 0.03780645504593849, "kl": 0.006886005401611328, "learning_rate": 1.6633149844955415e-06, "loss": 0.0094, "num_tokens": 166836260.0, "reward": 0.0797991111758165, "reward_std": 0.08157813875004649, "rewards/pure_accuracy_reward_math": 0.07979910867288709, "step": 1076 }, { "clip_ratio": 0.0002608302990552147, "epoch": 1.4340473843028292, "grad_norm": 0.03681138530373573, "kl": 0.006786823272705078, "learning_rate": 1.6574062240330996e-06, "loss": 0.0093, "step": 1077 }, { "clip_ratio": 0.00031450060896531795, "epoch": 1.4359594873173793, "grad_norm": 0.036778852343559265, "kl": 0.0066986083984375, "learning_rate": 1.651502768860389e-06, "loss": 0.0093, "step": 1078 }, { "clip_ratio": 0.0003176571812559814, "epoch": 1.437871590331929, "grad_norm": 0.03592304140329361, "kl": 0.006758213043212891, "learning_rate": 1.6456046561478023e-06, "loss": 0.0092, "step": 1079 }, { "clip_ratio": 0.0003236016519281293, "epoch": 1.4397836933464792, "grad_norm": 0.03520684316754341, "kl": 0.006850242614746094, "learning_rate": 1.6397119230320919e-06, "loss": 0.0092, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 508.80498933792114, "epoch": 1.4416957963610288, "grad_norm": 0.04630957916378975, "kl": 0.01150655746459961, "learning_rate": 1.633824606616138e-06, "loss": 0.008, "num_tokens": 170392081.0, "reward": 0.07589286129223183, "reward_std": 0.08140548272058368, "rewards/pure_accuracy_reward_math": 0.07589285844005644, "step": 1081 }, { "clip_ratio": 0.00028873196572476445, "epoch": 1.443607899375579, "grad_norm": 0.04534924402832985, "kl": 0.01107931137084961, "learning_rate": 1.6279427439687154e-06, "loss": 0.008, "step": 1082 }, { "clip_ratio": 0.000319909158235987, "epoch": 1.4455200023901287, "grad_norm": 0.044707395136356354, "kl": 0.010364532470703125, "learning_rate": 1.622066372124262e-06, "loss": 0.0079, "step": 1083 }, { "clip_ratio": 0.0003388643909829625, "epoch": 1.4474321054046788, "grad_norm": 0.038643479347229004, "kl": 0.009525775909423828, "learning_rate": 1.6161955280826399e-06, "loss": 0.0078, "step": 1084 }, { "clip_ratio": 0.0003223289492098047, "epoch": 1.4493442084192285, "grad_norm": 0.12098709493875504, "kl": 0.010370254516601562, "learning_rate": 1.6103302488089104e-06, "loss": 0.0078, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 520.3169894218445, "epoch": 1.4512563114337784, "grad_norm": 0.03693209961056709, "kl": 0.006680965423583984, "learning_rate": 1.6044705712330932e-06, "loss": 0.0059, "num_tokens": 173992817.0, "reward": 0.07031250311410986, "reward_std": 0.07530715462053195, "rewards/pure_accuracy_reward_math": 0.07031250142608769, "step": 1086 }, { "clip_ratio": 0.0002918191117657898, "epoch": 1.4531684144483283, "grad_norm": 0.03641385957598686, "kl": 0.0065898895263671875, "learning_rate": 1.5986165322499398e-06, "loss": 0.0059, "step": 1087 }, { "clip_ratio": 0.0002921736467840219, "epoch": 1.4550805174628783, "grad_norm": 0.03598758950829506, "kl": 0.006548881530761719, "learning_rate": 1.5927681687186964e-06, "loss": 0.0058, "step": 1088 }, { "clip_ratio": 0.0003169650843233285, "epoch": 1.4569926204774282, "grad_norm": 0.036268141120672226, "kl": 0.006561756134033203, "learning_rate": 1.5869255174628778e-06, "loss": 0.0058, "step": 1089 }, { "clip_ratio": 0.0003259218068478731, "epoch": 1.458904723491978, "grad_norm": 0.03529893979430199, "kl": 0.006597042083740234, "learning_rate": 1.5810886152700302e-06, "loss": 0.0057, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 533.391206741333, "epoch": 1.460816826506528, "grad_norm": 0.04034799709916115, "kl": 0.006509304046630859, "learning_rate": 1.5752574988915004e-06, "loss": 0.0066, "num_tokens": 177633359.0, "reward": 0.07477678920258768, "reward_std": 0.0747891838545911, "rewards/pure_accuracy_reward_math": 0.07477678699069656, "step": 1091 }, { "clip_ratio": 0.0002679697158214367, "epoch": 1.462728929521078, "grad_norm": 0.039328683167696, "kl": 0.006606101989746094, "learning_rate": 1.5694322050422096e-06, "loss": 0.0066, "step": 1092 }, { "clip_ratio": 0.0002975759220475993, "epoch": 1.4646410325356278, "grad_norm": 0.03947217017412186, "kl": 0.00665283203125, "learning_rate": 1.5636127704004133e-06, "loss": 0.0065, "step": 1093 }, { "clip_ratio": 0.0003127538088278925, "epoch": 1.4665531355501777, "grad_norm": 0.03733786940574646, "kl": 0.006627559661865234, "learning_rate": 1.5577992316074783e-06, "loss": 0.0064, "step": 1094 }, { "clip_ratio": 0.00035554791872982605, "epoch": 1.4684652385647277, "grad_norm": 0.03660706803202629, "kl": 0.0065364837646484375, "learning_rate": 1.5519916252676482e-06, "loss": 0.0064, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 530.1163763999939, "epoch": 1.4703773415792776, "grad_norm": 0.06871657073497772, "kl": 0.010003089904785156, "learning_rate": 1.5461899879478133e-06, "loss": 0.0057, "num_tokens": 181268648.0, "reward": 0.0744977711874526, "reward_std": 0.08333237702026963, "rewards/pure_accuracy_reward_math": 0.0744977695576381, "step": 1096 }, { "clip_ratio": 0.00032988911306119917, "epoch": 1.4722894445938275, "grad_norm": 0.04868275299668312, "kl": 0.009030342102050781, "learning_rate": 1.5403943561772789e-06, "loss": 0.0057, "step": 1097 }, { "clip_ratio": 0.0003833602018517013, "epoch": 1.4742015476083774, "grad_norm": 0.04073934629559517, "kl": 0.00842428207397461, "learning_rate": 1.5346047664475422e-06, "loss": 0.0056, "step": 1098 }, { "clip_ratio": 0.00040459603366116426, "epoch": 1.4761136506229273, "grad_norm": 0.04011493921279907, "kl": 0.008179187774658203, "learning_rate": 1.5288212552120524e-06, "loss": 0.0055, "step": 1099 }, { "clip_ratio": 0.0004078742092019638, "epoch": 1.4780257536374772, "grad_norm": 0.03785649687051773, "kl": 0.008193016052246094, "learning_rate": 1.5230438588859881e-06, "loss": 0.0054, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 541.5837321281433, "epoch": 1.4799378566520272, "grad_norm": 0.04047717526555061, "kl": 0.007642269134521484, "learning_rate": 1.517272613846027e-06, "loss": 0.0051, "num_tokens": 184939348.0, "reward": 0.06863839572179131, "reward_std": 0.07131457631476223, "rewards/pure_accuracy_reward_math": 0.06863839420839213, "step": 1101 }, { "clip_ratio": 0.00026072144959243815, "epoch": 1.481849959666577, "grad_norm": 0.037731293588876724, "kl": 0.007551670074462891, "learning_rate": 1.511507556430114e-06, "loss": 0.0051, "step": 1102 }, { "clip_ratio": 0.00029216510773721893, "epoch": 1.483762062681127, "grad_norm": 0.03771767392754555, "kl": 0.007477760314941406, "learning_rate": 1.5057487229372347e-06, "loss": 0.0051, "step": 1103 }, { "clip_ratio": 0.0003181908435294645, "epoch": 1.485674165695677, "grad_norm": 0.03619125112891197, "kl": 0.0074062347412109375, "learning_rate": 1.4999961496271889e-06, "loss": 0.005, "step": 1104 }, { "clip_ratio": 0.0003646736843165854, "epoch": 1.4875862687102268, "grad_norm": 0.035048868507146835, "kl": 0.007380008697509766, "learning_rate": 1.4942498727203578e-06, "loss": 0.0049, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 541.8585615158081, "epoch": 1.4894983717247767, "grad_norm": 0.0386812798678875, "kl": 0.006747245788574219, "learning_rate": 1.4885099283974774e-06, "loss": 0.0071, "num_tokens": 188614221.0, "reward": 0.07198661062284373, "reward_std": 0.08140548341907561, "rewards/pure_accuracy_reward_math": 0.07198660864378326, "step": 1106 }, { "clip_ratio": 0.0003357146362077401, "epoch": 1.4914104747393266, "grad_norm": 0.03723128139972687, "kl": 0.006694316864013672, "learning_rate": 1.482776352799414e-06, "loss": 0.0071, "step": 1107 }, { "clip_ratio": 0.0003692662889989151, "epoch": 1.4933225777538766, "grad_norm": 0.038370903581380844, "kl": 0.006665706634521484, "learning_rate": 1.4770491820269317e-06, "loss": 0.007, "step": 1108 }, { "clip_ratio": 0.00040588962588117283, "epoch": 1.4952346807684265, "grad_norm": 0.037489671260118484, "kl": 0.006663322448730469, "learning_rate": 1.4713284521404678e-06, "loss": 0.0069, "step": 1109 }, { "clip_ratio": 0.00039138679812822375, "epoch": 1.4971467837829764, "grad_norm": 0.03641659393906593, "kl": 0.006697654724121094, "learning_rate": 1.465614199159905e-06, "loss": 0.0069, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 520.476583480835, "epoch": 1.4990588867975263, "grad_norm": 1.8961507081985474, "kl": 0.03508758544921875, "learning_rate": 1.4599064590643472e-06, "loss": 0.0056, "num_tokens": 192212657.0, "reward": 0.0753348250000272, "reward_std": 0.07783834805013612, "rewards/pure_accuracy_reward_math": 0.07533482302096672, "step": 1111 }, { "clip_ratio": 0.00029740781877762856, "epoch": 1.500970989812076, "grad_norm": 0.08476530015468597, "kl": 0.011601448059082031, "learning_rate": 1.4542052677918885e-06, "loss": 0.0047, "step": 1112 }, { "clip_ratio": 0.0003210891072171762, "epoch": 1.5028830928266261, "grad_norm": 0.04907820373773575, "kl": 0.010628223419189453, "learning_rate": 1.4485106612393897e-06, "loss": 0.0046, "step": 1113 }, { "clip_ratio": 0.00033912417364945213, "epoch": 1.5047951958411758, "grad_norm": 0.04438456520438194, "kl": 0.010659217834472656, "learning_rate": 1.4428226752622509e-06, "loss": 0.0046, "step": 1114 }, { "clip_ratio": 0.0003756833369834567, "epoch": 1.506707298855726, "grad_norm": 0.0422808900475502, "kl": 0.010442733764648438, "learning_rate": 1.437141345674189e-06, "loss": 0.0045, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 535.0778713226318, "epoch": 1.5086194018702757, "grad_norm": 0.048265133053064346, "kl": 0.007592678070068359, "learning_rate": 1.4314667082470064e-06, "loss": 0.0086, "num_tokens": 195861088.0, "reward": 0.07142857479630038, "reward_std": 0.08346496871672571, "rewards/pure_accuracy_reward_math": 0.07142857287544757, "step": 1116 }, { "clip_ratio": 0.0003429410510875641, "epoch": 1.5105315048848258, "grad_norm": 0.04287589713931084, "kl": 0.007152557373046875, "learning_rate": 1.4257987987103727e-06, "loss": 0.0085, "step": 1117 }, { "clip_ratio": 0.0003726668836634417, "epoch": 1.5124436078993755, "grad_norm": 0.0397462397813797, "kl": 0.006825447082519531, "learning_rate": 1.420137652751593e-06, "loss": 0.0085, "step": 1118 }, { "clip_ratio": 0.0003763367328133427, "epoch": 1.5143557109139256, "grad_norm": 0.03851110488176346, "kl": 0.006707668304443359, "learning_rate": 1.4144833060153887e-06, "loss": 0.0084, "step": 1119 }, { "clip_ratio": 0.0003624607439292049, "epoch": 1.5162678139284753, "grad_norm": 0.03720558434724808, "kl": 0.00676727294921875, "learning_rate": 1.408835794103669e-06, "loss": 0.0083, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 524.7569994926453, "epoch": 1.5181799169430255, "grad_norm": 0.03832938149571419, "kl": 0.008425712585449219, "learning_rate": 1.4031951525753088e-06, "loss": 0.0071, "num_tokens": 199475701.0, "reward": 0.08565848635043949, "reward_std": 0.08179086120799184, "rewards/pure_accuracy_reward_math": 0.08565848338184878, "step": 1121 }, { "clip_ratio": 0.00028257126655262255, "epoch": 1.5200920199575751, "grad_norm": 0.038414496928453445, "kl": 0.008458137512207031, "learning_rate": 1.3975614169459253e-06, "loss": 0.0071, "step": 1122 }, { "clip_ratio": 0.0003134008442202685, "epoch": 1.5220041229721253, "grad_norm": 0.03928304836153984, "kl": 0.008496284484863281, "learning_rate": 1.391934622687652e-06, "loss": 0.0071, "step": 1123 }, { "clip_ratio": 0.00030222541431612626, "epoch": 1.523916225986675, "grad_norm": 0.038087427616119385, "kl": 0.008494377136230469, "learning_rate": 1.38631480522892e-06, "loss": 0.007, "step": 1124 }, { "clip_ratio": 0.0002927070846396873, "epoch": 1.525828329001225, "grad_norm": 0.03641984984278679, "kl": 0.008457183837890625, "learning_rate": 1.3807019999542287e-06, "loss": 0.0069, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 531.1537666320801, "epoch": 1.5277404320157748, "grad_norm": 0.040940940380096436, "kl": 0.006596565246582031, "learning_rate": 1.3750962422039269e-06, "loss": 0.0058, "num_tokens": 203109136.0, "reward": 0.07254464621655643, "reward_std": 0.08217623952077702, "rewards/pure_accuracy_reward_math": 0.07254464400466532, "step": 1126 }, { "clip_ratio": 0.00031519718078243386, "epoch": 1.5296525350303247, "grad_norm": 0.038493506610393524, "kl": 0.006714344024658203, "learning_rate": 1.369497567273989e-06, "loss": 0.0058, "step": 1127 }, { "clip_ratio": 0.0003513000764314711, "epoch": 1.5315646380448746, "grad_norm": 0.039495162665843964, "kl": 0.006772041320800781, "learning_rate": 1.3639060104157964e-06, "loss": 0.0057, "step": 1128 }, { "clip_ratio": 0.00033387296190312554, "epoch": 1.5334767410594246, "grad_norm": 0.03875305503606796, "kl": 0.006872653961181641, "learning_rate": 1.3583216068359078e-06, "loss": 0.0057, "step": 1129 }, { "clip_ratio": 0.00036185752793471693, "epoch": 1.5353888440739745, "grad_norm": 0.03817266598343849, "kl": 0.006899356842041016, "learning_rate": 1.3527443916958466e-06, "loss": 0.0056, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 537.4143671989441, "epoch": 1.5373009470885244, "grad_norm": 0.035565100610256195, "kl": 0.006679058074951172, "learning_rate": 1.3471744001118718e-06, "loss": 0.0091, "num_tokens": 206769717.0, "reward": 0.07533482497092336, "reward_std": 0.07436373975360766, "rewards/pure_accuracy_reward_math": 0.07533482293365523, "step": 1131 }, { "clip_ratio": 0.00028060592541123697, "epoch": 1.5392130501030743, "grad_norm": 0.036901701241731644, "kl": 0.006720542907714844, "learning_rate": 1.3416116671547613e-06, "loss": 0.0091, "step": 1132 }, { "clip_ratio": 0.00034766932589036514, "epoch": 1.5411251531176242, "grad_norm": 0.03489091992378235, "kl": 0.006618499755859375, "learning_rate": 1.3360562278495899e-06, "loss": 0.009, "step": 1133 }, { "clip_ratio": 0.0003513962886927402, "epoch": 1.5430372561321741, "grad_norm": 0.035007573664188385, "kl": 0.0066070556640625, "learning_rate": 1.3305081171755092e-06, "loss": 0.009, "step": 1134 }, { "clip_ratio": 0.00036896456708745973, "epoch": 1.544949359146724, "grad_norm": 0.03363417461514473, "kl": 0.006587028503417969, "learning_rate": 1.3249673700655246e-06, "loss": 0.0089, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 531.2251925468445, "epoch": 1.546861462161274, "grad_norm": 0.037738338112831116, "kl": 0.006687164306640625, "learning_rate": 1.3194340214062828e-06, "loss": 0.0066, "num_tokens": 210404892.0, "reward": 0.07477678978466429, "reward_std": 0.08492635452421382, "rewards/pure_accuracy_reward_math": 0.07477678699069656, "step": 1136 }, { "clip_ratio": 0.0003166603274848967, "epoch": 1.5487735651758239, "grad_norm": 0.03711307421326637, "kl": 0.0067272186279296875, "learning_rate": 1.3139081060378423e-06, "loss": 0.0066, "step": 1137 }, { "clip_ratio": 0.00032532861348499864, "epoch": 1.5506856681903738, "grad_norm": 0.0381547249853611, "kl": 0.006831169128417969, "learning_rate": 1.3083896587534606e-06, "loss": 0.0065, "step": 1138 }, { "clip_ratio": 0.0003168874280845557, "epoch": 1.5525977712049237, "grad_norm": 0.03702245280146599, "kl": 0.0068492889404296875, "learning_rate": 1.3028787142993723e-06, "loss": 0.0064, "step": 1139 }, { "clip_ratio": 0.00031372528076190065, "epoch": 1.5545098742194736, "grad_norm": 0.035462986677885056, "kl": 0.0068511962890625, "learning_rate": 1.297375307374574e-06, "loss": 0.0063, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 528.9913792610168, "epoch": 1.5564219772340235, "grad_norm": 0.0402364507317543, "kl": 0.006835460662841797, "learning_rate": 1.2918794726306003e-06, "loss": 0.0099, "num_tokens": 214034825.0, "reward": 0.07310268151923083, "reward_std": 0.07917333993827924, "rewards/pure_accuracy_reward_math": 0.07310268000583164, "step": 1141 }, { "clip_ratio": 0.0003137970834359294, "epoch": 1.5583340802485734, "grad_norm": 0.03920648992061615, "kl": 0.006829738616943359, "learning_rate": 1.2863912446713084e-06, "loss": 0.0098, "step": 1142 }, { "clip_ratio": 0.00032378236608110456, "epoch": 1.5602461832631231, "grad_norm": 0.03806397691369057, "kl": 0.006905078887939453, "learning_rate": 1.2809106580526636e-06, "loss": 0.0098, "step": 1143 }, { "clip_ratio": 0.0003143088524097948, "epoch": 1.5621582862776733, "grad_norm": 0.03801356628537178, "kl": 0.006966590881347656, "learning_rate": 1.2754377472825153e-06, "loss": 0.0097, "step": 1144 }, { "clip_ratio": 0.00035796050920566813, "epoch": 1.564070389292223, "grad_norm": 0.036964964121580124, "kl": 0.006992816925048828, "learning_rate": 1.2699725468203832e-06, "loss": 0.0096, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 538.6370244026184, "epoch": 1.565982492306773, "grad_norm": 0.045449208468198776, "kl": 0.007224559783935547, "learning_rate": 1.2645150910772413e-06, "loss": 0.0043, "num_tokens": 217697304.0, "reward": 0.07393973600119352, "reward_std": 0.08620888477889821, "rewards/pure_accuracy_reward_math": 0.07393973361467943, "step": 1146 }, { "clip_ratio": 0.0003596847872131548, "epoch": 1.5678945953213228, "grad_norm": 0.03882161155343056, "kl": 0.006949901580810547, "learning_rate": 1.2590654144152992e-06, "loss": 0.0043, "step": 1147 }, { "clip_ratio": 0.0004527134210547956, "epoch": 1.569806698335873, "grad_norm": 0.03764580935239792, "kl": 0.00691986083984375, "learning_rate": 1.2536235511477852e-06, "loss": 0.0043, "step": 1148 }, { "clip_ratio": 0.0005161078099717997, "epoch": 1.5717188013504226, "grad_norm": 0.03833252564072609, "kl": 0.006892681121826172, "learning_rate": 1.2481895355387341e-06, "loss": 0.0042, "step": 1149 }, { "clip_ratio": 0.0005320426059824968, "epoch": 1.5736309043649728, "grad_norm": 0.03876457363367081, "kl": 0.006943702697753906, "learning_rate": 1.2427634018027673e-06, "loss": 0.0041, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 529.9707288742065, "epoch": 1.5755430073795225, "grad_norm": 0.03937402740120888, "kl": 0.007305145263671875, "learning_rate": 1.2373451841048781e-06, "loss": 0.0078, "num_tokens": 221325451.0, "reward": 0.08258928963914514, "reward_std": 0.08058846154017374, "rewards/pure_accuracy_reward_math": 0.08258928655413911, "step": 1151 }, { "clip_ratio": 0.0002857717965980555, "epoch": 1.5774551103940726, "grad_norm": 0.03863917291164398, "kl": 0.007287502288818359, "learning_rate": 1.2319349165602202e-06, "loss": 0.0078, "step": 1152 }, { "clip_ratio": 0.0002796752659151025, "epoch": 1.5793672134086223, "grad_norm": 0.03722836822271347, "kl": 0.007286548614501953, "learning_rate": 1.2265326332338875e-06, "loss": 0.0077, "step": 1153 }, { "clip_ratio": 0.00034041513032434523, "epoch": 1.5812793164231724, "grad_norm": 0.03688417002558708, "kl": 0.007335662841796875, "learning_rate": 1.2211383681407022e-06, "loss": 0.0076, "step": 1154 }, { "clip_ratio": 0.0003595712430524145, "epoch": 1.5831914194377221, "grad_norm": 0.037124987691640854, "kl": 0.007359981536865234, "learning_rate": 1.2157521552450035e-06, "loss": 0.0076, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 540.098798751831, "epoch": 1.5851035224522723, "grad_norm": 0.03577388823032379, "kl": 0.0069561004638671875, "learning_rate": 1.210374028460428e-06, "loss": 0.0065, "num_tokens": 224996253.0, "reward": 0.06863839607103728, "reward_std": 0.07376563857542351, "rewards/pure_accuracy_reward_math": 0.06863839426659979, "step": 1156 }, { "clip_ratio": 0.00025091522741149674, "epoch": 1.587015625466822, "grad_norm": 0.03386949375271797, "kl": 0.006894588470458984, "learning_rate": 1.2050040216497e-06, "loss": 0.0065, "step": 1157 }, { "clip_ratio": 0.00029767470277874963, "epoch": 1.588927728481372, "grad_norm": 0.033231545239686966, "kl": 0.0068531036376953125, "learning_rate": 1.1996421686244179e-06, "loss": 0.0064, "step": 1158 }, { "clip_ratio": 0.00030627386024661973, "epoch": 1.5908398314959218, "grad_norm": 0.0327543206512928, "kl": 0.006781578063964844, "learning_rate": 1.1942885031448397e-06, "loss": 0.0064, "step": 1159 }, { "clip_ratio": 0.00032285955057886895, "epoch": 1.5927519345104717, "grad_norm": 0.03283894062042236, "kl": 0.006725788116455078, "learning_rate": 1.1889430589196727e-06, "loss": 0.0063, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 540.7405333518982, "epoch": 1.5946640375250216, "grad_norm": 0.04240734875202179, "kl": 0.006897449493408203, "learning_rate": 1.183605869605858e-06, "loss": 0.0064, "num_tokens": 228663991.0, "reward": 0.08091518227593042, "reward_std": 0.08951703325146809, "rewards/pure_accuracy_reward_math": 0.08091518018045463, "step": 1161 }, { "clip_ratio": 0.00035278943187222467, "epoch": 1.5965761405395715, "grad_norm": 0.04050403833389282, "kl": 0.006961345672607422, "learning_rate": 1.1782769688083647e-06, "loss": 0.0064, "step": 1162 }, { "clip_ratio": 0.00034535837551175064, "epoch": 1.5984882435541214, "grad_norm": 0.03872028365731239, "kl": 0.007065296173095703, "learning_rate": 1.1729563900799695e-06, "loss": 0.0063, "step": 1163 }, { "clip_ratio": 0.00037939938943054585, "epoch": 1.6004003465686714, "grad_norm": 0.039447493851184845, "kl": 0.007191181182861328, "learning_rate": 1.1676441669210543e-06, "loss": 0.0063, "step": 1164 }, { "clip_ratio": 0.00037003348657549395, "epoch": 1.6023124495832213, "grad_norm": 0.03724885359406471, "kl": 0.0071163177490234375, "learning_rate": 1.1623403327793881e-06, "loss": 0.0061, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 531.3211750984192, "epoch": 1.6042245525977712, "grad_norm": 0.9447879791259766, "kl": 0.03227043151855469, "learning_rate": 1.1570449210499213e-06, "loss": 0.0085, "num_tokens": 232302082.0, "reward": 0.07756696781143546, "reward_std": 0.0780110054765828, "rewards/pure_accuracy_reward_math": 0.07756696577416733, "step": 1166 }, { "clip_ratio": 0.00036849399879201883, "epoch": 1.606136655612321, "grad_norm": 0.26742058992385864, "kl": 0.011518478393554688, "learning_rate": 1.1517579650745713e-06, "loss": 0.0079, "step": 1167 }, { "clip_ratio": 0.00029733346730154153, "epoch": 1.608048758626871, "grad_norm": 0.3907225728034973, "kl": 0.017581462860107422, "learning_rate": 1.1464794981420187e-06, "loss": 0.0079, "step": 1168 }, { "clip_ratio": 0.0003680569542439116, "epoch": 1.609960861641421, "grad_norm": 0.1778813600540161, "kl": 0.010699748992919922, "learning_rate": 1.1412095534874912e-06, "loss": 0.0077, "step": 1169 }, { "clip_ratio": 0.0003726620370798628, "epoch": 1.6118729646559709, "grad_norm": 0.2035137563943863, "kl": 0.01429891586303711, "learning_rate": 1.135948164292557e-06, "loss": 0.0077, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 519.0362968444824, "epoch": 1.6137850676705208, "grad_norm": 0.040138401091098785, "kl": 0.008060932159423828, "learning_rate": 1.130695363684916e-06, "loss": 0.0096, "num_tokens": 235898380.0, "reward": 0.0630580390279647, "reward_std": 0.07195894001051784, "rewards/pure_accuracy_reward_math": 0.06305803687428124, "step": 1171 }, { "clip_ratio": 0.0002708259837049809, "epoch": 1.6156971706850707, "grad_norm": 0.03859123960137367, "kl": 0.008191585540771484, "learning_rate": 1.1254511847381922e-06, "loss": 0.0096, "step": 1172 }, { "clip_ratio": 0.00029455311903348047, "epoch": 1.6176092736996206, "grad_norm": 0.03898981586098671, "kl": 0.008168697357177734, "learning_rate": 1.1202156604717234e-06, "loss": 0.0095, "step": 1173 }, { "clip_ratio": 0.0003440694692926627, "epoch": 1.6195213767141705, "grad_norm": 0.0370321087539196, "kl": 0.00800466537475586, "learning_rate": 1.1149888238503537e-06, "loss": 0.0094, "step": 1174 }, { "clip_ratio": 0.00040963905792068545, "epoch": 1.6214334797287204, "grad_norm": 0.03698049858212471, "kl": 0.007803440093994141, "learning_rate": 1.109770707784229e-06, "loss": 0.0094, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 525.937527179718, "epoch": 1.6233455827432703, "grad_norm": 0.039002615958452225, "kl": 0.007039546966552734, "learning_rate": 1.1045613451285837e-06, "loss": 0.0074, "num_tokens": 239513448.0, "reward": 0.06584821754950099, "reward_std": 0.07595151895657182, "rewards/pure_accuracy_reward_math": 0.06584821516298689, "step": 1176 }, { "clip_ratio": 0.0003209126220440339, "epoch": 1.6252576857578203, "grad_norm": 0.038693126291036606, "kl": 0.0069637298583984375, "learning_rate": 1.0993607686835408e-06, "loss": 0.0074, "step": 1177 }, { "clip_ratio": 0.0003234959946212257, "epoch": 1.62716978877237, "grad_norm": 0.03805870935320854, "kl": 0.006987094879150391, "learning_rate": 1.0941690111939002e-06, "loss": 0.0073, "step": 1178 }, { "clip_ratio": 0.0003316311403978034, "epoch": 1.62908189178692, "grad_norm": 0.03687576577067375, "kl": 0.0070285797119140625, "learning_rate": 1.0889861053489341e-06, "loss": 0.0072, "step": 1179 }, { "clip_ratio": 0.00033663610071243966, "epoch": 1.6309939948014698, "grad_norm": 0.03717907890677452, "kl": 0.007116794586181641, "learning_rate": 1.0838120837821814e-06, "loss": 0.0071, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 514.2112393379211, "epoch": 1.63290609781602, "grad_norm": 0.04346395656466484, "kl": 0.007472515106201172, "learning_rate": 1.0786469790712441e-06, "loss": 0.0059, "num_tokens": 243092265.0, "reward": 0.07700893233413808, "reward_std": 0.07526089128805324, "rewards/pure_accuracy_reward_math": 0.07700893029686995, "step": 1181 }, { "clip_ratio": 0.0002878125141592136, "epoch": 1.6348182008305696, "grad_norm": 0.03890342637896538, "kl": 0.007323265075683594, "learning_rate": 1.0734908237375783e-06, "loss": 0.0059, "step": 1182 }, { "clip_ratio": 0.00031910790164602076, "epoch": 1.6367303038451197, "grad_norm": 0.03748926892876625, "kl": 0.007243156433105469, "learning_rate": 1.0683436502462915e-06, "loss": 0.0058, "step": 1183 }, { "clip_ratio": 0.00036283263597169935, "epoch": 1.6386424068596694, "grad_norm": 0.037570755928754807, "kl": 0.007138252258300781, "learning_rate": 1.0632054910059391e-06, "loss": 0.0058, "step": 1184 }, { "clip_ratio": 0.00039574184188495565, "epoch": 1.6405545098742196, "grad_norm": 0.038306284695863724, "kl": 0.007193088531494141, "learning_rate": 1.0580763783683187e-06, "loss": 0.0057, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 518.925525188446, "epoch": 1.6424666128887693, "grad_norm": 0.04251728951931, "kl": 0.007372379302978516, "learning_rate": 1.0529563446282665e-06, "loss": 0.01, "num_tokens": 246686482.0, "reward": 0.08537946754950099, "reward_std": 0.08939063869183883, "rewards/pure_accuracy_reward_math": 0.08537946551223285, "step": 1186 }, { "clip_ratio": 0.0003136689152256622, "epoch": 1.6443787159033194, "grad_norm": 0.04087135195732117, "kl": 0.007419109344482422, "learning_rate": 1.0478454220234568e-06, "loss": 0.0099, "step": 1187 }, { "clip_ratio": 0.0003467907941399062, "epoch": 1.646290818917869, "grad_norm": 0.039666056632995605, "kl": 0.007442951202392578, "learning_rate": 1.0427436427341939e-06, "loss": 0.0099, "step": 1188 }, { "clip_ratio": 0.00038431568484043055, "epoch": 1.6482029219324192, "grad_norm": 0.0389142706990242, "kl": 0.007426738739013672, "learning_rate": 1.0376510388832147e-06, "loss": 0.0098, "step": 1189 }, { "clip_ratio": 0.000490980125164242, "epoch": 1.650115024946969, "grad_norm": 0.03956843912601471, "kl": 0.007406711578369141, "learning_rate": 1.0325676425354828e-06, "loss": 0.0097, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 508.4835596084595, "epoch": 1.652027127961519, "grad_norm": 0.04898946359753609, "kl": 0.008952617645263672, "learning_rate": 1.0274934856979876e-06, "loss": 0.0069, "num_tokens": 250241299.0, "reward": 0.07868303955183364, "reward_std": 0.08381028211442754, "rewards/pure_accuracy_reward_math": 0.07868303728173487, "step": 1191 }, { "clip_ratio": 0.0002854310730526777, "epoch": 1.6539392309760688, "grad_norm": 0.04304199293255806, "kl": 0.008716106414794922, "learning_rate": 1.0224286003195437e-06, "loss": 0.0069, "step": 1192 }, { "clip_ratio": 0.00029722766299755676, "epoch": 1.655851333990619, "grad_norm": 0.039751190692186356, "kl": 0.008554935455322266, "learning_rate": 1.017373018290588e-06, "loss": 0.0068, "step": 1193 }, { "clip_ratio": 0.00036785421832519205, "epoch": 1.6577634370051686, "grad_norm": 0.039316095411777496, "kl": 0.00851297378540039, "learning_rate": 1.0123267714429826e-06, "loss": 0.0067, "step": 1194 }, { "clip_ratio": 0.0003976103018885624, "epoch": 1.6596755400197185, "grad_norm": 0.03880908712744713, "kl": 0.008470535278320312, "learning_rate": 1.0072898915498094e-06, "loss": 0.0067, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 514.2179379463196, "epoch": 1.6615876430342684, "grad_norm": 0.04073133319616318, "kl": 0.0076427459716796875, "learning_rate": 1.0022624103251727e-06, "loss": 0.0095, "num_tokens": 253820892.0, "reward": 0.08593750416184776, "reward_std": 0.08978221646975726, "rewards/pure_accuracy_reward_math": 0.08593750165891834, "step": 1196 }, { "clip_ratio": 0.0003768215759691884, "epoch": 1.6634997460488183, "grad_norm": 0.039870597422122955, "kl": 0.007634639739990234, "learning_rate": 9.972443594239997e-07, "loss": 0.0095, "step": 1197 }, { "clip_ratio": 0.00033531371116168884, "epoch": 1.6654118490633683, "grad_norm": 0.039165791124105453, "kl": 0.007609367370605469, "learning_rate": 9.922357704418394e-07, "loss": 0.0094, "step": 1198 }, { "clip_ratio": 0.0003830786464504854, "epoch": 1.6673239520779182, "grad_norm": 0.0393473282456398, "kl": 0.0076847076416015625, "learning_rate": 9.872366749146684e-07, "loss": 0.0094, "step": 1199 }, { "clip_ratio": 0.0003766370310813727, "epoch": 1.669236055092468, "grad_norm": 0.037378448992967606, "kl": 0.007641792297363281, "learning_rate": 9.822471043186846e-07, "loss": 0.0093, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 502.35381841659546, "epoch": 1.671148158107018, "grad_norm": 0.051170479506254196, "kl": 0.008347511291503906, "learning_rate": 9.772670900701172e-07, "loss": 0.0074, "num_tokens": 257360516.0, "reward": 0.08537946784053929, "reward_std": 0.09248606633627787, "rewards/pure_accuracy_reward_math": 0.0853794660361018, "step": 1201 }, { "clip_ratio": 0.00036896339207714846, "epoch": 1.673060261121568, "grad_norm": 0.04540196433663368, "kl": 0.008112430572509766, "learning_rate": 9.722966635250222e-07, "loss": 0.0074, "step": 1202 }, { "clip_ratio": 0.00040850058093155894, "epoch": 1.6749723641361178, "grad_norm": 0.0428830124437809, "kl": 0.007869243621826172, "learning_rate": 9.673358559790892e-07, "loss": 0.0073, "step": 1203 }, { "clip_ratio": 0.0004735397765216476, "epoch": 1.6768844671506677, "grad_norm": 0.04445512220263481, "kl": 0.007699012756347656, "learning_rate": 9.623846986674417e-07, "loss": 0.0072, "step": 1204 }, { "clip_ratio": 0.00047387216932293086, "epoch": 1.6787965701652177, "grad_norm": 0.04317403957247734, "kl": 0.0076007843017578125, "learning_rate": 9.574432227644432e-07, "loss": 0.0071, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 511.88367557525635, "epoch": 1.6807086731797676, "grad_norm": 0.041338611394166946, "kl": 0.007639884948730469, "learning_rate": 9.525114593834975e-07, "loss": 0.0077, "num_tokens": 260924667.0, "reward": 0.07617187869618647, "reward_std": 0.08037573983892798, "rewards/pure_accuracy_reward_math": 0.0761718759604264, "step": 1206 }, { "clip_ratio": 0.00029646307336861355, "epoch": 1.6826207761943175, "grad_norm": 0.040457833558321, "kl": 0.007670402526855469, "learning_rate": 9.475894395768579e-07, "loss": 0.0077, "step": 1207 }, { "clip_ratio": 0.0003306309376966965, "epoch": 1.6845328792088674, "grad_norm": 0.03946809470653534, "kl": 0.0076751708984375, "learning_rate": 9.426771943354249e-07, "loss": 0.0076, "step": 1208 }, { "clip_ratio": 0.0003582578942200598, "epoch": 1.6864449822234173, "grad_norm": 0.04006471857428551, "kl": 0.007700443267822266, "learning_rate": 9.377747545885569e-07, "loss": 0.0075, "step": 1209 }, { "clip_ratio": 0.00040392828321955676, "epoch": 1.6883570852379672, "grad_norm": 0.04037889465689659, "kl": 0.007681369781494141, "learning_rate": 9.328821512038716e-07, "loss": 0.0074, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 533.6010298728943, "epoch": 1.6902691882525172, "grad_norm": 0.03628333657979965, "kl": 0.006788730621337891, "learning_rate": 9.279994149870539e-07, "loss": 0.0073, "num_tokens": 264564517.0, "reward": 0.06110491382423788, "reward_std": 0.06693661888130009, "rewards/pure_accuracy_reward_math": 0.06110491219442338, "step": 1211 }, { "clip_ratio": 0.0002594580842014693, "epoch": 1.692181291267067, "grad_norm": 0.034194085747003555, "kl": 0.006678581237792969, "learning_rate": 9.231265766816619e-07, "loss": 0.0073, "step": 1212 }, { "clip_ratio": 0.0003170226998463477, "epoch": 1.6940933942816168, "grad_norm": 0.035113800317049026, "kl": 0.006625652313232422, "learning_rate": 9.182636669689335e-07, "loss": 0.0073, "step": 1213 }, { "clip_ratio": 0.0003448430217076748, "epoch": 1.696005497296167, "grad_norm": 0.03626548498868942, "kl": 0.006573200225830078, "learning_rate": 9.134107164675898e-07, "loss": 0.0072, "step": 1214 }, { "clip_ratio": 0.00033195262278695736, "epoch": 1.6979176003107166, "grad_norm": 0.03465663269162178, "kl": 0.006582736968994141, "learning_rate": 9.085677557336465e-07, "loss": 0.0071, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 527.8440546989441, "epoch": 1.6998297033252667, "grad_norm": 0.038788389414548874, "kl": 0.009612560272216797, "learning_rate": 9.037348152602199e-07, "loss": 0.0052, "num_tokens": 268179390.0, "reward": 0.07756696798605844, "reward_std": 0.0852254037745297, "rewards/pure_accuracy_reward_math": 0.07756696571595967, "step": 1216 }, { "clip_ratio": 0.00027092215094626226, "epoch": 1.7017418063398164, "grad_norm": 0.038229282945394516, "kl": 0.009754657745361328, "learning_rate": 8.989119254773343e-07, "loss": 0.0052, "step": 1217 }, { "clip_ratio": 0.00027246196253827293, "epoch": 1.7036539093543666, "grad_norm": 0.03782220929861069, "kl": 0.009780406951904297, "learning_rate": 8.940991167517313e-07, "loss": 0.0051, "step": 1218 }, { "clip_ratio": 0.0003069629718197575, "epoch": 1.7055660123689163, "grad_norm": 0.03707100450992584, "kl": 0.00977468490600586, "learning_rate": 8.892964193866799e-07, "loss": 0.005, "step": 1219 }, { "clip_ratio": 0.0003035257008150438, "epoch": 1.7074781153834664, "grad_norm": 0.03552490472793579, "kl": 0.009665966033935547, "learning_rate": 8.845038636217818e-07, "loss": 0.0049, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 529.9601240158081, "epoch": 1.709390218398016, "grad_norm": 0.04051567241549492, "kl": 0.007312297821044922, "learning_rate": 8.797214796327843e-07, "loss": 0.0079, "num_tokens": 271808667.0, "reward": 0.08733259368455037, "reward_std": 0.08496641932288185, "rewards/pure_accuracy_reward_math": 0.0873325903667137, "step": 1221 }, { "clip_ratio": 0.00033132852740891394, "epoch": 1.7113023214125662, "grad_norm": 0.03887411206960678, "kl": 0.007235527038574219, "learning_rate": 8.749492975313897e-07, "loss": 0.0079, "step": 1222 }, { "clip_ratio": 0.0003587238066984355, "epoch": 1.713214424427116, "grad_norm": 0.04010055959224701, "kl": 0.007251739501953125, "learning_rate": 8.701873473650643e-07, "loss": 0.0079, "step": 1223 }, { "clip_ratio": 0.0003504625653079074, "epoch": 1.715126527441666, "grad_norm": 0.039550576359033585, "kl": 0.007262229919433594, "learning_rate": 8.654356591168522e-07, "loss": 0.0078, "step": 1224 }, { "clip_ratio": 0.0003497420942721874, "epoch": 1.7170386304562157, "grad_norm": 0.03883340209722519, "kl": 0.007348537445068359, "learning_rate": 8.60694262705182e-07, "loss": 0.0077, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 530.5396447181702, "epoch": 1.7189507334707659, "grad_norm": 0.037610165774822235, "kl": 0.007049083709716797, "learning_rate": 8.559631879836838e-07, "loss": 0.0065, "num_tokens": 275440789.0, "reward": 0.07896205675206147, "reward_std": 0.07938606152310967, "rewards/pure_accuracy_reward_math": 0.07896205494762398, "step": 1226 }, { "clip_ratio": 0.0002787316387298233, "epoch": 1.7208628364853156, "grad_norm": 0.03763109818100929, "kl": 0.007136821746826172, "learning_rate": 8.512424647409964e-07, "loss": 0.0065, "step": 1227 }, { "clip_ratio": 0.0003178273858566172, "epoch": 1.7227749394998657, "grad_norm": 0.037824735045433044, "kl": 0.007121562957763672, "learning_rate": 8.465321227005823e-07, "loss": 0.0065, "step": 1228 }, { "clip_ratio": 0.0002866029928725311, "epoch": 1.7246870425144154, "grad_norm": 0.03616493567824364, "kl": 0.00708770751953125, "learning_rate": 8.418321915205399e-07, "loss": 0.0064, "step": 1229 }, { "clip_ratio": 0.00031164622902224437, "epoch": 1.7265991455289653, "grad_norm": 0.03562076762318611, "kl": 0.007038593292236328, "learning_rate": 8.371427007934174e-07, "loss": 0.0063, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 536.3178272247314, "epoch": 1.7285112485435152, "grad_norm": 0.03759186714887619, "kl": 0.006800651550292969, "learning_rate": 8.324636800460242e-07, "loss": 0.0071, "num_tokens": 279097568.0, "reward": 0.07728794903960079, "reward_std": 0.07732657541055232, "rewards/pure_accuracy_reward_math": 0.07728794822469354, "step": 1231 }, { "clip_ratio": 0.00028705537579298834, "epoch": 1.7304233515580651, "grad_norm": 0.036786679178476334, "kl": 0.006786346435546875, "learning_rate": 8.277951587392505e-07, "loss": 0.0071, "step": 1232 }, { "clip_ratio": 0.000303516245821811, "epoch": 1.732335454572615, "grad_norm": 0.03563455864787102, "kl": 0.0068149566650390625, "learning_rate": 8.231371662678741e-07, "loss": 0.0071, "step": 1233 }, { "clip_ratio": 0.0003096325264095867, "epoch": 1.734247557587165, "grad_norm": 0.03413652628660202, "kl": 0.006861209869384766, "learning_rate": 8.184897319603813e-07, "loss": 0.007, "step": 1234 }, { "clip_ratio": 0.0003550405467649398, "epoch": 1.736159660601715, "grad_norm": 0.03433661162853241, "kl": 0.006935596466064453, "learning_rate": 8.138528850787792e-07, "loss": 0.0069, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 516.8069453239441, "epoch": 1.7380717636162648, "grad_norm": 0.2546544671058655, "kl": 0.012326240539550781, "learning_rate": 8.092266548184139e-07, "loss": 0.011, "num_tokens": 282683384.0, "reward": 0.07477678873692639, "reward_std": 0.08165826951153576, "rewards/pure_accuracy_reward_math": 0.07477678751456551, "step": 1236 }, { "clip_ratio": 0.00030172572752462656, "epoch": 1.7399838666308147, "grad_norm": 0.042716413736343384, "kl": 0.0078887939453125, "learning_rate": 8.046110703077839e-07, "loss": 0.0108, "step": 1237 }, { "clip_ratio": 0.00029401268267292835, "epoch": 1.7418959696453646, "grad_norm": 0.038783252239227295, "kl": 0.007707118988037109, "learning_rate": 8.000061606083579e-07, "loss": 0.0107, "step": 1238 }, { "clip_ratio": 0.00028625389199987694, "epoch": 1.7438080726599146, "grad_norm": 0.0381159707903862, "kl": 0.007790088653564453, "learning_rate": 7.954119547143935e-07, "loss": 0.0107, "step": 1239 }, { "clip_ratio": 0.00034677153644224745, "epoch": 1.7457201756744645, "grad_norm": 0.038590554147958755, "kl": 0.007785797119140625, "learning_rate": 7.90828481552752e-07, "loss": 0.0106, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 517.8047132492065, "epoch": 1.7476322786890144, "grad_norm": 0.03943649306893349, "kl": 0.007458209991455078, "learning_rate": 7.862557699827167e-07, "loss": 0.0092, "num_tokens": 286269120.0, "reward": 0.06640625282307155, "reward_std": 0.07607791275950149, "rewards/pure_accuracy_reward_math": 0.06640625130967237, "step": 1241 }, { "clip_ratio": 0.00031282668544463377, "epoch": 1.7495443817035643, "grad_norm": 0.0388050340116024, "kl": 0.007348060607910156, "learning_rate": 7.816938487958131e-07, "loss": 0.0092, "step": 1242 }, { "clip_ratio": 0.0003194147345197962, "epoch": 1.7514564847181142, "grad_norm": 0.038322921842336655, "kl": 0.007298946380615234, "learning_rate": 7.771427467156256e-07, "loss": 0.0091, "step": 1243 }, { "clip_ratio": 0.0003203335651846828, "epoch": 1.7533685877326641, "grad_norm": 0.037499312311410904, "kl": 0.007254600524902344, "learning_rate": 7.726024923976169e-07, "loss": 0.009, "step": 1244 }, { "clip_ratio": 0.00032696440513291236, "epoch": 1.755280690747214, "grad_norm": 0.03671669587492943, "kl": 0.007252693176269531, "learning_rate": 7.680731144289505e-07, "loss": 0.009, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 514.8644180297852, "epoch": 1.757192793761764, "grad_norm": 0.04826434701681137, "kl": 0.0094451904296875, "learning_rate": 7.635546413283054e-07, "loss": 0.0078, "num_tokens": 289848950.0, "reward": 0.07421875323052518, "reward_std": 0.07818366138963029, "rewards/pure_accuracy_reward_math": 0.074218751717126, "step": 1246 }, { "clip_ratio": 0.000299703156713349, "epoch": 1.7591048967763139, "grad_norm": 0.03791136294603348, "kl": 0.009324073791503906, "learning_rate": 7.590471015457002e-07, "loss": 0.0077, "step": 1247 }, { "clip_ratio": 0.00030542989918558305, "epoch": 1.7610169997908636, "grad_norm": 0.03703403100371361, "kl": 0.009335517883300781, "learning_rate": 7.545505234623152e-07, "loss": 0.0077, "step": 1248 }, { "clip_ratio": 0.0002983629839832247, "epoch": 1.7629291028054137, "grad_norm": 0.0363752581179142, "kl": 0.009361743927001953, "learning_rate": 7.500649353903092e-07, "loss": 0.0076, "step": 1249 }, { "clip_ratio": 0.0002923785563098136, "epoch": 1.7648412058199634, "grad_norm": 0.03587965667247772, "kl": 0.009373664855957031, "learning_rate": 7.455903655726437e-07, "loss": 0.0075, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 510.6543188095093, "epoch": 1.7667533088345135, "grad_norm": 0.03651593253016472, "kl": 0.008678436279296875, "learning_rate": 7.411268421829076e-07, "loss": 0.0059, "num_tokens": 293408275.0, "reward": 0.07031250264844857, "reward_std": 0.07401842658873647, "rewards/pure_accuracy_reward_math": 0.07031250160071068, "step": 1251 }, { "clip_ratio": 0.000244510552590782, "epoch": 1.7686654118490632, "grad_norm": 0.03525623679161072, "kl": 0.008609294891357422, "learning_rate": 7.366743933251349e-07, "loss": 0.0059, "step": 1252 }, { "clip_ratio": 0.000242228649824483, "epoch": 1.7705775148636134, "grad_norm": 0.035115260630846024, "kl": 0.008548259735107422, "learning_rate": 7.322330470336314e-07, "loss": 0.0058, "step": 1253 }, { "clip_ratio": 0.0002641637478291159, "epoch": 1.772489617878163, "grad_norm": 0.03518166393041611, "kl": 0.008442401885986328, "learning_rate": 7.278028312727961e-07, "loss": 0.0058, "step": 1254 }, { "clip_ratio": 0.0002555919315909705, "epoch": 1.7744017208927132, "grad_norm": 0.03385892137885094, "kl": 0.00841379165649414, "learning_rate": 7.233837739369462e-07, "loss": 0.0057, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 513.7271451950073, "epoch": 1.776313823907263, "grad_norm": 0.03341628611087799, "kl": 0.006855964660644531, "learning_rate": 7.189759028501417e-07, "loss": 0.0062, "num_tokens": 296984393.0, "reward": 0.06556919915601611, "reward_std": 0.06311669771093875, "rewards/pure_accuracy_reward_math": 0.06556919775903225, "step": 1256 }, { "clip_ratio": 0.0002122660096688378, "epoch": 1.778225926921813, "grad_norm": 0.03227659687399864, "kl": 0.006803989410400391, "learning_rate": 7.145792457660083e-07, "loss": 0.0062, "step": 1257 }, { "clip_ratio": 0.00023682935608348998, "epoch": 1.7801380299363627, "grad_norm": 0.03206360712647438, "kl": 0.006758213043212891, "learning_rate": 7.101938303675674e-07, "loss": 0.0062, "step": 1258 }, { "clip_ratio": 0.0002413284565250251, "epoch": 1.7820501329509129, "grad_norm": 0.031279318034648895, "kl": 0.006762981414794922, "learning_rate": 7.058196842670548e-07, "loss": 0.0061, "step": 1259 }, { "clip_ratio": 0.0002680151189338176, "epoch": 1.7839622359654626, "grad_norm": 0.031049314886331558, "kl": 0.006676197052001953, "learning_rate": 7.014568350057516e-07, "loss": 0.0061, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 532.2553224563599, "epoch": 1.7858743389800127, "grad_norm": 0.03635333850979805, "kl": 0.007339000701904297, "learning_rate": 6.971053100538116e-07, "loss": 0.0066, "num_tokens": 300622928.0, "reward": 0.0711495568684768, "reward_std": 0.07668221119092777, "rewards/pure_accuracy_reward_math": 0.07114955512224697, "step": 1261 }, { "clip_ratio": 0.00025942773436327116, "epoch": 1.7877864419945624, "grad_norm": 0.03595859929919243, "kl": 0.007373332977294922, "learning_rate": 6.927651368100843e-07, "loss": 0.0065, "step": 1262 }, { "clip_ratio": 0.00026420129074722354, "epoch": 1.7896985450091125, "grad_norm": 0.034778136759996414, "kl": 0.00739288330078125, "learning_rate": 6.884363426019444e-07, "loss": 0.0065, "step": 1263 }, { "clip_ratio": 0.0002875854173112202, "epoch": 1.7916106480236622, "grad_norm": 0.035560280084609985, "kl": 0.007449150085449219, "learning_rate": 6.841189546851224e-07, "loss": 0.0064, "step": 1264 }, { "clip_ratio": 0.00026737677507071567, "epoch": 1.7935227510382123, "grad_norm": 0.03407442197203636, "kl": 0.007452964782714844, "learning_rate": 6.79813000243528e-07, "loss": 0.0064, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 523.543550491333, "epoch": 1.795434854052762, "grad_norm": 0.03908964619040489, "kl": 0.008809566497802734, "learning_rate": 6.755185063890818e-07, "loss": 0.0074, "num_tokens": 304236988.0, "reward": 0.0747767890279647, "reward_std": 0.07865536957979202, "rewards/pure_accuracy_reward_math": 0.07477678745635785, "step": 1266 }, { "clip_ratio": 0.0002752643416670253, "epoch": 1.797346957067312, "grad_norm": 0.0380408875644207, "kl": 0.00884389877319336, "learning_rate": 6.71235500161545e-07, "loss": 0.0074, "step": 1267 }, { "clip_ratio": 0.0002959408872698077, "epoch": 1.7992590600818619, "grad_norm": 0.03713267296552658, "kl": 0.008931636810302734, "learning_rate": 6.669640085283479e-07, "loss": 0.0073, "step": 1268 }, { "clip_ratio": 0.0003134474755484007, "epoch": 1.8011711630964118, "grad_norm": 0.03684492036700249, "kl": 0.008975982666015625, "learning_rate": 6.627040583844199e-07, "loss": 0.0073, "step": 1269 }, { "clip_ratio": 0.0003336208075666036, "epoch": 1.8030832661109617, "grad_norm": 0.0364052951335907, "kl": 0.009007453918457031, "learning_rate": 6.584556765520231e-07, "loss": 0.0072, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 532.5468997955322, "epoch": 1.8049953691255116, "grad_norm": 0.03688374161720276, "kl": 0.006972789764404297, "learning_rate": 6.542188897805782e-07, "loss": 0.0076, "num_tokens": 307881200.0, "reward": 0.06082589610014111, "reward_std": 0.06925509008578956, "rewards/pure_accuracy_reward_math": 0.06082589423749596, "step": 1271 }, { "clip_ratio": 0.0002535940801635661, "epoch": 1.8069074721400615, "grad_norm": 0.03543318435549736, "kl": 0.006913661956787109, "learning_rate": 6.499937247465002e-07, "loss": 0.0076, "step": 1272 }, { "clip_ratio": 0.00029529011806062044, "epoch": 1.8088195751546114, "grad_norm": 0.034321434795856476, "kl": 0.006764411926269531, "learning_rate": 6.457802080530304e-07, "loss": 0.0075, "step": 1273 }, { "clip_ratio": 0.00032198404306882367, "epoch": 1.8107316781691614, "grad_norm": 0.03342648968100548, "kl": 0.006732940673828125, "learning_rate": 6.415783662300662e-07, "loss": 0.0075, "step": 1274 }, { "clip_ratio": 0.000381207836142039, "epoch": 1.8126437811837113, "grad_norm": 0.034588467329740524, "kl": 0.006687164306640625, "learning_rate": 6.373882257339964e-07, "loss": 0.0074, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 528.7452836036682, "epoch": 1.8145558841982612, "grad_norm": 0.039650533348321915, "kl": 0.012791156768798828, "learning_rate": 6.33209812947532e-07, "loss": 0.0068, "num_tokens": 311509399.0, "reward": 0.06919643239234574, "reward_std": 0.07131457643117756, "rewards/pure_accuracy_reward_math": 0.06919642988941632, "step": 1276 }, { "clip_ratio": 0.00028128568749252736, "epoch": 1.816467987212811, "grad_norm": 0.039305564016103745, "kl": 0.012639522552490234, "learning_rate": 6.290431541795456e-07, "loss": 0.0068, "step": 1277 }, { "clip_ratio": 0.00027201296376233586, "epoch": 1.818380090227361, "grad_norm": 0.038404785096645355, "kl": 0.012586116790771484, "learning_rate": 6.248882756648988e-07, "loss": 0.0067, "step": 1278 }, { "clip_ratio": 0.00027703067632955936, "epoch": 1.820292193241911, "grad_norm": 0.037614692002534866, "kl": 0.01236581802368164, "learning_rate": 6.207452035642814e-07, "loss": 0.0066, "step": 1279 }, { "clip_ratio": 0.000309511864088563, "epoch": 1.8222042962564609, "grad_norm": 0.03737355023622513, "kl": 0.012206554412841797, "learning_rate": 6.166139639640454e-07, "loss": 0.0065, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 526.473795413971, "epoch": 1.8241163992710108, "grad_norm": 0.03713076934218407, "kl": 0.007002353668212891, "learning_rate": 6.124945828760406e-07, "loss": 0.0059, "num_tokens": 315129533.0, "reward": 0.06445312840514816, "reward_std": 0.06921502435579896, "rewards/pure_accuracy_reward_math": 0.0644531259604264, "step": 1281 }, { "clip_ratio": 0.00024346445911760384, "epoch": 1.8260285022855607, "grad_norm": 0.03588669002056122, "kl": 0.006989955902099609, "learning_rate": 6.083870862374513e-07, "loss": 0.0059, "step": 1282 }, { "clip_ratio": 0.0002329723478737833, "epoch": 1.8279406053001104, "grad_norm": 0.03526683151721954, "kl": 0.007010459899902344, "learning_rate": 6.042914999106342e-07, "loss": 0.0058, "step": 1283 }, { "clip_ratio": 0.00023291378442991117, "epoch": 1.8298527083146605, "grad_norm": 0.03384559601545334, "kl": 0.007075786590576172, "learning_rate": 6.002078496829514e-07, "loss": 0.0058, "step": 1284 }, { "clip_ratio": 0.0002458733478647446, "epoch": 1.8317648113292102, "grad_norm": 0.03377237543463707, "kl": 0.0071315765380859375, "learning_rate": 5.961361612666139e-07, "loss": 0.0057, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 525.0859618186951, "epoch": 1.8336769143437603, "grad_norm": 0.0914173573255539, "kl": 0.012554645538330078, "learning_rate": 5.920764602985141e-07, "loss": 0.0058, "num_tokens": 318747025.0, "reward": 0.06612723506987095, "reward_std": 0.06865079142153263, "rewards/pure_accuracy_reward_math": 0.06612723355647177, "step": 1286 }, { "clip_ratio": 0.00025586230526641884, "epoch": 1.83558901735831, "grad_norm": 0.04225718230009079, "kl": 0.010876655578613281, "learning_rate": 5.88028772340068e-07, "loss": 0.0057, "step": 1287 }, { "clip_ratio": 0.00024814432106268214, "epoch": 1.8375011203728602, "grad_norm": 0.03636258468031883, "kl": 0.010531425476074219, "learning_rate": 5.839931228770526e-07, "loss": 0.0057, "step": 1288 }, { "clip_ratio": 0.0002984523198108491, "epoch": 1.8394132233874099, "grad_norm": 0.03610241040587425, "kl": 0.010416984558105469, "learning_rate": 5.799695373194461e-07, "loss": 0.0056, "step": 1289 }, { "clip_ratio": 0.00032527196299270145, "epoch": 1.84132532640196, "grad_norm": 0.034912850707769394, "kl": 0.010428428649902344, "learning_rate": 5.759580410012691e-07, "loss": 0.0055, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 520.4793767929077, "epoch": 1.8432374294165097, "grad_norm": 0.04220513626933098, "kl": 0.009058475494384766, "learning_rate": 5.719586591804222e-07, "loss": 0.0071, "num_tokens": 322345307.0, "reward": 0.07366071786964312, "reward_std": 0.07878176297526807, "rewards/pure_accuracy_reward_math": 0.07366071542492136, "step": 1291 }, { "clip_ratio": 0.00030183524040694465, "epoch": 1.8451495324310598, "grad_norm": 0.03849344700574875, "kl": 0.009106636047363281, "learning_rate": 5.679714170385283e-07, "loss": 0.0071, "step": 1292 }, { "clip_ratio": 0.00035880112773156725, "epoch": 1.8470616354456095, "grad_norm": 0.037096235901117325, "kl": 0.009167194366455078, "learning_rate": 5.63996339680776e-07, "loss": 0.0071, "step": 1293 }, { "clip_ratio": 0.00040293739141361584, "epoch": 1.8489737384601597, "grad_norm": 0.03884498402476311, "kl": 0.009192943572998047, "learning_rate": 5.600334521357581e-07, "loss": 0.007, "step": 1294 }, { "clip_ratio": 0.00038201194092835067, "epoch": 1.8508858414747094, "grad_norm": 0.03875093162059784, "kl": 0.009291648864746094, "learning_rate": 5.560827793553159e-07, "loss": 0.0069, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 518.3301024436951, "epoch": 1.8527979444892595, "grad_norm": 0.04254430532455444, "kl": 0.008441925048828125, "learning_rate": 5.52144346214383e-07, "loss": 0.0063, "num_tokens": 325938766.0, "reward": 0.07840402127476409, "reward_std": 0.08084744628285989, "rewards/pure_accuracy_reward_math": 0.07840401929570362, "step": 1296 }, { "clip_ratio": 0.0002986583057804637, "epoch": 1.8547100475038092, "grad_norm": 0.041676584631204605, "kl": 0.008450508117675781, "learning_rate": 5.482181775108278e-07, "loss": 0.0062, "step": 1297 }, { "clip_ratio": 0.00031948441494478175, "epoch": 1.8566221505183593, "grad_norm": 0.03955300524830818, "kl": 0.008507251739501953, "learning_rate": 5.443042979652957e-07, "loss": 0.0062, "step": 1298 }, { "clip_ratio": 0.0003085145480667961, "epoch": 1.858534253532909, "grad_norm": 0.03848061338067055, "kl": 0.008501052856445312, "learning_rate": 5.404027322210556e-07, "loss": 0.0061, "step": 1299 }, { "clip_ratio": 0.0003855731235944404, "epoch": 1.8604463565474592, "grad_norm": 0.04076399654150009, "kl": 0.00849771499633789, "learning_rate": 5.365135048438438e-07, "loss": 0.006, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 529.5170464515686, "epoch": 1.8623584595620088, "grad_norm": 0.14906181395053864, "kl": 0.007767677307128906, "learning_rate": 5.326366403217093e-07, "loss": 0.0084, "num_tokens": 329571311.0, "reward": 0.07254464630386792, "reward_std": 0.08418946416350082, "rewards/pure_accuracy_reward_math": 0.07254464438301511, "step": 1301 }, { "clip_ratio": 0.00028383656763253384, "epoch": 1.8642705625765588, "grad_norm": 0.04550671949982643, "kl": 0.008212089538574219, "learning_rate": 5.287721630648615e-07, "loss": 0.0083, "step": 1302 }, { "clip_ratio": 0.0003281467976989916, "epoch": 1.8661826655911087, "grad_norm": 0.05260877683758736, "kl": 0.008829593658447266, "learning_rate": 5.249200974055132e-07, "loss": 0.0083, "step": 1303 }, { "clip_ratio": 0.00036754867960553383, "epoch": 1.8680947686056586, "grad_norm": 0.0511869452893734, "kl": 0.008836746215820312, "learning_rate": 5.210804675977299e-07, "loss": 0.0082, "step": 1304 }, { "clip_ratio": 0.0004018283953541868, "epoch": 1.8700068716202085, "grad_norm": 0.044321924448013306, "kl": 0.008379459381103516, "learning_rate": 5.172532978172753e-07, "loss": 0.0081, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 512.9788198471069, "epoch": 1.8719189746347584, "grad_norm": 0.04202428087592125, "kl": 0.0076198577880859375, "learning_rate": 5.134386121614615e-07, "loss": 0.0072, "num_tokens": 333143795.0, "reward": 0.07421875317231752, "reward_std": 0.07986396714113653, "rewards/pure_accuracy_reward_math": 0.074218751717126, "step": 1306 }, { "clip_ratio": 0.00027569573836672134, "epoch": 1.8738310776493083, "grad_norm": 0.040443304926157, "kl": 0.007631778717041016, "learning_rate": 5.096364346489935e-07, "loss": 0.0072, "step": 1307 }, { "clip_ratio": 0.00027392168607320855, "epoch": 1.8757431806638583, "grad_norm": 0.040238041430711746, "kl": 0.007664203643798828, "learning_rate": 5.058467892198241e-07, "loss": 0.0071, "step": 1308 }, { "clip_ratio": 0.0003170029604007141, "epoch": 1.8776552836784082, "grad_norm": 0.039109617471694946, "kl": 0.007664203643798828, "learning_rate": 5.02069699734995e-07, "loss": 0.007, "step": 1309 }, { "clip_ratio": 0.0003183572773082233, "epoch": 1.879567386692958, "grad_norm": 0.03724955767393112, "kl": 0.007700443267822266, "learning_rate": 4.983051899764946e-07, "loss": 0.007, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 505.4592852592468, "epoch": 1.881479489707508, "grad_norm": 0.03964386135339737, "kl": 0.007820606231689453, "learning_rate": 4.945532836471026e-07, "loss": 0.0074, "num_tokens": 336685165.0, "reward": 0.0848214327415917, "reward_std": 0.07835631881607696, "rewards/pure_accuracy_reward_math": 0.08482142965658568, "step": 1311 }, { "clip_ratio": 0.0002873320136700386, "epoch": 1.883391592722058, "grad_norm": 0.03871289640665054, "kl": 0.007764339447021484, "learning_rate": 4.908140043702426e-07, "loss": 0.0074, "step": 1312 }, { "clip_ratio": 0.0003113469839775007, "epoch": 1.8853036957366078, "grad_norm": 0.03769771382212639, "kl": 0.007766246795654297, "learning_rate": 4.870873756898345e-07, "loss": 0.0074, "step": 1313 }, { "clip_ratio": 0.00034381698696961394, "epoch": 1.8872157987511577, "grad_norm": 0.03724011033773422, "kl": 0.007775783538818359, "learning_rate": 4.833734210701435e-07, "loss": 0.0073, "step": 1314 }, { "clip_ratio": 0.0003651243675335536, "epoch": 1.8891279017657077, "grad_norm": 0.03757576644420624, "kl": 0.007784366607666016, "learning_rate": 4.796721638956376e-07, "loss": 0.0072, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 527.5703339576721, "epoch": 1.8910400047802576, "grad_norm": 0.03592124208807945, "kl": 0.007517337799072266, "learning_rate": 4.7598362747083293e-07, "loss": 0.008, "num_tokens": 340304225.0, "reward": 0.06501116388244554, "reward_std": 0.0762443722342141, "rewards/pure_accuracy_reward_math": 0.06501116219442338, "step": 1316 }, { "clip_ratio": 0.00026663288446115985, "epoch": 1.8929521077948075, "grad_norm": 0.03529619425535202, "kl": 0.007477283477783203, "learning_rate": 4.7230783502015346e-07, "loss": 0.008, "step": 1317 }, { "clip_ratio": 0.00025462434007295087, "epoch": 1.8948642108093574, "grad_norm": 0.03387421742081642, "kl": 0.007337093353271484, "learning_rate": 4.6864480968778103e-07, "loss": 0.008, "step": 1318 }, { "clip_ratio": 0.00031681645646131074, "epoch": 1.8967763138239073, "grad_norm": 0.033014364540576935, "kl": 0.007318019866943359, "learning_rate": 4.649945745375109e-07, "loss": 0.0079, "step": 1319 }, { "clip_ratio": 0.00037019279989181086, "epoch": 1.898688416838457, "grad_norm": 0.033140987157821655, "kl": 0.007157325744628906, "learning_rate": 4.613571525526081e-07, "loss": 0.0078, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 523.3727917671204, "epoch": 1.9006005198530072, "grad_norm": 0.03997303172945976, "kl": 0.007628440856933594, "learning_rate": 4.577325666356586e-07, "loss": 0.0118, "num_tokens": 343915401.0, "reward": 0.08816964740981348, "reward_std": 0.08973595389397815, "rewards/pure_accuracy_reward_math": 0.08816964426659979, "step": 1321 }, { "clip_ratio": 0.0003053776546835252, "epoch": 1.9025126228675568, "grad_norm": 0.039738208055496216, "kl": 0.007574558258056641, "learning_rate": 4.541208396084304e-07, "loss": 0.0117, "step": 1322 }, { "clip_ratio": 0.00030029478972437573, "epoch": 1.904424725882107, "grad_norm": 0.038392502814531326, "kl": 0.007514476776123047, "learning_rate": 4.5052199421172475e-07, "loss": 0.0117, "step": 1323 }, { "clip_ratio": 0.0003343055576010556, "epoch": 1.9063368288966567, "grad_norm": 0.037236347794532776, "kl": 0.007477760314941406, "learning_rate": 4.4693605310523636e-07, "loss": 0.0116, "step": 1324 }, { "clip_ratio": 0.00032557199602933906, "epoch": 1.9082489319112068, "grad_norm": 0.03678731992840767, "kl": 0.007478237152099609, "learning_rate": 4.43363038867409e-07, "loss": 0.0115, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 513.3047099113464, "epoch": 1.9101610349257565, "grad_norm": 0.11113768815994263, "kl": 0.013922691345214844, "learning_rate": 4.39802973995295e-07, "loss": 0.0093, "num_tokens": 347490901.0, "reward": 0.09486607549479231, "reward_std": 0.09372853260720149, "rewards/pure_accuracy_reward_math": 0.09486607305007055, "step": 1326 }, { "clip_ratio": 0.00036943193325100765, "epoch": 1.9120731379403066, "grad_norm": 0.055216722190380096, "kl": 0.013732433319091797, "learning_rate": 4.362558809044107e-07, "loss": 0.0093, "step": 1327 }, { "clip_ratio": 0.0004000666916681439, "epoch": 1.9139852409548563, "grad_norm": 0.045698132365942, "kl": 0.013063907623291016, "learning_rate": 4.327217819286e-07, "loss": 0.0092, "step": 1328 }, { "clip_ratio": 0.0004443397794489101, "epoch": 1.9158973439694065, "grad_norm": 0.04273562505841255, "kl": 0.012539863586425781, "learning_rate": 4.292006993198888e-07, "loss": 0.009, "step": 1329 }, { "clip_ratio": 0.0004470848766686686, "epoch": 1.9178094469839562, "grad_norm": 0.04232070967555046, "kl": 0.012142658233642578, "learning_rate": 4.2569265524834756e-07, "loss": 0.0089, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 518.7550463676453, "epoch": 1.9197215499985063, "grad_norm": 0.03724661469459534, "kl": 0.007449150085449219, "learning_rate": 4.221976718019505e-07, "loss": 0.007, "num_tokens": 351086731.0, "reward": 0.06919643189758062, "reward_std": 0.07200520270271227, "rewards/pure_accuracy_reward_math": 0.06919642974389717, "step": 1331 }, { "clip_ratio": 0.00027471570277270985, "epoch": 1.921633653013056, "grad_norm": 0.03599303960800171, "kl": 0.007382869720458984, "learning_rate": 4.187157709864392e-07, "loss": 0.007, "step": 1332 }, { "clip_ratio": 0.0002737036326720954, "epoch": 1.9235457560276061, "grad_norm": 0.03614535927772522, "kl": 0.007375240325927734, "learning_rate": 4.152469747251794e-07, "loss": 0.0069, "step": 1333 }, { "clip_ratio": 0.00030229948259830053, "epoch": 1.9254578590421558, "grad_norm": 0.03546711429953575, "kl": 0.0072498321533203125, "learning_rate": 4.117913048590283e-07, "loss": 0.0069, "step": 1334 }, { "clip_ratio": 0.00030038867771509103, "epoch": 1.927369962056706, "grad_norm": 0.03401359170675278, "kl": 0.007149219512939453, "learning_rate": 4.0834878314619244e-07, "loss": 0.0068, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 526.2182154655457, "epoch": 1.9292820650712557, "grad_norm": 0.04080551117658615, "kl": 0.006867885589599609, "learning_rate": 4.049194312620927e-07, "loss": 0.0092, "num_tokens": 354708525.0, "reward": 0.07756696798605844, "reward_std": 0.08467356563778594, "rewards/pure_accuracy_reward_math": 0.07756696530850604, "step": 1336 }, { "clip_ratio": 0.0002796990767137686, "epoch": 1.9311941680858056, "grad_norm": 0.038895782083272934, "kl": 0.006824970245361328, "learning_rate": 4.015032707992286e-07, "loss": 0.0092, "step": 1337 }, { "clip_ratio": 0.00032694752422912643, "epoch": 1.9331062711003555, "grad_norm": 0.03889061138033867, "kl": 0.006866931915283203, "learning_rate": 3.9810032326704106e-07, "loss": 0.0091, "step": 1338 }, { "clip_ratio": 0.0003511786251237936, "epoch": 1.9350183741149054, "grad_norm": 0.03880919888615608, "kl": 0.006947994232177734, "learning_rate": 3.9471061009177693e-07, "loss": 0.009, "step": 1339 }, { "clip_ratio": 0.000323922223401496, "epoch": 1.9369304771294553, "grad_norm": 0.036964643746614456, "kl": 0.007033824920654297, "learning_rate": 3.91334152616355e-07, "loss": 0.0089, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 527.7076120376587, "epoch": 1.9388425801440052, "grad_norm": 0.04040682688355446, "kl": 0.007448673248291016, "learning_rate": 3.879709721002317e-07, "loss": 0.0052, "num_tokens": 358339045.0, "reward": 0.07896205660654232, "reward_std": 0.08278053888352588, "rewards/pure_accuracy_reward_math": 0.07896205550059676, "step": 1341 }, { "clip_ratio": 0.00029579239503618737, "epoch": 1.9407546831585551, "grad_norm": 0.03910582885146141, "kl": 0.007539272308349609, "learning_rate": 3.8462108971926564e-07, "loss": 0.0052, "step": 1342 }, { "clip_ratio": 0.0003078770084812277, "epoch": 1.942666786173105, "grad_norm": 0.03942732512950897, "kl": 0.007628440856933594, "learning_rate": 3.8128452656558623e-07, "loss": 0.0051, "step": 1343 }, { "clip_ratio": 0.0003229538778555252, "epoch": 1.944578889187655, "grad_norm": 0.03747202083468437, "kl": 0.007678031921386719, "learning_rate": 3.779613036474583e-07, "loss": 0.005, "step": 1344 }, { "clip_ratio": 0.000363169818285769, "epoch": 1.946490992202205, "grad_norm": 0.036778781563043594, "kl": 0.0076923370361328125, "learning_rate": 3.746514418891545e-07, "loss": 0.0049, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 532.7960658073425, "epoch": 1.9484030952167548, "grad_norm": 0.040943268686532974, "kl": 0.011704444885253906, "learning_rate": 3.713549621308174e-07, "loss": 0.005, "num_tokens": 361980918.0, "reward": 0.07059152092551813, "reward_std": 0.07973137585213408, "rewards/pure_accuracy_reward_math": 0.07059151900466532, "step": 1346 }, { "clip_ratio": 0.00029914512055029263, "epoch": 1.9503151982313047, "grad_norm": 0.04052672162652016, "kl": 0.0114288330078125, "learning_rate": 3.6807188512833406e-07, "loss": 0.005, "step": 1347 }, { "clip_ratio": 0.000334167169853572, "epoch": 1.9522273012458546, "grad_norm": 0.04054692015051842, "kl": 0.011135578155517578, "learning_rate": 3.648022315532007e-07, "loss": 0.0049, "step": 1348 }, { "clip_ratio": 0.00035840429575273447, "epoch": 1.9541394042604046, "grad_norm": 0.03996079042553902, "kl": 0.010680675506591797, "learning_rate": 3.615460219923955e-07, "loss": 0.0048, "step": 1349 }, { "clip_ratio": 0.00034668986540964397, "epoch": 1.9560515072749545, "grad_norm": 0.037566084414720535, "kl": 0.010373115539550781, "learning_rate": 3.5830327694824777e-07, "loss": 0.0047, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 534.6453948020935, "epoch": 1.9579636102895044, "grad_norm": 0.03812556713819504, "kl": 0.007121086120605469, "learning_rate": 3.5507401683830933e-07, "loss": 0.0114, "num_tokens": 365629991.0, "reward": 0.07672991411527619, "reward_std": 0.07831625349353999, "rewards/pure_accuracy_reward_math": 0.07672991178696975, "step": 1351 }, { "clip_ratio": 0.0003128355612602718, "epoch": 1.9598757133040543, "grad_norm": 0.03631382808089256, "kl": 0.007141590118408203, "learning_rate": 3.518582619952257e-07, "loss": 0.0114, "step": 1352 }, { "clip_ratio": 0.00033067399391484287, "epoch": 1.9617878163186042, "grad_norm": 0.03752359002828598, "kl": 0.007140636444091797, "learning_rate": 3.486560326666072e-07, "loss": 0.0113, "step": 1353 }, { "clip_ratio": 0.00037038392605381887, "epoch": 1.9636999193331541, "grad_norm": 0.03724711388349533, "kl": 0.007131099700927734, "learning_rate": 3.4546734901490466e-07, "loss": 0.0112, "step": 1354 }, { "clip_ratio": 0.00040464663743478013, "epoch": 1.9656120223477038, "grad_norm": 0.034875430166721344, "kl": 0.007108211517333984, "learning_rate": 3.42292231117278e-07, "loss": 0.0112, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 519.9101786613464, "epoch": 1.967524125362254, "grad_norm": 0.04123640060424805, "kl": 0.007243156433105469, "learning_rate": 3.3913069896547217e-07, "loss": 0.0069, "num_tokens": 369229613.0, "reward": 0.08007812878349796, "reward_std": 0.085311732836999, "rewards/pure_accuracy_reward_math": 0.0800781263387762, "step": 1356 }, { "clip_ratio": 0.00033138683619426956, "epoch": 1.9694362283768037, "grad_norm": 0.04048166796565056, "kl": 0.007332801818847656, "learning_rate": 3.3598277246569307e-07, "loss": 0.0069, "step": 1357 }, { "clip_ratio": 0.0003668193609200898, "epoch": 1.9713483313913538, "grad_norm": 0.042313288897275925, "kl": 0.007485866546630859, "learning_rate": 3.3284847143847834e-07, "loss": 0.0068, "step": 1358 }, { "clip_ratio": 0.0003713441701620468, "epoch": 1.9732604344059035, "grad_norm": 0.04199962690472603, "kl": 0.007598400115966797, "learning_rate": 3.2972781561857433e-07, "loss": 0.0067, "step": 1359 }, { "clip_ratio": 0.0003367169608736731, "epoch": 1.9751725374204536, "grad_norm": 0.03874565288424492, "kl": 0.007636547088623047, "learning_rate": 3.266208246548136e-07, "loss": 0.0066, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 516.4445023536682, "epoch": 1.9770846404350033, "grad_norm": 0.040357448160648346, "kl": 0.007414817810058594, "learning_rate": 3.2352751810998896e-07, "loss": 0.0055, "num_tokens": 372817046.0, "reward": 0.08258928993018344, "reward_std": 0.09080576250562444, "rewards/pure_accuracy_reward_math": 0.08258928690338507, "step": 1361 }, { "clip_ratio": 0.00038423701278134104, "epoch": 1.9789967434495535, "grad_norm": 0.03990958258509636, "kl": 0.007411479949951172, "learning_rate": 3.2044791546072985e-07, "loss": 0.0055, "step": 1362 }, { "clip_ratio": 0.00044172884827275993, "epoch": 1.9809088464641031, "grad_norm": 0.042212970554828644, "kl": 0.007319450378417969, "learning_rate": 3.173820360973823e-07, "loss": 0.0054, "step": 1363 }, { "clip_ratio": 0.00042502668532051757, "epoch": 1.9828209494786533, "grad_norm": 0.03946436941623688, "kl": 0.0072727203369140625, "learning_rate": 3.1432989932388416e-07, "loss": 0.0053, "step": 1364 }, { "clip_ratio": 0.00040032099315112646, "epoch": 1.984733052493203, "grad_norm": 0.03701746463775635, "kl": 0.007288455963134766, "learning_rate": 3.1129152435764473e-07, "loss": 0.0052, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 519.9707279205322, "epoch": 1.9866451555077531, "grad_norm": 0.03677362576127052, "kl": 0.00740814208984375, "learning_rate": 3.0826693032942586e-07, "loss": 0.008, "num_tokens": 376414405.0, "reward": 0.07087053926079534, "reward_std": 0.07741290412377566, "rewards/pure_accuracy_reward_math": 0.07087053710711189, "step": 1366 }, { "clip_ratio": 0.0002998853265978596, "epoch": 1.9885572585223028, "grad_norm": 0.03619634732604027, "kl": 0.0074787139892578125, "learning_rate": 3.0525613628321656e-07, "loss": 0.0079, "step": 1367 }, { "clip_ratio": 0.00031987275491474065, "epoch": 1.990469361536853, "grad_norm": 0.03580261766910553, "kl": 0.007512092590332031, "learning_rate": 3.022591611761169e-07, "loss": 0.0079, "step": 1368 }, { "clip_ratio": 0.00029055258056587263, "epoch": 1.9923814645514026, "grad_norm": 0.03512256592512131, "kl": 0.007531166076660156, "learning_rate": 2.9927602387821916e-07, "loss": 0.0078, "step": 1369 }, { "clip_ratio": 0.0003325358438814874, "epoch": 1.9942935675659528, "grad_norm": 0.03404110670089722, "kl": 0.007470130920410156, "learning_rate": 2.963067431724856e-07, "loss": 0.0077, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 524.95845079422, "epoch": 2.0019121030145497, "grad_norm": 0.03709035739302635, "kl": 0.007386684417724609, "learning_rate": 2.9335133775463266e-07, "loss": 0.011, "num_tokens": 380027444.0, "reward": 0.07198661039001308, "reward_std": 0.07208533387165517, "rewards/pure_accuracy_reward_math": 0.07198660876019858, "step": 1371 }, { "clip_ratio": 0.0002751371110321088, "epoch": 2.0038242060291, "grad_norm": 0.03661485016345978, "kl": 0.007431507110595703, "learning_rate": 2.9040982623301264e-07, "loss": 0.011, "step": 1372 }, { "clip_ratio": 0.0003175289227783651, "epoch": 2.0057363090436495, "grad_norm": 0.036799393594264984, "kl": 0.007405281066894531, "learning_rate": 2.874822271284977e-07, "loss": 0.0109, "step": 1373 }, { "clip_ratio": 0.0003284543961399322, "epoch": 2.0076484120581997, "grad_norm": 0.036977026611566544, "kl": 0.007386684417724609, "learning_rate": 2.8456855887436074e-07, "loss": 0.0108, "step": 1374 }, { "clip_ratio": 0.00032697250054525284, "epoch": 2.0095605150727494, "grad_norm": 0.03594314306974411, "kl": 0.00739288330078125, "learning_rate": 2.816688398161613e-07, "loss": 0.0108, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 524.5270891189575, "epoch": 2.0114726180872995, "grad_norm": 15.976890563964844, "kl": 0.4394536018371582, "learning_rate": 2.7878308821162964e-07, "loss": 0.0259, "num_tokens": 383639505.0, "reward": 0.08286830733413808, "reward_std": 0.08972975501092151, "rewards/pure_accuracy_reward_math": 0.08286830488941632, "step": 1376 }, { "clip_ratio": 0.0003084787746274742, "epoch": 2.013384721101849, "grad_norm": 1.2859545946121216, "kl": 0.04446220397949219, "learning_rate": 2.759113222305512e-07, "loss": 0.0102, "step": 1377 }, { "clip_ratio": 0.00034848380650487343, "epoch": 2.0152968241163993, "grad_norm": 0.0618804506957531, "kl": 0.009487152099609375, "learning_rate": 2.730535599546524e-07, "loss": 0.0087, "step": 1378 }, { "clip_ratio": 0.000346398171132023, "epoch": 2.017208927130949, "grad_norm": 0.039353594183921814, "kl": 0.008243560791015625, "learning_rate": 2.702098193774891e-07, "loss": 0.0087, "step": 1379 }, { "clip_ratio": 0.000389314118024231, "epoch": 2.019121030145499, "grad_norm": 0.03626256063580513, "kl": 0.0083465576171875, "learning_rate": 2.6738011840432817e-07, "loss": 0.0086, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 504.881441116333, "epoch": 2.021033133160049, "grad_norm": 0.03991848975419998, "kl": 0.00807046890258789, "learning_rate": 2.6456447485204014e-07, "loss": 0.0078, "num_tokens": 387180856.0, "reward": 0.07700893218861893, "reward_std": 0.0893906393321231, "rewards/pure_accuracy_reward_math": 0.07700893026776612, "step": 1381 }, { "clip_ratio": 0.00029079897933570464, "epoch": 2.022945236174599, "grad_norm": 0.03955512493848801, "kl": 0.008087635040283203, "learning_rate": 2.617629064489838e-07, "loss": 0.0078, "step": 1382 }, { "clip_ratio": 0.00034119405472665676, "epoch": 2.0248573391891487, "grad_norm": 0.04050750657916069, "kl": 0.008031845092773438, "learning_rate": 2.5897543083489544e-07, "loss": 0.0077, "step": 1383 }, { "clip_ratio": 0.0003633832532159431, "epoch": 2.026769442203699, "grad_norm": 0.03760417178273201, "kl": 0.007889270782470703, "learning_rate": 2.562020655607772e-07, "loss": 0.0076, "step": 1384 }, { "clip_ratio": 0.00040043183099669477, "epoch": 2.0286815452182485, "grad_norm": 0.036376822739839554, "kl": 0.007742404937744141, "learning_rate": 2.534428280887891e-07, "loss": 0.0076, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 521.2332820892334, "epoch": 2.0305936482327986, "grad_norm": 0.03659322112798691, "kl": 0.0079498291015625, "learning_rate": 2.50697735792135e-07, "loss": 0.0074, "num_tokens": 390784592.0, "reward": 0.0678013424621895, "reward_std": 0.07990403228905052, "rewards/pure_accuracy_reward_math": 0.06780134083237499, "step": 1386 }, { "clip_ratio": 0.0003029348101790674, "epoch": 2.0325057512473483, "grad_norm": 0.03603421524167061, "kl": 0.0077915191650390625, "learning_rate": 2.47966805954957e-07, "loss": 0.0073, "step": 1387 }, { "clip_ratio": 0.0002788126068935526, "epoch": 2.0344178542618985, "grad_norm": 0.035584706813097, "kl": 0.00768280029296875, "learning_rate": 2.4525005577222373e-07, "loss": 0.0073, "step": 1388 }, { "clip_ratio": 0.00033219700696918153, "epoch": 2.036329957276448, "grad_norm": 0.033913753926754, "kl": 0.007656097412109375, "learning_rate": 2.42547502349624e-07, "loss": 0.0072, "step": 1389 }, { "clip_ratio": 0.00034793876449157324, "epoch": 2.0382420602909983, "grad_norm": 0.033490557223558426, "kl": 0.007609367370605469, "learning_rate": 2.398591627034588e-07, "loss": 0.0072, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 534.8217334747314, "epoch": 2.040154163305548, "grad_norm": 0.04065319523215294, "kl": 0.007349491119384766, "learning_rate": 2.3718505376053246e-07, "loss": 0.0094, "num_tokens": 394433277.0, "reward": 0.07589286056463607, "reward_std": 0.09050671145087108, "rewards/pure_accuracy_reward_math": 0.07589285823632963, "step": 1391 }, { "clip_ratio": 0.00032872594630362073, "epoch": 2.042066266320098, "grad_norm": 0.0390729084610939, "kl": 0.007353305816650391, "learning_rate": 2.345251923580491e-07, "loss": 0.0094, "step": 1392 }, { "clip_ratio": 0.00038015836332760955, "epoch": 2.043978369334648, "grad_norm": 0.037973206490278244, "kl": 0.007381916046142578, "learning_rate": 2.3187959524350352e-07, "loss": 0.0093, "step": 1393 }, { "clip_ratio": 0.00041672343576237836, "epoch": 2.045890472349198, "grad_norm": 0.037547629326581955, "kl": 0.007441043853759766, "learning_rate": 2.2924827907457841e-07, "loss": 0.0092, "step": 1394 }, { "clip_ratio": 0.00047711057584365335, "epoch": 2.0478025753637477, "grad_norm": 0.037767618894577026, "kl": 0.007452487945556641, "learning_rate": 2.266312604190374e-07, "loss": 0.0091, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 520.9163165092468, "epoch": 2.049714678378298, "grad_norm": 0.039165694266557693, "kl": 0.007717609405517578, "learning_rate": 2.2402855575462152e-07, "loss": 0.0071, "num_tokens": 398030605.0, "reward": 0.07840402194415219, "reward_std": 0.08072105259634554, "rewards/pure_accuracy_reward_math": 0.07840401885914616, "step": 1396 }, { "clip_ratio": 0.0002864374472437703, "epoch": 2.0516267813928475, "grad_norm": 0.03918104246258736, "kl": 0.007798194885253906, "learning_rate": 2.2144018146894542e-07, "loss": 0.007, "step": 1397 }, { "clip_ratio": 0.00028412381868747616, "epoch": 2.0535388844073976, "grad_norm": 0.03787809982895851, "kl": 0.007855415344238281, "learning_rate": 2.1886615385939502e-07, "loss": 0.007, "step": 1398 }, { "clip_ratio": 0.0002802736350417945, "epoch": 2.0554509874219473, "grad_norm": 0.03685666248202324, "kl": 0.007898807525634766, "learning_rate": 2.1630648913302354e-07, "loss": 0.0069, "step": 1399 }, { "clip_ratio": 0.0003048399971703475, "epoch": 2.0573630904364975, "grad_norm": 0.03653446584939957, "kl": 0.0079193115234375, "learning_rate": 2.1376120340645014e-07, "loss": 0.0068, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 523.7120804786682, "epoch": 2.059275193451047, "grad_norm": 0.041400156915187836, "kl": 0.0076904296875, "learning_rate": 2.1123031270575827e-07, "loss": 0.0112, "num_tokens": 401639357.0, "reward": 0.08398437922005542, "reward_std": 0.08836089540272951, "rewards/pure_accuracy_reward_math": 0.08398437665891834, "step": 1401 }, { "clip_ratio": 0.0003276587292475597, "epoch": 2.0611872964655973, "grad_norm": 0.04058953374624252, "kl": 0.007676601409912109, "learning_rate": 2.0871383296639487e-07, "loss": 0.0112, "step": 1402 }, { "clip_ratio": 0.00033817819053183484, "epoch": 2.063099399480147, "grad_norm": 0.040160875767469406, "kl": 0.007659435272216797, "learning_rate": 2.062117800330693e-07, "loss": 0.0112, "step": 1403 }, { "clip_ratio": 0.00034579052078242967, "epoch": 2.065011502494697, "grad_norm": 0.03876737132668495, "kl": 0.007627964019775391, "learning_rate": 2.0372416965965675e-07, "loss": 0.0111, "step": 1404 }, { "clip_ratio": 0.00035969930786450277, "epoch": 2.066923605509247, "grad_norm": 0.03797266259789467, "kl": 0.007703304290771484, "learning_rate": 2.0125101750909315e-07, "loss": 0.011, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 514.2500252723694, "epoch": 2.068835708523797, "grad_norm": 0.05333253741264343, "kl": 0.010094165802001953, "learning_rate": 1.9879233915328312e-07, "loss": 0.0065, "num_tokens": 405215041.0, "reward": 0.08231027176952921, "reward_std": 0.08208991179708391, "rewards/pure_accuracy_reward_math": 0.08231026903376915, "step": 1406 }, { "clip_ratio": 0.0002884399551135175, "epoch": 2.0707478115383466, "grad_norm": 0.04066501557826996, "kl": 0.009914398193359375, "learning_rate": 1.9634815007299634e-07, "loss": 0.0065, "step": 1407 }, { "clip_ratio": 0.0003325861029566113, "epoch": 2.0726599145528963, "grad_norm": 0.03939688578248024, "kl": 0.00982666015625, "learning_rate": 1.9391846565777418e-07, "loss": 0.0064, "step": 1408 }, { "clip_ratio": 0.0003743518978467364, "epoch": 2.0745720175674465, "grad_norm": 0.03857440873980522, "kl": 0.009755611419677734, "learning_rate": 1.9150330120583012e-07, "loss": 0.0063, "step": 1409 }, { "clip_ratio": 0.0004666026043196325, "epoch": 2.076484120581996, "grad_norm": 0.03952641412615776, "kl": 0.0096588134765625, "learning_rate": 1.891026719239547e-07, "loss": 0.0062, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 516.8532605171204, "epoch": 2.0783962235965463, "grad_norm": 0.04142899066209793, "kl": 0.008448123931884766, "learning_rate": 1.8671659292742007e-07, "loss": 0.0099, "num_tokens": 408804459.0, "reward": 0.08286830742144957, "reward_std": 0.08260788215557113, "rewards/pure_accuracy_reward_math": 0.08286830509314314, "step": 1411 }, { "clip_ratio": 0.0003487231184635675, "epoch": 2.080308326611096, "grad_norm": 0.040530916303396225, "kl": 0.008367538452148438, "learning_rate": 1.8434507923988375e-07, "loss": 0.0099, "step": 1412 }, { "clip_ratio": 0.0003221970002869057, "epoch": 2.082220429625646, "grad_norm": 0.03941330686211586, "kl": 0.008350849151611328, "learning_rate": 1.8198814579329426e-07, "loss": 0.0098, "step": 1413 }, { "clip_ratio": 0.00037204451541583694, "epoch": 2.084132532640196, "grad_norm": 0.03861032798886299, "kl": 0.008304595947265625, "learning_rate": 1.7964580742779847e-07, "loss": 0.0097, "step": 1414 }, { "clip_ratio": 0.0003590778907209824, "epoch": 2.086044635654746, "grad_norm": 0.03945469483733177, "kl": 0.008287906646728516, "learning_rate": 1.7731807889164537e-07, "loss": 0.0096, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 529.592381477356, "epoch": 2.0879567386692957, "grad_norm": 0.03833872824907303, "kl": 0.0077228546142578125, "learning_rate": 1.7500497484109703e-07, "loss": 0.0109, "num_tokens": 412432506.0, "reward": 0.07449777142028324, "reward_std": 0.08200978167587891, "rewards/pure_accuracy_reward_math": 0.07449776885914616, "step": 1416 }, { "clip_ratio": 0.0002795722035671133, "epoch": 2.089868841683846, "grad_norm": 0.03684116527438164, "kl": 0.007727146148681641, "learning_rate": 1.7270650984033245e-07, "loss": 0.0108, "step": 1417 }, { "clip_ratio": 0.00033119657558700055, "epoch": 2.0917809446983955, "grad_norm": 0.03667665645480156, "kl": 0.007739067077636719, "learning_rate": 1.7042269836135882e-07, "loss": 0.0108, "step": 1418 }, { "clip_ratio": 0.00036255177064958843, "epoch": 2.0936930477129456, "grad_norm": 0.037857044488191605, "kl": 0.007757663726806641, "learning_rate": 1.6815355478391886e-07, "loss": 0.0107, "step": 1419 }, { "clip_ratio": 0.0003589615364489873, "epoch": 2.0956051507274953, "grad_norm": 0.0360855907201767, "kl": 0.007729053497314453, "learning_rate": 1.6589909339539968e-07, "loss": 0.0106, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 523.7469544410706, "epoch": 2.0975172537420455, "grad_norm": 0.041348401457071304, "kl": 0.007639408111572266, "learning_rate": 1.6365932839074532e-07, "loss": 0.0099, "num_tokens": 416048915.0, "reward": 0.07979911076836288, "reward_std": 0.08175079576903954, "rewards/pure_accuracy_reward_math": 0.07979910861467943, "step": 1421 }, { "clip_ratio": 0.00028084742956480113, "epoch": 2.099429356756595, "grad_norm": 0.03983917832374573, "kl": 0.007691860198974609, "learning_rate": 1.6143427387236455e-07, "loss": 0.0099, "step": 1422 }, { "clip_ratio": 0.00032101355429858813, "epoch": 2.1013414597711453, "grad_norm": 0.04035898670554161, "kl": 0.007829666137695312, "learning_rate": 1.592239438500434e-07, "loss": 0.0098, "step": 1423 }, { "clip_ratio": 0.00036129408920260175, "epoch": 2.103253562785695, "grad_norm": 0.03893222287297249, "kl": 0.0079498291015625, "learning_rate": 1.570283522408586e-07, "loss": 0.0097, "step": 1424 }, { "clip_ratio": 0.0003233651194136655, "epoch": 2.105165665800245, "grad_norm": 0.03798089176416397, "kl": 0.008071422576904297, "learning_rate": 1.5484751286908655e-07, "loss": 0.0097, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 515.3281455039978, "epoch": 2.107077768814795, "grad_norm": 0.04489213973283768, "kl": 0.00823831558227539, "learning_rate": 1.5268143946611802e-07, "loss": 0.01, "num_tokens": 419628171.0, "reward": 0.07952009321888909, "reward_std": 0.0892580482759513, "rewards/pure_accuracy_reward_math": 0.07952009089058265, "step": 1426 }, { "clip_ratio": 0.0003507794546067089, "epoch": 2.108989871829345, "grad_norm": 0.04182901233434677, "kl": 0.008199691772460938, "learning_rate": 1.5053014567037171e-07, "loss": 0.01, "step": 1427 }, { "clip_ratio": 0.0004634781105323782, "epoch": 2.1109019748438946, "grad_norm": 0.04111779108643532, "kl": 0.008260250091552734, "learning_rate": 1.483936450272097e-07, "loss": 0.0099, "step": 1428 }, { "clip_ratio": 0.0005032591409417364, "epoch": 2.1128140778584448, "grad_norm": 0.04071485623717308, "kl": 0.008274078369140625, "learning_rate": 1.4627195098884856e-07, "loss": 0.0098, "step": 1429 }, { "clip_ratio": 0.0005640338476382567, "epoch": 2.1147261808729945, "grad_norm": 0.041747044771909714, "kl": 0.008271217346191406, "learning_rate": 1.441650769142791e-07, "loss": 0.0097, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 527.8217334747314, "epoch": 2.1166382838875446, "grad_norm": 0.04057304188609123, "kl": 0.00798797607421875, "learning_rate": 1.4207303606917856e-07, "loss": 0.0057, "num_tokens": 423255484.0, "reward": 0.08761161076836288, "reward_std": 0.09866452467394993, "rewards/pure_accuracy_reward_math": 0.08761160855647177, "step": 1431 }, { "clip_ratio": 0.0003497144300581567, "epoch": 2.1185503869020943, "grad_norm": 0.03972388803958893, "kl": 0.007953643798828125, "learning_rate": 1.3999584162582874e-07, "loss": 0.0057, "step": 1432 }, { "clip_ratio": 0.00037741022566706306, "epoch": 2.1204624899166444, "grad_norm": 0.03924018144607544, "kl": 0.00795888900756836, "learning_rate": 1.3793350666303328e-07, "loss": 0.0056, "step": 1433 }, { "clip_ratio": 0.0003785647801350933, "epoch": 2.122374592931194, "grad_norm": 0.03913624957203865, "kl": 0.007895946502685547, "learning_rate": 1.3588604416603424e-07, "loss": 0.0055, "step": 1434 }, { "clip_ratio": 0.0003937934675377619, "epoch": 2.1242866959457443, "grad_norm": 0.03699544072151184, "kl": 0.00783538818359375, "learning_rate": 1.3385346702643188e-07, "loss": 0.0054, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 533.7888078689575, "epoch": 2.126198798960294, "grad_norm": 0.042676378041505814, "kl": 0.010451793670654297, "learning_rate": 1.3183578804210173e-07, "loss": 0.0098, "num_tokens": 426903267.0, "reward": 0.07645089671132155, "reward_std": 0.08488008996937424, "rewards/pure_accuracy_reward_math": 0.07645089426659979, "step": 1436 }, { "clip_ratio": 0.00036263700505401175, "epoch": 2.128110901974844, "grad_norm": 0.03884616866707802, "kl": 0.010242462158203125, "learning_rate": 1.2983301991711578e-07, "loss": 0.0098, "step": 1437 }, { "clip_ratio": 0.0003990789759313884, "epoch": 2.130023004989394, "grad_norm": 0.0399676114320755, "kl": 0.01007843017578125, "learning_rate": 1.278451752616608e-07, "loss": 0.0097, "step": 1438 }, { "clip_ratio": 0.0004171350746560165, "epoch": 2.131935108003944, "grad_norm": 0.039714373648166656, "kl": 0.010037422180175781, "learning_rate": 1.258722665919604e-07, "loss": 0.0097, "step": 1439 }, { "clip_ratio": 0.00039808801824392503, "epoch": 2.1338472110184936, "grad_norm": 0.03794709965586662, "kl": 0.009942054748535156, "learning_rate": 1.2391430633019452e-07, "loss": 0.0096, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 525.7826709747314, "epoch": 2.1357593140330433, "grad_norm": 0.05131447687745094, "kl": 0.00860595703125, "learning_rate": 1.2197130680442399e-07, "loss": 0.0073, "num_tokens": 430520032.0, "reward": 0.07282366428989917, "reward_std": 0.0797313749208115, "rewards/pure_accuracy_reward_math": 0.07282366172876209, "step": 1441 }, { "clip_ratio": 0.0003007381984616586, "epoch": 2.1376714170475934, "grad_norm": 0.03815394267439842, "kl": 0.008358001708984375, "learning_rate": 1.2004328024850938e-07, "loss": 0.0073, "step": 1442 }, { "clip_ratio": 0.0003256684682355626, "epoch": 2.139583520062143, "grad_norm": 0.03841105103492737, "kl": 0.008275985717773438, "learning_rate": 1.1813023880203722e-07, "loss": 0.0072, "step": 1443 }, { "clip_ratio": 0.00034418403180325186, "epoch": 2.1414956230766933, "grad_norm": 0.041511572897434235, "kl": 0.008276939392089844, "learning_rate": 1.1623219451024098e-07, "loss": 0.0071, "step": 1444 }, { "clip_ratio": 0.00032526867431670325, "epoch": 2.143407726091243, "grad_norm": 0.03922862559556961, "kl": 0.008294105529785156, "learning_rate": 1.1434915932392682e-07, "loss": 0.007, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 526.7310523986816, "epoch": 2.145319829105793, "grad_norm": 0.04134941101074219, "kl": 0.008166313171386719, "learning_rate": 1.1248114509939817e-07, "loss": 0.0067, "num_tokens": 434141592.0, "reward": 0.08342634307336994, "reward_std": 0.08578344061970711, "rewards/pure_accuracy_reward_math": 0.08342634132714011, "step": 1446 }, { "clip_ratio": 0.00029539940015865795, "epoch": 2.147231932120343, "grad_norm": 0.04034848138689995, "kl": 0.008122920989990234, "learning_rate": 1.1062816359838024e-07, "loss": 0.0066, "step": 1447 }, { "clip_ratio": 0.0003565281184592095, "epoch": 2.149144035134893, "grad_norm": 0.04018424078822136, "kl": 0.00803232192993164, "learning_rate": 1.0879022648794645e-07, "loss": 0.0066, "step": 1448 }, { "clip_ratio": 0.0003515161848781645, "epoch": 2.1510561381494426, "grad_norm": 0.03917380049824715, "kl": 0.007886886596679688, "learning_rate": 1.0696734534044629e-07, "loss": 0.0065, "step": 1449 }, { "clip_ratio": 0.0004228238227028669, "epoch": 2.1529682411639928, "grad_norm": 0.038036227226257324, "kl": 0.00785064697265625, "learning_rate": 1.0515953163342973e-07, "loss": 0.0064, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 544.0078330039978, "epoch": 2.1548803441785425, "grad_norm": 0.03814779594540596, "kl": 0.008002758026123047, "learning_rate": 1.0336679674957716e-07, "loss": 0.0113, "num_tokens": 437824108.0, "reward": 0.07533482514554635, "reward_std": 0.07659588241949677, "rewards/pure_accuracy_reward_math": 0.07533482287544757, "step": 1451 }, { "clip_ratio": 0.0002914705042371679, "epoch": 2.1567924471930926, "grad_norm": 0.03763413056731224, "kl": 0.00798654556274414, "learning_rate": 1.0158915197662628e-07, "loss": 0.0113, "step": 1452 }, { "clip_ratio": 0.0002916823746659247, "epoch": 2.1587045502076423, "grad_norm": 0.036225125193595886, "kl": 0.008030414581298828, "learning_rate": 9.982660850730269e-08, "loss": 0.0112, "step": 1453 }, { "clip_ratio": 0.0002708278207137482, "epoch": 2.1606166532221924, "grad_norm": 0.03529945760965347, "kl": 0.00803375244140625, "learning_rate": 9.807917743924838e-08, "loss": 0.0112, "step": 1454 }, { "clip_ratio": 0.0002930295025862506, "epoch": 2.162528756236742, "grad_norm": 0.03426925837993622, "kl": 0.007987022399902344, "learning_rate": 9.634686977495089e-08, "loss": 0.0111, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 517.6585068702698, "epoch": 2.1644408592512923, "grad_norm": 0.038425736129283905, "kl": 0.008115291595458984, "learning_rate": 9.462969642167613e-08, "loss": 0.0052, "num_tokens": 441407888.0, "reward": 0.07617187869618647, "reward_std": 0.0740246243076399, "rewards/pure_accuracy_reward_math": 0.07617187630967237, "step": 1456 }, { "clip_ratio": 0.00023060813538222646, "epoch": 2.166352962265842, "grad_norm": 0.03851727396249771, "kl": 0.008001327514648438, "learning_rate": 9.292766819139847e-08, "loss": 0.0052, "step": 1457 }, { "clip_ratio": 0.0002378168165932948, "epoch": 2.168265065280392, "grad_norm": 0.040155645459890366, "kl": 0.007994651794433594, "learning_rate": 9.12407958007322e-08, "loss": 0.0051, "step": 1458 }, { "clip_ratio": 0.0002497726611068174, "epoch": 2.170177168294942, "grad_norm": 0.0425233468413353, "kl": 0.007935047149658203, "learning_rate": 8.956908987086538e-08, "loss": 0.005, "step": 1459 }, { "clip_ratio": 0.00030142679486289126, "epoch": 2.172089271309492, "grad_norm": 0.03647738695144653, "kl": 0.007966041564941406, "learning_rate": 8.791256092749223e-08, "loss": 0.0049, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 520.2968997955322, "epoch": 2.1740013743240416, "grad_norm": 0.22045741975307465, "kl": 0.022356510162353516, "learning_rate": 8.627121940074645e-08, "loss": 0.0122, "num_tokens": 445010628.0, "reward": 0.08705357578583062, "reward_std": 0.08814817463280633, "rewards/pure_accuracy_reward_math": 0.08705357281723991, "step": 1461 }, { "clip_ratio": 0.00031046926528688346, "epoch": 2.1759134773385918, "grad_norm": 0.06329243630170822, "kl": 0.015823841094970703, "learning_rate": 8.464507562513657e-08, "loss": 0.0119, "step": 1462 }, { "clip_ratio": 0.0003438202776351318, "epoch": 2.1778255803531414, "grad_norm": 0.05041000247001648, "kl": 0.014271736145019531, "learning_rate": 8.303413983948017e-08, "loss": 0.0118, "step": 1463 }, { "clip_ratio": 0.0003563892260558532, "epoch": 2.1797376833676916, "grad_norm": 0.04660080000758171, "kl": 0.013462543487548828, "learning_rate": 8.143842218683862e-08, "loss": 0.0117, "step": 1464 }, { "clip_ratio": 0.0004125210731444895, "epoch": 2.1816497863822413, "grad_norm": 0.04536700248718262, "kl": 0.012927532196044922, "learning_rate": 7.985793271445636e-08, "loss": 0.0116, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 517.6127443313599, "epoch": 2.1835618893967914, "grad_norm": 0.08454474061727524, "kl": 0.010744094848632812, "learning_rate": 7.829268137369311e-08, "loss": 0.0075, "num_tokens": 448601372.0, "reward": 0.0750558071595151, "reward_std": 0.0813654173980467, "rewards/pure_accuracy_reward_math": 0.07505580488941632, "step": 1466 }, { "clip_ratio": 0.00028517025145902153, "epoch": 2.185473992411341, "grad_norm": 0.04138394817709923, "kl": 0.009669780731201172, "learning_rate": 7.674267801996427e-08, "loss": 0.0075, "step": 1467 }, { "clip_ratio": 0.00027802770790685827, "epoch": 2.1873860954258912, "grad_norm": 0.03745463490486145, "kl": 0.009511947631835938, "learning_rate": 7.52079324126792e-08, "loss": 0.0074, "step": 1468 }, { "clip_ratio": 0.0003267590287805433, "epoch": 2.189298198440441, "grad_norm": 0.036841075867414474, "kl": 0.00956106185913086, "learning_rate": 7.368845421517779e-08, "loss": 0.0073, "step": 1469 }, { "clip_ratio": 0.0003443693621534294, "epoch": 2.191210301454991, "grad_norm": 0.0362345427274704, "kl": 0.009715557098388672, "learning_rate": 7.21842529946698e-08, "loss": 0.0072, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 499.83763551712036, "epoch": 2.1931224044695408, "grad_norm": 0.0431695282459259, "kl": 0.008378028869628906, "learning_rate": 7.0695338222177e-08, "loss": 0.0093, "num_tokens": 452124382.0, "reward": 0.07756696839351207, "reward_std": 0.08685944566968828, "rewards/pure_accuracy_reward_math": 0.07756696530850604, "step": 1471 }, { "clip_ratio": 0.0003288618632950602, "epoch": 2.195034507484091, "grad_norm": 0.042445823550224304, "kl": 0.008408546447753906, "learning_rate": 6.922171927247062e-08, "loss": 0.0092, "step": 1472 }, { "clip_ratio": 0.0003429904774066017, "epoch": 2.1969466104986406, "grad_norm": 0.04231419414281845, "kl": 0.008434295654296875, "learning_rate": 6.776340542401422e-08, "loss": 0.0092, "step": 1473 }, { "clip_ratio": 0.00035230960349963425, "epoch": 2.1988587135131903, "grad_norm": 0.04162426292896271, "kl": 0.008434295654296875, "learning_rate": 6.632040585890398e-08, "loss": 0.0091, "step": 1474 }, { "clip_ratio": 0.000348456743722636, "epoch": 2.2007708165277404, "grad_norm": 0.04009128361940384, "kl": 0.008394718170166016, "learning_rate": 6.489272966281269e-08, "loss": 0.009, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 511.53015899658203, "epoch": 2.2026829195422906, "grad_norm": 0.03803718462586403, "kl": 0.008605003356933594, "learning_rate": 6.348038582493e-08, "loss": 0.0064, "num_tokens": 455697798.0, "reward": 0.06863839633297175, "reward_std": 0.0772402475704439, "rewards/pure_accuracy_reward_math": 0.06863839423749596, "step": 1476 }, { "clip_ratio": 0.0002735381897878142, "epoch": 2.2045950225568403, "grad_norm": 0.036724258214235306, "kl": 0.008575439453125, "learning_rate": 6.208338323790891e-08, "loss": 0.0064, "step": 1477 }, { "clip_ratio": 0.000271568493644736, "epoch": 2.20650712557139, "grad_norm": 0.03627302870154381, "kl": 0.008494853973388672, "learning_rate": 6.070173069780638e-08, "loss": 0.0063, "step": 1478 }, { "clip_ratio": 0.0003129301562694309, "epoch": 2.20841922858594, "grad_norm": 0.035685960203409195, "kl": 0.008512496948242188, "learning_rate": 5.933543690403082e-08, "loss": 0.0063, "step": 1479 }, { "clip_ratio": 0.0003575469975203305, "epoch": 2.21033133160049, "grad_norm": 0.03495527431368828, "kl": 0.008492469787597656, "learning_rate": 5.7984510459285215e-08, "loss": 0.0062, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 527.403482913971, "epoch": 2.21224343461504, "grad_norm": 0.041989997029304504, "kl": 0.008183956146240234, "learning_rate": 5.6648959869514965e-08, "loss": 0.0075, "num_tokens": 459321180.0, "reward": 0.07617187898722477, "reward_std": 0.0817908609751612, "rewards/pure_accuracy_reward_math": 0.07617187630967237, "step": 1481 }, { "clip_ratio": 0.0003129412224893713, "epoch": 2.2141555376295896, "grad_norm": 0.04108978435397148, "kl": 0.00823974609375, "learning_rate": 5.532879354385234e-08, "loss": 0.0075, "step": 1482 }, { "clip_ratio": 0.0003202799926498301, "epoch": 2.2160676406441397, "grad_norm": 0.03990933671593666, "kl": 0.00827646255493164, "learning_rate": 5.4024019794565176e-08, "loss": 0.0075, "step": 1483 }, { "clip_ratio": 0.0003925440155398974, "epoch": 2.2179797436586894, "grad_norm": 0.039193831384181976, "kl": 0.008234977722167969, "learning_rate": 5.273464683700352e-08, "loss": 0.0074, "step": 1484 }, { "clip_ratio": 0.0004001183214654702, "epoch": 2.2198918466732396, "grad_norm": 0.039878588169813156, "kl": 0.00826406478881836, "learning_rate": 5.1460682789547526e-08, "loss": 0.0073, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 531.470449924469, "epoch": 2.2218039496877893, "grad_norm": 0.04079683497548103, "kl": 0.011513710021972656, "learning_rate": 5.020213567355825e-08, "loss": 0.0091, "num_tokens": 462957626.0, "reward": 0.06752232459257357, "reward_std": 0.07320140459341928, "rewards/pure_accuracy_reward_math": 0.0675223229045514, "step": 1486 }, { "clip_ratio": 0.0002717390548241383, "epoch": 2.2237160527023394, "grad_norm": 0.037311483174562454, "kl": 0.011410713195800781, "learning_rate": 4.8959013413324705e-08, "loss": 0.009, "step": 1487 }, { "clip_ratio": 0.0002951391629721911, "epoch": 2.225628155716889, "grad_norm": 0.035728756338357925, "kl": 0.011387348175048828, "learning_rate": 4.773132383601664e-08, "loss": 0.009, "step": 1488 }, { "clip_ratio": 0.00030970129540719427, "epoch": 2.2275402587314392, "grad_norm": 0.03630708530545235, "kl": 0.011130332946777344, "learning_rate": 4.6519074671631805e-08, "loss": 0.0089, "step": 1489 }, { "clip_ratio": 0.00035198272149727927, "epoch": 2.229452361745989, "grad_norm": 0.035501569509506226, "kl": 0.010982990264892578, "learning_rate": 4.5322273552951265e-08, "loss": 0.0088, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 516.0912661552429, "epoch": 2.231364464760539, "grad_norm": 0.039065275341272354, "kl": 0.008381366729736328, "learning_rate": 4.4140928015488085e-08, "loss": 0.0067, "num_tokens": 466540145.0, "reward": 0.08007812951109372, "reward_std": 0.07346039032563567, "rewards/pure_accuracy_reward_math": 0.08007812619325705, "step": 1491 }, { "clip_ratio": 0.0002747246091985289, "epoch": 2.2332765677750888, "grad_norm": 0.03766880929470062, "kl": 0.008387088775634766, "learning_rate": 4.297504549744119e-08, "loss": 0.0067, "step": 1492 }, { "clip_ratio": 0.0002486348788579562, "epoch": 2.235188670789639, "grad_norm": 0.03599947690963745, "kl": 0.0084991455078125, "learning_rate": 4.182463333964909e-08, "loss": 0.0066, "step": 1493 }, { "clip_ratio": 0.0002674886795261955, "epoch": 2.2371007738041886, "grad_norm": 0.0361332893371582, "kl": 0.008679389953613281, "learning_rate": 4.068969878554263e-08, "loss": 0.0066, "step": 1494 }, { "clip_ratio": 0.00031218544620514876, "epoch": 2.2390128768187387, "grad_norm": 0.035462211817502975, "kl": 0.008719921112060547, "learning_rate": 3.957024898110007e-08, "loss": 0.0065, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 507.05945777893066, "epoch": 2.2409249798332884, "grad_norm": 0.10880274325609207, "kl": 0.012134075164794922, "learning_rate": 3.846629097480126e-08, "loss": 0.0046, "num_tokens": 470091662.0, "reward": 0.07952009330620058, "reward_std": 0.08660046098520979, "rewards/pure_accuracy_reward_math": 0.0795200907450635, "step": 1496 }, { "clip_ratio": 0.00034633993402621854, "epoch": 2.2428370828478386, "grad_norm": 0.04444468766450882, "kl": 0.010071754455566406, "learning_rate": 3.737783171758408e-08, "loss": 0.0045, "step": 1497 }, { "clip_ratio": 0.00040814166391101026, "epoch": 2.2447491858623883, "grad_norm": 0.050679393112659454, "kl": 0.009745597839355469, "learning_rate": 3.630487806280086e-08, "loss": 0.0044, "step": 1498 }, { "clip_ratio": 0.00040935890626769833, "epoch": 2.2466612888769384, "grad_norm": 0.04249563813209534, "kl": 0.009531974792480469, "learning_rate": 3.524743676617426e-08, "loss": 0.0044, "step": 1499 }, { "clip_ratio": 0.00041069585563491273, "epoch": 2.248573391891488, "grad_norm": 0.04013880342245102, "kl": 0.009422779083251953, "learning_rate": 3.42055144857556e-08, "loss": 0.0042, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 530.4908156394958, "epoch": 2.250485494906038, "grad_norm": 0.04119328781962395, "kl": 0.00858306884765625, "learning_rate": 3.3179117781882154e-08, "loss": 0.0064, "num_tokens": 473729421.0, "reward": 0.08175223629223183, "reward_std": 0.080375739664305, "rewards/pure_accuracy_reward_math": 0.08175223390571773, "step": 1501 }, { "clip_ratio": 0.00027040669908728887, "epoch": 2.252397597920588, "grad_norm": 0.03726639971137047, "kl": 0.008556365966796875, "learning_rate": 3.216825311713689e-08, "loss": 0.0064, "step": 1502 }, { "clip_ratio": 0.0003022322244419229, "epoch": 2.254309700935138, "grad_norm": 0.03740008547902107, "kl": 0.008624553680419922, "learning_rate": 3.11729268563063e-08, "loss": 0.0063, "step": 1503 }, { "clip_ratio": 0.0002972338604081415, "epoch": 2.2562218039496877, "grad_norm": 0.036019936203956604, "kl": 0.008683204650878906, "learning_rate": 3.019314526634232e-08, "loss": 0.0062, "step": 1504 }, { "clip_ratio": 0.0003317092545103151, "epoch": 2.258133906964238, "grad_norm": 0.035242002457380295, "kl": 0.008699893951416016, "learning_rate": 2.922891451632076e-08, "loss": 0.0062, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 516.8340096473694, "epoch": 2.2600460099787876, "grad_norm": 0.04786042869091034, "kl": 0.0166015625, "learning_rate": 2.8280240677403813e-08, "loss": 0.0117, "num_tokens": 477311002.0, "reward": 0.08593750389991328, "reward_std": 0.09509739134227857, "rewards/pure_accuracy_reward_math": 0.08593750139698386, "step": 1506 }, { "clip_ratio": 0.0003771551589011324, "epoch": 2.2619581129933373, "grad_norm": 0.04542854428291321, "kl": 0.016517162322998047, "learning_rate": 2.7347129722801736e-08, "loss": 0.0117, "step": 1507 }, { "clip_ratio": 0.00043879733209450933, "epoch": 2.2638702160078874, "grad_norm": 0.04336082562804222, "kl": 0.016106605529785156, "learning_rate": 2.6429587527734835e-08, "loss": 0.0116, "step": 1508 }, { "clip_ratio": 0.0005006881825977416, "epoch": 2.2657823190224375, "grad_norm": 0.04397574067115784, "kl": 0.015746116638183594, "learning_rate": 2.5527619869396003e-08, "loss": 0.0115, "step": 1509 }, { "clip_ratio": 0.0005348546662844456, "epoch": 2.2676944220369872, "grad_norm": 0.043936342000961304, "kl": 0.015500068664550781, "learning_rate": 2.464123242691574e-08, "loss": 0.0114, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 526.8474016189575, "epoch": 2.269606525051537, "grad_norm": 0.04165401682257652, "kl": 0.008256912231445312, "learning_rate": 2.377043078132496e-08, "loss": 0.0079, "num_tokens": 480935151.0, "reward": 0.08342634345171973, "reward_std": 0.09024772583507001, "rewards/pure_accuracy_reward_math": 0.08342634071595967, "step": 1511 }, { "clip_ratio": 0.0003286536882569635, "epoch": 2.271518628066087, "grad_norm": 0.04013460502028465, "kl": 0.008354663848876953, "learning_rate": 2.291522041552141e-08, "loss": 0.0079, "step": 1512 }, { "clip_ratio": 0.00034448601985559435, "epoch": 2.273430731080637, "grad_norm": 0.03929148614406586, "kl": 0.008509159088134766, "learning_rate": 2.207560671423331e-08, "loss": 0.0078, "step": 1513 }, { "clip_ratio": 0.00038580430322099346, "epoch": 2.275342834095187, "grad_norm": 0.04108521342277527, "kl": 0.008730888366699219, "learning_rate": 2.1251594963986876e-08, "loss": 0.0077, "step": 1514 }, { "clip_ratio": 0.00038072799372912414, "epoch": 2.2772549371097366, "grad_norm": 0.038887783885002136, "kl": 0.008725643157958984, "learning_rate": 2.0443190353072185e-08, "loss": 0.0076, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 519.4051609039307, "epoch": 2.2791670401242867, "grad_norm": 0.03783741220831871, "kl": 0.008581161499023438, "learning_rate": 1.9650397971510972e-08, "loss": 0.0064, "num_tokens": 484530587.0, "reward": 0.08231027124566026, "reward_std": 0.08037574036279693, "rewards/pure_accuracy_reward_math": 0.08231026897556148, "step": 1516 }, { "clip_ratio": 0.0002746778108644321, "epoch": 2.2810791431388364, "grad_norm": 0.03765445947647095, "kl": 0.008580207824707031, "learning_rate": 1.8873222811024717e-08, "loss": 0.0063, "step": 1517 }, { "clip_ratio": 0.00031986788579274616, "epoch": 2.2829912461533866, "grad_norm": 0.03684096038341522, "kl": 0.008593082427978516, "learning_rate": 1.8111669765003005e-08, "loss": 0.0063, "step": 1518 }, { "clip_ratio": 0.0003354349921380617, "epoch": 2.2849033491679362, "grad_norm": 0.03599463030695915, "kl": 0.008591175079345703, "learning_rate": 1.73657436284716e-08, "loss": 0.0062, "step": 1519 }, { "clip_ratio": 0.0003505910435706028, "epoch": 2.2868154521824864, "grad_norm": 0.035750966519117355, "kl": 0.00874948501586914, "learning_rate": 1.6635449098064972e-08, "loss": 0.0061, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 521.2455606460571, "epoch": 2.288727555197036, "grad_norm": 0.03890154883265495, "kl": 0.008922100067138672, "learning_rate": 1.5920790771993822e-08, "loss": 0.0078, "num_tokens": 488136255.0, "reward": 0.07952009289874695, "reward_std": 0.07556614064378664, "rewards/pure_accuracy_reward_math": 0.07952009068685584, "step": 1521 }, { "clip_ratio": 0.00024827225587387147, "epoch": 2.290639658211586, "grad_norm": 0.037810854613780975, "kl": 0.008934974670410156, "learning_rate": 1.5221773150017882e-08, "loss": 0.0078, "step": 1522 }, { "clip_ratio": 0.0002384709360967463, "epoch": 2.292551761226136, "grad_norm": 0.0364384800195694, "kl": 0.008936882019042969, "learning_rate": 1.4538400633417049e-08, "loss": 0.0077, "step": 1523 }, { "clip_ratio": 0.0002599185108635993, "epoch": 2.294463864240686, "grad_norm": 0.035106074064970016, "kl": 0.008829116821289062, "learning_rate": 1.387067752496335e-08, "loss": 0.0076, "step": 1524 }, { "clip_ratio": 0.0003290796867077006, "epoch": 2.2963759672552357, "grad_norm": 0.03489363566040993, "kl": 0.0086822509765625, "learning_rate": 1.3218608028895131e-08, "loss": 0.0076, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 517.0547122955322, "epoch": 2.298288070269786, "grad_norm": 0.040062014013528824, "kl": 0.008834362030029297, "learning_rate": 1.2582196250888745e-08, "loss": 0.0071, "num_tokens": 491722139.0, "reward": 0.08621652179863304, "reward_std": 0.08020308247068897, "rewards/pure_accuracy_reward_math": 0.08621651906287298, "step": 1526 }, { "clip_ratio": 0.00031514769625573535, "epoch": 2.3002001732843356, "grad_norm": 0.03938477113842964, "kl": 0.008733272552490234, "learning_rate": 1.1961446198033855e-08, "loss": 0.0071, "step": 1527 }, { "clip_ratio": 0.00030386562087869606, "epoch": 2.3021122762988857, "grad_norm": 0.03844742849469185, "kl": 0.008654594421386719, "learning_rate": 1.1356361778808167e-08, "loss": 0.007, "step": 1528 }, { "clip_ratio": 0.00034510965764411594, "epoch": 2.3040243793134354, "grad_norm": 0.03755528852343559, "kl": 0.00861358642578125, "learning_rate": 1.076694680305218e-08, "loss": 0.007, "step": 1529 }, { "clip_ratio": 0.00035207756366162357, "epoch": 2.3059364823279855, "grad_norm": 0.03696778416633606, "kl": 0.008616447448730469, "learning_rate": 1.0193204981946426e-08, "loss": 0.0069, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 516.7249145507812, "epoch": 2.3078485853425352, "grad_norm": 0.045076508074998856, "kl": 0.014521598815917969, "learning_rate": 9.63513992798676e-09, "loss": 0.0065, "num_tokens": 495305537.0, "reward": 0.07505580713041127, "reward_std": 0.07844264624873176, "rewards/pure_accuracy_reward_math": 0.07505580480210483, "step": 1531 }, { "clip_ratio": 0.0003054732096074986, "epoch": 2.3097606883570854, "grad_norm": 0.041828691959381104, "kl": 0.01419973373413086, "learning_rate": 9.092755154961886e-09, "loss": 0.0065, "step": 1532 }, { "clip_ratio": 0.00030572324658351135, "epoch": 2.311672791371635, "grad_norm": 0.03949357569217682, "kl": 0.013697624206542969, "learning_rate": 8.566054077932262e-09, "loss": 0.0064, "step": 1533 }, { "clip_ratio": 0.0003279060996987937, "epoch": 2.313584894386185, "grad_norm": 0.038545649498701096, "kl": 0.01345968246459961, "learning_rate": 8.055040013207061e-09, "loss": 0.0063, "step": 1534 }, { "clip_ratio": 0.00033917763732915773, "epoch": 2.315496997400735, "grad_norm": 0.03716408833861351, "kl": 0.01330709457397461, "learning_rate": 7.559716178325016e-09, "loss": 0.0062, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 519.2921552658081, "epoch": 2.317409100415285, "grad_norm": 0.041162386536598206, "kl": 0.008297443389892578, "learning_rate": 7.080085692032224e-09, "loss": 0.0079, "num_tokens": 498900584.0, "reward": 0.08928571816068143, "reward_std": 0.08428199036279693, "rewards/pure_accuracy_reward_math": 0.08928571571595967, "step": 1536 }, { "clip_ratio": 0.00029752771973790004, "epoch": 2.3193212034298347, "grad_norm": 0.03933210298418999, "kl": 0.008346080780029297, "learning_rate": 6.616151574264374e-09, "loss": 0.0079, "step": 1537 }, { "clip_ratio": 0.0003302163729017593, "epoch": 2.321233306444385, "grad_norm": 0.038146842271089554, "kl": 0.008320331573486328, "learning_rate": 6.1679167461262124e-09, "loss": 0.0078, "step": 1538 }, { "clip_ratio": 0.0003326926421891585, "epoch": 2.3231454094589346, "grad_norm": 0.038072116672992706, "kl": 0.008330345153808594, "learning_rate": 5.735384029874336e-09, "loss": 0.0077, "step": 1539 }, { "clip_ratio": 0.00038002995881925017, "epoch": 2.3250575124734847, "grad_norm": 0.037320397794246674, "kl": 0.008296012878417969, "learning_rate": 5.31855614889859e-09, "loss": 0.0076, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 520.1487407684326, "epoch": 2.3269696154880344, "grad_norm": 0.03688493371009827, "kl": 0.008476734161376953, "learning_rate": 4.917435727704867e-09, "loss": 0.0024, "num_tokens": 502500281.0, "reward": 0.0811942005821038, "reward_std": 0.0787416979437694, "rewards/pure_accuracy_reward_math": 0.08119419842842035, "step": 1541 }, { "clip_ratio": 0.00028201957394458077, "epoch": 2.3288817185025845, "grad_norm": 0.03607385605573654, "kl": 0.008441448211669922, "learning_rate": 4.53202529190011e-09, "loss": 0.0023, "step": 1542 }, { "clip_ratio": 0.0002742231245633775, "epoch": 2.330793821517134, "grad_norm": 0.03572804853320122, "kl": 0.00852060317993164, "learning_rate": 4.162327268173727e-09, "loss": 0.0023, "step": 1543 }, { "clip_ratio": 0.0003046261713848253, "epoch": 2.332705924531684, "grad_norm": 0.034965962171554565, "kl": 0.00861501693725586, "learning_rate": 3.80834398428509e-09, "loss": 0.0022, "step": 1544 }, { "clip_ratio": 0.0003226917802976459, "epoch": 2.334618027546234, "grad_norm": 0.034803807735443115, "kl": 0.008724212646484375, "learning_rate": 3.470077669046612e-09, "loss": 0.0021, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 538.0273699760437, "epoch": 2.336530130560784, "grad_norm": 0.034996818751096725, "kl": 0.008575439453125, "learning_rate": 3.147530452311809e-09, "loss": 0.0064, "num_tokens": 506159719.0, "reward": 0.06891741408617236, "reward_std": 0.07063014718005434, "rewards/pure_accuracy_reward_math": 0.06891741210711189, "step": 1546 }, { "clip_ratio": 0.00023073077210256088, "epoch": 2.338442233575334, "grad_norm": 0.03347066789865494, "kl": 0.008565902709960938, "learning_rate": 2.8407043649597567e-09, "loss": 0.0063, "step": 1547 }, { "clip_ratio": 0.000268154504112772, "epoch": 2.3403543365898836, "grad_norm": 0.03273630142211914, "kl": 0.008545398712158203, "learning_rate": 2.549601338883989e-09, "loss": 0.0063, "step": 1548 }, { "clip_ratio": 0.00029292683666426456, "epoch": 2.3422664396044337, "grad_norm": 0.032376162707805634, "kl": 0.008570671081542969, "learning_rate": 2.2742232069794533e-09, "loss": 0.0063, "step": 1549 }, { "clip_ratio": 0.0003443536306235728, "epoch": 2.344178542618984, "grad_norm": 0.031950000673532486, "kl": 0.008484363555908203, "learning_rate": 2.01457170313113e-09, "loss": 0.0062, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 520.7207255363464, "epoch": 2.3460906456335335, "grad_norm": 0.04171088710427284, "kl": 0.009114742279052734, "learning_rate": 1.7706484622034837e-09, "loss": 0.005, "num_tokens": 509757966.0, "reward": 0.07672991443541832, "reward_std": 0.08149181143380702, "rewards/pure_accuracy_reward_math": 0.07672991228173487, "step": 1551 }, { "clip_ratio": 0.0003305982788788242, "epoch": 2.3480027486480832, "grad_norm": 0.04123101010918617, "kl": 0.009046554565429688, "learning_rate": 1.5424550200293653e-09, "loss": 0.005, "step": 1552 }, { "clip_ratio": 0.0003486324259256435, "epoch": 2.3499148516626334, "grad_norm": 0.039809513837099075, "kl": 0.008966445922851562, "learning_rate": 1.3299928134014039e-09, "loss": 0.0049, "step": 1553 }, { "clip_ratio": 0.0003954665013452541, "epoch": 2.351826954677183, "grad_norm": 0.0393875353038311, "kl": 0.008915901184082031, "learning_rate": 1.1332631800620164e-09, "loss": 0.0049, "step": 1554 }, { "clip_ratio": 0.0004334128346954458, "epoch": 2.353739057691733, "grad_norm": 0.03990260884165764, "kl": 0.008862972259521484, "learning_rate": 9.522673586956355e-10, "loss": 0.0047, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 518.488025188446, "epoch": 2.355651160706283, "grad_norm": 0.04300679266452789, "kl": 0.009171009063720703, "learning_rate": 7.870064889206608e-10, "loss": 0.0082, "num_tokens": 513350767.0, "reward": 0.07728794994181953, "reward_std": 0.08290693227900192, "rewards/pure_accuracy_reward_math": 0.07728794743889011, "step": 1556 }, { "clip_ratio": 0.000295089724772879, "epoch": 2.357563263720833, "grad_norm": 0.04144243150949478, "kl": 0.009136676788330078, "learning_rate": 6.374816112819648e-10, "loss": 0.0082, "step": 1557 }, { "clip_ratio": 0.0003283331608940898, "epoch": 2.3594753667353827, "grad_norm": 0.039357006549835205, "kl": 0.009202003479003906, "learning_rate": 5.036936672447868e-10, "loss": 0.0081, "step": 1558 }, { "clip_ratio": 0.00036647373104869985, "epoch": 2.361387469749933, "grad_norm": 0.03904441371560097, "kl": 0.009307384490966797, "learning_rate": 3.8564349918890356e-10, "loss": 0.008, "step": 1559 }, { "clip_ratio": 0.0004084905730792343, "epoch": 2.3632995727644825, "grad_norm": 0.03901646286249161, "kl": 0.00932168960571289, "learning_rate": 2.833318504030791e-10, "loss": 0.0079, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 527.474356174469, "epoch": 2.3652116757790327, "grad_norm": 5.391517162322998, "kl": 0.0942845344543457, "learning_rate": 1.9675936507979056e-10, "loss": 0.0081, "num_tokens": 516974751.0, "reward": 0.06975446754950099, "reward_std": 0.06989945453824475, "rewards/pure_accuracy_reward_math": 0.06975446597789414, "step": 1561 }, { "clip_ratio": 0.0002886794856635788, "epoch": 2.3671237787935824, "grad_norm": 0.1764528900384903, "kl": 0.013553619384765625, "learning_rate": 1.2592658831245274e-10, "loss": 0.0049, "step": 1562 }, { "clip_ratio": 0.00028670978349509824, "epoch": 2.3690358818081325, "grad_norm": 0.03846847265958786, "kl": 0.009183406829833984, "learning_rate": 7.083396609097737e-11, "loss": 0.0047, "step": 1563 }, { "clip_ratio": 0.0002776476591748178, "epoch": 2.370947984822682, "grad_norm": 0.035545963793992996, "kl": 0.008979320526123047, "learning_rate": 3.148184529927489e-11, "loss": 0.0046, "step": 1564 }, { "clip_ratio": 0.00032522391097700165, "epoch": 2.3728600878372323, "grad_norm": 0.1538141518831253, "kl": 0.009156227111816406, "learning_rate": 7.870473713589288e-12, "loss": 0.0046, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 530.6135845184326, "epoch": 2.374772190851782, "grad_norm": 0.0368269719183445, "kl": 0.008574485778808594, "learning_rate": 0.0, "loss": 0.0087, "num_tokens": 520611370.0, "reward": 0.07142857427243143, "reward_std": 0.07900068280287087, "rewards/pure_accuracy_reward_math": 0.07142857293365523, "step": 1566 }, { "epoch": 2.374772190851782, "step": 1566, "total_flos": 0.0, "train_loss": 0.003398028112404372, "train_runtime": 273585.6306, "train_samples_per_second": 1.028, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 1566, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }