{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_fraction": 0.0, "completion_length": 2253.854206085205, "epoch": 0.001142857142857143, "grad_norm": 0.029786353930830956, "kl": 0.0, "lambda_div_used": 0.6170438528060913, "learning_rate": 0.0, "loss": -0.0476, "reward": 0.09989889524877071, "reward_after_mean": 0.09989889524877071, "reward_after_std": 0.6247774921357632, "reward_before_mean": 0.5353203006088734, "reward_before_std": 0.5411310354247689, "reward_change_max": 0.0, "reward_change_mean": -0.4354214407503605, "reward_change_min": -0.6615581586956978, "reward_change_std": 0.2600514395162463, "reward_std": 0.6247775163501501, "rewards/accuracy_reward": 0.37500000931322575, "rewards/cosine_scaled_reward": 0.16032031644135714, "step": 1 }, { "clip_fraction": 0.0, "completion_length": 2566.395854949951, "epoch": 0.002285714285714286, "grad_norm": 0.025455903261899948, "kl": 0.0, "lambda_div_used": 0.6156510338187218, "learning_rate": 5e-08, "loss": 0.0349, "reward": 0.10292071849107742, "reward_after_mean": 0.10292071849107742, "reward_after_std": 0.598213616758585, "reward_before_mean": 0.5439198296517134, "reward_before_std": 0.5335724893957376, "reward_change_max": 0.0, "reward_change_mean": -0.440999086946249, "reward_change_min": -0.6536596268415451, "reward_change_std": 0.2629614323377609, "reward_std": 0.5982136316597462, "rewards/accuracy_reward": 0.41666667722165585, "rewards/cosine_scaled_reward": 0.12725313939154148, "step": 2 }, { "clip_fraction": 0.0, "completion_length": 2808.2083740234375, "epoch": 0.0034285714285714284, "grad_norm": 0.026795223355293274, "kl": 0.00017423927783966064, "lambda_div_used": 0.599299855530262, "learning_rate": 1e-07, "loss": -0.011, "reward": -0.23149854317307472, "reward_after_mean": -0.23149854317307472, "reward_after_std": 0.4924859032034874, "reward_before_mean": 0.05049763061106205, "reward_before_std": 0.4575129607692361, "reward_change_max": 0.0, "reward_change_mean": -0.28199618123471737, "reward_change_min": -0.46816934645175934, "reward_change_std": 0.17736693751066923, "reward_std": 0.49248590879142284, "rewards/accuracy_reward": 0.14583333767950535, "rewards/cosine_scaled_reward": -0.09533570008352399, "step": 3 }, { "clip_fraction": 0.0, "completion_length": 1537.4583435058594, "epoch": 0.004571428571428572, "grad_norm": 0.027395043522119522, "kl": 8.840858936309814e-05, "lambda_div_used": 0.6135994121432304, "learning_rate": 1.5e-07, "loss": 0.0364, "reward": 0.19573484233114868, "reward_after_mean": 0.19573484233114868, "reward_after_std": 0.5626182612031698, "reward_before_mean": 0.6533232685178518, "reward_before_std": 0.5161786610260606, "reward_change_max": 0.0, "reward_change_mean": -0.4575884137302637, "reward_change_min": -0.662933848798275, "reward_change_std": 0.26597225945442915, "reward_std": 0.5626182779669762, "rewards/accuracy_reward": 0.45833334885537624, "rewards/cosine_scaled_reward": 0.19498991407454014, "step": 4 }, { "clip_fraction": 0.0, "completion_length": 2930.541748046875, "epoch": 0.005714285714285714, "grad_norm": 0.021351713687181473, "kl": 0.00014585256576538086, "lambda_div_used": 0.618914432823658, "learning_rate": 2e-07, "loss": 0.0298, "reward": -0.07733920076861978, "reward_after_mean": -0.07733920076861978, "reward_after_std": 0.6453567277640104, "reward_before_mean": 0.276840849313885, "reward_before_std": 0.5498588550835848, "reward_change_max": 0.0, "reward_change_mean": -0.3541800267994404, "reward_change_min": -0.5702032893896103, "reward_change_std": 0.21250940579921007, "reward_std": 0.6453567445278168, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.0060075074434280396, "step": 5 }, { "clip_fraction": 0.0, "completion_length": 2457.187515258789, "epoch": 0.006857142857142857, "grad_norm": 0.03508686274290085, "kl": 0.00010536611080169678, "lambda_div_used": 0.6270971074700356, "learning_rate": 2.5e-07, "loss": -0.0326, "reward": -0.08803003467619419, "reward_after_mean": -0.08803003467619419, "reward_after_std": 0.6014832425862551, "reward_before_mean": 0.21033997228369117, "reward_before_std": 0.5929712019860744, "reward_change_max": 0.0, "reward_change_mean": -0.29837000742554665, "reward_change_min": -0.5160593837499619, "reward_change_std": 0.2064626282081008, "reward_std": 0.601483253762126, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.01882670260965824, "step": 6 }, { "clip_fraction": 0.0, "completion_length": 2581.604263305664, "epoch": 0.008, "grad_norm": 0.02246660739183426, "kl": 0.00012472271919250488, "lambda_div_used": 0.6219364404678345, "learning_rate": 3e-07, "loss": -0.0131, "reward": -0.06724199093878269, "reward_after_mean": -0.06724199093878269, "reward_after_std": 0.5799425262957811, "reward_before_mean": 0.2561297030188143, "reward_before_std": 0.5679045412689447, "reward_change_max": 0.0, "reward_change_mean": -0.3233717121183872, "reward_change_min": -0.5483178608119488, "reward_change_std": 0.213481605052948, "reward_std": 0.5799425337463617, "rewards/accuracy_reward": 0.27083334140479565, "rewards/cosine_scaled_reward": -0.014703631401062012, "step": 7 }, { "clip_fraction": 0.0, "completion_length": 1865.0625228881836, "epoch": 0.009142857142857144, "grad_norm": 0.0264554712921381, "kl": 7.349252700805664e-05, "lambda_div_used": 0.6559017673134804, "learning_rate": 3.5e-07, "loss": 0.0012, "reward": 0.30751039180904627, "reward_after_mean": 0.30751039180904627, "reward_after_std": 0.768000740557909, "reward_before_mean": 0.7620646432042122, "reward_before_std": 0.7257685504155234, "reward_change_max": 0.0, "reward_change_mean": -0.4545542187988758, "reward_change_min": -0.6947779208421707, "reward_change_std": 0.28947674110531807, "reward_std": 0.7680007480084896, "rewards/accuracy_reward": 0.5208333469927311, "rewards/cosine_scaled_reward": 0.24123129644431174, "step": 8 }, { "clip_fraction": 0.0, "completion_length": 2611.187515258789, "epoch": 0.010285714285714285, "grad_norm": 0.02845916524529457, "kl": 0.00014799833297729492, "lambda_div_used": 0.6612376719713211, "learning_rate": 4e-07, "loss": -0.0083, "reward": 0.022774726152420044, "reward_after_mean": 0.022774726152420044, "reward_after_std": 0.7959331478923559, "reward_before_mean": 0.3162382678128779, "reward_before_std": 0.7569107804447412, "reward_change_max": 0.0, "reward_change_mean": -0.29346355609595776, "reward_change_min": -0.4966486766934395, "reward_change_std": 0.20045297034084797, "reward_std": 0.7959331627935171, "rewards/accuracy_reward": 0.25000000186264515, "rewards/cosine_scaled_reward": 0.06623826455324888, "step": 9 }, { "clip_fraction": 0.0, "completion_length": 2326.8541870117188, "epoch": 0.011428571428571429, "grad_norm": 0.02597963623702526, "kl": 0.0001014266163110733, "lambda_div_used": 0.6026739403605461, "learning_rate": 4.5e-07, "loss": 0.012, "reward": 0.08584612235426903, "reward_after_mean": 0.08584612235426903, "reward_after_std": 0.5247625019401312, "reward_before_mean": 0.5338415652513504, "reward_before_std": 0.4742008354514837, "reward_change_max": 0.0, "reward_change_mean": -0.4479954708367586, "reward_change_min": -0.6653710156679153, "reward_change_std": 0.2726733274757862, "reward_std": 0.5247625187039375, "rewards/accuracy_reward": 0.3958333432674408, "rewards/cosine_scaled_reward": 0.13800824619829655, "step": 10 }, { "clip_fraction": 0.0, "completion_length": 3257.875045776367, "epoch": 0.012571428571428572, "grad_norm": 0.01903906650841236, "kl": 0.00015848875045776367, "lambda_div_used": 0.6304982751607895, "learning_rate": 5e-07, "loss": 0.037, "reward": -0.19499589689075947, "reward_after_mean": -0.19499589689075947, "reward_after_std": 0.6491430383175611, "reward_before_mean": 0.052283127792179585, "reward_before_std": 0.5964901968836784, "reward_change_max": 0.0, "reward_change_mean": -0.24727902933955193, "reward_change_min": -0.40677722357213497, "reward_change_std": 0.14844622276723385, "reward_std": 0.649143049493432, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.11438353173434734, "step": 11 }, { "clip_fraction": 0.0, "completion_length": 1922.6458587646484, "epoch": 0.013714285714285714, "grad_norm": 0.027084212750196457, "kl": 0.00012683868408203125, "lambda_div_used": 0.6053123474121094, "learning_rate": 5.5e-07, "loss": 0.0637, "reward": -0.10656415252014995, "reward_after_mean": -0.10656415252014995, "reward_after_std": 0.5768749956041574, "reward_before_mean": 0.25147354789078236, "reward_before_std": 0.4855317808687687, "reward_change_max": 0.0, "reward_change_mean": -0.35803768038749695, "reward_change_min": -0.5767498537898064, "reward_change_std": 0.21485394705086946, "reward_std": 0.5768750291317701, "rewards/accuracy_reward": 0.25000000186264515, "rewards/cosine_scaled_reward": 0.0014735234435647726, "step": 12 }, { "clip_fraction": 0.0, "completion_length": 2585.979217529297, "epoch": 0.014857142857142857, "grad_norm": 0.02171475626528263, "kl": 0.00012777745723724365, "lambda_div_used": 0.5844379514455795, "learning_rate": 6e-07, "loss": 0.0567, "reward": -0.11025732010602951, "reward_after_mean": -0.11025732010602951, "reward_after_std": 0.4723772555589676, "reward_before_mean": 0.2753620855510235, "reward_before_std": 0.39071971736848354, "reward_change_max": 0.0, "reward_change_mean": -0.3856194168329239, "reward_change_min": -0.5897834450006485, "reward_change_std": 0.2299499223008752, "reward_std": 0.4723772667348385, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.004528738558292389, "step": 13 }, { "clip_fraction": 0.0, "completion_length": 2412.1250534057617, "epoch": 0.016, "grad_norm": 0.024103164672851562, "kl": 0.00015282630920410156, "lambda_div_used": 0.6067279502749443, "learning_rate": 6.5e-07, "loss": -0.0015, "reward": -0.08361193258315325, "reward_after_mean": -0.08361193258315325, "reward_after_std": 0.5700989812612534, "reward_before_mean": 0.27934680134058, "reward_before_std": 0.49509103409945965, "reward_change_max": 0.0, "reward_change_mean": -0.36295875161886215, "reward_change_min": -0.6321466080844402, "reward_change_std": 0.2297183210030198, "reward_std": 0.570098988711834, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.0085134650580585, "step": 14 }, { "clip_fraction": 0.0, "completion_length": 2612.5833778381348, "epoch": 0.017142857142857144, "grad_norm": 0.02622115984559059, "kl": 0.00012434273958206177, "lambda_div_used": 0.5387627929449081, "learning_rate": 7e-07, "loss": -0.0354, "reward": -0.03696875274181366, "reward_after_mean": -0.03696875274181366, "reward_after_std": 0.3815008979290724, "reward_before_mean": 0.5313108433037996, "reward_before_std": 0.17025252804160118, "reward_change_max": 0.0, "reward_change_mean": -0.5682796090841293, "reward_change_min": -0.7619944997131824, "reward_change_std": 0.2890887148678303, "reward_std": 0.3815009109675884, "rewards/accuracy_reward": 0.375, "rewards/cosine_scaled_reward": 0.15631086938083172, "step": 15 }, { "clip_fraction": 0.0, "completion_length": 3487.4583435058594, "epoch": 0.018285714285714287, "grad_norm": 0.018314050510525703, "kl": 0.00019049644470214844, "lambda_div_used": 0.5788475871086121, "learning_rate": 7.5e-07, "loss": 0.0111, "reward": -0.1648220755159855, "reward_after_mean": -0.1648220755159855, "reward_after_std": 0.3886314034461975, "reward_before_mean": 0.18453767150640488, "reward_before_std": 0.3589506670832634, "reward_change_max": 0.0, "reward_change_mean": -0.34935975447297096, "reward_change_min": -0.5136087462306023, "reward_change_std": 0.2073321659117937, "reward_std": 0.38863140903413296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/cosine_scaled_reward": -0.023795653134584427, "step": 16 }, { "clip_fraction": 0.0, "completion_length": 1923.895851135254, "epoch": 0.019428571428571427, "grad_norm": 0.03498728573322296, "kl": 0.00010513514280319214, "lambda_div_used": 0.6301147192716599, "learning_rate": 8e-07, "loss": 0.0516, "reward": 0.24809654615819454, "reward_after_mean": 0.24809654615819454, "reward_after_std": 0.7150995936244726, "reward_before_mean": 0.749116275459528, "reward_before_std": 0.5986730419099331, "reward_change_max": 0.0, "reward_change_mean": -0.5010197218507528, "reward_change_min": -0.7233205642551184, "reward_change_std": 0.2854560799896717, "reward_std": 0.7150996085256338, "rewards/accuracy_reward": 0.5208333432674408, "rewards/cosine_scaled_reward": 0.2282829141477123, "step": 17 }, { "clip_fraction": 0.0, "completion_length": 2648.3334197998047, "epoch": 0.02057142857142857, "grad_norm": 0.019363267347216606, "kl": 0.00012411177158355713, "lambda_div_used": 0.5733960121870041, "learning_rate": 8.499999999999999e-07, "loss": 0.0156, "reward": -4.522036761045456e-05, "reward_after_mean": -4.522036761045456e-05, "reward_after_std": 0.491512268781662, "reward_before_mean": 0.4940829328261316, "reward_before_std": 0.33302280586212873, "reward_change_max": 0.0, "reward_change_mean": -0.49412815272808075, "reward_change_min": -0.6739438865333796, "reward_change_std": 0.25992031022906303, "reward_std": 0.4915122911334038, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.11908289603888988, "step": 18 }, { "clip_fraction": 0.0, "completion_length": 2037.93754196167, "epoch": 0.021714285714285714, "grad_norm": 0.0254252627491951, "kl": 0.00010475516319274902, "lambda_div_used": 0.5972657427191734, "learning_rate": 9e-07, "loss": -0.0277, "reward": 0.3277764454251155, "reward_after_mean": 0.3277764454251155, "reward_after_std": 0.7013481389731169, "reward_before_mean": 0.9843392036855221, "reward_before_std": 0.44702923856675625, "reward_change_max": 0.0, "reward_change_mean": -0.6565627679228783, "reward_change_min": -0.9309929609298706, "reward_change_std": 0.3530767587944865, "reward_std": 0.7013481538742781, "rewards/accuracy_reward": 0.6041666697710752, "rewards/cosine_scaled_reward": 0.3801724927034229, "step": 19 }, { "clip_fraction": 0.0, "completion_length": 1404.291706085205, "epoch": 0.022857142857142857, "grad_norm": 0.028516631573438644, "kl": 5.914270877838135e-05, "lambda_div_used": 0.6129282414913177, "learning_rate": 9.499999999999999e-07, "loss": 0.0391, "reward": 0.26722301356494427, "reward_after_mean": 0.26722301356494427, "reward_after_std": 0.6774719897657633, "reward_before_mean": 0.8377129100263119, "reward_before_std": 0.5324758047936484, "reward_change_max": 0.0, "reward_change_mean": -0.5704899430274963, "reward_change_min": -0.8648187033832073, "reward_change_std": 0.3376352610066533, "reward_std": 0.6774720121175051, "rewards/accuracy_reward": 0.5416666716337204, "rewards/cosine_scaled_reward": 0.29604623932391405, "step": 20 }, { "clip_fraction": 0.0, "completion_length": 2424.6042137145996, "epoch": 0.024, "grad_norm": 0.032928258180618286, "kl": 0.000138014554977417, "lambda_div_used": 0.6448317095637321, "learning_rate": 1e-06, "loss": 0.0361, "reward": 0.23116276413202286, "reward_after_mean": 0.23116276413202286, "reward_after_std": 0.7203101813793182, "reward_before_mean": 0.6635394699405879, "reward_before_std": 0.6762269856408238, "reward_change_max": 0.0, "reward_change_mean": -0.4323767013847828, "reward_change_min": -0.7010968029499054, "reward_change_std": 0.2771869823336601, "reward_std": 0.7203102335333824, "rewards/accuracy_reward": 0.43750000931322575, "rewards/cosine_scaled_reward": 0.22603945806622505, "step": 21 }, { "clip_fraction": 0.0, "completion_length": 1410.8750381469727, "epoch": 0.025142857142857144, "grad_norm": 0.03686099499464035, "kl": 0.00010958313941955566, "lambda_div_used": 0.5709755718708038, "learning_rate": 9.99931462820376e-07, "loss": -0.0637, "reward": -0.13723512832075357, "reward_after_mean": -0.13723512832075357, "reward_after_std": 0.4940304774791002, "reward_before_mean": 0.30008178018033504, "reward_before_std": 0.32625696901232004, "reward_change_max": 0.0, "reward_change_mean": -0.4373169243335724, "reward_change_min": -0.6162486486136913, "reward_change_std": 0.23701479192823172, "reward_std": 0.4940304830670357, "rewards/accuracy_reward": 0.31250000186264515, "rewards/cosine_scaled_reward": -0.012418218422681093, "step": 22 }, { "clip_fraction": 0.0, "completion_length": 2354.062568664551, "epoch": 0.026285714285714287, "grad_norm": 0.025602027773857117, "kl": 0.00011620670557022095, "lambda_div_used": 0.6488568410277367, "learning_rate": 9.997258721585931e-07, "loss": 0.0419, "reward": -0.012333650141954422, "reward_after_mean": -0.012333650141954422, "reward_after_std": 0.698169419541955, "reward_before_mean": 0.27921401464845985, "reward_before_std": 0.6954333996400237, "reward_change_max": 0.0, "reward_change_mean": -0.29154767468571663, "reward_change_min": -0.5527428761124611, "reward_change_std": 0.2108509410172701, "reward_std": 0.6981694512069225, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.008380686864256859, "step": 23 }, { "clip_fraction": 0.0, "completion_length": 1993.1041870117188, "epoch": 0.027428571428571427, "grad_norm": 0.03096413053572178, "kl": 8.243322372436523e-05, "lambda_div_used": 0.6519145146012306, "learning_rate": 9.993832906395582e-07, "loss": 0.0795, "reward": 0.11854812642559409, "reward_after_mean": 0.11854812642559409, "reward_after_std": 0.7567200511693954, "reward_before_mean": 0.49915426783263683, "reward_before_std": 0.7109423782676458, "reward_change_max": 0.0, "reward_change_mean": -0.38060615211725235, "reward_change_min": -0.7067882716655731, "reward_change_std": 0.26373046822845936, "reward_std": 0.7567200735211372, "rewards/accuracy_reward": 0.3750000037252903, "rewards/cosine_scaled_reward": 0.12415427155792713, "step": 24 }, { "clip_fraction": 0.0, "completion_length": 2234.8333435058594, "epoch": 0.02857142857142857, "grad_norm": 0.022234002128243446, "kl": 0.00014199316501617432, "lambda_div_used": 0.6245157197117805, "learning_rate": 9.989038226169207e-07, "loss": 0.0107, "reward": 0.08800357580184937, "reward_after_mean": 0.08800357580184937, "reward_after_std": 0.5656109545379877, "reward_before_mean": 0.46775088645517826, "reward_before_std": 0.5775847099721432, "reward_change_max": 0.0, "reward_change_mean": -0.37974734231829643, "reward_change_min": -0.6116136200726032, "reward_change_std": 0.2511585932224989, "reward_std": 0.5656109638512135, "rewards/accuracy_reward": 0.354166679084301, "rewards/cosine_scaled_reward": 0.1135842353105545, "step": 25 }, { "clip_fraction": 0.0, "completion_length": 2473.708366394043, "epoch": 0.029714285714285714, "grad_norm": 0.02305966429412365, "kl": 0.00014957785606384277, "lambda_div_used": 0.57183438539505, "learning_rate": 9.982876141412855e-07, "loss": -0.0358, "reward": -0.41022508684545755, "reward_after_mean": -0.41022508684545755, "reward_after_std": 0.3927479684352875, "reward_before_mean": -0.15500983409583569, "reward_before_std": 0.32340476755052805, "reward_change_max": 0.0, "reward_change_mean": -0.25521524623036385, "reward_change_min": -0.4159896522760391, "reward_change_std": 0.1455942215397954, "reward_std": 0.3927479758858681, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.21750983409583569, "step": 26 }, { "clip_fraction": 0.0, "completion_length": 2344.5625762939453, "epoch": 0.030857142857142857, "grad_norm": 0.03154224529862404, "kl": 0.00014576315879821777, "lambda_div_used": 0.5766249001026154, "learning_rate": 9.975348529157229e-07, "loss": 0.1289, "reward": -0.04079665243625641, "reward_after_mean": -0.04079665243625641, "reward_after_std": 0.4307698383927345, "reward_before_mean": 0.39524078369140625, "reward_before_std": 0.3457355350255966, "reward_change_max": 0.0, "reward_change_mean": -0.4360374417155981, "reward_change_min": -0.6437131129205227, "reward_change_std": 0.24637807440012693, "reward_std": 0.4307698402553797, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.061907440423965454, "step": 27 }, { "clip_fraction": 0.0, "completion_length": 2371.4375381469727, "epoch": 0.032, "grad_norm": 0.030750228092074394, "kl": 0.00011971592903137207, "lambda_div_used": 0.6046403273940086, "learning_rate": 9.96645768238595e-07, "loss": 0.0725, "reward": 0.06598741095513105, "reward_after_mean": 0.06598741095513105, "reward_after_std": 0.6246800310909748, "reward_before_mean": 0.5469000339508057, "reward_before_std": 0.47788948379456997, "reward_change_max": 0.0, "reward_change_mean": -0.4809126127511263, "reward_change_min": -0.6964126750826836, "reward_change_std": 0.2671422157436609, "reward_std": 0.6246800404042006, "rewards/accuracy_reward": 0.4375000074505806, "rewards/cosine_scaled_reward": 0.10940003173891455, "step": 28 }, { "clip_fraction": 0.0, "completion_length": 2796.6250762939453, "epoch": 0.03314285714285714, "grad_norm": 0.022529419511556625, "kl": 0.00014966726303100586, "lambda_div_used": 0.5826017782092094, "learning_rate": 9.956206309337066e-07, "loss": -0.0443, "reward": -0.2186606228351593, "reward_after_mean": -0.2186606228351593, "reward_after_std": 0.4133179672062397, "reward_before_mean": 0.09334492683410645, "reward_before_std": 0.3791744504123926, "reward_change_max": 0.0, "reward_change_mean": -0.3120055440813303, "reward_change_min": -0.4988309144973755, "reward_change_std": 0.19146351423114538, "reward_std": 0.41331798397004604, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.07332174107432365, "step": 29 }, { "clip_fraction": 0.0, "completion_length": 2330.354232788086, "epoch": 0.03428571428571429, "grad_norm": 0.02580900862812996, "kl": 0.00011408329010009766, "lambda_div_used": 0.617066040635109, "learning_rate": 9.944597532678119e-07, "loss": 0.0106, "reward": -0.01328302314504981, "reward_after_mean": -0.01328302314504981, "reward_after_std": 0.6155566833913326, "reward_before_mean": 0.36568982464814326, "reward_before_std": 0.536849819123745, "reward_change_max": 0.0, "reward_change_mean": -0.37897284515202045, "reward_change_min": -0.5798661820590496, "reward_change_std": 0.22223789989948273, "reward_std": 0.6155567076057196, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.05318982107564807, "step": 30 }, { "clip_fraction": 0.0, "completion_length": 2794.4375381469727, "epoch": 0.03542857142857143, "grad_norm": 0.021817587316036224, "kl": 0.0001348257064819336, "lambda_div_used": 0.6117052882909775, "learning_rate": 9.931634888554935e-07, "loss": 0.0028, "reward": -0.07263503596186638, "reward_after_mean": -0.07263503596186638, "reward_after_std": 0.5405435804277658, "reward_before_mean": 0.26847894000820816, "reward_before_std": 0.5099836494773626, "reward_change_max": 0.0, "reward_change_mean": -0.34111399203538895, "reward_change_min": -0.5489708594977856, "reward_change_std": 0.21398558467626572, "reward_std": 0.5405435990542173, "rewards/accuracy_reward": 0.27083334140479565, "rewards/cosine_scaled_reward": -0.002354402095079422, "step": 31 }, { "clip_fraction": 0.0, "completion_length": 2270.2292098999023, "epoch": 0.036571428571428574, "grad_norm": 0.024800026789307594, "kl": 0.00010867416858673096, "lambda_div_used": 0.6270024925470352, "learning_rate": 9.917322325514487e-07, "loss": 0.0215, "reward": 0.13659005239605904, "reward_after_mean": 0.13659005239605904, "reward_after_std": 0.6418719291687012, "reward_before_mean": 0.5631819479167461, "reward_before_std": 0.5894247069954872, "reward_change_max": 0.0, "reward_change_mean": -0.4265918843448162, "reward_change_min": -0.6561249867081642, "reward_change_std": 0.26332158874720335, "reward_std": 0.6418719589710236, "rewards/accuracy_reward": 0.39583334140479565, "rewards/cosine_scaled_reward": 0.1673485841602087, "step": 32 }, { "clip_fraction": 0.0, "completion_length": 2852.000045776367, "epoch": 0.037714285714285714, "grad_norm": 0.025015637278556824, "kl": 0.00012034177780151367, "lambda_div_used": 0.6343094930052757, "learning_rate": 9.901664203302124e-07, "loss": -0.0682, "reward": 0.12994618620723486, "reward_after_mean": 0.12994618620723486, "reward_after_std": 0.641582889482379, "reward_before_mean": 0.5249918717890978, "reward_before_std": 0.6226585754193366, "reward_change_max": 0.0, "reward_change_mean": -0.39504568465054035, "reward_change_min": -0.6912417784333229, "reward_change_std": 0.2625753004103899, "reward_std": 0.6415829043835402, "rewards/accuracy_reward": 0.3958333432674408, "rewards/cosine_scaled_reward": 0.12915852759033442, "step": 33 }, { "clip_fraction": 0.0, "completion_length": 1936.0000305175781, "epoch": 0.038857142857142854, "grad_norm": 0.027393249794840813, "kl": 0.0001316368579864502, "lambda_div_used": 0.6432492136955261, "learning_rate": 9.88466529153356e-07, "loss": 0.0524, "reward": 0.22461825609207153, "reward_after_mean": 0.22461825609207153, "reward_after_std": 0.6486394293606281, "reward_before_mean": 0.6393125429749489, "reward_before_std": 0.6651058997958899, "reward_change_max": 0.0, "reward_change_mean": -0.41469427943229675, "reward_change_min": -0.6961428225040436, "reward_change_std": 0.27915553003549576, "reward_std": 0.6486394479870796, "rewards/accuracy_reward": 0.416666679084301, "rewards/cosine_scaled_reward": 0.222645852714777, "step": 34 }, { "clip_fraction": 0.0, "completion_length": 2444.270851135254, "epoch": 0.04, "grad_norm": 0.036447569727897644, "kl": 0.0001233518123626709, "lambda_div_used": 0.641115739941597, "learning_rate": 9.866330768241983e-07, "loss": 0.0574, "reward": 0.17524952441453934, "reward_after_mean": 0.17524952441453934, "reward_after_std": 0.6338076200336218, "reward_before_mean": 0.5732492320239544, "reward_before_std": 0.6535743195563555, "reward_change_max": 0.0, "reward_change_mean": -0.39799970760941505, "reward_change_min": -0.6731353290379047, "reward_change_std": 0.2706009875983, "reward_std": 0.6338076237589121, "rewards/accuracy_reward": 0.416666679084301, "rewards/cosine_scaled_reward": 0.15658256597816944, "step": 35 }, { "clip_fraction": 0.0, "completion_length": 3021.5834045410156, "epoch": 0.04114285714285714, "grad_norm": 0.023824643343687057, "kl": 0.00018006563186645508, "lambda_div_used": 0.6088642552495003, "learning_rate": 9.846666218300807e-07, "loss": -0.0045, "reward": -0.21179450303316116, "reward_after_mean": -0.21179450303316116, "reward_after_std": 0.521540641784668, "reward_before_mean": 0.07177379354834557, "reward_before_std": 0.5020219217985868, "reward_change_max": 0.0, "reward_change_mean": -0.2835683096200228, "reward_change_min": -0.5416868068277836, "reward_change_std": 0.1942979209125042, "reward_std": 0.5215406529605389, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.07405953668057919, "step": 36 }, { "clip_fraction": 0.0, "completion_length": 2800.3541717529297, "epoch": 0.04228571428571429, "grad_norm": 0.022017668932676315, "kl": 0.0001254826784133911, "lambda_div_used": 0.5779423043131828, "learning_rate": 9.825677631722435e-07, "loss": 0.0116, "reward": -0.2564888745546341, "reward_after_mean": -0.2564888745546341, "reward_after_std": 0.4095242340117693, "reward_before_mean": 0.04728756472468376, "reward_before_std": 0.3590739220380783, "reward_change_max": 0.0, "reward_change_mean": -0.3037764262408018, "reward_change_min": -0.48755551874637604, "reward_change_std": 0.18500223569571972, "reward_std": 0.40952424332499504, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.09854578226804733, "step": 37 }, { "clip_fraction": 0.0, "completion_length": 3207.8541870117188, "epoch": 0.04342857142857143, "grad_norm": 0.01808248646557331, "kl": 0.00016523152589797974, "lambda_div_used": 0.5784263759851456, "learning_rate": 9.80337140183366e-07, "loss": 0.0282, "reward": -0.26364604104310274, "reward_after_mean": -0.26364604104310274, "reward_after_std": 0.40342688001692295, "reward_before_mean": 0.05271115526556969, "reward_before_std": 0.35847953893244267, "reward_change_max": 0.0, "reward_change_mean": -0.31635717302560806, "reward_change_min": -0.5081874057650566, "reward_change_std": 0.19082189723849297, "reward_std": 0.4034268856048584, "rewards/accuracy_reward": 0.1458333395421505, "rewards/cosine_scaled_reward": -0.0931221836945042, "step": 38 }, { "clip_fraction": 0.0, "completion_length": 2402.250045776367, "epoch": 0.044571428571428574, "grad_norm": 0.02232450246810913, "kl": 0.00010313093662261963, "lambda_div_used": 0.594361886382103, "learning_rate": 9.779754323328192e-07, "loss": 0.0018, "reward": -0.1193324881605804, "reward_after_mean": -0.1193324881605804, "reward_after_std": 0.5286196451634169, "reward_before_mean": 0.25862734392285347, "reward_before_std": 0.4300632723607123, "reward_change_max": 0.0, "reward_change_mean": -0.37795985862612724, "reward_change_min": -0.5456654913723469, "reward_change_std": 0.21102675329893827, "reward_std": 0.5286196675151587, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": -0.01220599515363574, "step": 39 }, { "clip_fraction": 0.0, "completion_length": 2159.604217529297, "epoch": 0.045714285714285714, "grad_norm": 0.02471066638827324, "kl": 0.00011989474296569824, "lambda_div_used": 0.5991866067051888, "learning_rate": 9.754833590196926e-07, "loss": 0.0737, "reward": 0.01806516945362091, "reward_after_mean": 0.01806516945362091, "reward_after_std": 0.5395061280578375, "reward_before_mean": 0.45469519402831793, "reward_before_std": 0.4553617415949702, "reward_change_max": 0.0, "reward_change_mean": -0.43663003854453564, "reward_change_min": -0.6850062496960163, "reward_change_std": 0.2616432458162308, "reward_std": 0.5395061578601599, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.12136187124997377, "step": 40 }, { "clip_fraction": 0.0, "completion_length": 2768.5000610351562, "epoch": 0.046857142857142854, "grad_norm": 0.021616969257593155, "kl": 0.00012002140283584595, "lambda_div_used": 0.6239832416176796, "learning_rate": 9.728616793536587e-07, "loss": -0.0156, "reward": 0.14593233913183212, "reward_after_mean": 0.14593233913183212, "reward_after_std": 0.63340456597507, "reward_before_mean": 0.5886982697993517, "reward_before_std": 0.5802208222448826, "reward_change_max": 0.0, "reward_change_mean": -0.442765936255455, "reward_change_min": -0.7020618245005608, "reward_change_std": 0.2809063671156764, "reward_std": 0.6334045827388763, "rewards/accuracy_reward": 0.4375000111758709, "rewards/cosine_scaled_reward": 0.1511982548981905, "step": 41 }, { "clip_fraction": 0.0, "completion_length": 2634.8333702087402, "epoch": 0.048, "grad_norm": 0.04166898876428604, "kl": 0.00016657263040542603, "lambda_div_used": 0.5780549123883247, "learning_rate": 9.701111919237408e-07, "loss": 0.0133, "reward": -0.34821823611855507, "reward_after_mean": -0.34821823611855507, "reward_after_std": 0.42264553159475327, "reward_before_mean": -0.07792945206165314, "reward_before_std": 0.3588131470605731, "reward_change_max": 0.0, "reward_change_mean": -0.2702887710183859, "reward_change_min": -0.42083460837602615, "reward_change_std": 0.15945285465568304, "reward_std": 0.42264554649591446, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.16126279765740037, "step": 42 }, { "clip_fraction": 0.0, "completion_length": 2635.833396911621, "epoch": 0.04914285714285714, "grad_norm": 0.02200184389948845, "kl": 0.0001204218715429306, "lambda_div_used": 0.6316910237073898, "learning_rate": 9.672327345550543e-07, "loss": -0.0063, "reward": 0.021529126912355423, "reward_after_mean": 0.021529126912355423, "reward_after_std": 0.7083056271076202, "reward_before_mean": 0.3985202740877867, "reward_before_std": 0.6068296208977699, "reward_change_max": 0.0, "reward_change_mean": -0.3769911602139473, "reward_change_min": -0.5825657024979591, "reward_change_std": 0.2227043965831399, "reward_std": 0.7083056569099426, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.08602028086897917, "step": 43 }, { "clip_fraction": 0.0, "completion_length": 2195.229248046875, "epoch": 0.05028571428571429, "grad_norm": 0.036139003932476044, "kl": 0.00012126564979553223, "lambda_div_used": 0.5746868774294853, "learning_rate": 9.64227184053598e-07, "loss": 0.1008, "reward": -0.02278389036655426, "reward_after_mean": -0.02278389036655426, "reward_after_std": 0.43771820329129696, "reward_before_mean": 0.4344022050499916, "reward_before_std": 0.33903054893016815, "reward_change_max": 0.0, "reward_change_mean": -0.457186084240675, "reward_change_min": -0.6404859870672226, "reward_change_std": 0.257523151114583, "reward_std": 0.43771822564303875, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.10106886364519596, "step": 44 }, { "clip_fraction": 0.0, "completion_length": 3065.2083740234375, "epoch": 0.05142857142857143, "grad_norm": 0.01725860871374607, "kl": 0.0001525580883026123, "lambda_div_used": 0.6148836985230446, "learning_rate": 9.610954559391704e-07, "loss": 0.0504, "reward": 0.07026529498398304, "reward_after_mean": 0.07026529498398304, "reward_after_std": 0.5995844416320324, "reward_before_mean": 0.49401637725532055, "reward_before_std": 0.5296608861535788, "reward_change_max": 0.0, "reward_change_mean": -0.42375105805695057, "reward_change_min": -0.6395902335643768, "reward_change_std": 0.25529387686401606, "reward_std": 0.599584462121129, "rewards/accuracy_reward": 0.37500000931322575, "rewards/cosine_scaled_reward": 0.11901635373942554, "step": 45 }, { "clip_fraction": 0.0, "completion_length": 2750.166702270508, "epoch": 0.052571428571428575, "grad_norm": 0.024388441815972328, "kl": 0.0001780986785888672, "lambda_div_used": 0.5804363936185837, "learning_rate": 9.578385041664925e-07, "loss": 0.045, "reward": -0.34557132300687954, "reward_after_mean": -0.34557132300687954, "reward_after_std": 0.41662513464689255, "reward_before_mean": -0.07300508208572865, "reward_before_std": 0.3654432473704219, "reward_change_max": 0.0, "reward_change_mean": -0.2725662402808666, "reward_change_min": -0.43397457897663116, "reward_change_std": 0.16234493535012007, "reward_std": 0.4166251439601183, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.15633842581883073, "step": 46 }, { "clip_fraction": 0.0, "completion_length": 2203.437568664551, "epoch": 0.053714285714285714, "grad_norm": 0.02950853668153286, "kl": 0.00010627508163452148, "lambda_div_used": 0.6182287782430649, "learning_rate": 9.54457320834625e-07, "loss": -0.0342, "reward": 0.08941356465220451, "reward_after_mean": 0.08941356465220451, "reward_after_std": 0.5494941845536232, "reward_before_mean": 0.48168421536684036, "reward_before_std": 0.5449010655283928, "reward_change_max": 0.0, "reward_change_mean": -0.39227062091231346, "reward_change_min": -0.6192042604088783, "reward_change_std": 0.2517077624797821, "reward_std": 0.5494942031800747, "rewards/accuracy_reward": 0.354166679084301, "rewards/cosine_scaled_reward": 0.12751753628253937, "step": 47 }, { "clip_fraction": 0.0, "completion_length": 2591.729232788086, "epoch": 0.054857142857142854, "grad_norm": 0.026953846216201782, "kl": 0.00012552738189697266, "lambda_div_used": 0.6436027362942696, "learning_rate": 9.509529358847654e-07, "loss": 0.0003, "reward": 0.07579808123409748, "reward_after_mean": 0.07579808123409748, "reward_after_std": 0.7408107332885265, "reward_before_mean": 0.4633368235081434, "reward_before_std": 0.6680604638531804, "reward_change_max": 0.0, "reward_change_mean": -0.38753873854875565, "reward_change_min": -0.686689231544733, "reward_change_std": 0.25357643235474825, "reward_std": 0.740810751914978, "rewards/accuracy_reward": 0.33333333767950535, "rewards/cosine_scaled_reward": 0.130003463011235, "step": 48 }, { "clip_fraction": 0.0, "completion_length": 1606.1875267028809, "epoch": 0.056, "grad_norm": 0.03145647421479225, "kl": 8.93324613571167e-05, "lambda_div_used": 0.6448555663228035, "learning_rate": 9.473264167865171e-07, "loss": -0.0559, "reward": 0.11411497555673122, "reward_after_mean": 0.11411497555673122, "reward_after_std": 0.6693379506468773, "reward_before_mean": 0.456937775015831, "reward_before_std": 0.679039599490352, "reward_change_max": 0.0, "reward_change_mean": -0.3428228013217449, "reward_change_min": -0.5654460191726685, "reward_change_std": 0.23543909844011068, "reward_std": 0.6693379702046514, "rewards/accuracy_reward": 0.35416667722165585, "rewards/cosine_scaled_reward": 0.10277110431343317, "step": 49 }, { "clip_fraction": 0.0, "completion_length": 2721.18754196167, "epoch": 0.05714285714285714, "grad_norm": 0.020998205989599228, "kl": 0.00010278820991516113, "lambda_div_used": 0.5569720417261124, "learning_rate": 9.43578868212728e-07, "loss": 0.0198, "reward": -0.014975886791944504, "reward_after_mean": -0.014975886791944504, "reward_after_std": 0.47250125743448734, "reward_before_mean": 0.5332869850099087, "reward_before_std": 0.25590797886252403, "reward_change_max": 0.0, "reward_change_mean": -0.5482628662139177, "reward_change_min": -0.7288287468254566, "reward_change_std": 0.2796243606135249, "reward_std": 0.4725012853741646, "rewards/accuracy_reward": 0.39583333395421505, "rewards/cosine_scaled_reward": 0.13745362346526235, "step": 50 }, { "clip_fraction": 0.0, "completion_length": 2238.145851135254, "epoch": 0.05828571428571429, "grad_norm": 0.030202677473425865, "kl": 0.00016352534294128418, "lambda_div_used": 0.589074470102787, "learning_rate": 9.397114317029974e-07, "loss": -0.0263, "reward": -0.32303538359701633, "reward_after_mean": -0.32303538359701633, "reward_after_std": 0.4511607848107815, "reward_before_mean": -0.059135761111974716, "reward_before_std": 0.41417009476572275, "reward_change_max": 0.0, "reward_change_mean": -0.26389962807297707, "reward_change_min": -0.46818122640252113, "reward_change_std": 0.1724827392026782, "reward_std": 0.4511608015745878, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.16330242389813066, "step": 51 }, { "clip_fraction": 0.0, "completion_length": 2562.2292098999023, "epoch": 0.05942857142857143, "grad_norm": 0.027546504512429237, "kl": 0.00012978166341781616, "lambda_div_used": 0.5993086248636246, "learning_rate": 9.357252853159505e-07, "loss": 0.0385, "reward": 0.08331240899860859, "reward_after_mean": 0.08331240899860859, "reward_after_std": 0.5890753846615553, "reward_before_mean": 0.5538598063867539, "reward_before_std": 0.45862336084246635, "reward_change_max": 0.0, "reward_change_mean": -0.4705474264919758, "reward_change_min": -0.6809027269482613, "reward_change_std": 0.26692016143351793, "reward_std": 0.5890753846615553, "rewards/accuracy_reward": 0.43750000558793545, "rewards/cosine_scaled_reward": 0.11635981127619743, "step": 52 }, { "clip_fraction": 0.0, "completion_length": 2494.8750762939453, "epoch": 0.060571428571428575, "grad_norm": 0.024038787931203842, "kl": 0.0001424252986907959, "lambda_div_used": 0.6427036076784134, "learning_rate": 9.316216432703916e-07, "loss": 0.0239, "reward": 0.016640717163681984, "reward_after_mean": 0.016640717163681984, "reward_after_std": 0.6831068731844425, "reward_before_mean": 0.334790101274848, "reward_before_std": 0.665732966735959, "reward_change_max": 0.0, "reward_change_mean": -0.318149384111166, "reward_change_min": -0.5409456379711628, "reward_change_std": 0.2164039220660925, "reward_std": 0.6831068824976683, "rewards/accuracy_reward": 0.29166667349636555, "rewards/cosine_scaled_reward": 0.043123436626046896, "step": 53 }, { "clip_fraction": 0.0, "completion_length": 1934.7083892822266, "epoch": 0.061714285714285715, "grad_norm": 0.0318799689412117, "kl": 9.445101022720337e-05, "lambda_div_used": 0.6365627199411392, "learning_rate": 9.274017555754407e-07, "loss": 0.0748, "reward": 0.47428043745458126, "reward_after_mean": 0.47428043745458126, "reward_after_std": 0.7259879875928164, "reward_before_mean": 1.077655490487814, "reward_before_std": 0.6342989937402308, "reward_change_max": 0.0, "reward_change_mean": -0.6033750809729099, "reward_change_min": -0.9116491675376892, "reward_change_std": 0.3683675564825535, "reward_std": 0.7259880118072033, "rewards/accuracy_reward": 0.6458333469927311, "rewards/cosine_scaled_reward": 0.43182216165587306, "step": 54 }, { "clip_fraction": 0.0, "completion_length": 2623.7708892822266, "epoch": 0.06285714285714286, "grad_norm": 0.02094973810017109, "kl": 0.00013570114970207214, "lambda_div_used": 0.645888201892376, "learning_rate": 9.230669076497687e-07, "loss": 0.0808, "reward": 0.17671905946917832, "reward_after_mean": 0.17671905946917832, "reward_after_std": 0.7367929276078939, "reward_before_mean": 0.5884007401764393, "reward_before_std": 0.6809105025604367, "reward_change_max": 0.0, "reward_change_mean": -0.4116816818714142, "reward_change_min": -0.6507021151483059, "reward_change_std": 0.258549933321774, "reward_std": 0.7367929276078939, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.19256740622222424, "step": 55 }, { "clip_fraction": 0.0, "completion_length": 2702.8750915527344, "epoch": 0.064, "grad_norm": 0.02438957802951336, "kl": 0.00014606118202209473, "lambda_div_used": 0.583276279270649, "learning_rate": 9.186184199300463e-07, "loss": 0.0126, "reward": -0.2818741099908948, "reward_after_mean": -0.2818741099908948, "reward_after_std": 0.4212169963866472, "reward_before_mean": 0.015760678332298994, "reward_before_std": 0.38252383656799793, "reward_change_max": 0.0, "reward_change_mean": -0.2976347878575325, "reward_change_min": -0.5124507918953896, "reward_change_std": 0.1885841079056263, "reward_std": 0.4212170038372278, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.1092393291182816, "step": 56 }, { "clip_fraction": 0.0, "completion_length": 3056.4583740234375, "epoch": 0.06514285714285714, "grad_norm": 0.01594528928399086, "kl": 0.00010664761066436768, "lambda_div_used": 0.6316090971231461, "learning_rate": 9.140576474687263e-07, "loss": 0.0184, "reward": -0.039651480969041586, "reward_after_mean": -0.039651480969041586, "reward_after_std": 0.6351467221975327, "reward_before_mean": 0.2791806310415268, "reward_before_std": 0.605388393625617, "reward_change_max": 0.0, "reward_change_mean": -0.3188321180641651, "reward_change_min": -0.5200116373598576, "reward_change_std": 0.20627015084028244, "reward_std": 0.6351467464119196, "rewards/accuracy_reward": 0.2708333432674408, "rewards/cosine_scaled_reward": 0.008347294380655512, "step": 57 }, { "clip_fraction": 0.0, "completion_length": 1619.8541946411133, "epoch": 0.06628571428571428, "grad_norm": 0.028398146852850914, "kl": 7.768720388412476e-05, "lambda_div_used": 0.6244253218173981, "learning_rate": 9.093859795212817e-07, "loss": -0.0744, "reward": 0.02804100140929222, "reward_after_mean": 0.02804100140929222, "reward_after_std": 0.6312676724046469, "reward_before_mean": 0.41605534171685576, "reward_before_std": 0.5784196928143501, "reward_change_max": 0.0, "reward_change_mean": -0.38801432587206364, "reward_change_min": -0.6227842308580875, "reward_change_std": 0.24862205237150192, "reward_std": 0.6312677096575499, "rewards/accuracy_reward": 0.35416666977107525, "rewards/cosine_scaled_reward": 0.061888658441603184, "step": 58 }, { "clip_fraction": 0.0, "completion_length": 2521.937530517578, "epoch": 0.06742857142857143, "grad_norm": 0.023369140923023224, "kl": 9.158626198768616e-05, "lambda_div_used": 0.6007750853896141, "learning_rate": 9.046048391230247e-07, "loss": 0.0054, "reward": -0.07107383571565151, "reward_after_mean": -0.07107383571565151, "reward_after_std": 0.49415648356080055, "reward_before_mean": 0.2814117716625333, "reward_before_std": 0.466181633528322, "reward_change_max": 0.0, "reward_change_mean": -0.3524855989962816, "reward_change_min": -0.5633360184729099, "reward_change_std": 0.22167869098484516, "reward_std": 0.49415648356080055, "rewards/accuracy_reward": 0.2500000111758709, "rewards/cosine_scaled_reward": 0.03141175117343664, "step": 59 }, { "clip_fraction": 0.0, "completion_length": 2424.3125762939453, "epoch": 0.06857142857142857, "grad_norm": 0.020705759525299072, "kl": 0.00011293590068817139, "lambda_div_used": 0.6067081466317177, "learning_rate": 8.997156826556369e-07, "loss": 0.0407, "reward": 0.07130313850939274, "reward_after_mean": 0.07130313850939274, "reward_after_std": 0.5420917756855488, "reward_before_mean": 0.5138208344578743, "reward_before_std": 0.4931760486215353, "reward_change_max": 0.0, "reward_change_mean": -0.4425176791846752, "reward_change_min": -0.6790216974914074, "reward_change_std": 0.2717863190919161, "reward_std": 0.5420917868614197, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.1388207976706326, "step": 60 }, { "clip_fraction": 0.0, "completion_length": 2410.625030517578, "epoch": 0.06971428571428571, "grad_norm": 0.0203632153570652, "kl": 9.970366954803467e-05, "lambda_div_used": 0.5674436464905739, "learning_rate": 8.9471999940354e-07, "loss": 0.0006, "reward": -0.1596299186348915, "reward_after_mean": -0.1596299186348915, "reward_after_std": 0.40844789519906044, "reward_before_mean": 0.2439738381654024, "reward_before_std": 0.3030575467273593, "reward_change_max": 0.0, "reward_change_mean": -0.40360378473997116, "reward_change_min": -0.562778364866972, "reward_change_std": 0.21779226139187813, "reward_std": 0.4084479194134474, "rewards/accuracy_reward": 0.25000000558793545, "rewards/cosine_scaled_reward": -0.006026154384016991, "step": 61 }, { "clip_fraction": 0.0, "completion_length": 1986.6875381469727, "epoch": 0.07085714285714285, "grad_norm": 0.023419808596372604, "kl": 8.377432823181152e-05, "lambda_div_used": 0.6294708475470543, "learning_rate": 8.896193111002475e-07, "loss": 0.0145, "reward": 0.127364382147789, "reward_after_mean": 0.127364382147789, "reward_after_std": 0.6607769038528204, "reward_before_mean": 0.5438444633036852, "reward_before_std": 0.6062875427305698, "reward_change_max": 0.0, "reward_change_mean": -0.41648009419441223, "reward_change_min": -0.6949719563126564, "reward_change_std": 0.27056892681866884, "reward_std": 0.6607769187539816, "rewards/accuracy_reward": 0.4166666753590107, "rewards/cosine_scaled_reward": 0.12717779609374702, "step": 62 }, { "clip_fraction": 0.0, "completion_length": 1558.8542098999023, "epoch": 0.072, "grad_norm": 0.029614871367812157, "kl": 9.24495980143547e-05, "lambda_div_used": 0.5882120281457901, "learning_rate": 8.844151714648274e-07, "loss": -0.0217, "reward": 0.24260340631008148, "reward_after_mean": 0.24260340631008148, "reward_after_std": 0.5563407000154257, "reward_before_mean": 0.8341647423803806, "reward_before_std": 0.4078605566173792, "reward_change_max": 0.0, "reward_change_mean": -0.59156134724617, "reward_change_min": -0.8355859033763409, "reward_change_std": 0.33061067573726177, "reward_std": 0.5563407260924578, "rewards/accuracy_reward": 0.5416666716337204, "rewards/cosine_scaled_reward": 0.29249807819724083, "step": 63 }, { "clip_fraction": 0.0, "completion_length": 2526.979217529297, "epoch": 0.07314285714285715, "grad_norm": 0.023758206516504288, "kl": 0.00014294683933258057, "lambda_div_used": 0.5706712529063225, "learning_rate": 8.791091657286267e-07, "loss": -0.0491, "reward": 0.005135258659720421, "reward_after_mean": 0.005135258659720421, "reward_after_std": 0.4730408936738968, "reward_before_mean": 0.509556919336319, "reward_before_std": 0.318298134021461, "reward_change_max": 0.0, "reward_change_mean": -0.5044216811656952, "reward_change_min": -0.6950805820524693, "reward_change_std": 0.2665313957259059, "reward_std": 0.4730409197509289, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.13455691374838352, "step": 64 }, { "clip_fraction": 0.0, "completion_length": 2619.6875343322754, "epoch": 0.07428571428571429, "grad_norm": 0.02972961962223053, "kl": 9.997934103012085e-05, "lambda_div_used": 0.5918664485216141, "learning_rate": 8.737029101523929e-07, "loss": 0.0061, "reward": -0.009068667888641357, "reward_after_mean": -0.009068667888641357, "reward_after_std": 0.523827837780118, "reward_before_mean": 0.42262596264481544, "reward_before_std": 0.4240496205165982, "reward_change_max": 0.0, "reward_change_mean": -0.43169461004436016, "reward_change_min": -0.6695713810622692, "reward_change_std": 0.25297324638813734, "reward_std": 0.5238278452306986, "rewards/accuracy_reward": 0.3541666679084301, "rewards/cosine_scaled_reward": 0.06845926493406296, "step": 65 }, { "clip_fraction": 0.0, "completion_length": 2048.645835876465, "epoch": 0.07542857142857143, "grad_norm": 0.029114792123436928, "kl": 8.37370753288269e-05, "lambda_div_used": 0.5725493803620338, "learning_rate": 8.681980515339463e-07, "loss": -0.0155, "reward": -0.16004172409884632, "reward_after_mean": -0.16004172409884632, "reward_after_std": 0.49360031075775623, "reward_before_mean": 0.2602699510753155, "reward_before_std": 0.33009787695482373, "reward_change_max": 0.0, "reward_change_mean": -0.4203116577118635, "reward_change_min": -0.6120298802852631, "reward_change_std": 0.2290408704429865, "reward_std": 0.49360031820833683, "rewards/accuracy_reward": 0.31250000186264515, "rewards/cosine_scaled_reward": -0.05223005823791027, "step": 66 }, { "clip_fraction": 0.0, "completion_length": 3108.1458740234375, "epoch": 0.07657142857142857, "grad_norm": 0.017497915774583817, "kl": 0.00010813027620315552, "lambda_div_used": 0.630346029996872, "learning_rate": 8.625962667065487e-07, "loss": -0.024, "reward": -0.17341885343194008, "reward_after_mean": -0.17341885343194008, "reward_after_std": 0.6313954871147871, "reward_before_mean": 0.0926114417379722, "reward_before_std": 0.6103347176685929, "reward_change_max": 0.0, "reward_change_mean": -0.26603029295802116, "reward_change_min": -0.5225372426211834, "reward_change_std": 0.1893094191327691, "reward_std": 0.6313955169171095, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.07405522745102644, "step": 67 }, { "clip_fraction": 0.0, "completion_length": 1551.3541870117188, "epoch": 0.07771428571428571, "grad_norm": 0.03526793047785759, "kl": 9.113550186157227e-05, "lambda_div_used": 0.6158603206276894, "learning_rate": 8.568992620281243e-07, "loss": -0.07, "reward": -0.04862045869231224, "reward_after_mean": -0.04862045869231224, "reward_after_std": 0.5381349269300699, "reward_before_mean": 0.3049982152879238, "reward_before_std": 0.5310354437679052, "reward_change_max": 0.0, "reward_change_mean": -0.35361868515610695, "reward_change_min": -0.6167616136372089, "reward_change_std": 0.2329900823533535, "reward_std": 0.5381349604576826, "rewards/accuracy_reward": 0.291666679084301, "rewards/cosine_scaled_reward": 0.013331551104784012, "step": 68 }, { "clip_fraction": 0.0, "completion_length": 1843.0000305175781, "epoch": 0.07885714285714286, "grad_norm": 0.034535001963377, "kl": 0.00010597705841064453, "lambda_div_used": 0.6196342781186104, "learning_rate": 8.511087728614862e-07, "loss": -0.0403, "reward": -0.16936753690242767, "reward_after_mean": -0.16936753690242767, "reward_after_std": 0.5928534604609013, "reward_before_mean": 0.10612463857978582, "reward_before_std": 0.5493131745606661, "reward_change_max": 0.0, "reward_change_mean": -0.2754921726882458, "reward_change_min": -0.4671623595058918, "reward_change_std": 0.1736066685989499, "reward_std": 0.592853469774127, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.06054202886298299, "step": 69 }, { "clip_fraction": 0.0, "completion_length": 2385.6250610351562, "epoch": 0.08, "grad_norm": 0.022255579009652138, "kl": 9.971857070922852e-05, "lambda_div_used": 0.5713987499475479, "learning_rate": 8.452265630457282e-07, "loss": 0.0293, "reward": -0.15412342175841331, "reward_after_mean": -0.15412342175841331, "reward_after_std": 0.4452939387410879, "reward_before_mean": 0.2508242540061474, "reward_before_std": 0.32261871080845594, "reward_change_max": 0.0, "reward_change_mean": -0.4049476757645607, "reward_change_min": -0.5730207115411758, "reward_change_std": 0.21870618779212236, "reward_std": 0.44529395177960396, "rewards/accuracy_reward": 0.25000000558793545, "rewards/cosine_scaled_reward": 0.0008242330513894558, "step": 70 }, { "clip_fraction": 0.0, "completion_length": 2444.6666717529297, "epoch": 0.08114285714285714, "grad_norm": 0.03434319049119949, "kl": 0.0001207888126373291, "lambda_div_used": 0.6165208369493484, "learning_rate": 8.392544243589427e-07, "loss": -0.0114, "reward": -0.04324318375438452, "reward_after_mean": -0.04324318375438452, "reward_after_std": 0.5684425849467516, "reward_before_mean": 0.3016075724735856, "reward_before_std": 0.535145154222846, "reward_change_max": 0.0, "reward_change_mean": -0.3448507599532604, "reward_change_min": -0.5520102642476559, "reward_change_std": 0.21349613554775715, "reward_std": 0.5684425886720419, "rewards/accuracy_reward": 0.27083334513008595, "rewards/cosine_scaled_reward": 0.030774242244660854, "step": 71 }, { "clip_fraction": 0.0, "completion_length": 2275.229232788086, "epoch": 0.08228571428571428, "grad_norm": 0.026326576247811317, "kl": 0.0001144111156463623, "lambda_div_used": 0.5820747911930084, "learning_rate": 8.331941759724268e-07, "loss": 0.0486, "reward": -0.22072702879086137, "reward_after_mean": -0.22072702879086137, "reward_after_std": 0.4294308237731457, "reward_before_mean": 0.11065018083900213, "reward_before_std": 0.37079737335443497, "reward_change_max": 0.0, "reward_change_mean": -0.3313772287219763, "reward_change_min": -0.49349113181233406, "reward_change_std": 0.18969058711081743, "reward_std": 0.4294308312237263, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.05601647589355707, "step": 72 }, { "clip_fraction": 0.0, "completion_length": 3070.5209350585938, "epoch": 0.08342857142857144, "grad_norm": 0.0183473639190197, "kl": 0.00015050172805786133, "lambda_div_used": 0.6504631415009499, "learning_rate": 8.270476638965461e-07, "loss": 0.0732, "reward": -0.024378453381359577, "reward_after_mean": -0.024378453381359577, "reward_after_std": 0.719476904720068, "reward_before_mean": 0.25941105699166656, "reward_before_std": 0.7029449231922626, "reward_change_max": 0.0, "reward_change_mean": -0.28378950990736485, "reward_change_min": -0.4811365678906441, "reward_change_std": 0.19341129437088966, "reward_std": 0.7194769158959389, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": -0.01142227090895176, "step": 73 }, { "clip_fraction": 0.0, "completion_length": 2230.604202270508, "epoch": 0.08457142857142858, "grad_norm": 0.02717745117843151, "kl": 9.462237358093262e-05, "lambda_div_used": 0.6418861970305443, "learning_rate": 8.208167604184217e-07, "loss": -0.0956, "reward": -0.0033467919565737247, "reward_after_mean": -0.0033467919565737247, "reward_after_std": 0.6568808052688837, "reward_before_mean": 0.3098285049200058, "reward_before_std": 0.6575386971235275, "reward_change_max": 0.0, "reward_change_mean": -0.3131752759218216, "reward_change_min": -0.6135218031704426, "reward_change_std": 0.22661382239311934, "reward_std": 0.6568808313459158, "rewards/accuracy_reward": 0.2916666753590107, "rewards/cosine_scaled_reward": 0.018161814659833908, "step": 74 }, { "clip_fraction": 0.0, "completion_length": 2656.416732788086, "epoch": 0.08571428571428572, "grad_norm": 0.018360283225774765, "kl": 0.00010110437870025635, "lambda_div_used": 0.5670187771320343, "learning_rate": 8.145033635316128e-07, "loss": -0.0053, "reward": 0.006984639912843704, "reward_after_mean": 0.006984639912843704, "reward_after_std": 0.42523463629186153, "reward_before_mean": 0.5017829714342952, "reward_before_std": 0.3057099119760096, "reward_change_max": 0.0, "reward_change_mean": -0.49479835107922554, "reward_change_min": -0.685304194688797, "reward_change_std": 0.27397861890494823, "reward_std": 0.4252346530556679, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.1684496277011931, "step": 75 }, { "clip_fraction": 0.0, "completion_length": 2364.0000228881836, "epoch": 0.08685714285714285, "grad_norm": 0.026309814304113388, "kl": 0.00010971724987030029, "lambda_div_used": 0.5593390390276909, "learning_rate": 8.081093963579707e-07, "loss": 0.0787, "reward": -0.24861154425889254, "reward_after_mean": -0.24861154425889254, "reward_after_std": 0.3816519398242235, "reward_before_mean": 0.14198310300707817, "reward_before_std": 0.2715805321931839, "reward_change_max": 0.0, "reward_change_mean": -0.3905946556478739, "reward_change_min": -0.5774808749556541, "reward_change_std": 0.21815251000225544, "reward_std": 0.3816519435495138, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.02468355232849717, "step": 76 }, { "clip_fraction": 0.0, "completion_length": 2667.750030517578, "epoch": 0.088, "grad_norm": 0.020327381789684296, "kl": 0.00012464821338653564, "lambda_div_used": 0.5539242178201675, "learning_rate": 8.01636806561836e-07, "loss": -0.003, "reward": -0.2628857381641865, "reward_after_mean": -0.2628857381641865, "reward_after_std": 0.31219773180782795, "reward_before_mean": 0.10476060304790735, "reward_before_std": 0.24125095596536994, "reward_change_max": 0.0, "reward_change_mean": -0.36764636635780334, "reward_change_min": -0.5086396895349026, "reward_change_std": 0.20270386710762978, "reward_std": 0.3121977373957634, "rewards/accuracy_reward": 0.1875000074505806, "rewards/cosine_scaled_reward": -0.08273938857018948, "step": 77 }, { "clip_fraction": 0.0, "completion_length": 2686.8125534057617, "epoch": 0.08914285714285715, "grad_norm": 0.021180735900998116, "kl": 0.0001368001103401184, "lambda_div_used": 0.6677093878388405, "learning_rate": 7.950875657567621e-07, "loss": 0.11, "reward": 0.1766232904046774, "reward_after_mean": 0.1766232904046774, "reward_after_std": 0.8389766626060009, "reward_before_mean": 0.5535875726491213, "reward_before_std": 0.7847605030983686, "reward_change_max": 0.0, "reward_change_mean": -0.3769642859697342, "reward_change_min": -0.6617914140224457, "reward_change_std": 0.25535117369145155, "reward_std": 0.8389766924083233, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.15775423496961594, "step": 78 }, { "clip_fraction": 0.0, "completion_length": 1962.8750457763672, "epoch": 0.09028571428571429, "grad_norm": 0.030601773411035538, "kl": 8.64267349243164e-05, "lambda_div_used": 0.6305629685521126, "learning_rate": 7.884636689049422e-07, "loss": -0.0874, "reward": -0.07519742846488953, "reward_after_mean": -0.07519742846488953, "reward_after_std": 0.6089170537889004, "reward_before_mean": 0.22960891388356686, "reward_before_std": 0.611657090485096, "reward_change_max": 0.0, "reward_change_mean": -0.304806362837553, "reward_change_min": -0.5647807456552982, "reward_change_std": 0.21934652887284756, "reward_std": 0.608917074277997, "rewards/accuracy_reward": 0.2708333432674408, "rewards/cosine_scaled_reward": -0.04122441209619865, "step": 79 }, { "clip_fraction": 0.0, "completion_length": 2891.8333587646484, "epoch": 0.09142857142857143, "grad_norm": 0.020956117659807205, "kl": 0.00014778971672058105, "lambda_div_used": 0.5999866053462029, "learning_rate": 7.817671337095244e-07, "loss": 0.0016, "reward": -0.013454930856823921, "reward_after_mean": -0.013454930856823921, "reward_after_std": 0.5396539904177189, "reward_before_mean": 0.39186157658696175, "reward_before_std": 0.4594053290784359, "reward_change_max": 0.0, "reward_change_mean": -0.4053164832293987, "reward_change_min": -0.6001381352543831, "reward_change_std": 0.23710554651916027, "reward_std": 0.5396539978682995, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.058528220281004906, "step": 80 }, { "clip_fraction": 0.0, "completion_length": 2756.125057220459, "epoch": 0.09257142857142857, "grad_norm": 0.0333174467086792, "kl": 0.0001837015151977539, "lambda_div_used": 0.6196143180131912, "learning_rate": 7.75e-07, "loss": -0.019, "reward": -0.15753823146224022, "reward_after_mean": -0.15753823146224022, "reward_after_std": 0.5845062825828791, "reward_before_mean": 0.12821976901614107, "reward_before_std": 0.5542377643287182, "reward_change_max": 0.0, "reward_change_mean": -0.28575799986720085, "reward_change_min": -0.5201962739229202, "reward_change_std": 0.1913682147860527, "reward_std": 0.5845062825828791, "rewards/accuracy_reward": 0.20833334140479565, "rewards/cosine_scaled_reward": -0.08011356927454472, "step": 81 }, { "clip_fraction": 0.0, "completion_length": 2416.895851135254, "epoch": 0.09371428571428571, "grad_norm": 0.02389819547533989, "kl": 0.00010943412780761719, "lambda_div_used": 0.6039423421025276, "learning_rate": 7.681643291108517e-07, "loss": -0.0069, "reward": -0.03246039338409901, "reward_after_mean": -0.03246039338409901, "reward_after_std": 0.5534762311726809, "reward_before_mean": 0.3658472504466772, "reward_before_std": 0.4814961114898324, "reward_change_max": 0.0, "reward_change_mean": -0.3983076363801956, "reward_change_min": -0.655058030039072, "reward_change_std": 0.24730877578258514, "reward_std": 0.5534762516617775, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.05334722436964512, "step": 82 }, { "clip_fraction": 0.0, "completion_length": 2424.3125534057617, "epoch": 0.09485714285714286, "grad_norm": 0.032633859664201736, "kl": 0.00012442469596862793, "lambda_div_used": 0.6332797482609749, "learning_rate": 7.612622032536507e-07, "loss": 0.0298, "reward": 0.024711462669074535, "reward_after_mean": 0.024711462669074535, "reward_after_std": 0.6729160957038403, "reward_before_mean": 0.3993762247264385, "reward_before_std": 0.6236082511022687, "reward_change_max": 0.0, "reward_change_mean": -0.3746647536754608, "reward_change_min": -0.691291693598032, "reward_change_std": 0.2538198195397854, "reward_std": 0.6729161199182272, "rewards/accuracy_reward": 0.3125000037252903, "rewards/cosine_scaled_reward": 0.08687621541321278, "step": 83 }, { "clip_fraction": 0.0, "completion_length": 2199.125026702881, "epoch": 0.096, "grad_norm": 0.023145966231822968, "kl": 9.820610284805298e-05, "lambda_div_used": 0.6221627593040466, "learning_rate": 7.54295724882796e-07, "loss": -0.0603, "reward": 0.019974265713244677, "reward_after_mean": 0.019974265713244677, "reward_after_std": 0.6427162848412991, "reward_before_mean": 0.40708103217184544, "reward_before_std": 0.5660292999818921, "reward_change_max": 0.0, "reward_change_mean": -0.3871067576110363, "reward_change_min": -0.6353648640215397, "reward_change_std": 0.23979215417057276, "reward_std": 0.6427163053303957, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.07374768820591271, "step": 84 }, { "clip_fraction": 0.0, "completion_length": 2697.812545776367, "epoch": 0.09714285714285714, "grad_norm": 0.018655812367796898, "kl": 0.0001118779182434082, "lambda_div_used": 0.6938974410295486, "learning_rate": 7.472670160550848e-07, "loss": 0.0247, "reward": 0.22309327218681574, "reward_after_mean": 0.22309327218681574, "reward_after_std": 0.8897394463419914, "reward_before_mean": 0.5425103409215808, "reward_before_std": 0.909519312903285, "reward_change_max": 0.0, "reward_change_mean": -0.3194170743227005, "reward_change_min": -0.6195828355848789, "reward_change_std": 0.24075988680124283, "reward_std": 0.889739491045475, "rewards/accuracy_reward": 0.3750000111758709, "rewards/cosine_scaled_reward": 0.16751032788306475, "step": 85 }, { "clip_fraction": 0.0, "completion_length": 2550.145851135254, "epoch": 0.09828571428571428, "grad_norm": 0.02583778277039528, "kl": 0.00013977289199829102, "lambda_div_used": 0.5981776341795921, "learning_rate": 7.401782177833147e-07, "loss": -0.0016, "reward": -0.1776493340730667, "reward_after_mean": -0.1776493340730667, "reward_after_std": 0.4848842676728964, "reward_before_mean": 0.13707906752824783, "reward_before_std": 0.44980547949671745, "reward_change_max": 0.0, "reward_change_mean": -0.31472842022776604, "reward_change_min": -0.5129407718777657, "reward_change_std": 0.19486056733876467, "reward_std": 0.4848842900246382, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.029587595723569393, "step": 86 }, { "clip_fraction": 0.0, "completion_length": 2216.520866394043, "epoch": 0.09942857142857142, "grad_norm": 0.026928512379527092, "kl": 0.00014719367027282715, "lambda_div_used": 0.5552037805318832, "learning_rate": 7.330314893841101e-07, "loss": -0.0112, "reward": -0.14393189689144492, "reward_after_mean": -0.14393189689144492, "reward_after_std": 0.41725931130349636, "reward_before_mean": 0.3129472378641367, "reward_before_std": 0.24625429138541222, "reward_change_max": 0.0, "reward_change_mean": -0.4568791352212429, "reward_change_min": -0.6228058040142059, "reward_change_std": 0.2342971321195364, "reward_std": 0.41725931875407696, "rewards/accuracy_reward": 0.27083333395421505, "rewards/cosine_scaled_reward": 0.04211391881108284, "step": 87 }, { "clip_fraction": 0.0, "completion_length": 1673.833366394043, "epoch": 0.10057142857142858, "grad_norm": 0.03403550013899803, "kl": 9.492039680480957e-05, "lambda_div_used": 0.6714882552623749, "learning_rate": 7.258290078201731e-07, "loss": 0.1357, "reward": 0.23210743255913258, "reward_after_mean": 0.23210743255913258, "reward_after_std": 0.7776901721954346, "reward_before_mean": 0.6004263032227755, "reward_before_std": 0.8064676076173782, "reward_change_max": 0.0, "reward_change_mean": -0.3683188706636429, "reward_change_min": -0.6842552609741688, "reward_change_std": 0.2728601209819317, "reward_std": 0.7776901982724667, "rewards/accuracy_reward": 0.4375000149011612, "rewards/cosine_scaled_reward": 0.16292626922950149, "step": 88 }, { "clip_fraction": 0.0, "completion_length": 2455.875030517578, "epoch": 0.10171428571428572, "grad_norm": 0.02145099826157093, "kl": 0.00010183453559875488, "lambda_div_used": 0.6517865061759949, "learning_rate": 7.185729670371604e-07, "loss": 0.0088, "reward": 0.022896312177181244, "reward_after_mean": 0.022896312177181244, "reward_after_std": 0.7065913639962673, "reward_before_mean": 0.327204130589962, "reward_before_std": 0.7111460026353598, "reward_change_max": 0.0, "reward_change_mean": -0.30430781841278076, "reward_change_min": -0.5758539959788322, "reward_change_std": 0.22174649592489004, "reward_std": 0.7065913733094931, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.056370790116488934, "step": 89 }, { "clip_fraction": 0.0, "completion_length": 2390.3125381469727, "epoch": 0.10285714285714286, "grad_norm": 0.04259632155299187, "kl": 0.00015431642532348633, "lambda_div_used": 0.5984295755624771, "learning_rate": 7.11265577295385e-07, "loss": 0.034, "reward": -0.32630743458867073, "reward_after_mean": -0.32630743458867073, "reward_after_std": 0.5043698158115149, "reward_before_mean": -0.07746448495890945, "reward_before_std": 0.4487060569226742, "reward_change_max": 0.0, "reward_change_mean": -0.24884295091032982, "reward_change_min": -0.42664676532149315, "reward_change_std": 0.15124379005283117, "reward_std": 0.5043698251247406, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.1607978215906769, "step": 90 }, { "clip_fraction": 0.0, "completion_length": 2609.041702270508, "epoch": 0.104, "grad_norm": 0.022521013393998146, "kl": 0.0001251697540283203, "lambda_div_used": 0.6242434978485107, "learning_rate": 7.039090644965509e-07, "loss": 0.0254, "reward": 0.06410847418010235, "reward_after_mean": 0.06410847418010235, "reward_after_std": 0.6396163944154978, "reward_before_mean": 0.4679965991526842, "reward_before_std": 0.5784324184060097, "reward_change_max": 0.0, "reward_change_mean": -0.4038881305605173, "reward_change_min": -0.6435716077685356, "reward_change_std": 0.2524958234280348, "reward_std": 0.6396164130419493, "rewards/accuracy_reward": 0.35416666977107525, "rewards/cosine_scaled_reward": 0.11382993124425411, "step": 91 }, { "clip_fraction": 0.0, "completion_length": 2140.312526702881, "epoch": 0.10514285714285715, "grad_norm": 0.028272144496440887, "kl": 8.565187454223633e-05, "lambda_div_used": 0.6041740253567696, "learning_rate": 6.965056695057204e-07, "loss": -0.0131, "reward": -0.22151808440685272, "reward_after_mean": -0.22151808440685272, "reward_after_std": 0.5138680338859558, "reward_before_mean": 0.06442609056830406, "reward_before_std": 0.48106229305267334, "reward_change_max": 0.0, "reward_change_mean": -0.28594417311251163, "reward_change_min": -0.4845799170434475, "reward_change_std": 0.1817196160554886, "reward_std": 0.5138680376112461, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.10224058385938406, "step": 92 }, { "clip_fraction": 0.0, "completion_length": 3470.3333435058594, "epoch": 0.10628571428571429, "grad_norm": 0.02051234431564808, "kl": 0.00021988153457641602, "lambda_div_used": 0.550777792930603, "learning_rate": 6.890576474687263e-07, "loss": 0.0098, "reward": -0.4400870492681861, "reward_after_mean": -0.4400870492681861, "reward_after_std": 0.30945760011672974, "reward_before_mean": -0.15331347286701202, "reward_before_std": 0.22546498104929924, "reward_change_max": 0.0, "reward_change_mean": -0.2867735829204321, "reward_change_min": -0.40346677228808403, "reward_change_std": 0.15250255912542343, "reward_std": 0.3094576168805361, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.17414680309593678, "step": 93 }, { "clip_fraction": 0.0, "completion_length": 2433.375045776367, "epoch": 0.10742857142857143, "grad_norm": 0.02587928995490074, "kl": 0.0001605413854122162, "lambda_div_used": 0.5997196212410927, "learning_rate": 6.815672671252315e-07, "loss": 0.0496, "reward": -0.1711340295150876, "reward_after_mean": -0.1711340295150876, "reward_after_std": 0.5549158975481987, "reward_before_mean": 0.1783520970493555, "reward_before_std": 0.45881492272019386, "reward_change_max": 0.0, "reward_change_mean": -0.34948613308370113, "reward_change_min": -0.5870647989213467, "reward_change_std": 0.21199375297874212, "reward_std": 0.5549159198999405, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.05081457580672577, "step": 94 }, { "clip_fraction": 0.0, "completion_length": 3021.2083740234375, "epoch": 0.10857142857142857, "grad_norm": 0.016700398176908493, "kl": 0.00013339519500732422, "lambda_div_used": 0.6040999740362167, "learning_rate": 6.740368101176495e-07, "loss": 0.0501, "reward": -0.10988862998783588, "reward_after_mean": -0.10988862998783588, "reward_after_std": 0.574177211150527, "reward_before_mean": 0.259714370011352, "reward_before_std": 0.4856903199106455, "reward_change_max": 0.0, "reward_change_mean": -0.36960301361978054, "reward_change_min": -0.6000017635524273, "reward_change_std": 0.2258477583527565, "reward_std": 0.5741772279143333, "rewards/accuracy_reward": 0.25000000186264515, "rewards/cosine_scaled_reward": 0.009714371990412474, "step": 95 }, { "clip_fraction": 0.0, "completion_length": 2388.979202270508, "epoch": 0.10971428571428571, "grad_norm": 0.02693852037191391, "kl": 0.00010056048631668091, "lambda_div_used": 0.6452958509325981, "learning_rate": 6.664685702961344e-07, "loss": 0.0131, "reward": 0.1582602821290493, "reward_after_mean": 0.1582602821290493, "reward_after_std": 0.7448761742562056, "reward_before_mean": 0.5632272865623236, "reward_before_std": 0.6762064695358276, "reward_change_max": 0.0, "reward_change_mean": -0.4049670249223709, "reward_change_min": -0.6407857313752174, "reward_change_std": 0.2506500957533717, "reward_std": 0.7448761742562056, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.1673939572647214, "step": 96 }, { "clip_fraction": 0.0, "completion_length": 2700.0000534057617, "epoch": 0.11085714285714286, "grad_norm": 0.022453829646110535, "kl": 0.00012642145156860352, "lambda_div_used": 0.5770404115319252, "learning_rate": 6.588648530198504e-07, "loss": 0.0441, "reward": -0.24632946588099003, "reward_after_mean": -0.24632946588099003, "reward_after_std": 0.4705936200916767, "reward_before_mean": 0.10453066416084766, "reward_before_std": 0.3494142349809408, "reward_change_max": 0.0, "reward_change_mean": -0.35086013562977314, "reward_change_min": -0.5019582267850637, "reward_change_std": 0.18468604423105717, "reward_std": 0.4705936200916767, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.06213599909096956, "step": 97 }, { "clip_fraction": 0.0, "completion_length": 2192.8750534057617, "epoch": 0.112, "grad_norm": 0.021812062710523605, "kl": 8.285045623779297e-05, "lambda_div_used": 0.6265708059072495, "learning_rate": 6.512279744547392e-07, "loss": 0.0464, "reward": -0.06961194425821304, "reward_after_mean": -0.06961194425821304, "reward_after_std": 0.705444872379303, "reward_before_mean": 0.27397448010742664, "reward_before_std": 0.5812770891934633, "reward_change_max": 0.0, "reward_change_mean": -0.3435864243656397, "reward_change_min": -0.521237924695015, "reward_change_std": 0.19378468580543995, "reward_std": 0.7054448891431093, "rewards/accuracy_reward": 0.2500000037252903, "rewards/cosine_scaled_reward": 0.023974468291271478, "step": 98 }, { "clip_fraction": 0.0, "completion_length": 2786.1875534057617, "epoch": 0.11314285714285714, "grad_norm": 0.02357642538845539, "kl": 0.00013721734285354614, "lambda_div_used": 0.6171735525131226, "learning_rate": 6.435602608679916e-07, "loss": 0.0071, "reward": -0.024897070601582527, "reward_after_mean": -0.024897070601582527, "reward_after_std": 0.6280203014612198, "reward_before_mean": 0.3600337319076061, "reward_before_std": 0.5423067910596728, "reward_change_max": 0.0, "reward_change_mean": -0.3849308080971241, "reward_change_min": -0.6169135756790638, "reward_change_std": 0.2341146618127823, "reward_std": 0.6280203089118004, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.04753373749554157, "step": 99 }, { "clip_fraction": 0.0, "completion_length": 2387.687545776367, "epoch": 0.11428571428571428, "grad_norm": 0.023666124790906906, "kl": 0.00012056529521942139, "lambda_div_used": 0.6473532766103745, "learning_rate": 6.358640479194451e-07, "loss": -0.0079, "reward": 0.10418231040239334, "reward_after_mean": 0.10418231040239334, "reward_after_std": 0.6740316934883595, "reward_before_mean": 0.4593420661985874, "reward_before_std": 0.6859642090275884, "reward_change_max": 0.0, "reward_change_mean": -0.3551597539335489, "reward_change_min": -0.6693484336137772, "reward_change_std": 0.2530285455286503, "reward_std": 0.6740317121148109, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.12600870989263058, "step": 100 }, { "clip_fraction": 0.0, "completion_length": 2082.4167098999023, "epoch": 0.11542857142857142, "grad_norm": 0.025257760658860207, "kl": 0.00012496113777160645, "lambda_div_used": 0.6337181106209755, "learning_rate": 6.281416799501187e-07, "loss": 0.0127, "reward": 0.08042715396732092, "reward_after_mean": 0.08042715396732092, "reward_after_std": 0.6870364677160978, "reward_before_mean": 0.4686305020004511, "reward_before_std": 0.6316956970840693, "reward_change_max": 0.0, "reward_change_mean": -0.3882033359259367, "reward_change_min": -0.6386316269636154, "reward_change_std": 0.2518463246524334, "reward_std": 0.6870364770293236, "rewards/accuracy_reward": 0.33333333767950535, "rewards/cosine_scaled_reward": 0.13529715640470386, "step": 101 }, { "clip_fraction": 0.0, "completion_length": 2037.208381652832, "epoch": 0.11657142857142858, "grad_norm": 0.030245469883084297, "kl": 0.00012689828872680664, "lambda_div_used": 0.6057270467281342, "learning_rate": 6.203955092681039e-07, "loss": 0.0419, "reward": 0.04565976280719042, "reward_after_mean": 0.04565976280719042, "reward_after_std": 0.58235695771873, "reward_before_mean": 0.49184186570346355, "reward_before_std": 0.48894889652729034, "reward_change_max": 0.0, "reward_change_mean": -0.4461820814758539, "reward_change_min": -0.7175954841077328, "reward_change_std": 0.27023847959935665, "reward_std": 0.5823569800704718, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.17934184102341533, "step": 102 }, { "clip_fraction": 0.0, "completion_length": 2204.291717529297, "epoch": 0.11771428571428572, "grad_norm": 0.035654906183481216, "kl": 0.00011385977268218994, "lambda_div_used": 0.5589818432927132, "learning_rate": 6.126278954320294e-07, "loss": -0.0085, "reward": -0.2170967198908329, "reward_after_mean": -0.2170967198908329, "reward_after_std": 0.37913205102086067, "reward_before_mean": 0.17684321105480194, "reward_before_std": 0.26896010898053646, "reward_change_max": 0.0, "reward_change_mean": -0.3939399253576994, "reward_change_min": -0.5747586265206337, "reward_change_std": 0.2185236681252718, "reward_std": 0.3791320640593767, "rewards/accuracy_reward": 0.1875, "rewards/cosine_scaled_reward": -0.010656798258423805, "step": 103 }, { "clip_fraction": 0.0, "completion_length": 2460.1458435058594, "epoch": 0.11885714285714286, "grad_norm": 0.034317746758461, "kl": 0.00014722347259521484, "lambda_div_used": 0.5736617371439934, "learning_rate": 6.048412045323164e-07, "loss": -0.0349, "reward": -0.1729673482477665, "reward_after_mean": -0.1729673482477665, "reward_after_std": 0.44365703873336315, "reward_before_mean": 0.2049333555623889, "reward_before_std": 0.3348153894767165, "reward_change_max": 0.0, "reward_change_mean": -0.3779007289558649, "reward_change_min": -0.5351628288626671, "reward_change_std": 0.20819698367267847, "reward_std": 0.4436570517718792, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.024233306758105755, "step": 104 }, { "clip_fraction": 0.0, "completion_length": 2288.958381652832, "epoch": 0.12, "grad_norm": 0.02615894190967083, "kl": 0.00012151896953582764, "lambda_div_used": 0.6090050563216209, "learning_rate": 5.97037808470444e-07, "loss": 0.0354, "reward": 0.04345609247684479, "reward_after_mean": 0.04345609247684479, "reward_after_std": 0.5673400796949863, "reward_before_mean": 0.4590000305324793, "reward_before_std": 0.5081056347116828, "reward_change_max": 0.0, "reward_change_mean": -0.41554390639066696, "reward_change_min": -0.6591046005487442, "reward_change_std": 0.25887916050851345, "reward_std": 0.5673400945961475, "rewards/accuracy_reward": 0.3333333358168602, "rewards/cosine_scaled_reward": 0.1256666723638773, "step": 105 }, { "clip_fraction": 0.0, "completion_length": 1890.4791870117188, "epoch": 0.12114285714285715, "grad_norm": 0.02484404481947422, "kl": 5.9254467487335205e-05, "lambda_div_used": 0.5908190160989761, "learning_rate": 5.892200842364462e-07, "loss": -0.031, "reward": 0.13268680218607187, "reward_after_mean": 0.13268680218607187, "reward_after_std": 0.5557667016983032, "reward_before_mean": 0.6536997258663177, "reward_before_std": 0.4149925457313657, "reward_change_max": 0.0, "reward_change_mean": -0.5210129152983427, "reward_change_min": -0.7558874487876892, "reward_change_std": 0.2945323744788766, "reward_std": 0.555766711011529, "rewards/accuracy_reward": 0.4791666753590107, "rewards/cosine_scaled_reward": 0.1745330523699522, "step": 106 }, { "clip_fraction": 0.0, "completion_length": 2767.75004196167, "epoch": 0.12228571428571429, "grad_norm": 0.02508847787976265, "kl": 0.00016683340072631836, "lambda_div_used": 0.5527122691273689, "learning_rate": 5.813904131848564e-07, "loss": 0.0551, "reward": -0.10423782840371132, "reward_after_mean": -0.10423782840371132, "reward_after_std": 0.40749385207891464, "reward_before_mean": 0.3917595148086548, "reward_before_std": 0.23519209399819374, "reward_change_max": 0.0, "reward_change_mean": -0.4959973506629467, "reward_change_min": -0.6841400265693665, "reward_change_std": 0.26082153245806694, "reward_std": 0.40749386698007584, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.07925950735807419, "step": 107 }, { "clip_fraction": 0.0, "completion_length": 2765.1041870117188, "epoch": 0.12342857142857143, "grad_norm": 0.01993192359805107, "kl": 0.0001267939805984497, "lambda_div_used": 0.5970002189278603, "learning_rate": 5.735511803093248e-07, "loss": -0.0109, "reward": 0.0014873668551445007, "reward_after_mean": 0.0014873668551445007, "reward_after_std": 0.5301447622478008, "reward_before_mean": 0.4201008062809706, "reward_before_std": 0.44587238878011703, "reward_change_max": 0.0, "reward_change_mean": -0.41861344687640667, "reward_change_min": -0.6076503656804562, "reward_change_std": 0.24174897000193596, "reward_std": 0.5301447845995426, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.08676748792640865, "step": 108 }, { "clip_fraction": 0.0, "completion_length": 2777.8541946411133, "epoch": 0.12457142857142857, "grad_norm": 0.023633325472474098, "kl": 0.0001358315348625183, "lambda_div_used": 0.5594293028116226, "learning_rate": 5.657047735161255e-07, "loss": -0.0301, "reward": -0.27439120411872864, "reward_after_mean": -0.27439120411872864, "reward_after_std": 0.3761340919882059, "reward_before_mean": 0.09332936629652977, "reward_before_std": 0.26834795251488686, "reward_change_max": 0.0, "reward_change_mean": -0.36772056482732296, "reward_change_min": -0.5622114911675453, "reward_change_std": 0.20468105003237724, "reward_std": 0.37613409385085106, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.07333731185644865, "step": 109 }, { "clip_fraction": 0.0, "completion_length": 2538.7917098999023, "epoch": 0.12571428571428572, "grad_norm": 0.028952863067388535, "kl": 0.00011619925498962402, "lambda_div_used": 0.6277726292610168, "learning_rate": 5.578535828967777e-07, "loss": -0.0225, "reward": 0.13001340767368674, "reward_after_mean": 0.13001340767368674, "reward_after_std": 0.6052438467741013, "reward_before_mean": 0.5293129477649927, "reward_before_std": 0.5916427094489336, "reward_change_max": 0.0, "reward_change_mean": -0.3992995321750641, "reward_change_min": -0.6062962152063847, "reward_change_std": 0.25030655320733786, "reward_std": 0.6052438709884882, "rewards/accuracy_reward": 0.41666668467223644, "rewards/cosine_scaled_reward": 0.1126462584361434, "step": 110 }, { "clip_fraction": 0.0, "completion_length": 2590.833351135254, "epoch": 0.12685714285714286, "grad_norm": 0.023588471114635468, "kl": 0.00016444921493530273, "lambda_div_used": 0.6258220672607422, "learning_rate": 5.5e-07, "loss": 0.0326, "reward": 0.05748726427555084, "reward_after_mean": 0.05748726427555084, "reward_after_std": 0.6602962389588356, "reward_before_mean": 0.4434679429978132, "reward_before_std": 0.5859999302774668, "reward_change_max": 0.0, "reward_change_mean": -0.3859807029366493, "reward_change_min": -0.6280175969004631, "reward_change_std": 0.24166050180792809, "reward_std": 0.6602962575852871, "rewards/accuracy_reward": 0.33333333767950535, "rewards/cosine_scaled_reward": 0.11013460718095303, "step": 111 }, { "clip_fraction": 0.0, "completion_length": 2659.3750915527344, "epoch": 0.128, "grad_norm": 0.022469794377684593, "kl": 0.0001360177993774414, "lambda_div_used": 0.6310129314661026, "learning_rate": 5.421464171032224e-07, "loss": 0.0414, "reward": 0.12392518669366837, "reward_after_mean": 0.12392518669366837, "reward_after_std": 0.6471672505140305, "reward_before_mean": 0.5420562420040369, "reward_before_std": 0.6111889835447073, "reward_change_max": 0.0, "reward_change_mean": -0.4181310646235943, "reward_change_min": -0.6937783919274807, "reward_change_std": 0.27559296786785126, "reward_std": 0.6471672654151917, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.14622290851548314, "step": 112 }, { "clip_fraction": 0.0, "completion_length": 1887.0000534057617, "epoch": 0.12914285714285714, "grad_norm": 0.034688595682382584, "kl": 0.0001182258129119873, "lambda_div_used": 0.6175974532961845, "learning_rate": 5.342952264838747e-07, "loss": 0.0045, "reward": -0.10040622856467962, "reward_after_mean": -0.10040622856467962, "reward_after_std": 0.5642315912991762, "reward_before_mean": 0.21734675765037537, "reward_before_std": 0.5426071379333735, "reward_change_max": 0.0, "reward_change_mean": -0.31775298714637756, "reward_change_min": -0.5628380477428436, "reward_change_std": 0.21154501475393772, "reward_std": 0.564231613650918, "rewards/accuracy_reward": 0.22916667349636555, "rewards/cosine_scaled_reward": -0.011819899722468108, "step": 113 }, { "clip_fraction": 0.0, "completion_length": 2079.3541870117188, "epoch": 0.13028571428571428, "grad_norm": 0.023916104808449745, "kl": 8.407607674598694e-05, "lambda_div_used": 0.5991180911660194, "learning_rate": 5.264488196906752e-07, "loss": 0.0147, "reward": -0.25019002705812454, "reward_after_mean": -0.25019002705812454, "reward_after_std": 0.48966687358915806, "reward_before_mean": 0.01878603477962315, "reward_before_std": 0.4579437389038503, "reward_change_max": 0.0, "reward_change_mean": -0.26897607184946537, "reward_change_min": -0.4482330121099949, "reward_change_std": 0.17346476390957832, "reward_std": 0.4896668866276741, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.1478806473314762, "step": 114 }, { "clip_fraction": 0.0, "completion_length": 2811.6041984558105, "epoch": 0.13142857142857142, "grad_norm": 0.02382122538983822, "kl": 0.00011992454528808594, "lambda_div_used": 0.6031630709767342, "learning_rate": 5.186095868151436e-07, "loss": 0.0298, "reward": -0.024587277323007584, "reward_after_mean": -0.024587277323007584, "reward_after_std": 0.5156112629920244, "reward_before_mean": 0.36486465483903885, "reward_before_std": 0.47580901626497507, "reward_change_max": 0.0, "reward_change_mean": -0.38945191726088524, "reward_change_min": -0.6165403127670288, "reward_change_std": 0.24257665127515793, "reward_std": 0.5156112778931856, "rewards/accuracy_reward": 0.3541666716337204, "rewards/cosine_scaled_reward": 0.01069796271622181, "step": 115 }, { "clip_fraction": 0.0, "completion_length": 3174.166679382324, "epoch": 0.13257142857142856, "grad_norm": 0.0280291810631752, "kl": 0.000163152813911438, "lambda_div_used": 0.5893594026565552, "learning_rate": 5.107799157635538e-07, "loss": -0.0065, "reward": -0.2329910285770893, "reward_after_mean": -0.2329910285770893, "reward_after_std": 0.42214493826031685, "reward_before_mean": 0.07756753638386726, "reward_before_std": 0.4099442269653082, "reward_change_max": 0.0, "reward_change_mean": -0.31055857613682747, "reward_change_min": -0.5211558938026428, "reward_change_std": 0.2011605817824602, "reward_std": 0.42214495688676834, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.06826579943299294, "step": 116 }, { "clip_fraction": 0.0, "completion_length": 2800.854232788086, "epoch": 0.1337142857142857, "grad_norm": 0.021884813904762268, "kl": 0.00016610324382781982, "lambda_div_used": 0.5740129947662354, "learning_rate": 5.02962191529556e-07, "loss": -0.0155, "reward": -0.4151143445633352, "reward_after_mean": -0.4151143445633352, "reward_after_std": 0.41484479792416096, "reward_before_mean": -0.17339750938117504, "reward_before_std": 0.3358896663412452, "reward_change_max": 0.0, "reward_change_mean": -0.24171681702136993, "reward_change_min": -0.35830606892704964, "reward_change_std": 0.12972851190716028, "reward_std": 0.41484480164945126, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.21506418148055673, "step": 117 }, { "clip_fraction": 0.0, "completion_length": 2905.0000610351562, "epoch": 0.13485714285714287, "grad_norm": 0.019685380160808563, "kl": 0.00012448430061340332, "lambda_div_used": 0.6509419903159142, "learning_rate": 4.951587954676837e-07, "loss": 0.0699, "reward": 0.37886764854192734, "reward_after_mean": 0.37886764854192734, "reward_after_std": 0.8599284738302231, "reward_before_mean": 0.9192078877240419, "reward_before_std": 0.7036966122686863, "reward_change_max": 0.0, "reward_change_mean": -0.54034024477005, "reward_change_min": -0.8373602591454983, "reward_change_std": 0.3177654892206192, "reward_std": 0.8599285036325455, "rewards/accuracy_reward": 0.5833333395421505, "rewards/cosine_scaled_reward": 0.33587456680834293, "step": 118 }, { "clip_fraction": 0.0, "completion_length": 1735.0416946411133, "epoch": 0.136, "grad_norm": 0.03752947598695755, "kl": 9.926781058311462e-05, "lambda_div_used": 0.615598551928997, "learning_rate": 4.873721045679706e-07, "loss": 0.0198, "reward": 0.11371836951002479, "reward_after_mean": 0.11371836951002479, "reward_after_std": 0.608397152274847, "reward_before_mean": 0.5552113465964794, "reward_before_std": 0.5343325138092041, "reward_change_max": 0.0, "reward_change_mean": -0.44149295426905155, "reward_change_min": -0.6740178428590298, "reward_change_std": 0.26589817740023136, "reward_std": 0.6083971671760082, "rewards/accuracy_reward": 0.37500000931322575, "rewards/cosine_scaled_reward": 0.18021131958812475, "step": 119 }, { "clip_fraction": 0.0, "completion_length": 2124.3958702087402, "epoch": 0.13714285714285715, "grad_norm": 0.03277261555194855, "kl": 0.00016069412231445312, "lambda_div_used": 0.621355876326561, "learning_rate": 4.79604490731896e-07, "loss": -0.0313, "reward": 0.030127520207315683, "reward_after_mean": 0.030127520207315683, "reward_after_std": 0.6104090996086597, "reward_before_mean": 0.3965782462619245, "reward_before_std": 0.5573655245825648, "reward_change_max": 0.0, "reward_change_mean": -0.36645070649683475, "reward_change_min": -0.5546461082994938, "reward_change_std": 0.21761172730475664, "reward_std": 0.6104091145098209, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.04241155949421227, "step": 120 }, { "clip_fraction": 0.0, "completion_length": 1674.895866394043, "epoch": 0.1382857142857143, "grad_norm": 0.028996704146265984, "kl": 0.00010481476783752441, "lambda_div_used": 0.5694864094257355, "learning_rate": 4.7185832004988133e-07, "loss": 0.0117, "reward": -0.15189427509903908, "reward_after_mean": -0.15189427509903908, "reward_after_std": 0.4316890323534608, "reward_before_mean": 0.2471799086779356, "reward_before_std": 0.3141373130492866, "reward_change_max": 0.0, "reward_change_mean": -0.3990741856396198, "reward_change_min": -0.5547297224402428, "reward_change_std": 0.21437653806060553, "reward_std": 0.43168904818594456, "rewards/accuracy_reward": 0.25000000558793545, "rewards/cosine_scaled_reward": -0.002820094407070428, "step": 121 }, { "clip_fraction": 0.0, "completion_length": 2674.1666870117188, "epoch": 0.13942857142857143, "grad_norm": 0.02413249760866165, "kl": 0.00017333030700683594, "lambda_div_used": 0.5866860523819923, "learning_rate": 4.641359520805548e-07, "loss": -0.0399, "reward": 0.04396146908402443, "reward_after_mean": 0.04396146908402443, "reward_after_std": 0.5285588596016169, "reward_before_mean": 0.5353102702647448, "reward_before_std": 0.40454091038554907, "reward_change_max": 0.0, "reward_change_mean": -0.4913488235324621, "reward_change_min": -0.7136706411838531, "reward_change_std": 0.2844190578907728, "reward_std": 0.5285588726401329, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.13947694562375546, "step": 122 }, { "clip_fraction": 0.0, "completion_length": 2401.270881652832, "epoch": 0.14057142857142857, "grad_norm": 0.021569611504673958, "kl": 0.00012353062629699707, "lambda_div_used": 0.560750350356102, "learning_rate": 4.5643973913200837e-07, "loss": -0.0352, "reward": -0.11011217907071114, "reward_after_mean": -0.11011217907071114, "reward_after_std": 0.43483646027743816, "reward_before_mean": 0.3471109885722399, "reward_before_std": 0.2799733504652977, "reward_change_max": 0.0, "reward_change_mean": -0.4572231862694025, "reward_change_min": -0.6473490297794342, "reward_change_std": 0.2491364972665906, "reward_std": 0.4348364770412445, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.034610994160175323, "step": 123 }, { "clip_fraction": 0.0, "completion_length": 2023.1875381469727, "epoch": 0.1417142857142857, "grad_norm": 0.024433298036456108, "kl": 8.402764797210693e-05, "lambda_div_used": 0.5741237699985504, "learning_rate": 4.4877202554526084e-07, "loss": 0.0177, "reward": 0.11256878264248371, "reward_after_mean": 0.11256878264248371, "reward_after_std": 0.5534880999475718, "reward_before_mean": 0.7005790947005153, "reward_before_std": 0.34279677364975214, "reward_change_max": 0.0, "reward_change_mean": -0.5880102626979351, "reward_change_min": -0.8437007814645767, "reward_change_std": 0.32156766299158335, "reward_std": 0.5534881185740232, "rewards/accuracy_reward": 0.4583333358168602, "rewards/cosine_scaled_reward": 0.2422457179054618, "step": 124 }, { "clip_fraction": 0.0, "completion_length": 2390.12504196167, "epoch": 0.14285714285714285, "grad_norm": 0.027825474739074707, "kl": 9.434670209884644e-05, "lambda_div_used": 0.5816653594374657, "learning_rate": 4.4113514698014953e-07, "loss": -0.1441, "reward": -0.022026576101779938, "reward_after_mean": -0.022026576101779938, "reward_after_std": 0.5128289703279734, "reward_before_mean": 0.4427599459886551, "reward_before_std": 0.37189827114343643, "reward_change_max": 0.0, "reward_change_mean": -0.4647865351289511, "reward_change_min": -0.6719168424606323, "reward_change_std": 0.2539686840027571, "reward_std": 0.5128289721906185, "rewards/accuracy_reward": 0.33333333395421505, "rewards/cosine_scaled_reward": 0.10942662274464965, "step": 125 }, { "clip_fraction": 0.0, "completion_length": 2348.625030517578, "epoch": 0.144, "grad_norm": 0.024593623355031013, "kl": 0.0001001209020614624, "lambda_div_used": 0.5944257900118828, "learning_rate": 4.3353142970386557e-07, "loss": 0.0235, "reward": -0.036125872284173965, "reward_after_mean": -0.036125872284173965, "reward_after_std": 0.5458226818591356, "reward_before_mean": 0.3889114623889327, "reward_before_std": 0.43509659357368946, "reward_change_max": 0.0, "reward_change_mean": -0.4250373411923647, "reward_change_min": -0.6385823003947735, "reward_change_std": 0.24260270595550537, "reward_std": 0.5458226818591356, "rewards/accuracy_reward": 0.29166667349636555, "rewards/cosine_scaled_reward": 0.09724476374685764, "step": 126 }, { "clip_fraction": 0.0, "completion_length": 3143.229217529297, "epoch": 0.14514285714285713, "grad_norm": 0.021421613171696663, "kl": 0.0001538693904876709, "lambda_div_used": 0.5594507232308388, "learning_rate": 4.2596318988235037e-07, "loss": -0.0322, "reward": -0.4113082066178322, "reward_after_mean": -0.4113082066178322, "reward_after_std": 0.34261589869856834, "reward_before_mean": -0.14389780722558498, "reward_before_std": 0.269620718434453, "reward_change_max": 0.0, "reward_change_mean": -0.2674104031175375, "reward_change_min": -0.4196575991809368, "reward_change_std": 0.15403888188302517, "reward_std": 0.3426159042865038, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.20639780443161726, "step": 127 }, { "clip_fraction": 0.0, "completion_length": 2329.5208740234375, "epoch": 0.1462857142857143, "grad_norm": 0.025876455008983612, "kl": 0.00014778971672058105, "lambda_div_used": 0.5925442725419998, "learning_rate": 4.1843273287476854e-07, "loss": -0.0218, "reward": 0.04296835511922836, "reward_after_mean": 0.04296835511922836, "reward_after_std": 0.5716155916452408, "reward_before_mean": 0.5175326284952462, "reward_before_std": 0.4212250765413046, "reward_change_max": 0.0, "reward_change_mean": -0.4745642766356468, "reward_change_min": -0.6585596464574337, "reward_change_std": 0.25283501856029034, "reward_std": 0.571615606546402, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.12169927265495062, "step": 128 }, { "clip_fraction": 0.0, "completion_length": 3137.4583892822266, "epoch": 0.14742857142857144, "grad_norm": 0.018007410690188408, "kl": 0.00014001131057739258, "lambda_div_used": 0.6392767652869225, "learning_rate": 4.1094235253127374e-07, "loss": 0.0746, "reward": -0.024870820343494415, "reward_after_mean": -0.024870820343494415, "reward_after_std": 0.6713957078754902, "reward_before_mean": 0.2818590197712183, "reward_before_std": 0.6434567552059889, "reward_change_max": 0.0, "reward_change_mean": -0.30672984197735786, "reward_change_min": -0.5197948329150677, "reward_change_std": 0.20186646562069654, "reward_std": 0.6713957078754902, "rewards/accuracy_reward": 0.2500000074505806, "rewards/cosine_scaled_reward": 0.031859016977250576, "step": 129 }, { "clip_fraction": 0.0, "completion_length": 3087.104202270508, "epoch": 0.14857142857142858, "grad_norm": 0.020643344148993492, "kl": 0.00017881393432617188, "lambda_div_used": 0.6044124737381935, "learning_rate": 4.034943304942796e-07, "loss": -0.0175, "reward": -0.250907301902771, "reward_after_mean": -0.250907301902771, "reward_after_std": 0.514793710783124, "reward_before_mean": 0.01573239639401436, "reward_before_std": 0.4839063249528408, "reward_change_max": 0.0, "reward_change_mean": -0.26663970574736595, "reward_change_min": -0.5019430406391621, "reward_change_std": 0.1815296784043312, "reward_std": 0.5147937145084143, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.1301009338349104, "step": 130 }, { "clip_fraction": 0.0, "completion_length": 2290.5000381469727, "epoch": 0.14971428571428572, "grad_norm": 0.025495275855064392, "kl": 0.00014261901378631592, "lambda_div_used": 0.585864968597889, "learning_rate": 3.9609093550344907e-07, "loss": -0.0345, "reward": -0.033165013417601585, "reward_after_mean": -0.033165013417601585, "reward_after_std": 0.5260650478303432, "reward_before_mean": 0.42901195771992207, "reward_before_std": 0.3989385652821511, "reward_change_max": 0.0, "reward_change_mean": -0.4621769953519106, "reward_change_min": -0.7094772532582283, "reward_change_std": 0.26999812573194504, "reward_std": 0.5260650608688593, "rewards/accuracy_reward": 0.3541666679084301, "rewards/cosine_scaled_reward": 0.07484527863562107, "step": 131 }, { "clip_fraction": 0.0, "completion_length": 2628.979202270508, "epoch": 0.15085714285714286, "grad_norm": 0.021410632878541946, "kl": 0.00011786073446273804, "lambda_div_used": 0.6330231353640556, "learning_rate": 3.8873442270461485e-07, "loss": -0.0163, "reward": 0.21169579401612282, "reward_after_mean": 0.21169579401612282, "reward_after_std": 0.6052236501127481, "reward_before_mean": 0.6431907135993242, "reward_before_std": 0.611154742538929, "reward_change_max": 0.0, "reward_change_mean": -0.43149489536881447, "reward_change_min": -0.7013396099209785, "reward_change_std": 0.279757896438241, "reward_std": 0.605223661288619, "rewards/accuracy_reward": 0.4375000186264515, "rewards/cosine_scaled_reward": 0.20569069124758244, "step": 132 }, { "clip_fraction": 0.0, "completion_length": 3043.916702270508, "epoch": 0.152, "grad_norm": 0.020860377699136734, "kl": 0.00016830861568450928, "lambda_div_used": 0.577904686331749, "learning_rate": 3.8142703296283953e-07, "loss": -0.0229, "reward": -0.311251699924469, "reward_after_mean": -0.311251699924469, "reward_after_std": 0.40735830925405025, "reward_before_mean": -0.022546445950865746, "reward_before_std": 0.35403214395046234, "reward_change_max": 0.0, "reward_change_mean": -0.28870525024831295, "reward_change_min": -0.4795113056898117, "reward_change_std": 0.175603779964149, "reward_std": 0.40735832042992115, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.16837978060357273, "step": 133 }, { "clip_fraction": 0.0, "completion_length": 2296.2500228881836, "epoch": 0.15314285714285714, "grad_norm": 0.025448989123106003, "kl": 0.00012351572513580322, "lambda_div_used": 0.6253047212958336, "learning_rate": 3.7417099217982686e-07, "loss": 0.0232, "reward": 0.1032534665428102, "reward_after_mean": 0.1032534665428102, "reward_after_std": 0.6201032679527998, "reward_before_mean": 0.527151208370924, "reward_before_std": 0.5803719013929367, "reward_change_max": 0.0, "reward_change_mean": -0.4238977525383234, "reward_change_min": -0.7053604945540428, "reward_change_std": 0.27356533519923687, "reward_std": 0.6201032791286707, "rewards/accuracy_reward": 0.3958333432674408, "rewards/cosine_scaled_reward": 0.1313178651034832, "step": 134 }, { "clip_fraction": 0.0, "completion_length": 1375.6667098999023, "epoch": 0.15428571428571428, "grad_norm": 0.042060088366270065, "kl": 7.014349102973938e-05, "lambda_div_used": 0.6157513931393623, "learning_rate": 3.6696851061588994e-07, "loss": -0.0865, "reward": 0.32518790662288666, "reward_after_mean": 0.32518790662288666, "reward_after_std": 0.7028943486511707, "reward_before_mean": 0.9168294770643115, "reward_before_std": 0.5377248618751764, "reward_change_max": 0.0, "reward_change_mean": -0.5916415732353926, "reward_change_min": -0.8611635379493237, "reward_change_std": 0.3413122948259115, "reward_std": 0.7028943561017513, "rewards/accuracy_reward": 0.6041666734963655, "rewards/cosine_scaled_reward": 0.31266278121620417, "step": 135 }, { "clip_fraction": 0.0, "completion_length": 2304.375030517578, "epoch": 0.15542857142857142, "grad_norm": 0.0248978603631258, "kl": 0.00010488927364349365, "lambda_div_used": 0.6134930327534676, "learning_rate": 3.5982178221668533e-07, "loss": -0.0298, "reward": 0.14305459149181843, "reward_after_mean": 0.14305459149181843, "reward_after_std": 0.6303485874086618, "reward_before_mean": 0.6280363164842129, "reward_before_std": 0.5314226988703012, "reward_change_max": 0.0, "reward_change_mean": -0.4849817119538784, "reward_change_min": -0.7693819738924503, "reward_change_std": 0.29691250063478947, "reward_std": 0.6303485967218876, "rewards/accuracy_reward": 0.4375000037252903, "rewards/cosine_scaled_reward": 0.19053628714755177, "step": 136 }, { "clip_fraction": 0.0, "completion_length": 2849.312530517578, "epoch": 0.15657142857142858, "grad_norm": 0.019165532663464546, "kl": 0.00011494755744934082, "lambda_div_used": 0.6259770095348358, "learning_rate": 3.5273298394491515e-07, "loss": 0.0001, "reward": -0.2035258673131466, "reward_after_mean": -0.2035258673131466, "reward_after_std": 0.6123348288238049, "reward_before_mean": 0.05552574759349227, "reward_before_std": 0.5897263735532761, "reward_change_max": 0.0, "reward_change_mean": -0.25905160419642925, "reward_change_min": -0.49531829729676247, "reward_change_std": 0.184982025064528, "reward_std": 0.6123348399996758, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.11114091548370197, "step": 137 }, { "clip_fraction": 0.0, "completion_length": 2511.0625381469727, "epoch": 0.15771428571428572, "grad_norm": 0.02206576056778431, "kl": 9.766221046447754e-05, "lambda_div_used": 0.5779839232563972, "learning_rate": 3.45704275117204e-07, "loss": -0.0123, "reward": -0.23054278269410133, "reward_after_mean": -0.23054278269410133, "reward_after_std": 0.47196367010474205, "reward_before_mean": 0.11714623775333166, "reward_before_std": 0.35360280703753233, "reward_change_max": 0.0, "reward_change_mean": -0.34768899716436863, "reward_change_min": -0.5065655931830406, "reward_change_std": 0.18736570980399847, "reward_std": 0.4719636719673872, "rewards/accuracy_reward": 0.20833333395421505, "rewards/cosine_scaled_reward": -0.09118711110204458, "step": 138 }, { "clip_fraction": 0.0, "completion_length": 2817.166702270508, "epoch": 0.15885714285714286, "grad_norm": 0.025601239874958992, "kl": 0.00013655424118041992, "lambda_div_used": 0.6219307482242584, "learning_rate": 3.387377967463493e-07, "loss": -0.0354, "reward": -0.07538405619561672, "reward_after_mean": -0.07538405619561672, "reward_after_std": 0.5752126723527908, "reward_before_mean": 0.24527974613010883, "reward_before_std": 0.5667949663475156, "reward_change_max": 0.0, "reward_change_mean": -0.3206638339906931, "reward_change_min": -0.556602880358696, "reward_change_std": 0.21935877669602633, "reward_std": 0.5752126909792423, "rewards/accuracy_reward": 0.2708333432674408, "rewards/cosine_scaled_reward": -0.02555358811514452, "step": 139 }, { "clip_fraction": 0.0, "completion_length": 3020.0833740234375, "epoch": 0.16, "grad_norm": 0.020280320197343826, "kl": 0.00016957521438598633, "lambda_div_used": 0.623006746172905, "learning_rate": 3.3183567088914833e-07, "loss": 0.0263, "reward": 0.048921750858426094, "reward_after_mean": 0.048921750858426094, "reward_after_std": 0.6539230048656464, "reward_before_mean": 0.46376091009005904, "reward_before_std": 0.5685575436800718, "reward_change_max": 0.0, "reward_change_mean": -0.4148391764611006, "reward_change_min": -0.6308448016643524, "reward_change_std": 0.2505591865628958, "reward_std": 0.6539230197668076, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.08876088261604309, "step": 140 }, { "clip_fraction": 0.0, "completion_length": 2796.916732788086, "epoch": 0.16114285714285714, "grad_norm": 0.019921308383345604, "kl": 0.00011113286018371582, "lambda_div_used": 0.6025099903345108, "learning_rate": 3.250000000000001e-07, "loss": 0.0527, "reward": -0.28436860628426075, "reward_after_mean": -0.28436860628426075, "reward_after_std": 0.5148810762912035, "reward_before_mean": -0.02774716354906559, "reward_before_std": 0.46978663094341755, "reward_change_max": 0.0, "reward_change_mean": -0.2566214445978403, "reward_change_min": -0.41403992287814617, "reward_change_std": 0.15458690002560616, "reward_std": 0.5148810893297195, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.1527471598237753, "step": 141 }, { "clip_fraction": 0.0, "completion_length": 2619.1458587646484, "epoch": 0.16228571428571428, "grad_norm": 0.019752761349081993, "kl": 0.00013134628534317017, "lambda_div_used": 0.595375120639801, "learning_rate": 3.182328662904756e-07, "loss": -0.0208, "reward": -0.15498719364404678, "reward_after_mean": -0.15498719364404678, "reward_after_std": 0.5081784036010504, "reward_before_mean": 0.1970929354429245, "reward_before_std": 0.4446716960519552, "reward_change_max": 0.0, "reward_change_mean": -0.3520801328122616, "reward_change_min": -0.6091671586036682, "reward_change_std": 0.22200345993041992, "reward_std": 0.508178411051631, "rewards/accuracy_reward": 0.2291666679084301, "rewards/cosine_scaled_reward": -0.032073733396828175, "step": 142 }, { "clip_fraction": 0.0, "completion_length": 2416.2500762939453, "epoch": 0.16342857142857142, "grad_norm": 0.02633557841181755, "kl": 0.00012940168380737305, "lambda_div_used": 0.6070521473884583, "learning_rate": 3.115363310950578e-07, "loss": 0.0473, "reward": -0.1759424265474081, "reward_after_mean": -0.1759424265474081, "reward_after_std": 0.5199048612266779, "reward_before_mean": 0.1274722833186388, "reward_before_std": 0.4973313231021166, "reward_change_max": 0.0, "reward_change_mean": -0.3034147098660469, "reward_change_min": -0.5472985841333866, "reward_change_std": 0.20386416278779507, "reward_std": 0.5199048724025488, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.03919439576566219, "step": 143 }, { "clip_fraction": 0.0, "completion_length": 2497.0416946411133, "epoch": 0.16457142857142856, "grad_norm": 0.035163138061761856, "kl": 0.00010834634304046631, "lambda_div_used": 0.5917740762233734, "learning_rate": 3.0491243424323783e-07, "loss": 0.0908, "reward": 0.24967988207936287, "reward_after_mean": 0.24967988207936287, "reward_after_std": 0.5445964094251394, "reward_before_mean": 0.8210294228047132, "reward_before_std": 0.4194592139683664, "reward_change_max": 0.0, "reward_change_mean": -0.5713495649397373, "reward_change_min": -0.7989893518388271, "reward_change_std": 0.3215014720335603, "reward_std": 0.5445964206010103, "rewards/accuracy_reward": 0.541666679084301, "rewards/cosine_scaled_reward": 0.2793627381324768, "step": 144 }, { "clip_fraction": 0.0, "completion_length": 1880.750015258789, "epoch": 0.1657142857142857, "grad_norm": 0.03040655143558979, "kl": 9.316205978393555e-05, "lambda_div_used": 0.6018117442727089, "learning_rate": 2.9836319343816397e-07, "loss": -0.0063, "reward": 0.0018871724605560303, "reward_after_mean": 0.0018871724605560303, "reward_after_std": 0.6153606176376343, "reward_before_mean": 0.45608025789260864, "reward_before_std": 0.47093176282942295, "reward_change_max": 0.0, "reward_change_mean": -0.45419312454760075, "reward_change_min": -0.7000111788511276, "reward_change_std": 0.26173546724021435, "reward_std": 0.6153606250882149, "rewards/accuracy_reward": 0.35416666977107525, "rewards/cosine_scaled_reward": 0.10191359603777528, "step": 145 }, { "clip_fraction": 0.0, "completion_length": 1995.1041793823242, "epoch": 0.16685714285714287, "grad_norm": 0.02280835248529911, "kl": 9.866058826446533e-05, "lambda_div_used": 0.5822854116559029, "learning_rate": 2.918906036420294e-07, "loss": 0.0731, "reward": -0.3418470360338688, "reward_after_mean": -0.3418470360338688, "reward_after_std": 0.4274127297103405, "reward_before_mean": -0.07948943041265011, "reward_before_std": 0.38051687460392714, "reward_change_max": 0.0, "reward_change_mean": -0.2623576056212187, "reward_change_min": -0.453898411244154, "reward_change_std": 0.1681989086791873, "reward_std": 0.4274127408862114, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.18365611135959625, "step": 146 }, { "clip_fraction": 0.0, "completion_length": 3414.4166870117188, "epoch": 0.168, "grad_norm": 0.017901504412293434, "kl": 0.00017184019088745117, "lambda_div_used": 0.6236077323555946, "learning_rate": 2.854966364683872e-07, "loss": 0.0225, "reward": -0.0833415687084198, "reward_after_mean": -0.0833415687084198, "reward_after_std": 0.5858507957309484, "reward_before_mean": 0.216837452724576, "reward_before_std": 0.5764442849904299, "reward_change_max": 0.0, "reward_change_mean": -0.30017900839447975, "reward_change_min": -0.5037183798849583, "reward_change_std": 0.2055044947192073, "reward_std": 0.5858508311212063, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.012329218676313758, "step": 147 }, { "clip_fraction": 0.0, "completion_length": 2291.000030517578, "epoch": 0.16914285714285715, "grad_norm": 0.021171187981963158, "kl": 0.00012093409895896912, "lambda_div_used": 0.5844326093792915, "learning_rate": 2.791832395815782e-07, "loss": 0.0386, "reward": -0.15509312599897385, "reward_after_mean": -0.15509312599897385, "reward_after_std": 0.48159872740507126, "reward_before_mean": 0.22943633235991, "reward_before_std": 0.39098774176090956, "reward_change_max": 0.0, "reward_change_mean": -0.3845294751226902, "reward_change_min": -0.6122906021773815, "reward_change_std": 0.23033427819609642, "reward_std": 0.48159876093268394, "rewards/accuracy_reward": 0.2291666679084301, "rewards/cosine_scaled_reward": 0.00026967376470565796, "step": 148 }, { "clip_fraction": 0.0, "completion_length": 2509.0416870117188, "epoch": 0.1702857142857143, "grad_norm": 0.020010385662317276, "kl": 8.110702037811279e-05, "lambda_div_used": 0.5822181403636932, "learning_rate": 2.729523361034538e-07, "loss": 0.0208, "reward": 0.08701876550912857, "reward_after_mean": 0.08701876550912857, "reward_after_std": 0.5091588757932186, "reward_before_mean": 0.598341865465045, "reward_before_std": 0.3779993327334523, "reward_change_max": 0.0, "reward_change_mean": -0.5113231185823679, "reward_change_min": -0.740135669708252, "reward_change_std": 0.2875883989036083, "reward_std": 0.5091589000076056, "rewards/accuracy_reward": 0.4166666716337204, "rewards/cosine_scaled_reward": 0.18167520873248577, "step": 149 }, { "clip_fraction": 0.0, "completion_length": 2679.208381652832, "epoch": 0.17142857142857143, "grad_norm": 0.025659440085291862, "kl": 0.00014823675155639648, "lambda_div_used": 0.6290425136685371, "learning_rate": 2.6680582402757324e-07, "loss": -0.0233, "reward": -0.049642632249742746, "reward_after_mean": -0.049642632249742746, "reward_after_std": 0.5986230112612247, "reward_before_mean": 0.2686304301023483, "reward_before_std": 0.6011289358139038, "reward_change_max": 0.0, "reward_change_mean": -0.31827306374907494, "reward_change_min": -0.5840066187083721, "reward_change_std": 0.2246640883386135, "reward_std": 0.598623014986515, "rewards/accuracy_reward": 0.2708333432674408, "rewards/cosine_scaled_reward": -0.002202920615673065, "step": 150 }, { "clip_fraction": 0.0, "completion_length": 2462.5625610351562, "epoch": 0.17257142857142857, "grad_norm": 0.03043326921761036, "kl": 0.0001488029956817627, "lambda_div_used": 0.6573176011443138, "learning_rate": 2.6074557564105724e-07, "loss": 0.0929, "reward": 0.2124087940901518, "reward_after_mean": 0.2124087940901518, "reward_after_std": 0.762018321081996, "reward_before_mean": 0.6105309925042093, "reward_before_std": 0.7428378090262413, "reward_change_max": 0.0, "reward_change_mean": -0.39812218956649303, "reward_change_min": -0.6643032841384411, "reward_change_std": 0.271884405054152, "reward_std": 0.7620183527469635, "rewards/accuracy_reward": 0.43750000558793545, "rewards/cosine_scaled_reward": 0.17303097806870937, "step": 151 }, { "clip_fraction": 0.0, "completion_length": 2846.875072479248, "epoch": 0.1737142857142857, "grad_norm": 0.02812943048775196, "kl": 0.00018167495727539062, "lambda_div_used": 0.599166102707386, "learning_rate": 2.547734369542718e-07, "loss": -0.0069, "reward": -0.32120730075985193, "reward_after_mean": -0.32120730075985193, "reward_after_std": 0.5156350377947092, "reward_before_mean": -0.06628246325999498, "reward_before_std": 0.4535220582038164, "reward_change_max": 0.0, "reward_change_mean": -0.25492484122514725, "reward_change_min": -0.42022984474897385, "reward_change_std": 0.14956693351268768, "reward_std": 0.5156350489705801, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.1496157981455326, "step": 152 }, { "clip_fraction": 0.0, "completion_length": 2625.770896911621, "epoch": 0.17485714285714285, "grad_norm": 0.02562631107866764, "kl": 0.0001496821641921997, "lambda_div_used": 0.5755600407719612, "learning_rate": 2.488912271385139e-07, "loss": 0.0383, "reward": -0.2726967688649893, "reward_after_mean": -0.2726967688649893, "reward_after_std": 0.43943885155022144, "reward_before_mean": 0.054511758498847485, "reward_before_std": 0.3466099677607417, "reward_change_max": 0.0, "reward_change_mean": -0.3272085413336754, "reward_change_min": -0.4877549596130848, "reward_change_std": 0.18566382955759764, "reward_std": 0.4394388683140278, "rewards/accuracy_reward": 0.20833333395421505, "rewards/cosine_scaled_reward": -0.15382158383727074, "step": 153 }, { "clip_fraction": 0.0, "completion_length": 2930.2083587646484, "epoch": 0.176, "grad_norm": 0.01869855262339115, "kl": 0.00013381242752075195, "lambda_div_used": 0.6487007588148117, "learning_rate": 2.4310073797187573e-07, "loss": 0.0074, "reward": 0.17745468392968178, "reward_after_mean": 0.17745468392968178, "reward_after_std": 0.6683135256171227, "reward_before_mean": 0.554833997040987, "reward_before_std": 0.6959163639694452, "reward_change_max": 0.0, "reward_change_mean": -0.37737933173775673, "reward_change_min": -0.6365409940481186, "reward_change_std": 0.26569664292037487, "reward_std": 0.6683135367929935, "rewards/accuracy_reward": 0.3958333432674408, "rewards/cosine_scaled_reward": 0.15900065936148167, "step": 154 }, { "clip_fraction": 0.0, "completion_length": 2324.7083587646484, "epoch": 0.17714285714285713, "grad_norm": 0.03042282536625862, "kl": 0.0001443326473236084, "lambda_div_used": 0.6021355092525482, "learning_rate": 2.374037332934512e-07, "loss": -0.0708, "reward": 0.013361499644815922, "reward_after_mean": 0.013361499644815922, "reward_after_std": 0.608528571203351, "reward_before_mean": 0.4701185021549463, "reward_before_std": 0.4722642693668604, "reward_change_max": 0.0, "reward_change_mean": -0.4567570425570011, "reward_change_min": -0.728898536413908, "reward_change_std": 0.2705167792737484, "reward_std": 0.6085285805165768, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.07428519520908594, "step": 155 }, { "clip_fraction": 0.0, "completion_length": 2742.041702270508, "epoch": 0.1782857142857143, "grad_norm": 0.023997837677598, "kl": 0.00013341009616851807, "lambda_div_used": 0.6111876517534256, "learning_rate": 2.3180194846605364e-07, "loss": 0.0104, "reward": -0.11058625392615795, "reward_after_mean": -0.11058625392615795, "reward_after_std": 0.5431522708386183, "reward_before_mean": 0.20433677232358605, "reward_before_std": 0.5068034324795008, "reward_change_max": 0.0, "reward_change_mean": -0.31492303870618343, "reward_change_min": -0.4863894209265709, "reward_change_std": 0.18972175009548664, "reward_std": 0.5431522782891989, "rewards/accuracy_reward": 0.2291666753590107, "rewards/cosine_scaled_reward": -0.024829893372952938, "step": 156 }, { "clip_fraction": 0.0, "completion_length": 2665.875045776367, "epoch": 0.17942857142857144, "grad_norm": 0.02073371410369873, "kl": 0.00014007091522216797, "lambda_div_used": 0.5806600153446198, "learning_rate": 2.2629708984760706e-07, "loss": 0.0406, "reward": -0.2642064723186195, "reward_after_mean": -0.2642064723186195, "reward_after_std": 0.48669449612498283, "reward_before_mean": 0.06258813291788101, "reward_before_std": 0.3662749119102955, "reward_change_max": 0.0, "reward_change_mean": -0.32679460756480694, "reward_change_min": -0.473216038197279, "reward_change_std": 0.1716562630608678, "reward_std": 0.48669449612498283, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.10407854370714631, "step": 157 }, { "clip_fraction": 0.0, "completion_length": 2338.8333740234375, "epoch": 0.18057142857142858, "grad_norm": 0.02651275135576725, "kl": 0.00011517666280269623, "lambda_div_used": 0.6631434112787247, "learning_rate": 2.2089083427137329e-07, "loss": 0.0501, "reward": 0.14301904384046793, "reward_after_mean": 0.14301904384046793, "reward_after_std": 0.8312356304377317, "reward_before_mean": 0.5320943212136626, "reward_before_std": 0.7637526150792837, "reward_change_max": 0.0, "reward_change_mean": -0.38907529041171074, "reward_change_min": -0.694700576364994, "reward_change_std": 0.26256909035146236, "reward_std": 0.8312356378883123, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.15709431585855782, "step": 158 }, { "clip_fraction": 0.0, "completion_length": 3239.312530517578, "epoch": 0.18171428571428572, "grad_norm": 0.016524603590369225, "kl": 0.0001583099365234375, "lambda_div_used": 0.623667947947979, "learning_rate": 2.1558482853517253e-07, "loss": -0.0341, "reward": -0.1187703013420105, "reward_after_mean": -0.1187703013420105, "reward_after_std": 0.5951940100640059, "reward_before_mean": 0.16929386125411838, "reward_before_std": 0.5728897508233786, "reward_change_max": 0.0, "reward_change_mean": -0.28806419111788273, "reward_change_min": -0.4665379598736763, "reward_change_std": 0.1831390606239438, "reward_std": 0.5951940137892962, "rewards/accuracy_reward": 0.2291666753590107, "rewards/cosine_scaled_reward": -0.05987279675900936, "step": 159 }, { "clip_fraction": 0.0, "completion_length": 2803.312530517578, "epoch": 0.18285714285714286, "grad_norm": 0.026071852073073387, "kl": 0.00017073750495910645, "lambda_div_used": 0.6305168867111206, "learning_rate": 2.1038068889975259e-07, "loss": -0.0459, "reward": 0.010341526940464973, "reward_after_mean": 0.010341526940464973, "reward_after_std": 0.6011195741593838, "reward_before_mean": 0.3497283663600683, "reward_before_std": 0.6088744457811117, "reward_change_max": 0.0, "reward_change_mean": -0.33938686549663544, "reward_change_min": -0.5917558334767818, "reward_change_std": 0.2366197258234024, "reward_std": 0.6011195983737707, "rewards/accuracy_reward": 0.2916666753590107, "rewards/cosine_scaled_reward": 0.05806170590221882, "step": 160 }, { "clip_fraction": 0.0, "completion_length": 2208.9167098999023, "epoch": 0.184, "grad_norm": 0.0240344051271677, "kl": 0.000129062682390213, "lambda_div_used": 0.6582028865814209, "learning_rate": 2.0528000059645995e-07, "loss": 0.0279, "reward": 0.07936648279428482, "reward_after_mean": 0.07936648279428482, "reward_after_std": 0.7546874471008778, "reward_before_mean": 0.40869135939283296, "reward_before_std": 0.7339576873928308, "reward_change_max": 0.0, "reward_change_mean": -0.32932490296661854, "reward_change_min": -0.6056464668363333, "reward_change_std": 0.22207134775817394, "reward_std": 0.7546874955296516, "rewards/accuracy_reward": 0.3541666716337204, "rewards/cosine_scaled_reward": 0.05452469550073147, "step": 161 }, { "clip_fraction": 0.0, "completion_length": 3135.0208892822266, "epoch": 0.18514285714285714, "grad_norm": 0.022219210863113403, "kl": 0.0001678466796875, "lambda_div_used": 0.5934342220425606, "learning_rate": 2.0028431734436308e-07, "loss": -0.0607, "reward": -0.046884071081876755, "reward_after_mean": -0.046884071081876755, "reward_after_std": 0.5110116824507713, "reward_before_mean": 0.3527396023273468, "reward_before_std": 0.4275300269946456, "reward_change_max": 0.0, "reward_change_mean": -0.39962366595864296, "reward_change_min": -0.5722533725202084, "reward_change_std": 0.22872111946344376, "reward_std": 0.5110117141157389, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.019406253471970558, "step": 162 }, { "clip_fraction": 0.0, "completion_length": 2319.9791870117188, "epoch": 0.18628571428571428, "grad_norm": 0.025694716721773148, "kl": 0.00014371052384376526, "lambda_div_used": 0.5704625844955444, "learning_rate": 1.9539516087697517e-07, "loss": 0.0916, "reward": 0.07464592158794403, "reward_after_mean": 0.07464592158794403, "reward_after_std": 0.49546825513243675, "reward_before_mean": 0.6325086355209351, "reward_before_std": 0.32235115580260754, "reward_change_max": 0.0, "reward_change_mean": -0.557862676680088, "reward_change_min": -0.7940906882286072, "reward_change_std": 0.3045828063040972, "reward_std": 0.49546825885772705, "rewards/accuracy_reward": 0.4375, "rewards/cosine_scaled_reward": 0.19500861689448357, "step": 163 }, { "clip_fraction": 0.0, "completion_length": 2167.750045776367, "epoch": 0.18742857142857142, "grad_norm": 0.0288226380944252, "kl": 0.0001278519630432129, "lambda_div_used": 0.6208588480949402, "learning_rate": 1.9061402047871833e-07, "loss": 0.0045, "reward": 0.052167763307807036, "reward_after_mean": 0.052167763307807036, "reward_after_std": 0.627906009554863, "reward_before_mean": 0.44521861523389816, "reward_before_std": 0.5596300046890974, "reward_change_max": 0.0, "reward_change_mean": -0.39305083081126213, "reward_change_min": -0.6213876642286777, "reward_change_std": 0.24184285942465067, "reward_std": 0.6279060393571854, "rewards/accuracy_reward": 0.35416667349636555, "rewards/cosine_scaled_reward": 0.09105192590504885, "step": 164 }, { "clip_fraction": 0.0, "completion_length": 2823.8125228881836, "epoch": 0.18857142857142858, "grad_norm": 0.026144707575440407, "kl": 0.00016139447689056396, "lambda_div_used": 0.5820390656590462, "learning_rate": 1.8594235253127372e-07, "loss": -0.0708, "reward": -0.32857649284414947, "reward_after_mean": -0.32857649284414947, "reward_after_std": 0.4344688355922699, "reward_before_mean": -0.0503513365983963, "reward_before_std": 0.3760820124298334, "reward_change_max": 0.0, "reward_change_mean": -0.2782251574099064, "reward_change_min": -0.43433111906051636, "reward_change_std": 0.16556962952017784, "reward_std": 0.4344688393175602, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.13368466403335333, "step": 165 }, { "clip_fraction": 0.0, "completion_length": 2491.0208587646484, "epoch": 0.18971428571428572, "grad_norm": 0.019254038110375404, "kl": 0.0001147836446762085, "lambda_div_used": 0.6493343263864517, "learning_rate": 1.8138158006995363e-07, "loss": 0.0025, "reward": 0.10031834430992603, "reward_after_mean": 0.10031834430992603, "reward_after_std": 0.6912827659398317, "reward_before_mean": 0.4514310024678707, "reward_before_std": 0.6998845022171736, "reward_change_max": 0.0, "reward_change_mean": -0.351112674921751, "reward_change_min": -0.6428324580192566, "reward_change_std": 0.2517909351736307, "reward_std": 0.6912827901542187, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.11809766665101051, "step": 166 }, { "clip_fraction": 0.0, "completion_length": 2146.645896911621, "epoch": 0.19085714285714286, "grad_norm": 0.028959238901734352, "kl": 0.00012630224227905273, "lambda_div_used": 0.602773554623127, "learning_rate": 1.7693309235023127e-07, "loss": -0.0687, "reward": -0.08888023532927036, "reward_after_mean": -0.08888023532927036, "reward_after_std": 0.555552402511239, "reward_before_mean": 0.284214471001178, "reward_before_std": 0.4734340328723192, "reward_change_max": 0.0, "reward_change_mean": -0.37309471145272255, "reward_change_min": -0.5994942858815193, "reward_change_std": 0.2264004945755005, "reward_std": 0.5555524323135614, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.013381102122366428, "step": 167 }, { "clip_fraction": 0.0, "completion_length": 2591.562545776367, "epoch": 0.192, "grad_norm": 0.022079093381762505, "kl": 0.00013020634651184082, "lambda_div_used": 0.6182359680533409, "learning_rate": 1.7259824442455923e-07, "loss": 0.0571, "reward": 0.07187426090240479, "reward_after_mean": 0.07187426090240479, "reward_after_std": 0.6350691560655832, "reward_before_mean": 0.4906120039522648, "reward_before_std": 0.5473381988704205, "reward_change_max": 0.0, "reward_change_mean": -0.4187377579510212, "reward_change_min": -0.6163501553237438, "reward_change_std": 0.24177053570747375, "reward_std": 0.6350691728293896, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.1364453360438347, "step": 168 }, { "clip_fraction": 0.0, "completion_length": 1961.333366394043, "epoch": 0.19314285714285714, "grad_norm": 0.024920228868722916, "kl": 0.00011189281940460205, "lambda_div_used": 0.5712982937693596, "learning_rate": 1.6837835672960831e-07, "loss": -0.037, "reward": 0.20125045720487833, "reward_after_mean": 0.20125045720487833, "reward_after_std": 0.5916534103453159, "reward_before_mean": 0.8435525028035045, "reward_before_std": 0.32216374203562737, "reward_change_max": 0.0, "reward_change_mean": -0.6423020549118519, "reward_change_min": -0.836066972464323, "reward_change_std": 0.32226173765957355, "reward_std": 0.5916534326970577, "rewards/accuracy_reward": 0.5416666679084301, "rewards/cosine_scaled_reward": 0.3018858137074858, "step": 169 }, { "clip_fraction": 0.0, "completion_length": 2154.5417404174805, "epoch": 0.19428571428571428, "grad_norm": 0.028055250644683838, "kl": 9.50545072555542e-05, "lambda_div_used": 0.5548974648118019, "learning_rate": 1.6427471468404952e-07, "loss": 0.0414, "reward": -0.1357494406402111, "reward_after_mean": -0.1357494406402111, "reward_after_std": 0.41400698386132717, "reward_before_mean": 0.32883553951978683, "reward_before_std": 0.2495311009697616, "reward_change_max": 0.0, "reward_change_mean": -0.4645849745720625, "reward_change_min": -0.6499762162566185, "reward_change_std": 0.250681190751493, "reward_std": 0.41400699876248837, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.01633552461862564, "step": 170 }, { "clip_fraction": 0.0, "completion_length": 2344.3958587646484, "epoch": 0.19542857142857142, "grad_norm": 0.02545234002172947, "kl": 0.00012035667896270752, "lambda_div_used": 0.5686006918549538, "learning_rate": 1.6028856829700258e-07, "loss": -0.0076, "reward": 0.025929288007318974, "reward_after_mean": 0.025929288007318974, "reward_after_std": 0.4821996595710516, "reward_before_mean": 0.5422459719702601, "reward_before_std": 0.3103277189657092, "reward_change_max": 0.0, "reward_change_mean": -0.5163167044520378, "reward_change_min": -0.6874858625233173, "reward_change_std": 0.26997776329517365, "reward_std": 0.4821996670216322, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.1672459738329053, "step": 171 }, { "clip_fraction": 0.0, "completion_length": 2580.3958740234375, "epoch": 0.19657142857142856, "grad_norm": 0.03461524099111557, "kl": 0.00015616416931152344, "lambda_div_used": 0.5815886929631233, "learning_rate": 8.487667956935087e-07, "loss": -0.0386, "reward": -0.02652345411479473, "reward_after_mean": -0.02652345411479473, "reward_after_std": 0.5259725619107485, "reward_before_mean": 0.4387537483125925, "reward_before_std": 0.3750908151268959, "reward_change_max": 0.0, "reward_change_mean": -0.46527721732854843, "reward_change_min": -0.6695724055171013, "reward_change_std": 0.2563530644401908, "reward_std": 0.5259725674986839, "rewards/accuracy_reward": 0.33333333395421505, "rewards/cosine_scaled_reward": 0.10542040364816785, "step": 172 }, { "clip_fraction": 0.0, "completion_length": 1932.0625305175781, "epoch": 0.1977142857142857, "grad_norm": 0.03550613671541214, "kl": 9.726732969284058e-05, "lambda_div_used": 0.5629367232322693, "learning_rate": 8.464102570534061e-07, "loss": -0.0198, "reward": -0.33315238857176155, "reward_after_mean": -0.33315238857176155, "reward_after_std": 0.35940456483513117, "reward_before_mean": -0.03457173053175211, "reward_before_std": 0.2833498573163524, "reward_change_max": 0.0, "reward_change_mean": -0.29858064092695713, "reward_change_min": -0.4200097434222698, "reward_change_std": 0.16404641512781382, "reward_std": 0.35940458066761494, "rewards/accuracy_reward": 0.12500000558793545, "rewards/cosine_scaled_reward": -0.15957174729555845, "step": 173 }, { "clip_fraction": 0.0, "completion_length": 1809.7708587646484, "epoch": 0.19885714285714284, "grad_norm": 0.03420722112059593, "kl": 0.00013248249888420105, "lambda_div_used": 0.6139687895774841, "learning_rate": 8.440392717955475e-07, "loss": -0.0816, "reward": -0.07395929284393787, "reward_after_mean": -0.07395929284393787, "reward_after_std": 0.5732789468020201, "reward_before_mean": 0.25243946351110935, "reward_before_std": 0.5262010591104627, "reward_change_max": 0.0, "reward_change_mean": -0.3263987563550472, "reward_change_min": -0.5132386535406113, "reward_change_std": 0.20316704735159874, "reward_std": 0.5732789561152458, "rewards/accuracy_reward": 0.22916667349636555, "rewards/cosine_scaled_reward": 0.02327278454322368, "step": 174 }, { "clip_fraction": 0.0, "completion_length": 2270.7500495910645, "epoch": 0.2, "grad_norm": 0.023624489083886147, "kl": 0.00011564046144485474, "lambda_div_used": 0.570957601070404, "learning_rate": 8.416539554784089e-07, "loss": 0.0018, "reward": -0.09137872606515884, "reward_after_mean": -0.09137872606515884, "reward_after_std": 0.4245176389813423, "reward_before_mean": 0.3394407369196415, "reward_before_std": 0.3214457123540342, "reward_change_max": 0.0, "reward_change_mean": -0.4308194350451231, "reward_change_min": -0.6172507330775261, "reward_change_std": 0.24237936083227396, "reward_std": 0.4245176613330841, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.026940705254673958, "step": 175 }, { "clip_fraction": 0.0, "completion_length": 2428.3750534057617, "epoch": 0.20114285714285715, "grad_norm": 0.03552456945180893, "kl": 9.998679161071777e-05, "lambda_div_used": 0.6270494386553764, "learning_rate": 8.392544243589427e-07, "loss": 0.0567, "reward": 0.23782964330166578, "reward_after_mean": 0.23782964330166578, "reward_after_std": 0.6844992376863956, "reward_before_mean": 0.7466199137270451, "reward_before_std": 0.594361359719187, "reward_change_max": 0.0, "reward_change_mean": -0.5087902657687664, "reward_change_min": -0.8128968887031078, "reward_change_std": 0.31682352535426617, "reward_std": 0.6844992693513632, "rewards/accuracy_reward": 0.5000000074505806, "rewards/cosine_scaled_reward": 0.24661988578736782, "step": 176 }, { "clip_fraction": 0.0, "completion_length": 2709.5833740234375, "epoch": 0.2022857142857143, "grad_norm": 0.022738253697752953, "kl": 0.0001655668020248413, "lambda_div_used": 0.6092279329895973, "learning_rate": 8.368407953869103e-07, "loss": 0.0278, "reward": -0.16157673671841621, "reward_after_mean": -0.16157673671841621, "reward_after_std": 0.5301279928535223, "reward_before_mean": 0.14143561571836472, "reward_before_std": 0.5103836972266436, "reward_change_max": 0.0, "reward_change_mean": -0.303012328222394, "reward_change_min": -0.5294999107718468, "reward_change_std": 0.2066562958061695, "reward_std": 0.530128002166748, "rewards/accuracy_reward": 0.1875, "rewards/cosine_scaled_reward": -0.04606438986957073, "step": 177 }, { "clip_fraction": 0.0, "completion_length": 2521.562545776367, "epoch": 0.20342857142857143, "grad_norm": 0.023333929479122162, "kl": 0.00010882318019866943, "lambda_div_used": 0.5983341336250305, "learning_rate": 8.344131861991828e-07, "loss": 0.0305, "reward": -0.0399339459836483, "reward_after_mean": -0.0399339459836483, "reward_after_std": 0.544251000508666, "reward_before_mean": 0.369808804243803, "reward_before_std": 0.4518149495124817, "reward_change_max": 0.0, "reward_change_mean": -0.4097427297383547, "reward_change_min": -0.6259582564234734, "reward_change_std": 0.23724547680467367, "reward_std": 0.5442510098218918, "rewards/accuracy_reward": 0.29166667349636555, "rewards/cosine_scaled_reward": 0.07814211072400212, "step": 178 }, { "clip_fraction": 0.0, "completion_length": 2703.7083740234375, "epoch": 0.20457142857142857, "grad_norm": 0.023093275725841522, "kl": 0.00013177096843719482, "lambda_div_used": 0.5762727931141853, "learning_rate": 8.319717151140072e-07, "loss": 0.0879, "reward": -0.20519665256142616, "reward_after_mean": -0.20519665256142616, "reward_after_std": 0.39140512235462666, "reward_before_mean": 0.13847777154296637, "reward_before_std": 0.3475749148055911, "reward_change_max": 0.0, "reward_change_mean": -0.3436744213104248, "reward_change_min": -0.5246328189969063, "reward_change_std": 0.20370345003902912, "reward_std": 0.39140513353049755, "rewards/accuracy_reward": 0.1875000074505806, "rewards/cosine_scaled_reward": -0.049022228457033634, "step": 179 }, { "clip_fraction": 0.0, "completion_length": 2075.3750228881836, "epoch": 0.2057142857142857, "grad_norm": 0.03283306583762169, "kl": 0.00013710558414459229, "lambda_div_used": 0.5492931827902794, "learning_rate": 8.295165011252396e-07, "loss": -0.0592, "reward": -0.09344155341386795, "reward_after_mean": -0.09344155341386795, "reward_after_std": 0.37022680789232254, "reward_before_mean": 0.39771560952067375, "reward_before_std": 0.21920094243250787, "reward_change_max": 0.0, "reward_change_mean": -0.4911571964621544, "reward_change_min": -0.6641882658004761, "reward_change_std": 0.25923535134643316, "reward_std": 0.3702268172055483, "rewards/accuracy_reward": 0.3541666716337204, "rewards/cosine_scaled_reward": 0.0435489546507597, "step": 180 }, { "clip_fraction": 0.0, "completion_length": 2976.875030517578, "epoch": 0.20685714285714285, "grad_norm": 0.02316705696284771, "kl": 0.00017173588275909424, "lambda_div_used": 0.594053827226162, "learning_rate": 8.270476638965461e-07, "loss": -0.0029, "reward": -0.17979225050657988, "reward_after_mean": -0.17979225050657988, "reward_after_std": 0.5341140031814575, "reward_before_mean": 0.17803902877494693, "reward_before_std": 0.4284058129414916, "reward_change_max": 0.0, "reward_change_mean": -0.3578312788158655, "reward_change_min": -0.5442017950117588, "reward_change_std": 0.2014410514384508, "reward_std": 0.5341140106320381, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.03029430890455842, "step": 181 }, { "clip_fraction": 0.0, "completion_length": 1833.0833930969238, "epoch": 0.208, "grad_norm": 0.02471437305212021, "kl": 6.746500730514526e-05, "lambda_div_used": 0.6185052543878555, "learning_rate": 8.245653237555705e-07, "loss": -0.0541, "reward": 0.10658288560807705, "reward_after_mean": 0.10658288560807705, "reward_after_std": 0.6189861167222261, "reward_before_mean": 0.5465792864561081, "reward_before_std": 0.5544852227903903, "reward_change_max": 0.0, "reward_change_mean": -0.43999641202390194, "reward_change_min": -0.7109990864992142, "reward_change_std": 0.2778801778331399, "reward_std": 0.618986152112484, "rewards/accuracy_reward": 0.4375000037252903, "rewards/cosine_scaled_reward": 0.10907927341759205, "step": 182 }, { "clip_fraction": 0.0, "completion_length": 1834.4375686645508, "epoch": 0.20914285714285713, "grad_norm": 0.026828886941075325, "kl": 8.349120616912842e-05, "lambda_div_used": 0.6341942846775055, "learning_rate": 8.220696016880687e-07, "loss": 0.0145, "reward": 0.10971941862953827, "reward_after_mean": 0.10971941862953827, "reward_after_std": 0.6589909251779318, "reward_before_mean": 0.4826927953399718, "reward_before_std": 0.6231282472144812, "reward_change_max": 0.0, "reward_change_mean": -0.37297336757183075, "reward_change_min": -0.5635729804635048, "reward_change_std": 0.2322026826441288, "reward_std": 0.6589909512549639, "rewards/accuracy_reward": 0.3125000111758709, "rewards/cosine_scaled_reward": 0.1701928018592298, "step": 183 }, { "clip_fraction": 0.0, "completion_length": 2777.020854949951, "epoch": 0.2102857142857143, "grad_norm": 0.027208158746361732, "kl": 0.00016835331916809082, "lambda_div_used": 0.5735552906990051, "learning_rate": 8.195606193320136e-07, "loss": 0.0331, "reward": -0.2550749061629176, "reward_after_mean": -0.2550749061629176, "reward_after_std": 0.45949564687907696, "reward_before_mean": 0.09985405765473843, "reward_before_std": 0.33237360091879964, "reward_change_max": 0.0, "reward_change_mean": -0.3549289759248495, "reward_change_min": -0.4971868433058262, "reward_change_std": 0.18658464308828115, "reward_std": 0.45949566550552845, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.06681260792538524, "step": 184 }, { "clip_fraction": 0.0, "completion_length": 2450.4583473205566, "epoch": 0.21142857142857144, "grad_norm": 0.031349100172519684, "kl": 0.00011152029037475586, "lambda_div_used": 0.5730742663145065, "learning_rate": 8.170384989716657e-07, "loss": 0.055, "reward": -0.23662783950567245, "reward_after_mean": -0.23662783950567245, "reward_after_std": 0.3798432908952236, "reward_before_mean": 0.08804147504270077, "reward_before_std": 0.3363419594243169, "reward_change_max": 0.0, "reward_change_mean": -0.32466931641101837, "reward_change_min": -0.47964803501963615, "reward_change_std": 0.19531975220888853, "reward_std": 0.3798433095216751, "rewards/accuracy_reward": 0.1875000074505806, "rewards/cosine_scaled_reward": -0.09945851005613804, "step": 185 }, { "clip_fraction": 0.0, "completion_length": 2827.9375228881836, "epoch": 0.21257142857142858, "grad_norm": 0.018743637949228287, "kl": 0.00014129281044006348, "lambda_div_used": 0.6022143214941025, "learning_rate": 8.145033635316128e-07, "loss": -0.0243, "reward": -0.19847029261291027, "reward_after_mean": -0.19847029261291027, "reward_after_std": 0.4997438360005617, "reward_before_mean": 0.09954999759793282, "reward_before_std": 0.4699443206191063, "reward_change_max": 0.0, "reward_change_mean": -0.298020301386714, "reward_change_min": -0.4791484698653221, "reward_change_std": 0.18491498567163944, "reward_std": 0.49974384531378746, "rewards/accuracy_reward": 0.18750000558793545, "rewards/cosine_scaled_reward": -0.08794999308884144, "step": 186 }, { "clip_fraction": 0.0, "completion_length": 2602.1875228881836, "epoch": 0.21371428571428572, "grad_norm": 0.030055196955800056, "kl": 0.0001709461212158203, "lambda_div_used": 0.5778695195913315, "learning_rate": 8.119553365707802e-07, "loss": -0.0784, "reward": -0.23846351448446512, "reward_after_mean": -0.23846351448446512, "reward_after_std": 0.39364523626863956, "reward_before_mean": 0.09842715226113796, "reward_before_std": 0.35397925041615963, "reward_change_max": 0.0, "reward_change_mean": -0.3368906620889902, "reward_change_min": -0.5288374535739422, "reward_change_std": 0.20237108506262302, "reward_std": 0.3936452493071556, "rewards/accuracy_reward": 0.1458333395421505, "rewards/cosine_scaled_reward": -0.047406171914190054, "step": 187 }, { "clip_fraction": 0.0, "completion_length": 3423.875030517578, "epoch": 0.21485714285714286, "grad_norm": 0.01838378608226776, "kl": 0.00020372867584228516, "lambda_div_used": 0.5887190625071526, "learning_rate": 8.093945422764069e-07, "loss": 0.0025, "reward": -0.24391454830765724, "reward_after_mean": -0.24391454830765724, "reward_after_std": 0.43992058746516705, "reward_before_mean": 0.055846452713012695, "reward_before_std": 0.4102069940418005, "reward_change_max": 0.0, "reward_change_mean": -0.2997609917074442, "reward_change_min": -0.501131433993578, "reward_change_std": 0.19180170260369778, "reward_std": 0.4399206154048443, "rewards/accuracy_reward": 0.125, "rewards/cosine_scaled_reward": -0.0691535547375679, "step": 188 }, { "clip_fraction": 0.0, "completion_length": 1879.500057220459, "epoch": 0.216, "grad_norm": 0.034202635288238525, "kl": 0.00012063980102539062, "lambda_div_used": 0.6004914790391922, "learning_rate": 8.068211054579943e-07, "loss": -0.0391, "reward": -0.1741858683526516, "reward_after_mean": -0.1741858683526516, "reward_after_std": 0.4978427290916443, "reward_before_mean": 0.13175462279468775, "reward_before_std": 0.466338312253356, "reward_change_max": 0.0, "reward_change_mean": -0.30594046600162983, "reward_change_min": -0.47848713025450706, "reward_change_std": 0.1904344316571951, "reward_std": 0.49784273840487003, "rewards/accuracy_reward": 0.18750000558793545, "rewards/cosine_scaled_reward": -0.05574538931250572, "step": 189 }, { "clip_fraction": 0.0, "completion_length": 2554.9583740234375, "epoch": 0.21714285714285714, "grad_norm": 0.02105082757771015, "kl": 0.0001103430986404419, "lambda_div_used": 0.6010043099522591, "learning_rate": 8.04235151541222e-07, "loss": 0.0273, "reward": -0.017362398095428944, "reward_after_mean": -0.017362398095428944, "reward_after_std": 0.5378628317266703, "reward_before_mean": 0.3967582155019045, "reward_before_std": 0.46276178024709225, "reward_change_max": 0.0, "reward_change_mean": -0.4141206480562687, "reward_change_min": -0.6200221106410027, "reward_change_std": 0.24346179515123367, "reward_std": 0.5378628373146057, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.08425821363925934, "step": 190 }, { "clip_fraction": 0.0, "completion_length": 2059.0208702087402, "epoch": 0.21828571428571428, "grad_norm": 0.027044324204325676, "kl": 0.0001252889633178711, "lambda_div_used": 0.6293297410011292, "learning_rate": 8.01636806561836e-07, "loss": 0.0258, "reward": 0.11435023881494999, "reward_after_mean": 0.11435023881494999, "reward_after_std": 0.6808852814137936, "reward_before_mean": 0.5306741870008409, "reward_before_std": 0.5962998084723949, "reward_change_max": 0.0, "reward_change_mean": -0.4163239523768425, "reward_change_min": -0.6401379927992821, "reward_change_std": 0.2505673002451658, "reward_std": 0.680885311216116, "rewards/accuracy_reward": 0.416666679084301, "rewards/cosine_scaled_reward": 0.11400750931352377, "step": 191 }, { "clip_fraction": 0.0, "completion_length": 3052.4583740234375, "epoch": 0.21942857142857142, "grad_norm": 0.01929704286158085, "kl": 0.0001703500747680664, "lambda_div_used": 0.5668376535177231, "learning_rate": 7.990261971595048e-07, "loss": -0.0488, "reward": -0.18853843957185745, "reward_after_mean": -0.18853843957185745, "reward_after_std": 0.3872429598122835, "reward_before_mean": 0.20527121797204018, "reward_before_std": 0.30656870268285275, "reward_change_max": 0.0, "reward_change_mean": -0.39380968734622, "reward_change_min": -0.5667809918522835, "reward_change_std": 0.22394196968525648, "reward_std": 0.38724296167492867, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.0030621085315942764, "step": 192 }, { "clip_fraction": 0.0, "completion_length": 2665.0000610351562, "epoch": 0.22057142857142858, "grad_norm": 0.024683799594640732, "kl": 0.0001424849033355713, "lambda_div_used": 0.6040371954441071, "learning_rate": 7.964034505716476e-07, "loss": 0.1051, "reward": -0.12321672588586807, "reward_after_mean": -0.12321672588586807, "reward_after_std": 0.49566064216196537, "reward_before_mean": 0.19374842569231987, "reward_before_std": 0.4829600788652897, "reward_change_max": 0.0, "reward_change_mean": -0.316965164616704, "reward_change_min": -0.5347904153168201, "reward_change_std": 0.2084766924381256, "reward_std": 0.49566065706312656, "rewards/accuracy_reward": 0.2500000074505806, "rewards/cosine_scaled_reward": -0.05625157803297043, "step": 193 }, { "clip_fraction": 0.0, "completion_length": 2681.6666870117188, "epoch": 0.22171428571428572, "grad_norm": 0.01963093690574169, "kl": 0.0001462697982788086, "lambda_div_used": 0.6072653383016586, "learning_rate": 7.93768694627233e-07, "loss": 0.0685, "reward": 0.2620888948440552, "reward_after_mean": 0.2620888948440552, "reward_after_std": 0.5582387447357178, "reward_before_mean": 0.7925186604261398, "reward_before_std": 0.4891574867069721, "reward_change_max": 0.0, "reward_change_mean": -0.5304297637194395, "reward_change_min": -0.7805496528744698, "reward_change_std": 0.3099254406988621, "reward_std": 0.5582387670874596, "rewards/accuracy_reward": 0.541666679084301, "rewards/cosine_scaled_reward": 0.250851983204484, "step": 194 }, { "clip_fraction": 0.0, "completion_length": 2509.2708892822266, "epoch": 0.22285714285714286, "grad_norm": 0.021495619788765907, "kl": 0.00014010071754455566, "lambda_div_used": 0.6250215768814087, "learning_rate": 7.911220577405484e-07, "loss": -0.0159, "reward": -0.02809133753180504, "reward_after_mean": -0.02809133753180504, "reward_after_std": 0.5878969728946686, "reward_before_mean": 0.3000528886914253, "reward_before_std": 0.5790729988366365, "reward_change_max": 0.0, "reward_change_mean": -0.32814422622323036, "reward_change_min": -0.5872809514403343, "reward_change_std": 0.22489875741302967, "reward_std": 0.5878969803452492, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.029219531919807196, "step": 195 }, { "clip_fraction": 0.0, "completion_length": 3391.9166870117188, "epoch": 0.224, "grad_norm": 0.01908985897898674, "kl": 0.00020110607147216797, "lambda_div_used": 0.5609800890088081, "learning_rate": 7.884636689049422e-07, "loss": -0.0096, "reward": -0.2512575164437294, "reward_after_mean": -0.2512575164437294, "reward_after_std": 0.3720796424895525, "reward_before_mean": 0.114608995616436, "reward_before_std": 0.27679250249639153, "reward_change_max": 0.0, "reward_change_mean": -0.3658665083348751, "reward_change_min": -0.5327885784208775, "reward_change_std": 0.20587429776787758, "reward_std": 0.37207965552806854, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.09372434392571449, "step": 196 }, { "clip_fraction": 0.0, "completion_length": 2261.50008392334, "epoch": 0.22514285714285714, "grad_norm": 0.0338062159717083, "kl": 0.00014703720808029175, "lambda_div_used": 0.6676772907376289, "learning_rate": 7.857936576865356e-07, "loss": 0.0118, "reward": 0.3714135689660907, "reward_after_mean": 0.3714135689660907, "reward_after_std": 0.7947616018354893, "reward_before_mean": 0.8371437452733517, "reward_before_std": 0.7848006598651409, "reward_change_max": 0.0, "reward_change_mean": -0.4657301902770996, "reward_change_min": -0.7909989431500435, "reward_change_std": 0.31981481425464153, "reward_std": 0.7947616167366505, "rewards/accuracy_reward": 0.5625000149011612, "rewards/cosine_scaled_reward": 0.2746437588939443, "step": 197 }, { "clip_fraction": 0.0, "completion_length": 2588.0000381469727, "epoch": 0.22628571428571428, "grad_norm": 0.02565637417137623, "kl": 0.00015842914581298828, "lambda_div_used": 0.5861488357186317, "learning_rate": 7.831121542179086e-07, "loss": 0.0107, "reward": -0.04248452000319958, "reward_after_mean": -0.04248452000319958, "reward_after_std": 0.5402837041765451, "reward_before_mean": 0.3971143513917923, "reward_before_std": 0.3975243829190731, "reward_change_max": 0.0, "reward_change_mean": -0.4395988676697016, "reward_change_min": -0.6525615304708481, "reward_change_std": 0.24251667596399784, "reward_std": 0.5402837190777063, "rewards/accuracy_reward": 0.33333333395421505, "rewards/cosine_scaled_reward": 0.06378100253641605, "step": 198 }, { "clip_fraction": 0.0, "completion_length": 3565.500030517578, "epoch": 0.22742857142857142, "grad_norm": 0.017493488267064095, "kl": 0.00019216537475585938, "lambda_div_used": 0.6027035862207413, "learning_rate": 7.804192891917571e-07, "loss": 0.0044, "reward": -0.25367068126797676, "reward_after_mean": -0.25367068126797676, "reward_after_std": 0.5018568355590105, "reward_before_mean": 0.015577135607600212, "reward_before_std": 0.4789720713160932, "reward_change_max": 0.0, "reward_change_mean": -0.2692478112876415, "reward_change_min": -0.49732755869627, "reward_change_std": 0.18694379180669785, "reward_std": 0.501856841146946, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.15108953975141048, "step": 199 }, { "clip_fraction": 0.0, "completion_length": 2030.7708587646484, "epoch": 0.22857142857142856, "grad_norm": 0.023603590205311775, "kl": 8.529424667358398e-05, "lambda_div_used": 0.6037929654121399, "learning_rate": 7.777151938545235e-07, "loss": 0.0494, "reward": 0.20463257655501366, "reward_after_mean": 0.20463257655501366, "reward_after_std": 0.607479989528656, "reward_before_mean": 0.7358259493485093, "reward_before_std": 0.48482801718637347, "reward_change_max": 0.0, "reward_change_mean": -0.531193383038044, "reward_change_min": -0.7928582988679409, "reward_change_std": 0.3125428520143032, "reward_std": 0.6074800062924623, "rewards/accuracy_reward": 0.4583333395421505, "rewards/cosine_scaled_reward": 0.2774925837293267, "step": 200 }, { "clip_fraction": 0.0, "completion_length": 2393.2708740234375, "epoch": 0.2297142857142857, "grad_norm": 0.030843405053019524, "kl": 0.00012966245412826538, "lambda_div_used": 0.6355544030666351, "learning_rate": 7.75e-07, "loss": -0.0226, "reward": 0.36713023856282234, "reward_after_mean": 0.36713023856282234, "reward_after_std": 0.7189916651695967, "reward_before_mean": 0.8961770609021187, "reward_before_std": 0.6416445402428508, "reward_change_max": 0.0, "reward_change_mean": -0.5290468074381351, "reward_change_min": -0.790231991559267, "reward_change_std": 0.33059168234467506, "reward_std": 0.7189916893839836, "rewards/accuracy_reward": 0.5833333432674408, "rewards/cosine_scaled_reward": 0.3128436878323555, "step": 201 }, { "clip_fraction": 0.0, "completion_length": 2136.437526702881, "epoch": 0.23085714285714284, "grad_norm": 0.02494831755757332, "kl": 9.03918407857418e-05, "lambda_div_used": 0.5813711285591125, "learning_rate": 7.72273839962904e-07, "loss": -0.0212, "reward": 0.20651111379265785, "reward_after_mean": 0.20651111379265785, "reward_after_std": 0.5735384915024042, "reward_before_mean": 0.8149078581482172, "reward_before_std": 0.3815823132172227, "reward_change_max": 0.0, "reward_change_mean": -0.6083967536687851, "reward_change_min": -0.8435066714882851, "reward_change_std": 0.3385826703161001, "reward_std": 0.5735384933650494, "rewards/accuracy_reward": 0.5208333358168602, "rewards/cosine_scaled_reward": 0.2940745260566473, "step": 202 }, { "clip_fraction": 0.0, "completion_length": 3241.125030517578, "epoch": 0.232, "grad_norm": 0.019406091421842575, "kl": 0.00020945072174072266, "lambda_div_used": 0.5796335637569427, "learning_rate": 7.695368466124296e-07, "loss": 0.0001, "reward": -0.171187374740839, "reward_after_mean": -0.171187374740839, "reward_after_std": 0.4753460921347141, "reward_before_mean": 0.21428256388753653, "reward_before_std": 0.3680141428485513, "reward_change_max": 0.0, "reward_change_mean": -0.38546993769705296, "reward_change_min": -0.565359104424715, "reward_change_std": 0.21748895198106766, "reward_std": 0.4753460939973593, "rewards/accuracy_reward": 0.20833333395421505, "rewards/cosine_scaled_reward": 0.0059492262080311775, "step": 203 }, { "clip_fraction": 0.0, "completion_length": 1740.6458892822266, "epoch": 0.23314285714285715, "grad_norm": 0.0317753441631794, "kl": 0.00010880827903747559, "lambda_div_used": 0.6080649197101593, "learning_rate": 7.667891533457718e-07, "loss": 0.1061, "reward": 0.01815126556903124, "reward_after_mean": 0.01815126556903124, "reward_after_std": 0.5492583587765694, "reward_before_mean": 0.41005287505686283, "reward_before_std": 0.4979597805067897, "reward_change_max": 0.0, "reward_change_mean": -0.39190160669386387, "reward_change_min": -0.5888865925371647, "reward_change_std": 0.23326328117400408, "reward_std": 0.5492583997547626, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.09755285223945975, "step": 204 }, { "clip_fraction": 0.0, "completion_length": 2370.875068664551, "epoch": 0.2342857142857143, "grad_norm": 0.030042720958590508, "kl": 0.00014892220497131348, "lambda_div_used": 0.6162895858287811, "learning_rate": 7.640308940816239e-07, "loss": 0.1316, "reward": 0.43424203619360924, "reward_after_mean": 0.43424203619360924, "reward_after_std": 0.6284053698182106, "reward_before_mean": 1.0446599274873734, "reward_before_std": 0.5331263300031424, "reward_change_max": 0.0, "reward_change_mean": -0.610417865216732, "reward_change_min": -0.8885884135961533, "reward_change_std": 0.35387465916574, "reward_std": 0.6284053847193718, "rewards/accuracy_reward": 0.6250000149011612, "rewards/cosine_scaled_reward": 0.41965989768505096, "step": 205 }, { "clip_fraction": 0.0, "completion_length": 2687.0208587646484, "epoch": 0.23542857142857143, "grad_norm": 0.021548230201005936, "kl": 0.00012874603271484375, "lambda_div_used": 0.6292116791009903, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": -0.19604980573058128, "reward_after_mean": -0.19604980573058128, "reward_after_std": 0.6235801223665476, "reward_before_mean": 0.05089604668319225, "reward_before_std": 0.606025786139071, "reward_change_max": 0.0, "reward_change_mean": -0.24694585241377354, "reward_change_min": -0.5342049337923527, "reward_change_std": 0.1862892871722579, "reward_std": 0.623580127954483, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.1157706193625927, "step": 206 }, { "clip_fraction": 0.0, "completion_length": 2933.145866394043, "epoch": 0.23657142857142857, "grad_norm": 0.02744130790233612, "kl": 0.0001799464225769043, "lambda_div_used": 0.5820295214653015, "learning_rate": 7.584832158039378e-07, "loss": -0.0561, "reward": -0.27963498421013355, "reward_after_mean": -0.27963498421013355, "reward_after_std": 0.4231335464864969, "reward_before_mean": 0.013983679935336113, "reward_before_std": 0.377011489123106, "reward_change_max": 0.0, "reward_change_mean": -0.2936186585575342, "reward_change_min": -0.4932614788413048, "reward_change_std": 0.18361396715044975, "reward_std": 0.423133572563529, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.11101631447672844, "step": 207 }, { "clip_fraction": 0.0, "completion_length": 2700.4583587646484, "epoch": 0.2377142857142857, "grad_norm": 0.022379335016012192, "kl": 0.00015173852443695068, "lambda_div_used": 0.6108261719346046, "learning_rate": 7.556940671764124e-07, "loss": 0.0624, "reward": -0.02188705001026392, "reward_after_mean": -0.02188705001026392, "reward_after_std": 0.5500882770866156, "reward_before_mean": 0.3470336627215147, "reward_before_std": 0.5068763047456741, "reward_change_max": 0.0, "reward_change_mean": -0.3689207211136818, "reward_change_min": -0.5766280777752399, "reward_change_std": 0.22263448685407639, "reward_std": 0.550088282674551, "rewards/accuracy_reward": 0.27083334513008595, "rewards/cosine_scaled_reward": 0.07620031712576747, "step": 208 }, { "clip_fraction": 0.0, "completion_length": 2061.041732788086, "epoch": 0.23885714285714285, "grad_norm": 0.029824599623680115, "kl": 9.86829400062561e-05, "lambda_div_used": 0.6090070083737373, "learning_rate": 7.528948933102438e-07, "loss": -0.0351, "reward": -0.05778682604432106, "reward_after_mean": -0.05778682604432106, "reward_after_std": 0.5228247437626123, "reward_before_mean": 0.2826935350894928, "reward_before_std": 0.501367649412714, "reward_change_max": 0.0, "reward_change_mean": -0.3404803555458784, "reward_change_min": -0.5473614186048508, "reward_change_std": 0.21792252641171217, "reward_std": 0.5228247474879026, "rewards/accuracy_reward": 0.3125000111758709, "rewards/cosine_scaled_reward": -0.029806464910507202, "step": 209 }, { "clip_fraction": 0.0, "completion_length": 2671.9583740234375, "epoch": 0.24, "grad_norm": 0.019723398610949516, "kl": 0.0001126602292060852, "lambda_div_used": 0.6178258955478668, "learning_rate": 7.500858306332172e-07, "loss": -0.046, "reward": -0.019846799783408642, "reward_after_mean": -0.019846799783408642, "reward_after_std": 0.6330938600003719, "reward_before_mean": 0.3628573752939701, "reward_before_std": 0.5453151864930987, "reward_change_max": 0.0, "reward_change_mean": -0.38270418159663677, "reward_change_min": -0.5824633538722992, "reward_change_std": 0.22786249686032534, "reward_std": 0.6330938655883074, "rewards/accuracy_reward": 0.3125000037252903, "rewards/cosine_scaled_reward": 0.05035736900754273, "step": 210 }, { "clip_fraction": 0.0, "completion_length": 2191.2500076293945, "epoch": 0.24114285714285713, "grad_norm": 0.026132306084036827, "kl": 0.00012756884098052979, "lambda_div_used": 0.5773908644914627, "learning_rate": 7.472670160550848e-07, "loss": 0.0368, "reward": -0.09260139870457351, "reward_after_mean": -0.09260139870457351, "reward_after_std": 0.5260053481906652, "reward_before_mean": 0.3510722735663876, "reward_before_std": 0.35058190673589706, "reward_change_max": 0.0, "reward_change_mean": -0.4436736721545458, "reward_change_min": -0.5985335633158684, "reward_change_std": 0.22750128898769617, "reward_std": 0.5260053630918264, "rewards/accuracy_reward": 0.2916666679084301, "rewards/cosine_scaled_reward": 0.05940559017471969, "step": 211 }, { "clip_fraction": 0.0, "completion_length": 1899.8750457763672, "epoch": 0.2422857142857143, "grad_norm": 0.03510681912302971, "kl": 0.000111408531665802, "lambda_div_used": 0.6011399254202843, "learning_rate": 7.444385869608921e-07, "loss": 0.0147, "reward": 0.018216299824416637, "reward_after_mean": 0.018216299824416637, "reward_after_std": 0.5794783290475607, "reward_before_mean": 0.4559548683464527, "reward_before_std": 0.47238516760990024, "reward_change_max": 0.0, "reward_change_mean": -0.43773859925568104, "reward_change_min": -0.6772446036338806, "reward_change_std": 0.26400116458535194, "reward_std": 0.5794783346354961, "rewards/accuracy_reward": 0.4375000074505806, "rewards/cosine_scaled_reward": 0.018454871140420437, "step": 212 }, { "clip_fraction": 0.0, "completion_length": 2187.2708587646484, "epoch": 0.24342857142857144, "grad_norm": 0.02723986841738224, "kl": 0.00015923380851745605, "lambda_div_used": 0.6033707112073898, "learning_rate": 7.416006812042827e-07, "loss": 0.0702, "reward": 0.11780917271971703, "reward_after_mean": 0.11780917271971703, "reward_after_std": 0.6044025905430317, "reward_before_mean": 0.6168809719383717, "reward_before_std": 0.4788802685216069, "reward_change_max": 0.0, "reward_change_mean": -0.4990718085318804, "reward_change_min": -0.774650864303112, "reward_change_std": 0.2952164225280285, "reward_std": 0.6044025998562574, "rewards/accuracy_reward": 0.4375000037252903, "rewards/cosine_scaled_reward": 0.17938095517456532, "step": 213 }, { "clip_fraction": 0.0, "completion_length": 2442.375072479248, "epoch": 0.24457142857142858, "grad_norm": 0.024862557649612427, "kl": 0.00014431774616241455, "lambda_div_used": 0.5444722771644592, "learning_rate": 7.387534371007797e-07, "loss": -0.0237, "reward": -0.20888542756438255, "reward_after_mean": -0.20888542756438255, "reward_after_std": 0.3397774752229452, "reward_before_mean": 0.22809876408427954, "reward_before_std": 0.19605009350925684, "reward_change_max": 0.0, "reward_change_mean": -0.436984209343791, "reward_change_min": -0.5989037677645683, "reward_change_std": 0.22710295487195253, "reward_std": 0.339777497574687, "rewards/accuracy_reward": 0.25, "rewards/cosine_scaled_reward": -0.021901232190430164, "step": 214 }, { "clip_fraction": 0.0, "completion_length": 1974.0000457763672, "epoch": 0.24571428571428572, "grad_norm": 0.03193683549761772, "kl": 0.00011608004570007324, "lambda_div_used": 0.6048371568322182, "learning_rate": 7.358969934210438e-07, "loss": -0.0834, "reward": -0.23377530556172132, "reward_after_mean": -0.23377530556172132, "reward_after_std": 0.5205546151846647, "reward_before_mean": 0.02766125090420246, "reward_before_std": 0.48529171757400036, "reward_change_max": 0.0, "reward_change_mean": -0.2614365555346012, "reward_change_min": -0.44492336362600327, "reward_change_std": 0.16870077326893806, "reward_std": 0.5205546207726002, "rewards/accuracy_reward": 0.14583333767950535, "rewards/cosine_scaled_reward": -0.11817209050059319, "step": 215 }, { "clip_fraction": 0.0, "completion_length": 2073.7917137145996, "epoch": 0.24685714285714286, "grad_norm": 0.03090011700987816, "kl": 0.00015020370483398438, "lambda_div_used": 0.5987100675702095, "learning_rate": 7.330314893841101e-07, "loss": -0.037, "reward": -0.0010385997593402863, "reward_after_mean": -0.0010385997593402863, "reward_after_std": 0.5259138215333223, "reward_before_mean": 0.42963695898652077, "reward_before_std": 0.4547121487557888, "reward_change_max": 0.0, "reward_change_mean": -0.43067559227347374, "reward_change_min": -0.6873270347714424, "reward_change_std": 0.2589325439184904, "reward_std": 0.5259138215333223, "rewards/accuracy_reward": 0.3541666679084301, "rewards/cosine_scaled_reward": 0.07547031342983246, "step": 216 }, { "clip_fraction": 0.0, "completion_length": 2313.416679382324, "epoch": 0.248, "grad_norm": 0.02645043283700943, "kl": 0.00013341009616851807, "lambda_div_used": 0.6059171706438065, "learning_rate": 7.301570646506027e-07, "loss": 0.005, "reward": 0.06280752643942833, "reward_after_mean": 0.06280752643942833, "reward_after_std": 0.5530473850667477, "reward_before_mean": 0.4911847524344921, "reward_before_std": 0.4922928689047694, "reward_change_max": 0.0, "reward_change_mean": -0.42837722785770893, "reward_change_min": -0.6544801071286201, "reward_change_std": 0.2630965141579509, "reward_std": 0.5530474036931992, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.11618476174771786, "step": 217 }, { "clip_fraction": 0.0, "completion_length": 2668.9375381469727, "epoch": 0.24914285714285714, "grad_norm": 0.023471172899007797, "kl": 0.00011789798736572266, "lambda_div_used": 0.6332497969269753, "learning_rate": 7.27273859315928e-07, "loss": -0.0061, "reward": 0.06328068673610687, "reward_after_mean": 0.06328068673610687, "reward_after_std": 0.631486464291811, "reward_before_mean": 0.4275508373975754, "reward_before_std": 0.613295029848814, "reward_change_max": 0.0, "reward_change_mean": -0.3642701506614685, "reward_change_min": -0.593178354203701, "reward_change_std": 0.23565197084099054, "reward_std": 0.6314864810556173, "rewards/accuracy_reward": 0.33333334513008595, "rewards/cosine_scaled_reward": 0.09421749995090067, "step": 218 }, { "clip_fraction": 0.0, "completion_length": 2036.7292098999023, "epoch": 0.2502857142857143, "grad_norm": 0.031107638031244278, "kl": 0.00014078617095947266, "lambda_div_used": 0.6064230278134346, "learning_rate": 7.243820139034464e-07, "loss": 0.0536, "reward": 0.17783035337924957, "reward_after_mean": 0.17783035337924957, "reward_after_std": 0.6028466131538153, "reward_before_mean": 0.6854837816208601, "reward_before_std": 0.4974929317831993, "reward_change_max": 0.0, "reward_change_mean": -0.5076534673571587, "reward_change_min": -0.7541679181158543, "reward_change_std": 0.3024881314486265, "reward_std": 0.602846622467041, "rewards/accuracy_reward": 0.4583333358168602, "rewards/cosine_scaled_reward": 0.22715043649077415, "step": 219 }, { "clip_fraction": 0.0, "completion_length": 2585.562515258789, "epoch": 0.25142857142857145, "grad_norm": 0.033569689840078354, "kl": 0.00014656782150268555, "lambda_div_used": 0.5626775473356247, "learning_rate": 7.214816693576234e-07, "loss": 0.0096, "reward": -0.44609223771840334, "reward_after_mean": -0.44609223771840334, "reward_after_std": 0.34080066718161106, "reward_before_mean": -0.19077827036380768, "reward_before_std": 0.2862963704392314, "reward_change_max": 0.0, "reward_change_mean": -0.25531397201120853, "reward_change_min": -0.4116707444190979, "reward_change_std": 0.15291727520525455, "reward_std": 0.340800691395998, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.23244493766105734, "step": 220 }, { "clip_fraction": 0.0, "completion_length": 1835.1667137145996, "epoch": 0.25257142857142856, "grad_norm": 0.026043305173516273, "kl": 9.073130786418915e-05, "lambda_div_used": 0.589106909930706, "learning_rate": 7.185729670371604e-07, "loss": -0.0167, "reward": 0.1618174184113741, "reward_after_mean": 0.1618174184113741, "reward_after_std": 0.5352318156510592, "reward_before_mean": 0.7099000085145235, "reward_before_std": 0.40553835732862353, "reward_change_max": 0.0, "reward_change_mean": -0.5480826254934072, "reward_change_min": -0.7475878298282623, "reward_change_std": 0.3015699228271842, "reward_std": 0.5352318380028009, "rewards/accuracy_reward": 0.47916667722165585, "rewards/cosine_scaled_reward": 0.23073333408683538, "step": 221 }, { "clip_fraction": 0.0, "completion_length": 2023.6875305175781, "epoch": 0.2537142857142857, "grad_norm": 0.024540327489376068, "kl": 0.0001620650291442871, "lambda_div_used": 0.5971302166581154, "learning_rate": 7.156560487081051e-07, "loss": 0.0082, "reward": 0.04625087231397629, "reward_after_mean": 0.04625087231397629, "reward_after_std": 0.5205375533550978, "reward_before_mean": 0.4992250055074692, "reward_before_std": 0.4442645478993654, "reward_change_max": 0.0, "reward_change_mean": -0.45297410897910595, "reward_change_min": -0.7014825120568275, "reward_change_std": 0.27082843892276287, "reward_std": 0.5205375626683235, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.145058311522007, "step": 222 }, { "clip_fraction": 0.0, "completion_length": 2102.104202270508, "epoch": 0.25485714285714284, "grad_norm": 0.02421215921640396, "kl": 0.00010086596012115479, "lambda_div_used": 0.6135998442769051, "learning_rate": 7.127310565369415e-07, "loss": 0.0077, "reward": -0.07060145400464535, "reward_after_mean": -0.07060145400464535, "reward_after_std": 0.5450965594500303, "reward_before_mean": 0.2666409400990233, "reward_before_std": 0.5180736510083079, "reward_change_max": 0.0, "reward_change_mean": -0.33724240958690643, "reward_change_min": -0.5495268329977989, "reward_change_std": 0.2100124368444085, "reward_std": 0.5450965687632561, "rewards/accuracy_reward": 0.27083334513008595, "rewards/cosine_scaled_reward": -0.004192400723695755, "step": 223 }, { "clip_fraction": 0.0, "completion_length": 2678.8958892822266, "epoch": 0.256, "grad_norm": 0.0209470484405756, "kl": 0.00014913082122802734, "lambda_div_used": 0.6316534802317619, "learning_rate": 7.097981330836616e-07, "loss": 0.041, "reward": 0.002024895278736949, "reward_after_mean": 0.002024895278736949, "reward_after_std": 0.6682235784828663, "reward_before_mean": 0.36611822061240673, "reward_before_std": 0.6153735313564539, "reward_change_max": 0.0, "reward_change_mean": -0.36409333534538746, "reward_change_min": -0.6180168017745018, "reward_change_std": 0.23850849829614162, "reward_std": 0.6682236194610596, "rewards/accuracy_reward": 0.33333333767950535, "rewards/cosine_scaled_reward": 0.03278488974319771, "step": 224 }, { "clip_fraction": 0.0, "completion_length": 2463.6875610351562, "epoch": 0.2571428571428571, "grad_norm": 0.026948727667331696, "kl": 0.0001347959041595459, "lambda_div_used": 0.6428607329726219, "learning_rate": 7.068574212948169e-07, "loss": 0.0403, "reward": 0.05994867905974388, "reward_after_mean": 0.05994867905974388, "reward_after_std": 0.6384792737662792, "reward_before_mean": 0.3915696498006582, "reward_before_std": 0.6680623888969421, "reward_change_max": 0.0, "reward_change_mean": -0.3316209614276886, "reward_change_min": -0.606514610350132, "reward_change_std": 0.24293010961264372, "reward_std": 0.638479296118021, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.03740297071635723, "step": 225 }, { "clip_fraction": 0.0, "completion_length": 2275.479232788086, "epoch": 0.2582857142857143, "grad_norm": 0.0262776929885149, "kl": 0.00012694299221038818, "lambda_div_used": 0.6046253740787506, "learning_rate": 7.039090644965509e-07, "loss": 0.0006, "reward": 0.08375599328428507, "reward_after_mean": 0.08375599328428507, "reward_after_std": 0.5798605680465698, "reward_before_mean": 0.5400239741429687, "reward_before_std": 0.47723895218223333, "reward_change_max": 0.0, "reward_change_mean": -0.45626799017190933, "reward_change_min": -0.6333699934184551, "reward_change_std": 0.2536289654672146, "reward_std": 0.579860582947731, "rewards/accuracy_reward": 0.3750000111758709, "rewards/cosine_scaled_reward": 0.16502396669238806, "step": 226 }, { "clip_fraction": 0.0, "completion_length": 1853.1041793823242, "epoch": 0.25942857142857145, "grad_norm": 0.03546634316444397, "kl": 0.00011576712131500244, "lambda_div_used": 0.6238459944725037, "learning_rate": 7.009532063876148e-07, "loss": -0.0356, "reward": 0.035207513719797134, "reward_after_mean": 0.035207513719797134, "reward_after_std": 0.5671821534633636, "reward_before_mean": 0.38889277167618275, "reward_before_std": 0.5772030726075172, "reward_change_max": 0.0, "reward_change_mean": -0.35368524491786957, "reward_change_min": -0.587718054652214, "reward_change_std": 0.2398481909185648, "reward_std": 0.5671821553260088, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.07639275304973125, "step": 227 }, { "clip_fraction": 0.0, "completion_length": 2130.875026702881, "epoch": 0.26057142857142856, "grad_norm": 0.03324354812502861, "kl": 0.00012753158807754517, "lambda_div_used": 0.566599652171135, "learning_rate": 6.979899910323624e-07, "loss": -0.0669, "reward": 0.011986830271780491, "reward_after_mean": 0.011986830271780491, "reward_after_std": 0.4835386872291565, "reward_before_mean": 0.5381738739088178, "reward_before_std": 0.30367479752749205, "reward_change_max": 0.0, "reward_change_mean": -0.5261870250105858, "reward_change_min": -0.6992091946303844, "reward_change_std": 0.27626297529786825, "reward_std": 0.48353871516883373, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.1631738357245922, "step": 228 }, { "clip_fraction": 0.0, "completion_length": 3159.562530517578, "epoch": 0.26171428571428573, "grad_norm": 0.026536332443356514, "kl": 0.00018978118896484375, "lambda_div_used": 0.5688631013035774, "learning_rate": 6.950195628537299e-07, "loss": -0.0295, "reward": -0.11544675379991531, "reward_after_mean": -0.11544675379991531, "reward_after_std": 0.42418220825493336, "reward_before_mean": 0.31868776679039, "reward_before_std": 0.3126910990104079, "reward_change_max": 0.0, "reward_change_mean": -0.4341345224529505, "reward_change_min": -0.63496870175004, "reward_change_std": 0.24463962391018867, "reward_std": 0.42418221198022366, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.047854430973529816, "step": 229 }, { "clip_fraction": 0.0, "completion_length": 2753.3333740234375, "epoch": 0.26285714285714284, "grad_norm": 0.020300425589084625, "kl": 0.00014778971672058105, "lambda_div_used": 0.6197129040956497, "learning_rate": 6.920420666261961e-07, "loss": 0.0738, "reward": -0.06457636877894402, "reward_after_mean": -0.06457636877894402, "reward_after_std": 0.5870498064905405, "reward_before_mean": 0.2789040170609951, "reward_before_std": 0.5545760486274958, "reward_change_max": 0.0, "reward_change_mean": -0.34348038397729397, "reward_change_min": -0.6167686618864536, "reward_change_std": 0.22966008260846138, "reward_std": 0.5870498213917017, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.008070665411651134, "step": 230 }, { "clip_fraction": 0.0, "completion_length": 2388.3750610351562, "epoch": 0.264, "grad_norm": 0.025614017620682716, "kl": 0.00012599676847457886, "lambda_div_used": 0.6009133085608482, "learning_rate": 6.890576474687263e-07, "loss": -0.0117, "reward": -0.07801849395036697, "reward_after_mean": -0.07801849395036697, "reward_after_std": 0.5619381573051214, "reward_before_mean": 0.2970875895989593, "reward_before_std": 0.4653975451365113, "reward_change_max": 0.0, "reward_change_mean": -0.37510609440505505, "reward_change_min": -0.5894374549388885, "reward_change_std": 0.2189607135951519, "reward_std": 0.5619381796568632, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.02625426184386015, "step": 231 }, { "clip_fraction": 0.0, "completion_length": 2800.4792098999023, "epoch": 0.2651428571428571, "grad_norm": 0.020755581557750702, "kl": 0.00017752498388290405, "lambda_div_used": 0.6359871402382851, "learning_rate": 6.860664508377001e-07, "loss": 0.0213, "reward": 0.09202059358358383, "reward_after_mean": 0.09202059358358383, "reward_after_std": 0.6515896432101727, "reward_before_mean": 0.49167851358652115, "reward_before_std": 0.6335036922246218, "reward_change_max": 0.0, "reward_change_mean": -0.3996579386293888, "reward_change_min": -0.6971911080181599, "reward_change_std": 0.2742554973810911, "reward_std": 0.6515896506607533, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.09584518847987056, "step": 232 }, { "clip_fraction": 0.0, "completion_length": 1828.020866394043, "epoch": 0.2662857142857143, "grad_norm": 0.028299635276198387, "kl": 0.00010451674461364746, "lambda_div_used": 0.6347432807087898, "learning_rate": 6.83068622519821e-07, "loss": -0.0488, "reward": -0.11232293955981731, "reward_after_mean": -0.11232293955981731, "reward_after_std": 0.6607348509132862, "reward_before_mean": 0.16373980697244406, "reward_before_std": 0.6230235639959574, "reward_change_max": 0.0, "reward_change_mean": -0.2760627530515194, "reward_change_min": -0.5052222050726414, "reward_change_std": 0.18505325820297003, "reward_std": 0.660734860226512, "rewards/accuracy_reward": 0.2291666716337204, "rewards/cosine_scaled_reward": -0.06542686396278441, "step": 233 }, { "clip_fraction": 0.0, "completion_length": 2573.562526702881, "epoch": 0.2674285714285714, "grad_norm": 0.027019290253520012, "kl": 0.0001166127622127533, "lambda_div_used": 0.5475329235196114, "learning_rate": 6.800643086250121e-07, "loss": 0.0132, "reward": -0.23357034847140312, "reward_after_mean": -0.23357034847140312, "reward_after_std": 0.3406812082976103, "reward_before_mean": 0.17690101824700832, "reward_before_std": 0.21493587270379066, "reward_change_max": 0.0, "reward_change_mean": -0.4104713797569275, "reward_change_min": -0.5621155239641666, "reward_change_std": 0.22167872916907072, "reward_std": 0.34068121016025543, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.03143232688307762, "step": 234 }, { "clip_fraction": 0.0, "completion_length": 2243.645851135254, "epoch": 0.26857142857142857, "grad_norm": 0.031080337241292, "kl": 0.00015339255332946777, "lambda_div_used": 0.6331789866089821, "learning_rate": 6.770536555792944e-07, "loss": -0.0167, "reward": 0.05447516264393926, "reward_after_mean": 0.05447516264393926, "reward_after_std": 0.7066546399146318, "reward_before_mean": 0.44694859720766544, "reward_before_std": 0.6171103774104267, "reward_change_max": 0.0, "reward_change_mean": -0.39247346110641956, "reward_change_min": -0.6285405829548836, "reward_change_std": 0.23825406469404697, "reward_std": 0.706654641777277, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.09278193739010021, "step": 235 }, { "clip_fraction": 0.0, "completion_length": 2630.875030517578, "epoch": 0.26971428571428574, "grad_norm": 0.022363808006048203, "kl": 0.00015428662300109863, "lambda_div_used": 0.6662941351532936, "learning_rate": 6.740368101176495e-07, "loss": 0.026, "reward": 0.1967709083110094, "reward_after_mean": 0.1967709083110094, "reward_after_std": 0.7583610694855452, "reward_before_mean": 0.5491750640794635, "reward_before_std": 0.7768743745982647, "reward_change_max": 0.0, "reward_change_mean": -0.35240414179861546, "reward_change_min": -0.625398077070713, "reward_change_std": 0.25161405000835657, "reward_std": 0.758361091837287, "rewards/accuracy_reward": 0.39583334513008595, "rewards/cosine_scaled_reward": 0.15334171522408724, "step": 236 }, { "clip_fraction": 0.0, "completion_length": 2439.7292289733887, "epoch": 0.27085714285714285, "grad_norm": 0.02510235831141472, "kl": 0.00016179680824279785, "lambda_div_used": 0.6136586889624596, "learning_rate": 6.710139192768694e-07, "loss": -0.0166, "reward": 0.04647237854078412, "reward_after_mean": 0.04647237854078412, "reward_after_std": 0.6743428651243448, "reward_before_mean": 0.47922211419790983, "reward_before_std": 0.5223582116886973, "reward_change_max": 0.0, "reward_change_mean": -0.43274970538914204, "reward_change_min": -0.6191227361559868, "reward_change_std": 0.23846820835024118, "reward_std": 0.6743428837507963, "rewards/accuracy_reward": 0.37500000186264515, "rewards/cosine_scaled_reward": 0.10422207851661369, "step": 237 }, { "clip_fraction": 0.0, "completion_length": 2402.5000762939453, "epoch": 0.272, "grad_norm": 0.021976694464683533, "kl": 0.0001609325408935547, "lambda_div_used": 0.6034338474273682, "learning_rate": 6.679851303883891e-07, "loss": 0.0784, "reward": 0.18511426215991378, "reward_after_mean": 0.18511426215991378, "reward_after_std": 0.6621626690030098, "reward_before_mean": 0.7374962608737405, "reward_before_std": 0.4762335177510977, "reward_change_max": 0.0, "reward_change_mean": -0.5523819867521524, "reward_change_min": -0.7993261553347111, "reward_change_std": 0.3063361942768097, "reward_std": 0.6621626764535904, "rewards/accuracy_reward": 0.5000000018626451, "rewards/cosine_scaled_reward": 0.23749624891206622, "step": 238 }, { "clip_fraction": 0.0, "completion_length": 1732.2292098999023, "epoch": 0.27314285714285713, "grad_norm": 0.03150353208184242, "kl": 7.70464539527893e-05, "lambda_div_used": 0.6118984445929527, "learning_rate": 6.649505910711058e-07, "loss": 0.0245, "reward": 0.22464729472994804, "reward_after_mean": 0.22464729472994804, "reward_after_std": 0.5859156623482704, "reward_before_mean": 0.7294908128678799, "reward_before_std": 0.5119684813544154, "reward_change_max": 0.0, "reward_change_mean": -0.5048435050994158, "reward_change_min": -0.7602570950984955, "reward_change_std": 0.30051624588668346, "reward_std": 0.5859156772494316, "rewards/accuracy_reward": 0.4583333469927311, "rewards/cosine_scaled_reward": 0.27115743793547153, "step": 239 }, { "clip_fraction": 0.0, "completion_length": 2970.708366394043, "epoch": 0.2742857142857143, "grad_norm": 0.02445312589406967, "kl": 0.00020241737365722656, "lambda_div_used": 0.5594572946429253, "learning_rate": 6.619104492241847e-07, "loss": 0.0141, "reward": -0.36844223737716675, "reward_after_mean": -0.36844223737716675, "reward_after_std": 0.32861490175127983, "reward_before_mean": -0.06151419784873724, "reward_before_std": 0.2699447488412261, "reward_change_max": 0.0, "reward_change_mean": -0.30692804232239723, "reward_change_min": -0.46535007655620575, "reward_change_std": 0.1784888058900833, "reward_std": 0.3286149147897959, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.12401420716196299, "step": 240 }, { "clip_fraction": 0.0, "completion_length": 2815.041702270508, "epoch": 0.2754285714285714, "grad_norm": 0.02061399444937706, "kl": 0.0001932680606842041, "lambda_div_used": 0.5598616823554039, "learning_rate": 6.588648530198504e-07, "loss": 0.0225, "reward": -0.4072608258575201, "reward_after_mean": -0.4072608258575201, "reward_after_std": 0.34731264412403107, "reward_before_mean": -0.12633821368217468, "reward_before_std": 0.2754332982003689, "reward_change_max": 0.0, "reward_change_mean": -0.2809226084500551, "reward_change_min": -0.4441990442574024, "reward_change_std": 0.1635214313864708, "reward_std": 0.34731266647577286, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.16800487786531448, "step": 241 }, { "clip_fraction": 0.0, "completion_length": 2020.520851135254, "epoch": 0.2765714285714286, "grad_norm": 0.03268786519765854, "kl": 0.0002362281084060669, "lambda_div_used": 0.5827708318829536, "learning_rate": 6.558139508961654e-07, "loss": 0.0641, "reward": -0.14442800264805555, "reward_after_mean": -0.14442800264805555, "reward_after_std": 0.488038569688797, "reward_before_mean": 0.23860891722142696, "reward_before_std": 0.3739425097592175, "reward_change_max": 0.0, "reward_change_mean": -0.38303691893815994, "reward_change_min": -0.5426856316626072, "reward_change_std": 0.20754980947822332, "reward_std": 0.4880385845899582, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": -0.032224420458078384, "step": 242 }, { "clip_fraction": 0.0, "completion_length": 2664.4375610351562, "epoch": 0.2777142857142857, "grad_norm": 0.02101411111652851, "kl": 0.00011355429887771606, "lambda_div_used": 0.6568357795476913, "learning_rate": 6.527578915497951e-07, "loss": 0.0052, "reward": 0.12949350103735924, "reward_after_mean": 0.12949350103735924, "reward_after_std": 0.7312840819358826, "reward_before_mean": 0.46976747084409, "reward_before_std": 0.7229422759264708, "reward_change_max": 0.0, "reward_change_mean": -0.3402740005403757, "reward_change_min": -0.5887857899069786, "reward_change_std": 0.2261042231693864, "reward_std": 0.7312840968370438, "rewards/accuracy_reward": 0.354166679084301, "rewards/cosine_scaled_reward": 0.11560080386698246, "step": 243 }, { "clip_fraction": 0.0, "completion_length": 2822.6875381469727, "epoch": 0.27885714285714286, "grad_norm": 0.021215323358774185, "kl": 0.00015109777450561523, "lambda_div_used": 0.6380957439541817, "learning_rate": 6.496968239287603e-07, "loss": 0.023, "reward": 0.23113884031772614, "reward_after_mean": 0.23113884031772614, "reward_after_std": 0.6932291053235531, "reward_before_mean": 0.678146418184042, "reward_before_std": 0.6383852679282427, "reward_change_max": 0.0, "reward_change_mean": -0.4470075909048319, "reward_change_min": -0.6521747056394815, "reward_change_std": 0.2655975092202425, "reward_std": 0.6932291202247143, "rewards/accuracy_reward": 0.4583333469927311, "rewards/cosine_scaled_reward": 0.21981305815279484, "step": 244 }, { "clip_fraction": 0.0, "completion_length": 2480.2500915527344, "epoch": 0.28, "grad_norm": 0.026185423135757446, "kl": 0.00016620755195617676, "lambda_div_used": 0.6584924161434174, "learning_rate": 6.466308972251785e-07, "loss": 0.058, "reward": 0.19090854283422232, "reward_after_mean": 0.19090854283422232, "reward_after_std": 0.726154362782836, "reward_before_mean": 0.5613497914746404, "reward_before_std": 0.7352566458284855, "reward_change_max": 0.0, "reward_change_mean": -0.37044124491512775, "reward_change_min": -0.6325159706175327, "reward_change_std": 0.25586483906954527, "reward_std": 0.7261543925851583, "rewards/accuracy_reward": 0.41666668094694614, "rewards/cosine_scaled_reward": 0.14468309609219432, "step": 245 }, { "clip_fraction": 0.0, "completion_length": 2683.291717529297, "epoch": 0.28114285714285714, "grad_norm": 0.020767759531736374, "kl": 0.00016689300537109375, "lambda_div_used": 0.613718219101429, "learning_rate": 6.435602608679916e-07, "loss": 0.0867, "reward": -0.015127861872315407, "reward_after_mean": -0.015127861872315407, "reward_after_std": 0.5861052125692368, "reward_before_mean": 0.36624928191304207, "reward_before_std": 0.5300967525690794, "reward_change_max": 0.0, "reward_change_mean": -0.38137709721922874, "reward_change_min": -0.6563880071043968, "reward_change_std": 0.2452305220067501, "reward_std": 0.5861052181571722, "rewards/accuracy_reward": 0.3125000037252903, "rewards/cosine_scaled_reward": 0.05374925094656646, "step": 246 }, { "clip_fraction": 0.0, "completion_length": 3034.937545776367, "epoch": 0.2822857142857143, "grad_norm": 0.018628831952810287, "kl": 0.00017150957137346268, "lambda_div_used": 0.6288246288895607, "learning_rate": 6.404850645156841e-07, "loss": 0.0338, "reward": -0.14502286911010742, "reward_after_mean": -0.14502286911010742, "reward_after_std": 0.6141320299357176, "reward_before_mean": 0.1263586189597845, "reward_before_std": 0.6014503743499517, "reward_change_max": 0.0, "reward_change_mean": -0.2713814973831177, "reward_change_min": -0.5071048811078072, "reward_change_std": 0.19316286500543356, "reward_std": 0.61413205973804, "rewards/accuracy_reward": 0.18750000186264515, "rewards/cosine_scaled_reward": -0.06114138173870742, "step": 247 }, { "clip_fraction": 0.0, "completion_length": 2029.1250228881836, "epoch": 0.2834285714285714, "grad_norm": 0.034633222967386246, "kl": 0.00014004111289978027, "lambda_div_used": 0.6094017848372459, "learning_rate": 6.374054580489873e-07, "loss": -0.0144, "reward": 0.2783904932439327, "reward_after_mean": 0.2783904932439327, "reward_after_std": 0.6359313689172268, "reward_before_mean": 0.8294162545353174, "reward_before_std": 0.5048373020254076, "reward_change_max": 0.0, "reward_change_mean": -0.5510257538408041, "reward_change_min": -0.7822528444230556, "reward_change_std": 0.3142691068351269, "reward_std": 0.6359313875436783, "rewards/accuracy_reward": 0.5416666772216558, "rewards/cosine_scaled_reward": 0.28774956427514553, "step": 248 }, { "clip_fraction": 0.0, "completion_length": 1789.4791870117188, "epoch": 0.2845714285714286, "grad_norm": 0.028334610164165497, "kl": 7.846951484680176e-05, "lambda_div_used": 0.5886820033192635, "learning_rate": 6.343215915635761e-07, "loss": 0.0095, "reward": -0.009319216012954712, "reward_after_mean": -0.009319216012954712, "reward_after_std": 0.5895384289324284, "reward_before_mean": 0.46247682347893715, "reward_before_std": 0.4071828918531537, "reward_change_max": 0.0, "reward_change_mean": -0.47179603204131126, "reward_change_min": -0.6552967764437199, "reward_change_std": 0.2524276301264763, "reward_std": 0.589538436383009, "rewards/accuracy_reward": 0.3333333358168602, "rewards/cosine_scaled_reward": 0.1291434899903834, "step": 249 }, { "clip_fraction": 0.0, "completion_length": 2342.8750610351562, "epoch": 0.2857142857142857, "grad_norm": 0.02926229126751423, "kl": 0.00020366907119750977, "lambda_div_used": 0.6367609649896622, "learning_rate": 6.31233615362752e-07, "loss": 0.0537, "reward": 0.04997219145298004, "reward_after_mean": 0.04997219145298004, "reward_after_std": 0.6538249664008617, "reward_before_mean": 0.4029387356713414, "reward_before_std": 0.6333111096173525, "reward_change_max": 0.0, "reward_change_mean": -0.35296651534736156, "reward_change_min": -0.5975028611719608, "reward_change_std": 0.23207756876945496, "reward_std": 0.6538249738514423, "rewards/accuracy_reward": 0.31250001303851604, "rewards/cosine_scaled_reward": 0.09043872263282537, "step": 250 }, { "clip_fraction": 0.0, "completion_length": 1925.1667289733887, "epoch": 0.28685714285714287, "grad_norm": 0.03169158101081848, "kl": 0.0001310408115386963, "lambda_div_used": 0.6620426177978516, "learning_rate": 6.281416799501187e-07, "loss": -0.0552, "reward": 0.16181311733089387, "reward_after_mean": 0.16181311733089387, "reward_after_std": 0.7404435630887747, "reward_before_mean": 0.516552684828639, "reward_before_std": 0.7539111012592912, "reward_change_max": 0.0, "reward_change_mean": -0.35473958775401115, "reward_change_min": -0.6220344565808773, "reward_change_std": 0.24936181399971247, "reward_std": 0.7404435705393553, "rewards/accuracy_reward": 0.39583334885537624, "rewards/cosine_scaled_reward": 0.12071935646235943, "step": 251 }, { "clip_fraction": 0.0, "completion_length": 2385.4166831970215, "epoch": 0.288, "grad_norm": 0.027474144473671913, "kl": 0.00018829107284545898, "lambda_div_used": 0.5771610513329506, "learning_rate": 6.25045936022246e-07, "loss": 0.0396, "reward": -0.18753607827238739, "reward_after_mean": -0.18753607827238739, "reward_after_std": 0.4631412886083126, "reward_before_mean": 0.1804720275104046, "reward_before_std": 0.3519942844286561, "reward_change_max": 0.0, "reward_change_mean": -0.3680081032216549, "reward_change_min": -0.5239151008427143, "reward_change_std": 0.20504287257790565, "reward_std": 0.46314129047095776, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.04869466880336404, "step": 252 }, { "clip_fraction": 0.0, "completion_length": 2590.875045776367, "epoch": 0.28914285714285715, "grad_norm": 0.027266530320048332, "kl": 0.0001958310604095459, "lambda_div_used": 0.6256092488765717, "learning_rate": 6.219465344613258e-07, "loss": -0.0262, "reward": 0.09082555398344994, "reward_after_mean": 0.09082555398344994, "reward_after_std": 0.6417571641504765, "reward_before_mean": 0.5176093801856041, "reward_before_std": 0.5764628401957452, "reward_change_max": 0.0, "reward_change_mean": -0.4267838429659605, "reward_change_min": -0.7001185156404972, "reward_change_std": 0.26451323740184307, "reward_std": 0.6417571865022182, "rewards/accuracy_reward": 0.39583334140479565, "rewards/cosine_scaled_reward": 0.12177603470627218, "step": 253 }, { "clip_fraction": 0.0, "completion_length": 2205.3333587646484, "epoch": 0.29028571428571426, "grad_norm": 0.03558209538459778, "kl": 0.0001392364501953125, "lambda_div_used": 0.6507852524518967, "learning_rate": 6.188436263278172e-07, "loss": -0.0987, "reward": 0.18358214199543, "reward_after_mean": 0.18358214199543, "reward_after_std": 0.7428858652710915, "reward_before_mean": 0.5856727678328753, "reward_before_std": 0.7034552115947008, "reward_change_max": 0.0, "reward_change_mean": -0.40209066309034824, "reward_change_min": -0.6650605984032154, "reward_change_std": 0.2646036548539996, "reward_std": 0.7428858801722527, "rewards/accuracy_reward": 0.43750000931322575, "rewards/cosine_scaled_reward": 0.14817279600538313, "step": 254 }, { "clip_fraction": 0.0, "completion_length": 3061.5625610351562, "epoch": 0.2914285714285714, "grad_norm": 0.026399368420243263, "kl": 0.00017392635345458984, "lambda_div_used": 0.6185515820980072, "learning_rate": 6.157373628530852e-07, "loss": 0.023, "reward": -0.10978002939373255, "reward_after_mean": -0.10978002939373255, "reward_after_std": 0.5691167917102575, "reward_before_mean": 0.19686487689614296, "reward_before_std": 0.5497966632246971, "reward_change_max": 0.0, "reward_change_mean": -0.3066448848694563, "reward_change_min": -0.5226548612117767, "reward_change_std": 0.20487169921398163, "reward_std": 0.569116810336709, "rewards/accuracy_reward": 0.22916667349636555, "rewards/cosine_scaled_reward": -0.032301797065883875, "step": 255 }, { "clip_fraction": 0.0, "completion_length": 2540.5833435058594, "epoch": 0.2925714285714286, "grad_norm": 0.02308651991188526, "kl": 0.00019800662994384766, "lambda_div_used": 0.6459387838840485, "learning_rate": 6.126278954320294e-07, "loss": -0.0509, "reward": 0.1737559838220477, "reward_after_mean": 0.1737559838220477, "reward_after_std": 0.6614836137741804, "reward_before_mean": 0.5523870065808296, "reward_before_std": 0.6789623461663723, "reward_change_max": 0.0, "reward_change_mean": -0.37863098084926605, "reward_change_min": -0.6470838598906994, "reward_change_std": 0.26314268447458744, "reward_std": 0.661483621224761, "rewards/accuracy_reward": 0.3958333469927311, "rewards/cosine_scaled_reward": 0.15655364841222763, "step": 256 }, { "clip_fraction": 0.0, "completion_length": 2811.0209045410156, "epoch": 0.2937142857142857, "grad_norm": 0.021752549335360527, "kl": 0.00017321109771728516, "lambda_div_used": 0.6104201078414917, "learning_rate": 6.095153756157051e-07, "loss": 0.077, "reward": 0.32746705412864685, "reward_after_mean": 0.32746705412864685, "reward_after_std": 0.627650348469615, "reward_before_mean": 0.9031309094280005, "reward_before_std": 0.5052130986005068, "reward_change_max": 0.0, "reward_change_mean": -0.5756638199090958, "reward_change_min": -0.8629779443144798, "reward_change_std": 0.33332069404423237, "reward_std": 0.6276503596454859, "rewards/accuracy_reward": 0.5625000111758709, "rewards/cosine_scaled_reward": 0.3406308852136135, "step": 257 }, { "clip_fraction": 0.0, "completion_length": 3115.166732788086, "epoch": 0.2948571428571429, "grad_norm": 0.019492125138640404, "kl": 0.00021690130233764648, "lambda_div_used": 0.6207298263907433, "learning_rate": 6.06399955103937e-07, "loss": 0.0613, "reward": 0.0009925179183483124, "reward_after_mean": 0.0009925179183483124, "reward_after_std": 0.5666598528623581, "reward_before_mean": 0.35771505534648895, "reward_before_std": 0.5585699509829283, "reward_change_max": 0.0, "reward_change_mean": -0.3567225467413664, "reward_change_min": -0.6022392623126507, "reward_change_std": 0.23685699328780174, "reward_std": 0.5666598528623581, "rewards/accuracy_reward": 0.3125000111758709, "rewards/cosine_scaled_reward": 0.04521506559103727, "step": 258 }, { "clip_fraction": 0.0, "completion_length": 2637.2500534057617, "epoch": 0.296, "grad_norm": 0.029653117060661316, "kl": 0.00020319223403930664, "lambda_div_used": 0.6043171733617783, "learning_rate": 6.032817857379256e-07, "loss": -0.0019, "reward": -0.037313513457775116, "reward_after_mean": -0.037313513457775116, "reward_after_std": 0.5161938592791557, "reward_before_mean": 0.3289037337526679, "reward_before_std": 0.47458031587302685, "reward_change_max": 0.0, "reward_change_mean": -0.3662172295153141, "reward_change_min": -0.5700537078082561, "reward_change_std": 0.2212026845663786, "reward_std": 0.5161938853561878, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": -0.004429628141224384, "step": 259 }, { "clip_fraction": 0.0, "completion_length": 1935.9166679382324, "epoch": 0.29714285714285715, "grad_norm": 0.028990233317017555, "kl": 0.00012747198343276978, "lambda_div_used": 0.56998710334301, "learning_rate": 6.001610194928464e-07, "loss": -0.0104, "reward": 0.24466626904904842, "reward_after_mean": 0.24466626904904842, "reward_after_std": 0.5734536852687597, "reward_before_mean": 0.9249862097203732, "reward_before_std": 0.32228614180348814, "reward_change_max": 0.0, "reward_change_mean": -0.6803199425339699, "reward_change_min": -0.9398231357336044, "reward_change_std": 0.3666897714138031, "reward_std": 0.5734537076205015, "rewards/accuracy_reward": 0.6041666679084301, "rewards/cosine_scaled_reward": 0.3208195334300399, "step": 260 }, { "clip_fraction": 0.0, "completion_length": 2764.8958892822266, "epoch": 0.29828571428571427, "grad_norm": 0.021772203966975212, "kl": 0.00015366077423095703, "lambda_div_used": 0.6116980388760567, "learning_rate": 5.97037808470444e-07, "loss": 0.0391, "reward": -0.14533425867557526, "reward_after_mean": -0.14533425867557526, "reward_after_std": 0.5442243628203869, "reward_before_mean": 0.15450917836278677, "reward_before_std": 0.5254541491158307, "reward_change_max": 0.0, "reward_change_mean": -0.29984344728291035, "reward_change_min": -0.5264418311417103, "reward_change_std": 0.2055408162996173, "reward_std": 0.5442243684083223, "rewards/accuracy_reward": 0.1875000037252903, "rewards/cosine_scaled_reward": -0.03299082722514868, "step": 261 }, { "clip_fraction": 0.0, "completion_length": 2946.541732788086, "epoch": 0.29942857142857143, "grad_norm": 0.02444116398692131, "kl": 0.00019305944442749023, "lambda_div_used": 0.5770210847258568, "learning_rate": 5.939123048916173e-07, "loss": -0.0078, "reward": -0.30842714570462704, "reward_after_mean": -0.30842714570462704, "reward_after_std": 0.4061661623418331, "reward_before_mean": -0.014746684581041336, "reward_before_std": 0.35232585947960615, "reward_change_max": 0.0, "reward_change_mean": -0.29368047416210175, "reward_change_min": -0.43233491107821465, "reward_change_std": 0.167787273414433, "reward_std": 0.40616616792976856, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.11891335435211658, "step": 262 }, { "clip_fraction": 0.0, "completion_length": 2761.7083740234375, "epoch": 0.30057142857142854, "grad_norm": 0.026775242760777473, "kl": 0.00015038251876831055, "lambda_div_used": 0.5933946445584297, "learning_rate": 5.907846610890011e-07, "loss": -0.0212, "reward": -0.28860565181821585, "reward_after_mean": -0.28860565181821585, "reward_after_std": 0.4898714739829302, "reward_before_mean": -0.010391712188720703, "reward_before_std": 0.4291188698261976, "reward_change_max": 0.0, "reward_change_mean": -0.27821394614875317, "reward_change_min": -0.43029162287712097, "reward_change_std": 0.16667384281754494, "reward_std": 0.48987148329615593, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.11455838289111853, "step": 263 }, { "clip_fraction": 0.0, "completion_length": 2586.3750534057617, "epoch": 0.3017142857142857, "grad_norm": 0.01994405686855316, "kl": 0.0001627206802368164, "lambda_div_used": 0.6219401434063911, "learning_rate": 5.87655029499542e-07, "loss": 0.043, "reward": -0.11865252908319235, "reward_after_mean": -0.11865252908319235, "reward_after_std": 0.59744056686759, "reward_before_mean": 0.1666876282542944, "reward_before_std": 0.5631906799972057, "reward_change_max": 0.0, "reward_change_mean": -0.2853401657193899, "reward_change_min": -0.4587775580585003, "reward_change_std": 0.18016593530774117, "reward_std": 0.597440579906106, "rewards/accuracy_reward": 0.2083333395421505, "rewards/cosine_scaled_reward": -0.04164570523425937, "step": 264 }, { "clip_fraction": 0.0, "completion_length": 1858.2083587646484, "epoch": 0.3028571428571429, "grad_norm": 0.02953009493649006, "kl": 0.0001354515552520752, "lambda_div_used": 0.5754420235753059, "learning_rate": 5.845235626570683e-07, "loss": 0.1211, "reward": 0.017749376595020294, "reward_after_mean": 0.017749376595020294, "reward_after_std": 0.478146318346262, "reward_before_mean": 0.4997409600764513, "reward_before_std": 0.3383461497724056, "reward_change_max": 0.0, "reward_change_mean": -0.48199158161878586, "reward_change_min": -0.6475610621273518, "reward_change_std": 0.2544400207698345, "reward_std": 0.4781463425606489, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.12474094179924577, "step": 265 }, { "clip_fraction": 0.0, "completion_length": 3198.625, "epoch": 0.304, "grad_norm": 0.019427413120865822, "kl": 0.00019751489162445068, "lambda_div_used": 0.5534809529781342, "learning_rate": 5.813904131848564e-07, "loss": -0.0091, "reward": -0.40436042100191116, "reward_after_mean": -0.40436042100191116, "reward_after_std": 0.2967198472470045, "reward_before_mean": -0.12292576022446156, "reward_before_std": 0.23972244351170957, "reward_change_max": 0.0, "reward_change_mean": -0.2814346421509981, "reward_change_min": -0.43088357895612717, "reward_change_std": 0.16106584202498198, "reward_std": 0.29671985376626253, "rewards/accuracy_reward": 0.1041666716337204, "rewards/cosine_scaled_reward": -0.22709244303405285, "step": 266 }, { "clip_fraction": 0.0, "completion_length": 2879.354202270508, "epoch": 0.30514285714285716, "grad_norm": 0.02146352268755436, "kl": 0.00019466876983642578, "lambda_div_used": 0.6196897253394127, "learning_rate": 5.78255733788191e-07, "loss": -0.0062, "reward": -0.12080054543912411, "reward_after_mean": -0.12080054543912411, "reward_after_std": 0.5834086053073406, "reward_before_mean": 0.1734671276062727, "reward_before_std": 0.5533927101641893, "reward_change_max": 0.0, "reward_change_mean": -0.29426765628159046, "reward_change_min": -0.4760690741240978, "reward_change_std": 0.18818860314786434, "reward_std": 0.5834086183458567, "rewards/accuracy_reward": 0.2083333395421505, "rewards/cosine_scaled_reward": -0.03486621752381325, "step": 267 }, { "clip_fraction": 0.0, "completion_length": 2556.5208740234375, "epoch": 0.3062857142857143, "grad_norm": 0.03222980722784996, "kl": 0.00019940733909606934, "lambda_div_used": 0.6217963546514511, "learning_rate": 5.751196772469237e-07, "loss": 0.11, "reward": -0.0901529286056757, "reward_after_mean": -0.0901529286056757, "reward_after_std": 0.5890399143099785, "reward_before_mean": 0.22261973470449448, "reward_before_std": 0.5651240181177855, "reward_change_max": 0.0, "reward_change_mean": -0.3127726651728153, "reward_change_min": -0.5852204114198685, "reward_change_std": 0.21594477724283934, "reward_std": 0.5890399310737848, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": -0.04821361042559147, "step": 268 }, { "clip_fraction": 0.0, "completion_length": 2763.0000534057617, "epoch": 0.30742857142857144, "grad_norm": 0.023989371955394745, "kl": 0.00017099082469940186, "lambda_div_used": 0.5594866573810577, "learning_rate": 5.71982396408026e-07, "loss": 0.04, "reward": 0.10778852179646492, "reward_after_mean": 0.10778852179646492, "reward_after_std": 0.45204984955489635, "reward_before_mean": 0.7018298227339983, "reward_before_std": 0.26720918249338865, "reward_change_max": 0.0, "reward_change_mean": -0.5940412897616625, "reward_change_min": -0.8074228167533875, "reward_change_std": 0.31475239619612694, "reward_std": 0.4520498663187027, "rewards/accuracy_reward": 0.4791666716337204, "rewards/cosine_scaled_reward": 0.22266313433647156, "step": 269 }, { "clip_fraction": 0.0, "completion_length": 2451.7917404174805, "epoch": 0.30857142857142855, "grad_norm": 0.02231750823557377, "kl": 0.00018018484115600586, "lambda_div_used": 0.6626150384545326, "learning_rate": 5.688440441781398e-07, "loss": 0.0293, "reward": 0.23073547426611185, "reward_after_mean": 0.23073547426611185, "reward_after_std": 0.8142610676586628, "reward_before_mean": 0.6419318169355392, "reward_before_std": 0.7594864275306463, "reward_change_max": 0.0, "reward_change_mean": -0.4111963789910078, "reward_change_min": -0.6618837527930737, "reward_change_std": 0.26606686785817146, "reward_std": 0.8142610862851143, "rewards/accuracy_reward": 0.4583333432674408, "rewards/cosine_scaled_reward": 0.18359847948886454, "step": 270 }, { "clip_fraction": 0.0, "completion_length": 1821.8333740234375, "epoch": 0.3097142857142857, "grad_norm": 0.029715267941355705, "kl": 0.00010024569928646088, "lambda_div_used": 0.6296036839485168, "learning_rate": 5.657047735161255e-07, "loss": -0.127, "reward": 0.15136122331023216, "reward_after_mean": 0.15136122331023216, "reward_after_std": 0.6207431796938181, "reward_before_mean": 0.5634245574474335, "reward_before_std": 0.6018376401625574, "reward_change_max": 0.0, "reward_change_mean": -0.4120633378624916, "reward_change_min": -0.6677181459963322, "reward_change_std": 0.26569564640522003, "reward_std": 0.6207432132214308, "rewards/accuracy_reward": 0.4375000111758709, "rewards/cosine_scaled_reward": 0.1259245565161109, "step": 271 }, { "clip_fraction": 0.0, "completion_length": 2659.0834045410156, "epoch": 0.31085714285714283, "grad_norm": 0.025181055068969727, "kl": 0.0002090930938720703, "lambda_div_used": 0.6270301192998886, "learning_rate": 5.625647374256061e-07, "loss": 0.0218, "reward": 0.11005037371069193, "reward_after_mean": 0.11005037371069193, "reward_after_std": 0.6581083033233881, "reward_before_mean": 0.5188289349898696, "reward_before_std": 0.59244554489851, "reward_change_max": 0.0, "reward_change_mean": -0.40877855755388737, "reward_change_min": -0.6175772212445736, "reward_change_std": 0.2535307565703988, "reward_std": 0.6581083126366138, "rewards/accuracy_reward": 0.39583334140479565, "rewards/cosine_scaled_reward": 0.12299557868391275, "step": 272 }, { "clip_fraction": 0.0, "completion_length": 2516.875045776367, "epoch": 0.312, "grad_norm": 0.022818049415946007, "kl": 0.00016960501670837402, "lambda_div_used": 0.5807301178574562, "learning_rate": 5.594240889475106e-07, "loss": -0.0175, "reward": 0.09495561942458153, "reward_after_mean": 0.09495561942458153, "reward_after_std": 0.5033265259116888, "reward_before_mean": 0.6270047463476658, "reward_before_std": 0.3717161314561963, "reward_change_max": 0.0, "reward_change_mean": -0.5320491325110197, "reward_change_min": -0.7614771388471127, "reward_change_std": 0.30045478232204914, "reward_std": 0.5033265501260757, "rewards/accuracy_reward": 0.4166666716337204, "rewards/cosine_scaled_reward": 0.2103380784392357, "step": 273 }, { "clip_fraction": 0.0, "completion_length": 1573.3750228881836, "epoch": 0.31314285714285717, "grad_norm": 0.03588717430830002, "kl": 0.00010608136653900146, "lambda_div_used": 0.6433197036385536, "learning_rate": 5.562829811526154e-07, "loss": -0.0021, "reward": 0.24507278576493263, "reward_after_mean": 0.24507278576493263, "reward_after_std": 0.7453816495835781, "reward_before_mean": 0.6975303117651492, "reward_before_std": 0.6636776090599597, "reward_change_max": 0.0, "reward_change_mean": -0.4524575434625149, "reward_change_min": -0.703257791697979, "reward_change_std": 0.2789953136816621, "reward_std": 0.7453816495835781, "rewards/accuracy_reward": 0.479166679084301, "rewards/cosine_scaled_reward": 0.21836365200579166, "step": 274 }, { "clip_fraction": 0.0, "completion_length": 2231.375015258789, "epoch": 0.3142857142857143, "grad_norm": 0.02452162466943264, "kl": 0.00013327598571777344, "lambda_div_used": 0.5854036509990692, "learning_rate": 5.531415671340826e-07, "loss": 0.0351, "reward": 0.1521737277507782, "reward_after_mean": 0.1521737277507782, "reward_after_std": 0.5359849948436022, "reward_before_mean": 0.6887877276167274, "reward_before_std": 0.3889514375478029, "reward_change_max": 0.0, "reward_change_mean": -0.5366139896214008, "reward_change_min": -0.7292314879596233, "reward_change_std": 0.28961729165166616, "reward_std": 0.5359850041568279, "rewards/accuracy_reward": 0.47916667722165585, "rewards/cosine_scaled_reward": 0.20962106250226498, "step": 275 }, { "clip_fraction": 0.0, "completion_length": 2479.375015258789, "epoch": 0.31542857142857145, "grad_norm": 0.022260351106524467, "kl": 0.0001841336488723755, "lambda_div_used": 0.6080946400761604, "learning_rate": 5.5e-07, "loss": 0.0233, "reward": 0.15685568936169147, "reward_after_mean": 0.15685568936169147, "reward_after_std": 0.6163474209606647, "reward_before_mean": 0.6554346140474081, "reward_before_std": 0.5032088747248054, "reward_change_max": 0.0, "reward_change_mean": -0.49857890233397484, "reward_change_min": -0.7599121108651161, "reward_change_std": 0.2981911161914468, "reward_std": 0.6163474582135677, "rewards/accuracy_reward": 0.4583333395421505, "rewards/cosine_scaled_reward": 0.19710125587880611, "step": 276 }, { "clip_fraction": 0.0, "completion_length": 2441.625045776367, "epoch": 0.31657142857142856, "grad_norm": 0.0253590177744627, "kl": 0.0002009868621826172, "lambda_div_used": 0.662374809384346, "learning_rate": 5.468584328659172e-07, "loss": 0.0379, "reward": 0.13848626799881458, "reward_after_mean": 0.13848626799881458, "reward_after_std": 0.8288281839340925, "reward_before_mean": 0.5136084349360317, "reward_before_std": 0.7587530063465238, "reward_change_max": 0.0, "reward_change_mean": -0.37512217462062836, "reward_change_min": -0.6195148192346096, "reward_change_std": 0.24048541858792305, "reward_std": 0.8288282137364149, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.13860841654241085, "step": 277 }, { "clip_fraction": 0.0, "completion_length": 2101.729202270508, "epoch": 0.3177142857142857, "grad_norm": 0.03273219242691994, "kl": 0.0001831650733947754, "lambda_div_used": 0.5535080209374428, "learning_rate": 5.437170188473847e-07, "loss": 0.0176, "reward": 0.026296844705939293, "reward_after_mean": 0.026296844705939293, "reward_after_std": 0.47010152228176594, "reward_before_mean": 0.6100956231821328, "reward_before_std": 0.24391112057492137, "reward_change_max": 0.0, "reward_change_mean": -0.5837987624108791, "reward_change_min": -0.8103098906576633, "reward_change_std": 0.3077183160930872, "reward_std": 0.4701015278697014, "rewards/accuracy_reward": 0.4166666679084301, "rewards/cosine_scaled_reward": 0.19342893542489037, "step": 278 }, { "clip_fraction": 0.0, "completion_length": 3132.5416717529297, "epoch": 0.31885714285714284, "grad_norm": 0.021023932844400406, "kl": 0.000225067138671875, "lambda_div_used": 0.6101865917444229, "learning_rate": 5.405759110524894e-07, "loss": 0.0252, "reward": -0.24995685555040836, "reward_after_mean": -0.24995685555040836, "reward_after_std": 0.5465981848537922, "reward_before_mean": 0.00569869764149189, "reward_before_std": 0.5138198006898165, "reward_change_max": 0.0, "reward_change_mean": -0.25565553829073906, "reward_change_min": -0.42565521970391273, "reward_change_std": 0.1681571202352643, "reward_std": 0.5465981848537922, "rewards/accuracy_reward": 0.14583333395421505, "rewards/cosine_scaled_reward": -0.14013465493917465, "step": 279 }, { "clip_fraction": 0.0, "completion_length": 2124.208381652832, "epoch": 0.32, "grad_norm": 0.03544396162033081, "kl": 0.000164031982421875, "lambda_div_used": 0.6441325098276138, "learning_rate": 5.37435262574394e-07, "loss": 0.0491, "reward": 0.1208603996783495, "reward_after_mean": 0.1208603996783495, "reward_after_std": 0.6665525771677494, "reward_before_mean": 0.48556846380233765, "reward_before_std": 0.6708142012357712, "reward_change_max": 0.0, "reward_change_mean": -0.364708062261343, "reward_change_min": -0.6562114134430885, "reward_change_std": 0.25303495209664106, "reward_std": 0.66655258461833, "rewards/accuracy_reward": 0.354166679084301, "rewards/cosine_scaled_reward": 0.13140178471803665, "step": 280 }, { "clip_fraction": 0.0, "completion_length": 3433.6459045410156, "epoch": 0.3211428571428571, "grad_norm": 0.0178577471524477, "kl": 0.00023686885833740234, "lambda_div_used": 0.5564139187335968, "learning_rate": 5.342952264838747e-07, "loss": 0.0092, "reward": -0.4361804537475109, "reward_after_mean": -0.4361804537475109, "reward_after_std": 0.3096983712166548, "reward_before_mean": -0.1626793835312128, "reward_before_std": 0.2517517115920782, "reward_change_max": 0.0, "reward_change_mean": -0.2735010664910078, "reward_change_min": -0.44846677780151367, "reward_change_std": 0.1593692358583212, "reward_std": 0.3096983730792999, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.22517940029501915, "step": 281 }, { "clip_fraction": 0.0, "completion_length": 2314.645866394043, "epoch": 0.3222857142857143, "grad_norm": 0.023510051891207695, "kl": 0.00020015239715576172, "lambda_div_used": 0.5891857892274857, "learning_rate": 5.311559558218603e-07, "loss": -0.0533, "reward": 0.015196382999420166, "reward_after_mean": 0.015196382999420166, "reward_after_std": 0.506258824840188, "reward_before_mean": 0.4649945506826043, "reward_before_std": 0.40477199107408524, "reward_change_max": 0.0, "reward_change_mean": -0.449798122048378, "reward_change_min": -0.6414072066545486, "reward_change_std": 0.2532971305772662, "reward_std": 0.5062588378787041, "rewards/accuracy_reward": 0.35416667722165585, "rewards/cosine_scaled_reward": 0.11082786321640015, "step": 282 }, { "clip_fraction": 0.0, "completion_length": 2393.854232788086, "epoch": 0.32342857142857145, "grad_norm": 0.02048996463418007, "kl": 0.00015240907669067383, "lambda_div_used": 0.587490864098072, "learning_rate": 5.28017603591974e-07, "loss": 0.0139, "reward": 0.30065850354731083, "reward_after_mean": 0.30065850354731083, "reward_after_std": 0.5727098472416401, "reward_before_mean": 0.9280985994264483, "reward_before_std": 0.3965794490650296, "reward_change_max": 0.0, "reward_change_mean": -0.6274400968104601, "reward_change_min": -0.8531196974217892, "reward_change_std": 0.3378349719569087, "reward_std": 0.5727098621428013, "rewards/accuracy_reward": 0.6041666772216558, "rewards/cosine_scaled_reward": 0.3239319231361151, "step": 283 }, { "clip_fraction": 0.0, "completion_length": 2126.020851135254, "epoch": 0.32457142857142857, "grad_norm": 0.028623223304748535, "kl": 0.00013819336891174316, "lambda_div_used": 0.6661800816655159, "learning_rate": 5.248803227530763e-07, "loss": 0.0092, "reward": 0.09131545946002007, "reward_after_mean": 0.09131545946002007, "reward_after_std": 0.7503824215382338, "reward_before_mean": 0.4026043973863125, "reward_before_std": 0.788893286138773, "reward_change_max": 0.0, "reward_change_mean": -0.3112889491021633, "reward_change_min": -0.6233196444809437, "reward_change_std": 0.24689115211367607, "reward_std": 0.7503824215382338, "rewards/accuracy_reward": 0.3125000037252903, "rewards/cosine_scaled_reward": 0.09010439366102219, "step": 284 }, { "clip_fraction": 0.0, "completion_length": 1871.5000228881836, "epoch": 0.32571428571428573, "grad_norm": 0.02662411518394947, "kl": 0.00016036629676818848, "lambda_div_used": 0.576582707464695, "learning_rate": 5.21744266211809e-07, "loss": 0.0319, "reward": -0.3101608529686928, "reward_after_mean": -0.3101608529686928, "reward_after_std": 0.41523571871221066, "reward_before_mean": -0.01919533498585224, "reward_before_std": 0.3510099109262228, "reward_change_max": 0.0, "reward_change_mean": -0.2909655049443245, "reward_change_min": -0.4455920048058033, "reward_change_std": 0.16767465602606535, "reward_std": 0.4152357243001461, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.12336201290600002, "step": 285 }, { "clip_fraction": 0.0, "completion_length": 2349.000030517578, "epoch": 0.32685714285714285, "grad_norm": 0.02408268116414547, "kl": 0.00019100308418273926, "lambda_div_used": 0.6201390102505684, "learning_rate": 5.186095868151436e-07, "loss": 0.0307, "reward": 0.21592768095433712, "reward_after_mean": 0.21592768095433712, "reward_after_std": 0.66009739972651, "reward_before_mean": 0.7218069694936275, "reward_before_std": 0.554044695571065, "reward_change_max": 0.0, "reward_change_mean": -0.5058793053030968, "reward_change_min": -0.7697652019560337, "reward_change_std": 0.2988813826814294, "reward_std": 0.6600974258035421, "rewards/accuracy_reward": 0.5000000093132257, "rewards/cosine_scaled_reward": 0.22180695831775665, "step": 286 }, { "clip_fraction": 0.0, "completion_length": 1718.604175567627, "epoch": 0.328, "grad_norm": 0.03919753059744835, "kl": 0.00013002753257751465, "lambda_div_used": 0.5975622236728668, "learning_rate": 5.154764373429315e-07, "loss": -0.1056, "reward": 0.03168256084609311, "reward_after_mean": 0.03168256084609311, "reward_after_std": 0.5345403701066971, "reward_before_mean": 0.4581381119787693, "reward_before_std": 0.44751388020813465, "reward_change_max": 0.0, "reward_change_mean": -0.42645558528602123, "reward_change_min": -0.617030244320631, "reward_change_std": 0.24450463335961103, "reward_std": 0.5345403775572777, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.12480480223894119, "step": 287 }, { "clip_fraction": 0.0, "completion_length": 2906.3541717529297, "epoch": 0.3291428571428571, "grad_norm": 0.019498826935887337, "kl": 0.00022274255752563477, "lambda_div_used": 0.5806883201003075, "learning_rate": 5.123449705004581e-07, "loss": 0.0055, "reward": -0.18337237276136875, "reward_after_mean": -0.18337237276136875, "reward_after_std": 0.4604283105581999, "reward_before_mean": 0.1833638995885849, "reward_before_std": 0.3699948964640498, "reward_change_max": 0.0, "reward_change_mean": -0.36673627234995365, "reward_change_min": -0.5305228792130947, "reward_change_std": 0.20658569782972336, "reward_std": 0.4604283291846514, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.04580276645720005, "step": 288 }, { "clip_fraction": 0.0, "completion_length": 2202.666702270508, "epoch": 0.3302857142857143, "grad_norm": 0.033871494233608246, "kl": 0.00016301870346069336, "lambda_div_used": 0.5840093046426773, "learning_rate": 5.09215338910999e-07, "loss": -0.0478, "reward": 0.007814206182956696, "reward_after_mean": 0.007814206182956696, "reward_after_std": 0.4950754214078188, "reward_before_mean": 0.46548917703330517, "reward_before_std": 0.381549178622663, "reward_change_max": 0.0, "reward_change_mean": -0.45767495781183243, "reward_change_min": -0.6326170898973942, "reward_change_std": 0.24920715391635895, "reward_std": 0.49507543072104454, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.09048915281891823, "step": 289 }, { "clip_fraction": 0.0, "completion_length": 1533.6042251586914, "epoch": 0.3314285714285714, "grad_norm": 0.032007429748773575, "kl": 0.00014695525169372559, "lambda_div_used": 0.6211200878024101, "learning_rate": 5.060876951083828e-07, "loss": 0.0764, "reward": 0.04068056936375797, "reward_after_mean": 0.04068056936375797, "reward_after_std": 0.6179232522845268, "reward_before_mean": 0.4251148612238467, "reward_before_std": 0.556100070476532, "reward_change_max": 0.0, "reward_change_mean": -0.3844342865049839, "reward_change_min": -0.5800922103226185, "reward_change_std": 0.22699587792158127, "reward_std": 0.617923267185688, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.09178150352090597, "step": 290 }, { "clip_fraction": 0.0, "completion_length": 1996.187515258789, "epoch": 0.3325714285714286, "grad_norm": 0.024394486099481583, "kl": 0.00016900897026062012, "lambda_div_used": 0.67839565128088, "learning_rate": 5.02962191529556e-07, "loss": 0.0077, "reward": 0.09233464859426022, "reward_after_mean": 0.09233464859426022, "reward_after_std": 0.8132282309234142, "reward_before_mean": 0.3814069051295519, "reward_before_std": 0.8372865226119757, "reward_change_max": 0.0, "reward_change_mean": -0.2890722490847111, "reward_change_min": -0.5995339304208755, "reward_change_std": 0.22939695976674557, "reward_std": 0.8132282607257366, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.06890689395368099, "step": 291 }, { "clip_fraction": 0.0, "completion_length": 2611.541702270508, "epoch": 0.33371428571428574, "grad_norm": 0.022669553756713867, "kl": 0.00019761919975280762, "lambda_div_used": 0.6018925532698631, "learning_rate": 4.998389805071536e-07, "loss": 0.0303, "reward": -0.11102894321084023, "reward_after_mean": -0.11102894321084023, "reward_after_std": 0.5624180883169174, "reward_before_mean": 0.24086102936416864, "reward_before_std": 0.4718956621363759, "reward_change_max": 0.0, "reward_change_mean": -0.35188993997871876, "reward_change_min": -0.5550865493714809, "reward_change_std": 0.21156923100352287, "reward_std": 0.5624181143939495, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": -0.029972338117659092, "step": 292 }, { "clip_fraction": 0.0, "completion_length": 2105.812545776367, "epoch": 0.33485714285714285, "grad_norm": 0.022291820496320724, "kl": 0.00020776689052581787, "lambda_div_used": 0.5899104326963425, "learning_rate": 4.967182142620745e-07, "loss": -0.0174, "reward": -0.15591239370405674, "reward_after_mean": -0.15591239370405674, "reward_after_std": 0.4546964541077614, "reward_before_mean": 0.18034806847572327, "reward_before_std": 0.4065048359334469, "reward_change_max": 0.0, "reward_change_mean": -0.33626046776771545, "reward_change_min": -0.5091775916516781, "reward_change_std": 0.19646561425179243, "reward_std": 0.4546964690089226, "rewards/accuracy_reward": 0.22916667722165585, "rewards/cosine_scaled_reward": -0.048818591982126236, "step": 293 }, { "clip_fraction": 0.0, "completion_length": 3101.562545776367, "epoch": 0.336, "grad_norm": 0.020145803689956665, "kl": 0.0002519190311431885, "lambda_div_used": 0.5795014202594757, "learning_rate": 4.93600044896063e-07, "loss": -0.0181, "reward": -0.29703800566494465, "reward_after_mean": -0.29703800566494465, "reward_after_std": 0.4137891363352537, "reward_before_mean": -0.004143683239817619, "reward_before_std": 0.3665550462901592, "reward_change_max": 0.0, "reward_change_mean": -0.2928943391889334, "reward_change_min": -0.4820845164358616, "reward_change_std": 0.18245024606585503, "reward_std": 0.4137891549617052, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.1291436767205596, "step": 294 }, { "clip_fraction": 0.0, "completion_length": 2967.1250076293945, "epoch": 0.33714285714285713, "grad_norm": 0.03346420079469681, "kl": 0.00022870302200317383, "lambda_div_used": 0.5998342111706734, "learning_rate": 4.904846243842949e-07, "loss": 0.0317, "reward": -0.21130692400038242, "reward_after_mean": -0.21130692400038242, "reward_after_std": 0.4944173116236925, "reward_before_mean": 0.07879196340218186, "reward_before_std": 0.4600360617041588, "reward_change_max": 0.0, "reward_change_mean": -0.2900988757610321, "reward_change_min": -0.47371478378772736, "reward_change_std": 0.18294072337448597, "reward_std": 0.49441731721162796, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.08787472359836102, "step": 295 }, { "clip_fraction": 0.0, "completion_length": 2934.3541870117188, "epoch": 0.3382857142857143, "grad_norm": 0.024312211200594902, "kl": 0.00023806095123291016, "lambda_div_used": 0.5608572289347649, "learning_rate": 4.873721045679706e-07, "loss": 0.0627, "reward": -0.18713558092713356, "reward_after_mean": -0.18713558092713356, "reward_after_std": 0.37113036401569843, "reward_before_mean": 0.2144376989454031, "reward_before_std": 0.2741664042696357, "reward_change_max": 0.0, "reward_change_mean": -0.40157328359782696, "reward_change_min": -0.5742517001926899, "reward_change_std": 0.22423129715025425, "reward_std": 0.3711303863674402, "rewards/accuracy_reward": 0.2291666716337204, "rewards/cosine_scaled_reward": -0.01472897082567215, "step": 296 }, { "clip_fraction": 0.0, "completion_length": 3350.8958435058594, "epoch": 0.3394285714285714, "grad_norm": 0.016911419108510017, "kl": 0.0002732276916503906, "lambda_div_used": 0.6085675731301308, "learning_rate": 4.842626371469149e-07, "loss": -0.0001, "reward": -0.13465989474207163, "reward_after_mean": -0.13465989474207163, "reward_after_std": 0.5554658677428961, "reward_before_mean": 0.1848100395873189, "reward_before_std": 0.494691526517272, "reward_change_max": 0.0, "reward_change_mean": -0.31946992687880993, "reward_change_min": -0.5050319209694862, "reward_change_std": 0.18696323037147522, "reward_std": 0.5554658826440573, "rewards/accuracy_reward": 0.2291666679084301, "rewards/cosine_scaled_reward": -0.04435660713352263, "step": 297 }, { "clip_fraction": 0.0, "completion_length": 2694.791748046875, "epoch": 0.3405714285714286, "grad_norm": 0.022230952978134155, "kl": 0.0001736283302307129, "lambda_div_used": 0.585621178150177, "learning_rate": 4.811563736721829e-07, "loss": -0.0641, "reward": -0.09131219866685569, "reward_after_mean": -0.09131219866685569, "reward_after_std": 0.521870668977499, "reward_before_mean": 0.32360453344881535, "reward_before_std": 0.3913488043472171, "reward_change_max": 0.0, "reward_change_mean": -0.41491674818098545, "reward_change_min": -0.6092600487172604, "reward_change_std": 0.22935225442051888, "reward_std": 0.5218706801533699, "rewards/accuracy_reward": 0.31250000186264515, "rewards/cosine_scaled_reward": 0.011104530887678266, "step": 298 }, { "clip_fraction": 0.0, "completion_length": 3055.562530517578, "epoch": 0.3417142857142857, "grad_norm": 0.016956690698862076, "kl": 0.00024694204330444336, "lambda_div_used": 0.6225644424557686, "learning_rate": 4.780534655386743e-07, "loss": -0.0092, "reward": 0.035534653812646866, "reward_after_mean": 0.035534653812646866, "reward_after_std": 0.5773687828332186, "reward_before_mean": 0.4040503818541765, "reward_before_std": 0.5686829779297113, "reward_change_max": 0.0, "reward_change_mean": -0.3685157597064972, "reward_change_min": -0.6160638965666294, "reward_change_std": 0.24328476376831532, "reward_std": 0.5773687846958637, "rewards/accuracy_reward": 0.3125000111758709, "rewards/cosine_scaled_reward": 0.09155038185417652, "step": 299 }, { "clip_fraction": 0.0, "completion_length": 3382.041717529297, "epoch": 0.34285714285714286, "grad_norm": 0.018982913345098495, "kl": 0.00028121471405029297, "lambda_div_used": 0.5635487735271454, "learning_rate": 4.749540639777539e-07, "loss": 0.0416, "reward": -0.35363302007317543, "reward_after_mean": -0.35363302007317543, "reward_after_std": 0.3348991144448519, "reward_before_mean": -0.05836603417992592, "reward_before_std": 0.2893520062789321, "reward_change_max": 0.0, "reward_change_mean": -0.2952669896185398, "reward_change_min": -0.45902693271636963, "reward_change_std": 0.17492949962615967, "reward_std": 0.33489912562072277, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.14169937558472157, "step": 300 }, { "clip_fraction": 0.0, "completion_length": 2189.666732788086, "epoch": 0.344, "grad_norm": 0.02743070013821125, "kl": 0.00020772218704223633, "lambda_div_used": 0.6162105649709702, "learning_rate": 4.7185832004988133e-07, "loss": 0.077, "reward": -0.2270398661494255, "reward_after_mean": -0.2270398661494255, "reward_after_std": 0.5807012170553207, "reward_before_mean": 0.03607312589883804, "reward_before_std": 0.5330064725130796, "reward_change_max": 0.0, "reward_change_mean": -0.263112997636199, "reward_change_min": -0.46500347927212715, "reward_change_std": 0.16810790356248617, "reward_std": 0.5807012394070625, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.10976020619273186, "step": 301 }, { "clip_fraction": 0.0, "completion_length": 2271.9167098999023, "epoch": 0.34514285714285714, "grad_norm": 0.03009362705051899, "kl": 0.00019848346710205078, "lambda_div_used": 0.6102809309959412, "learning_rate": 4.68766384637248e-07, "loss": -0.0061, "reward": 0.1712653641588986, "reward_after_mean": 0.1712653641588986, "reward_after_std": 0.6374101359397173, "reward_before_mean": 0.6755810640752316, "reward_before_std": 0.5059886500239372, "reward_change_max": 0.0, "reward_change_mean": -0.5043157208710909, "reward_change_min": -0.7193296477198601, "reward_change_std": 0.2849651984870434, "reward_std": 0.637410145252943, "rewards/accuracy_reward": 0.4583333395421505, "rewards/cosine_scaled_reward": 0.21724772220477462, "step": 302 }, { "clip_fraction": 0.0, "completion_length": 2379.8125381469727, "epoch": 0.3462857142857143, "grad_norm": 0.028283070772886276, "kl": 0.00022455304861068726, "lambda_div_used": 0.5937864035367966, "learning_rate": 4.656784084364238e-07, "loss": -0.0542, "reward": -0.11667206266429275, "reward_after_mean": -0.11667206266429275, "reward_after_std": 0.46380676329135895, "reward_before_mean": 0.24170983396470547, "reward_before_std": 0.4273997135460377, "reward_change_max": 0.0, "reward_change_mean": -0.3583819102495909, "reward_change_min": -0.5776236318051815, "reward_change_std": 0.22200697474181652, "reward_std": 0.4638067800551653, "rewards/accuracy_reward": 0.2500000111758709, "rewards/cosine_scaled_reward": -0.008290180005133152, "step": 303 }, { "clip_fraction": 0.0, "completion_length": 2483.270881652832, "epoch": 0.3474285714285714, "grad_norm": 0.02691042050719261, "kl": 0.0002352595329284668, "lambda_div_used": 0.6489763781428337, "learning_rate": 4.6259454195101267e-07, "loss": -0.0177, "reward": -0.052505167201161385, "reward_after_mean": -0.052505167201161385, "reward_after_std": 0.7108322139829397, "reward_before_mean": 0.2283354545943439, "reward_before_std": 0.6949998550117016, "reward_change_max": 0.0, "reward_change_mean": -0.28084064833819866, "reward_change_min": -0.5053656212985516, "reward_change_std": 0.1956273503601551, "reward_std": 0.7108322307467461, "rewards/accuracy_reward": 0.2291666716337204, "rewards/cosine_scaled_reward": -0.00083120446652174, "step": 304 }, { "clip_fraction": 0.0, "completion_length": 2814.229232788086, "epoch": 0.3485714285714286, "grad_norm": 0.023387128487229347, "kl": 0.0002447366714477539, "lambda_div_used": 0.5986855253577232, "learning_rate": 4.59514935484316e-07, "loss": -0.0237, "reward": -0.17212717607617378, "reward_after_mean": -0.17212717607617378, "reward_after_std": 0.5009447801858187, "reward_before_mean": 0.13333414122462273, "reward_before_std": 0.45383385568857193, "reward_change_max": 0.0, "reward_change_mean": -0.30546131171286106, "reward_change_min": -0.460513886064291, "reward_change_std": 0.1838802546262741, "reward_std": 0.5009447950869799, "rewards/accuracy_reward": 0.20833334140479565, "rewards/cosine_scaled_reward": -0.07499920274131, "step": 305 }, { "clip_fraction": 0.0, "completion_length": 2095.0000190734863, "epoch": 0.3497142857142857, "grad_norm": 0.03199386969208717, "kl": 0.00018633902072906494, "lambda_div_used": 0.595753937959671, "learning_rate": 4.5643973913200837e-07, "loss": 0.0245, "reward": 0.03899537643883377, "reward_after_mean": 0.03899537643883377, "reward_after_std": 0.528584310784936, "reward_before_mean": 0.49805059214122593, "reward_before_std": 0.4409319751430303, "reward_change_max": 0.0, "reward_change_mean": -0.45905524492263794, "reward_change_min": -0.6990172192454338, "reward_change_std": 0.2754313191398978, "reward_std": 0.528584323823452, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.14388393727131188, "step": 306 }, { "clip_fraction": 0.0, "completion_length": 2299.4166984558105, "epoch": 0.35085714285714287, "grad_norm": 0.030135195702314377, "kl": 0.00022998452186584473, "lambda_div_used": 0.6275613307952881, "learning_rate": 4.5336910277482155e-07, "loss": 0.024, "reward": -0.1547635430470109, "reward_after_mean": -0.1547635430470109, "reward_after_std": 0.6599444597959518, "reward_before_mean": 0.11526766640599817, "reward_before_std": 0.586240291595459, "reward_change_max": 0.0, "reward_change_mean": -0.270031226798892, "reward_change_min": -0.40235158428549767, "reward_change_std": 0.15426483657211065, "reward_std": 0.6599444709718227, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.05139899626374245, "step": 307 }, { "clip_fraction": 0.0, "completion_length": 3160.000030517578, "epoch": 0.352, "grad_norm": 0.018670011311769485, "kl": 0.00022402405738830566, "lambda_div_used": 0.6338188126683235, "learning_rate": 4.503031760712397e-07, "loss": -0.0036, "reward": -0.02171214483678341, "reward_after_mean": -0.02171214483678341, "reward_after_std": 0.6237165722995996, "reward_before_mean": 0.29556242609396577, "reward_before_std": 0.6284392019733787, "reward_change_max": 0.0, "reward_change_mean": -0.3172745667397976, "reward_change_min": -0.5880400538444519, "reward_change_std": 0.227503115311265, "reward_std": 0.6237165946513414, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.024729080265387893, "step": 308 }, { "clip_fraction": 0.0, "completion_length": 2601.9583740234375, "epoch": 0.35314285714285715, "grad_norm": 0.02388334833085537, "kl": 0.00020259618759155273, "lambda_div_used": 0.645737886428833, "learning_rate": 4.4724210845020494e-07, "loss": 0.0479, "reward": 0.09015273489058018, "reward_after_mean": 0.09015273489058018, "reward_after_std": 0.6656354945152998, "reward_before_mean": 0.4416396114975214, "reward_before_std": 0.6783724967390299, "reward_change_max": 0.0, "reward_change_mean": -0.3514868915081024, "reward_change_min": -0.5935809202492237, "reward_change_std": 0.24138008058071136, "reward_std": 0.6656355243176222, "rewards/accuracy_reward": 0.33333334140479565, "rewards/cosine_scaled_reward": 0.10830628499388695, "step": 309 }, { "clip_fraction": 0.0, "completion_length": 2018.3542022705078, "epoch": 0.35428571428571426, "grad_norm": 0.030827393755316734, "kl": 0.00020694732666015625, "lambda_div_used": 0.6246318891644478, "learning_rate": 4.441860491038345e-07, "loss": 0.0364, "reward": -0.13591936416924, "reward_after_mean": -0.13591936416924, "reward_after_std": 0.590507235378027, "reward_before_mean": 0.14517710404470563, "reward_before_std": 0.5790550196543336, "reward_change_max": 0.0, "reward_change_mean": -0.28109647892415524, "reward_change_min": -0.5089371241629124, "reward_change_std": 0.1950351819396019, "reward_std": 0.5905072540044785, "rewards/accuracy_reward": 0.20833333767950535, "rewards/cosine_scaled_reward": -0.06315623363479972, "step": 310 }, { "clip_fraction": 0.0, "completion_length": 2212.083335876465, "epoch": 0.3554285714285714, "grad_norm": 0.025455351918935776, "kl": 0.00022774934768676758, "lambda_div_used": 0.5723904073238373, "learning_rate": 4.4113514698014953e-07, "loss": -0.0105, "reward": 0.04124009236693382, "reward_after_mean": 0.04124009236693382, "reward_after_std": 0.47586701065301895, "reward_before_mean": 0.5763039644807577, "reward_before_std": 0.3296704487875104, "reward_change_max": 0.0, "reward_change_mean": -0.5350638851523399, "reward_change_min": -0.7587380260229111, "reward_change_std": 0.295849135145545, "reward_std": 0.47586701437830925, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.18047063890844584, "step": 311 }, { "clip_fraction": 0.0, "completion_length": 2184.833351135254, "epoch": 0.3565714285714286, "grad_norm": 0.030652416869997978, "kl": 0.0002588629722595215, "lambda_div_used": 0.5729342699050903, "learning_rate": 4.3808955077581546e-07, "loss": -0.0417, "reward": 0.07517872378230095, "reward_after_mean": 0.07517872378230095, "reward_after_std": 0.4695996157824993, "reward_before_mean": 0.596075527369976, "reward_before_std": 0.333228693343699, "reward_change_max": 0.0, "reward_change_mean": -0.5208968166261911, "reward_change_min": -0.7326644062995911, "reward_change_std": 0.28812805004417896, "reward_std": 0.46959962509572506, "rewards/accuracy_reward": 0.4375000074505806, "rewards/cosine_scaled_reward": 0.15857553109526634, "step": 312 }, { "clip_fraction": 0.0, "completion_length": 2586.1875534057617, "epoch": 0.3577142857142857, "grad_norm": 0.022464681416749954, "kl": 0.00023896992206573486, "lambda_div_used": 0.5898676738142967, "learning_rate": 4.350494089288943e-07, "loss": 0.0132, "reward": 0.004749574698507786, "reward_after_mean": 0.004749574698507786, "reward_after_std": 0.5146205350756645, "reward_before_mean": 0.4418492801487446, "reward_before_std": 0.41057482920587063, "reward_change_max": 0.0, "reward_change_mean": -0.4370996989309788, "reward_change_min": -0.6041509285569191, "reward_change_std": 0.24443841353058815, "reward_std": 0.5146205425262451, "rewards/accuracy_reward": 0.35416667722165585, "rewards/cosine_scaled_reward": 0.08768259733915329, "step": 313 }, { "clip_fraction": 0.0, "completion_length": 2277.68754196167, "epoch": 0.3588571428571429, "grad_norm": 0.029802966862916946, "kl": 0.00015804357826709747, "lambda_div_used": 0.57961256057024, "learning_rate": 4.3201486961161093e-07, "loss": -0.0058, "reward": 0.04087065905332565, "reward_after_mean": 0.04087065905332565, "reward_after_std": 0.491618013009429, "reward_before_mean": 0.5418765433132648, "reward_before_std": 0.3644657013937831, "reward_change_max": 0.0, "reward_change_mean": -0.5010058786720037, "reward_change_min": -0.7266513183712959, "reward_change_std": 0.2839649748057127, "reward_std": 0.4916180297732353, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.14604321867227554, "step": 314 }, { "clip_fraction": 0.0, "completion_length": 2722.479179382324, "epoch": 0.36, "grad_norm": 0.02163972705602646, "kl": 0.00025177001953125, "lambda_div_used": 0.5554845109581947, "learning_rate": 4.2898608072313045e-07, "loss": 0.0302, "reward": -0.1392173320055008, "reward_after_mean": -0.1392173320055008, "reward_after_std": 0.421754639595747, "reward_before_mean": 0.3322529550641775, "reward_before_std": 0.2525685231667012, "reward_change_max": 0.0, "reward_change_mean": -0.4714703354984522, "reward_change_min": -0.6779767945408821, "reward_change_std": 0.2537938868626952, "reward_std": 0.4217546433210373, "rewards/accuracy_reward": 0.2916666679084301, "rewards/cosine_scaled_reward": 0.04058632627129555, "step": 315 }, { "clip_fraction": 0.0, "completion_length": 3430.4166870117188, "epoch": 0.36114285714285715, "grad_norm": 0.021473117172718048, "kl": 0.00034546852111816406, "lambda_div_used": 0.5831886008381844, "learning_rate": 4.2596318988235037e-07, "loss": 0.0331, "reward": -0.20306292921304703, "reward_after_mean": -0.20306292921304703, "reward_after_std": 0.4045538082718849, "reward_before_mean": 0.12524887174367905, "reward_before_std": 0.37981976941227913, "reward_change_max": 0.0, "reward_change_mean": -0.3283117860555649, "reward_change_min": -0.5172865837812424, "reward_change_std": 0.20252829603850842, "reward_std": 0.40455381385982037, "rewards/accuracy_reward": 0.1875000074505806, "rewards/cosine_scaled_reward": -0.062251146882772446, "step": 316 }, { "clip_fraction": 0.0, "completion_length": 2792.6250076293945, "epoch": 0.36228571428571427, "grad_norm": 0.023409582674503326, "kl": 0.00030331313610076904, "lambda_div_used": 0.5606855005025864, "learning_rate": 4.2294634442070553e-07, "loss": 0.0162, "reward": -0.25401476211845875, "reward_after_mean": -0.25401476211845875, "reward_after_std": 0.39204780384898186, "reward_before_mean": 0.13540820218622684, "reward_before_std": 0.2758419858291745, "reward_change_max": 0.0, "reward_change_mean": -0.38942296989262104, "reward_change_min": -0.5835930369794369, "reward_change_std": 0.21769424341619015, "reward_std": 0.39204781129956245, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.03125846944749355, "step": 317 }, { "clip_fraction": 0.0, "completion_length": 1197.1875228881836, "epoch": 0.36342857142857143, "grad_norm": 0.044679004698991776, "kl": 0.00012731552124023438, "lambda_div_used": 0.6002074480056763, "learning_rate": 4.1993569137498776e-07, "loss": -0.0198, "reward": 0.03343228530138731, "reward_after_mean": 0.03343228530138731, "reward_after_std": 0.5209389794617891, "reward_before_mean": 0.4655236080288887, "reward_before_std": 0.45914868731051683, "reward_change_max": 0.0, "reward_change_mean": -0.4320913068950176, "reward_change_min": -0.6742168106138706, "reward_change_std": 0.2611297369003296, "reward_std": 0.5209389794617891, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.1113569182343781, "step": 318 }, { "clip_fraction": 0.0, "completion_length": 2743.4167098999023, "epoch": 0.36457142857142855, "grad_norm": 0.03044748492538929, "kl": 0.0002751946449279785, "lambda_div_used": 0.5535945892333984, "learning_rate": 4.1693137748017915e-07, "loss": 0.0111, "reward": -0.4908355651423335, "reward_after_mean": -0.4908355651423335, "reward_after_std": 0.3172128964215517, "reward_before_mean": -0.24739529378712177, "reward_before_std": 0.23890432622283697, "reward_change_max": 0.0, "reward_change_mean": -0.2434402648359537, "reward_change_min": -0.35953374207019806, "reward_change_std": 0.13119421433657408, "reward_std": 0.31721290200948715, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2682286258786917, "step": 319 }, { "clip_fraction": 0.0, "completion_length": 1857.3333587646484, "epoch": 0.3657142857142857, "grad_norm": 0.03547307848930359, "kl": 0.00026220083236694336, "lambda_div_used": 0.6022170931100845, "learning_rate": 4.1393354916230005e-07, "loss": 0.0462, "reward": -0.125480268150568, "reward_after_mean": -0.125480268150568, "reward_after_std": 0.505291972309351, "reward_before_mean": 0.19450905406847596, "reward_before_std": 0.4768552405294031, "reward_change_max": 0.0, "reward_change_mean": -0.31998929381370544, "reward_change_min": -0.5316651687026024, "reward_change_std": 0.20880178455263376, "reward_std": 0.505291985347867, "rewards/accuracy_reward": 0.2291666753590107, "rewards/cosine_scaled_reward": -0.034657632233574986, "step": 320 }, { "clip_fraction": 0.0, "completion_length": 1452.770881652832, "epoch": 0.3668571428571429, "grad_norm": 0.03124224953353405, "kl": 0.00018140673637390137, "lambda_div_used": 0.6199431717395782, "learning_rate": 4.1094235253127374e-07, "loss": -0.0599, "reward": 0.282286923378706, "reward_after_mean": 0.282286923378706, "reward_after_std": 0.6488686576485634, "reward_before_mean": 0.8242554701864719, "reward_before_std": 0.5566414860077202, "reward_change_max": 0.0, "reward_change_mean": -0.5419685430824757, "reward_change_min": -0.8236861452460289, "reward_change_std": 0.33083922043442726, "reward_std": 0.648868665099144, "rewards/accuracy_reward": 0.5625000074505806, "rewards/cosine_scaled_reward": 0.26175545156002045, "step": 321 }, { "clip_fraction": 0.0, "completion_length": 2662.50004196167, "epoch": 0.368, "grad_norm": 0.036272305995225906, "kl": 0.0003364086151123047, "lambda_div_used": 0.6552734896540642, "learning_rate": 4.079579333738039e-07, "loss": 0.0123, "reward": -0.039356768131256104, "reward_after_mean": -0.039356768131256104, "reward_after_std": 0.7184183727949858, "reward_before_mean": 0.23175650835037231, "reward_before_std": 0.7305669207125902, "reward_change_max": 0.0, "reward_change_mean": -0.2711132802069187, "reward_change_min": -0.5112780928611755, "reward_change_std": 0.20828023366630077, "reward_std": 0.7184183821082115, "rewards/accuracy_reward": 0.27083333395421505, "rewards/cosine_scaled_reward": -0.03907683305442333, "step": 322 }, { "clip_fraction": 0.0, "completion_length": 2875.937530517578, "epoch": 0.36914285714285716, "grad_norm": 0.023642728105187416, "kl": 0.0003116130828857422, "lambda_div_used": 0.5762533918023109, "learning_rate": 4.0498043714627006e-07, "loss": -0.0062, "reward": -0.18317513819783926, "reward_after_mean": -0.18317513819783926, "reward_after_std": 0.45206453651189804, "reward_before_mean": 0.1913843434303999, "reward_before_std": 0.34759796876460314, "reward_change_max": 0.0, "reward_change_mean": -0.3745594993233681, "reward_change_min": -0.5339466538280249, "reward_change_std": 0.2068865867331624, "reward_std": 0.4520645458251238, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.037782331462949514, "step": 323 }, { "clip_fraction": 0.0, "completion_length": 2517.541732788086, "epoch": 0.3702857142857143, "grad_norm": 0.02370315045118332, "kl": 0.0002675652503967285, "lambda_div_used": 0.582614079117775, "learning_rate": 4.020100089676376e-07, "loss": 0.0264, "reward": -0.09797720052301884, "reward_after_mean": -0.09797720052301884, "reward_after_std": 0.46359810046851635, "reward_before_mean": 0.30637714080512524, "reward_before_std": 0.37786578573286533, "reward_change_max": 0.0, "reward_change_mean": -0.4043543320149183, "reward_change_min": -0.6221220158040524, "reward_change_std": 0.2347763581201434, "reward_std": 0.46359810046851635, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.03554379381239414, "step": 324 }, { "clip_fraction": 0.0, "completion_length": 2340.104232788086, "epoch": 0.37142857142857144, "grad_norm": 0.02759523130953312, "kl": 0.0002740621566772461, "lambda_div_used": 0.6287032291293144, "learning_rate": 3.9904679361238526e-07, "loss": 0.0386, "reward": 0.3422952927649021, "reward_after_mean": 0.3422952927649021, "reward_after_std": 0.6593811456114054, "reward_before_mean": 0.8776058983057737, "reward_before_std": 0.5943253133445978, "reward_change_max": 0.0, "reward_change_mean": -0.5353106111288071, "reward_change_min": -0.8339854516088963, "reward_change_std": 0.32581204548478127, "reward_std": 0.6593811772763729, "rewards/accuracy_reward": 0.5416666753590107, "rewards/cosine_scaled_reward": 0.335939209908247, "step": 325 }, { "clip_fraction": 0.0, "completion_length": 2069.0208740234375, "epoch": 0.37257142857142855, "grad_norm": 0.026782048866152763, "kl": 0.00021858513355255127, "lambda_div_used": 0.5514643862843513, "learning_rate": 3.9609093550344907e-07, "loss": 0.0214, "reward": -0.2377523072063923, "reward_after_mean": -0.2377523072063923, "reward_after_std": 0.3420621510595083, "reward_before_mean": 0.17398178996518254, "reward_before_std": 0.2309601987944916, "reward_change_max": 0.0, "reward_change_mean": -0.41173411533236504, "reward_change_min": -0.586280532181263, "reward_change_std": 0.22657191008329391, "reward_std": 0.3420621529221535, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.034351545851677656, "step": 326 }, { "clip_fraction": 0.0, "completion_length": 2595.583339691162, "epoch": 0.3737142857142857, "grad_norm": 0.023073619231581688, "kl": 0.0003046169877052307, "lambda_div_used": 0.5742950737476349, "learning_rate": 3.931425787051832e-07, "loss": 0.016, "reward": -0.06984845921397209, "reward_after_mean": -0.06984845921397209, "reward_after_std": 0.4958471246063709, "reward_before_mean": 0.39507998805493116, "reward_before_std": 0.33821228239685297, "reward_change_max": 0.0, "reward_change_mean": -0.46492844074964523, "reward_change_min": -0.6655924804508686, "reward_change_std": 0.25432714726775885, "reward_std": 0.49584713764488697, "rewards/accuracy_reward": 0.33333333395421505, "rewards/cosine_scaled_reward": 0.06174664665013552, "step": 327 }, { "clip_fraction": 0.0, "completion_length": 3254.375030517578, "epoch": 0.37485714285714283, "grad_norm": 0.02187371626496315, "kl": 0.0003033876419067383, "lambda_div_used": 0.6039082854986191, "learning_rate": 3.902018669163384e-07, "loss": 0.0465, "reward": -0.22737010568380356, "reward_after_mean": -0.22737010568380356, "reward_after_std": 0.5139794517308474, "reward_before_mean": 0.057241520611569285, "reward_before_std": 0.47974141500890255, "reward_change_max": 0.0, "reward_change_mean": -0.28461163491010666, "reward_change_min": -0.4783761650323868, "reward_change_std": 0.18032598588615656, "reward_std": 0.5139794517308474, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.10942514054477215, "step": 328 }, { "clip_fraction": 0.0, "completion_length": 1712.7500305175781, "epoch": 0.376, "grad_norm": 0.030909525230526924, "kl": 0.00020560622215270996, "lambda_div_used": 0.5851015225052834, "learning_rate": 3.872689434630585e-07, "loss": 0.0147, "reward": -0.0780089907348156, "reward_after_mean": -0.0780089907348156, "reward_after_std": 0.4830572586506605, "reward_before_mean": 0.3372959513217211, "reward_before_std": 0.39714881777763367, "reward_change_max": 0.0, "reward_change_mean": -0.415304945781827, "reward_change_min": -0.6223411783576012, "reward_change_std": 0.24614232685416937, "reward_std": 0.48305728659033775, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.06646260246634483, "step": 329 }, { "clip_fraction": 0.0, "completion_length": 2283.437515258789, "epoch": 0.37714285714285717, "grad_norm": 0.03479500487446785, "kl": 0.00036913156509399414, "lambda_div_used": 0.629969134926796, "learning_rate": 3.843439512918949e-07, "loss": -0.0771, "reward": -0.08872065320611, "reward_after_mean": -0.08872065320611, "reward_after_std": 0.6100571732968092, "reward_before_mean": 0.20380639098584652, "reward_before_std": 0.606320459395647, "reward_change_max": 0.0, "reward_change_mean": -0.2925270590931177, "reward_change_min": -0.5179039165377617, "reward_change_std": 0.20588424988090992, "reward_std": 0.6100572124123573, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.02536027878522873, "step": 330 }, { "clip_fraction": 0.0, "completion_length": 2365.354200363159, "epoch": 0.3782857142857143, "grad_norm": 0.05943391099572182, "kl": 0.0003091096878051758, "lambda_div_used": 0.6346529722213745, "learning_rate": 3.8142703296283953e-07, "loss": -0.0337, "reward": -0.24408827535808086, "reward_after_mean": -0.24408827535808086, "reward_after_std": 0.6825795099139214, "reward_before_mean": -0.026182920671999454, "reward_before_std": 0.6199203189462423, "reward_change_max": 0.0, "reward_change_mean": -0.21790535561740398, "reward_change_min": -0.3899071477353573, "reward_change_std": 0.13575981836766005, "reward_std": 0.6825795285403728, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.1511829246301204, "step": 331 }, { "clip_fraction": 0.0, "completion_length": 2488.937530517578, "epoch": 0.37942857142857145, "grad_norm": 0.02497093193233013, "kl": 0.00024446845054626465, "lambda_div_used": 0.628095343708992, "learning_rate": 3.785183306423767e-07, "loss": 0.002, "reward": -0.0888600671896711, "reward_after_mean": -0.0888600671896711, "reward_after_std": 0.634421993046999, "reward_before_mean": 0.22087145410478115, "reward_before_std": 0.5863482365384698, "reward_change_max": 0.0, "reward_change_mean": -0.30973155051469803, "reward_change_min": -0.5146533660590649, "reward_change_std": 0.19153987523168325, "reward_std": 0.6344220079481602, "rewards/accuracy_reward": 0.2291666753590107, "rewards/cosine_scaled_reward": -0.00829521007835865, "step": 332 }, { "clip_fraction": 0.0, "completion_length": 1976.333381652832, "epoch": 0.38057142857142856, "grad_norm": 0.029570044949650764, "kl": 0.00023859739303588867, "lambda_div_used": 0.5713351741433144, "learning_rate": 3.7561798609655373e-07, "loss": 0.026, "reward": 0.11462849378585815, "reward_after_mean": 0.11462849378585815, "reward_after_std": 0.5290882792323828, "reward_before_mean": 0.6867042146623135, "reward_before_std": 0.3248422802425921, "reward_change_max": 0.0, "reward_change_mean": -0.5720757059752941, "reward_change_min": -0.7701217532157898, "reward_change_std": 0.30192676838487387, "reward_std": 0.5290882866829634, "rewards/accuracy_reward": 0.47916666977107525, "rewards/cosine_scaled_reward": 0.20753752067685127, "step": 333 }, { "clip_fraction": 0.0, "completion_length": 2970.2708892822266, "epoch": 0.38171428571428573, "grad_norm": 0.022143961861729622, "kl": 0.0002802610397338867, "lambda_div_used": 0.5814503356814384, "learning_rate": 3.72726140684072e-07, "loss": -0.0083, "reward": -0.36110448837280273, "reward_after_mean": -0.36110448837280273, "reward_after_std": 0.42284002527594566, "reward_before_mean": -0.09825330413877964, "reward_before_std": 0.3718461263924837, "reward_change_max": 0.0, "reward_change_mean": -0.26285118237137794, "reward_change_min": -0.4302907735109329, "reward_change_std": 0.16061531472951174, "reward_std": 0.42284005135297775, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.1815866343677044, "step": 334 }, { "clip_fraction": 0.0, "completion_length": 2137.7916717529297, "epoch": 0.38285714285714284, "grad_norm": 0.023810893297195435, "kl": 0.00019755959510803223, "lambda_div_used": 0.5730342343449593, "learning_rate": 3.6984293534939737e-07, "loss": -0.0589, "reward": -0.003980423090979457, "reward_after_mean": -0.003980423090979457, "reward_after_std": 0.4973279498517513, "reward_before_mean": 0.48682230338454247, "reward_before_std": 0.3303193561732769, "reward_change_max": 0.0, "reward_change_mean": -0.4908027183264494, "reward_change_min": -0.6631990969181061, "reward_change_std": 0.25930201914161444, "reward_std": 0.4973279610276222, "rewards/accuracy_reward": 0.35416666977107525, "rewards/cosine_scaled_reward": 0.132655612193048, "step": 335 }, { "clip_fraction": 0.0, "completion_length": 2604.5625762939453, "epoch": 0.384, "grad_norm": 0.019800275564193726, "kl": 0.00027495622634887695, "lambda_div_used": 0.6422952190041542, "learning_rate": 3.6696851061588994e-07, "loss": -0.0153, "reward": 0.1156077766790986, "reward_after_mean": 0.1156077766790986, "reward_after_std": 0.7227907460182905, "reward_before_mean": 0.5011630854569376, "reward_before_std": 0.6614610198885202, "reward_change_max": 0.0, "reward_change_mean": -0.3855553139001131, "reward_change_min": -0.6121690329164267, "reward_change_std": 0.23865524679422379, "reward_std": 0.7227907720953226, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.1261630654335022, "step": 336 }, { "clip_fraction": 0.0, "completion_length": 2581.6667404174805, "epoch": 0.3851428571428571, "grad_norm": 0.024305738508701324, "kl": 0.000293731689453125, "lambda_div_used": 0.5970565602183342, "learning_rate": 3.641030065789562e-07, "loss": 0.0618, "reward": -0.012716710567474365, "reward_after_mean": -0.012716710567474365, "reward_after_std": 0.5347360204905272, "reward_before_mean": 0.4010282773524523, "reward_before_std": 0.4501037606969476, "reward_change_max": 0.0, "reward_change_mean": -0.41374498419463634, "reward_change_min": -0.597484715282917, "reward_change_std": 0.24177721049636602, "reward_std": 0.5347360335290432, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.08852825942449272, "step": 337 }, { "clip_fraction": 0.0, "completion_length": 1854.9375381469727, "epoch": 0.3862857142857143, "grad_norm": 0.035376228392124176, "kl": 0.00024819374084472656, "lambda_div_used": 0.6688028946518898, "learning_rate": 3.612465628992203e-07, "loss": 0.1016, "reward": 0.3372328467667103, "reward_after_mean": 0.3372328467667103, "reward_after_std": 0.8120726235210896, "reward_before_mean": 0.783452745527029, "reward_before_std": 0.787550600245595, "reward_change_max": 0.0, "reward_change_mean": -0.44621986895799637, "reward_change_min": -0.745157428085804, "reward_change_std": 0.29823943972587585, "reward_std": 0.812072642147541, "rewards/accuracy_reward": 0.500000013038516, "rewards/cosine_scaled_reward": 0.2834526968654245, "step": 338 }, { "clip_fraction": 0.0, "completion_length": 2807.541717529297, "epoch": 0.38742857142857146, "grad_norm": 0.024581631645560265, "kl": 0.0003020763397216797, "lambda_div_used": 0.559957392513752, "learning_rate": 3.5839931879571725e-07, "loss": -0.0589, "reward": -0.2989091109484434, "reward_after_mean": -0.2989091109484434, "reward_after_std": 0.3937798347324133, "reward_before_mean": 0.06776450201869011, "reward_before_std": 0.269554709084332, "reward_change_max": 0.0, "reward_change_mean": -0.3666736055165529, "reward_change_min": -0.5128730908036232, "reward_change_std": 0.19430112652480602, "reward_std": 0.3937798459082842, "rewards/accuracy_reward": 0.14583333395421505, "rewards/cosine_scaled_reward": -0.07806883845478296, "step": 339 }, { "clip_fraction": 0.0, "completion_length": 2440.437530517578, "epoch": 0.38857142857142857, "grad_norm": 0.027588481083512306, "kl": 0.0002573728561401367, "lambda_div_used": 0.6044978573918343, "learning_rate": 3.555614130391079e-07, "loss": -0.021, "reward": -0.09739532321691513, "reward_after_mean": -0.09739532321691513, "reward_after_std": 0.5066191554069519, "reward_before_mean": 0.23490323033183813, "reward_before_std": 0.48513105837628245, "reward_change_max": 0.0, "reward_change_mean": -0.33229855448007584, "reward_change_min": -0.5539730787277222, "reward_change_std": 0.21668985951691866, "reward_std": 0.5066191554069519, "rewards/accuracy_reward": 0.2291666716337204, "rewards/cosine_scaled_reward": 0.005736543796956539, "step": 340 }, { "clip_fraction": 0.0, "completion_length": 2306.4583740234375, "epoch": 0.38971428571428574, "grad_norm": 0.025286095216870308, "kl": 0.00023421645164489746, "lambda_div_used": 0.5724563226103783, "learning_rate": 3.5273298394491515e-07, "loss": 0.0526, "reward": 0.04836506303399801, "reward_after_mean": 0.04836506303399801, "reward_after_std": 0.4924194272607565, "reward_before_mean": 0.5868019293993711, "reward_before_std": 0.33085333183407784, "reward_change_max": 0.0, "reward_change_mean": -0.5384368915110826, "reward_change_min": -0.7688934281468391, "reward_change_std": 0.29607443511486053, "reward_std": 0.492419445887208, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.19096860231366009, "step": 341 }, { "clip_fraction": 0.0, "completion_length": 2629.291690826416, "epoch": 0.39085714285714285, "grad_norm": 0.02702566236257553, "kl": 0.00029778480529785156, "lambda_div_used": 0.6460641473531723, "learning_rate": 3.4991416936678276e-07, "loss": -0.0077, "reward": 0.11682657990604639, "reward_after_mean": 0.11682657990604639, "reward_after_std": 0.6720432955771685, "reward_before_mean": 0.4811732564121485, "reward_before_std": 0.6802498865872622, "reward_change_max": 0.0, "reward_change_mean": -0.3643466793000698, "reward_change_min": -0.6579400822520256, "reward_change_std": 0.255507318302989, "reward_std": 0.6720433253794909, "rewards/accuracy_reward": 0.3750000149011612, "rewards/cosine_scaled_reward": 0.1061732517555356, "step": 342 }, { "clip_fraction": 0.0, "completion_length": 2944.5833435058594, "epoch": 0.392, "grad_norm": 0.01883111707866192, "kl": 0.0002968311309814453, "lambda_div_used": 0.6280755251646042, "learning_rate": 3.471051066897562e-07, "loss": 0.0174, "reward": 0.13441785983741283, "reward_after_mean": 0.13441785983741283, "reward_after_std": 0.62815947458148, "reward_before_mean": 0.5553851053118706, "reward_before_std": 0.5964916851371527, "reward_change_max": 0.0, "reward_change_mean": -0.42096727155148983, "reward_change_min": -0.6972850449383259, "reward_change_std": 0.27345132920891047, "reward_std": 0.6281594894826412, "rewards/accuracy_reward": 0.4166666753590107, "rewards/cosine_scaled_reward": 0.13871843740344048, "step": 343 }, { "clip_fraction": 0.0, "completion_length": 2095.5833854675293, "epoch": 0.3931428571428571, "grad_norm": 0.023183098062872887, "kl": 0.00022208690643310547, "lambda_div_used": 0.5810001268982887, "learning_rate": 3.4430593282358777e-07, "loss": -0.024, "reward": 0.11522329319268465, "reward_after_mean": 0.11522329319268465, "reward_after_std": 0.561205493286252, "reward_before_mean": 0.6688053011894226, "reward_before_std": 0.3688823012635112, "reward_change_max": 0.0, "reward_change_mean": -0.5535820256918669, "reward_change_min": -0.7913318648934364, "reward_change_std": 0.2952663041651249, "reward_std": 0.5612055025994778, "rewards/accuracy_reward": 0.47916666977107525, "rewards/cosine_scaled_reward": 0.18963862350210547, "step": 344 }, { "clip_fraction": 0.0, "completion_length": 2982.229202270508, "epoch": 0.3942857142857143, "grad_norm": 0.022071614861488342, "kl": 0.00032585859298706055, "lambda_div_used": 0.577229768037796, "learning_rate": 3.4151678419606233e-07, "loss": -0.061, "reward": -0.20981058850884438, "reward_after_mean": -0.20981058850884438, "reward_after_std": 0.40093352645635605, "reward_before_mean": 0.13120032008737326, "reward_before_std": 0.353534915484488, "reward_change_max": 0.0, "reward_change_mean": -0.3410108871757984, "reward_change_min": -0.5285827927291393, "reward_change_std": 0.20349892415106297, "reward_std": 0.40093354508280754, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.03546636272221804, "step": 345 }, { "clip_fraction": 0.0, "completion_length": 3075.916702270508, "epoch": 0.3954285714285714, "grad_norm": 0.018548911437392235, "kl": 0.0003154873847961426, "lambda_div_used": 0.5923640578985214, "learning_rate": 3.387377967463493e-07, "loss": 0.0031, "reward": -0.25945382937788963, "reward_after_mean": -0.25945382937788963, "reward_after_std": 0.47871536388993263, "reward_before_mean": 0.022648759186267853, "reward_before_std": 0.41908061131834984, "reward_change_max": 0.0, "reward_change_mean": -0.28210258670151234, "reward_change_min": -0.4310699477791786, "reward_change_std": 0.16193275339901447, "reward_std": 0.4787153732031584, "rewards/accuracy_reward": 0.1458333395421505, "rewards/cosine_scaled_reward": -0.12318458454683423, "step": 346 }, { "clip_fraction": 0.0, "completion_length": 3052.8333740234375, "epoch": 0.3965714285714286, "grad_norm": 0.01688998006284237, "kl": 0.0003132820129394531, "lambda_div_used": 0.5991570502519608, "learning_rate": 3.359691059183761e-07, "loss": 0.0425, "reward": -0.21558007411658764, "reward_after_mean": -0.21558007411658764, "reward_after_std": 0.504334045574069, "reward_before_mean": 0.08187778666615486, "reward_before_std": 0.4535634834319353, "reward_change_max": 0.0, "reward_change_mean": -0.2974578682333231, "reward_change_min": -0.48911403492093086, "reward_change_std": 0.1813967889174819, "reward_std": 0.5043340623378754, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.08478887472301722, "step": 347 }, { "clip_fraction": 0.0, "completion_length": 2511.916679382324, "epoch": 0.3977142857142857, "grad_norm": 0.031576935201883316, "kl": 0.00030159950256347656, "lambda_div_used": 0.6005090326070786, "learning_rate": 3.3321084665422803e-07, "loss": 0.048, "reward": -0.08813801780343056, "reward_after_mean": -0.08813801780343056, "reward_after_std": 0.4803820662200451, "reward_before_mean": 0.25806165859103203, "reward_before_std": 0.4620923697948456, "reward_change_max": 0.0, "reward_change_mean": -0.3461996652185917, "reward_change_min": -0.547101479023695, "reward_change_std": 0.22022681962698698, "reward_std": 0.48038206808269024, "rewards/accuracy_reward": 0.2500000074505806, "rewards/cosine_scaled_reward": 0.008061652071774006, "step": 348 }, { "clip_fraction": 0.0, "completion_length": 3160.916717529297, "epoch": 0.39885714285714285, "grad_norm": 0.020290188491344452, "kl": 0.00036334991455078125, "lambda_div_used": 0.5632076561450958, "learning_rate": 3.3046315338757026e-07, "loss": 0.0316, "reward": -0.26980413869023323, "reward_after_mean": -0.26980413869023323, "reward_after_std": 0.3867268729954958, "reward_before_mean": 0.09292120113968849, "reward_before_std": 0.2870235051959753, "reward_change_max": 0.0, "reward_change_mean": -0.3627253398299217, "reward_change_min": -0.5702350400388241, "reward_change_std": 0.20531136635690928, "reward_std": 0.386726887896657, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.0737454742193222, "step": 349 }, { "clip_fraction": 0.0, "completion_length": 2213.604202270508, "epoch": 0.4, "grad_norm": 0.030296506360173225, "kl": 0.0002709701657295227, "lambda_div_used": 0.6561494767665863, "learning_rate": 3.2772616003709616e-07, "loss": 0.0154, "reward": 0.1817268170416355, "reward_after_mean": 0.1817268170416355, "reward_after_std": 0.6988476235419512, "reward_before_mean": 0.5450442042201757, "reward_before_std": 0.7353325374424458, "reward_change_max": 0.0, "reward_change_mean": -0.3633174039423466, "reward_change_min": -0.6499125882983208, "reward_change_std": 0.26448090467602015, "reward_std": 0.6988476365804672, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.17004419304430485, "step": 350 }, { "clip_fraction": 0.0, "completion_length": 2509.5208740234375, "epoch": 0.40114285714285713, "grad_norm": 0.02334379218518734, "kl": 0.0002808868885040283, "lambda_div_used": 0.602841705083847, "learning_rate": 3.250000000000001e-07, "loss": 0.0785, "reward": -0.02256767451763153, "reward_after_mean": -0.02256767451763153, "reward_after_std": 0.545713946223259, "reward_before_mean": 0.3783569000661373, "reward_before_std": 0.47443881165236235, "reward_change_max": 0.0, "reward_change_mean": -0.40092457458376884, "reward_change_min": -0.5974738858640194, "reward_change_std": 0.24015377275645733, "reward_std": 0.5457139611244202, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.06585689261555672, "step": 351 }, { "clip_fraction": 0.0, "completion_length": 2237.7917251586914, "epoch": 0.4022857142857143, "grad_norm": 0.02218765579164028, "kl": 0.00025266408920288086, "lambda_div_used": 0.6070134416222572, "learning_rate": 3.222848061454764e-07, "loss": 0.0034, "reward": -0.07995379093335941, "reward_after_mean": -0.07995379093335941, "reward_after_std": 0.5729443337768316, "reward_before_mean": 0.291741443797946, "reward_before_std": 0.4979767380282283, "reward_change_max": 0.0, "reward_change_mean": -0.3716952446848154, "reward_change_min": -0.6056977100670338, "reward_change_std": 0.23109738621860743, "reward_std": 0.5729443468153477, "rewards/accuracy_reward": 0.2916666679084301, "rewards/cosine_scaled_reward": 7.475726306438446e-05, "step": 352 }, { "clip_fraction": 0.0, "completion_length": 2214.4583702087402, "epoch": 0.4034285714285714, "grad_norm": 0.02660507895052433, "kl": 0.0002359449863433838, "lambda_div_used": 0.670557290315628, "learning_rate": 3.195807108082429e-07, "loss": -0.0096, "reward": 0.11138913966715336, "reward_after_mean": 0.11138913966715336, "reward_after_std": 0.79796995036304, "reward_before_mean": 0.420612467918545, "reward_before_std": 0.7911950433626771, "reward_change_max": 0.0, "reward_change_mean": -0.30922332406044006, "reward_change_min": -0.5478326119482517, "reward_change_std": 0.21525408141314983, "reward_std": 0.7979699578136206, "rewards/accuracy_reward": 0.35416667722165585, "rewards/cosine_scaled_reward": 0.06644579023122787, "step": 353 }, { "clip_fraction": 0.0, "completion_length": 2018.0625228881836, "epoch": 0.4045714285714286, "grad_norm": 0.028822243213653564, "kl": 0.0002269148826599121, "lambda_div_used": 0.6141867712140083, "learning_rate": 3.168878457820915e-07, "loss": 0.0254, "reward": 0.07253095135092735, "reward_after_mean": 0.07253095135092735, "reward_after_std": 0.6088422238826752, "reward_before_mean": 0.4882249776273966, "reward_before_std": 0.5237803608179092, "reward_change_max": 0.0, "reward_change_mean": -0.4156940244138241, "reward_change_min": -0.5956555530428886, "reward_change_std": 0.23860780615359545, "reward_std": 0.6088422238826752, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.13405831216368824, "step": 354 }, { "clip_fraction": 0.0, "completion_length": 2018.0208740234375, "epoch": 0.4057142857142857, "grad_norm": 0.029382316395640373, "kl": 0.00026294589042663574, "lambda_div_used": 0.5812733992934227, "learning_rate": 3.142063423134644e-07, "loss": 0.0145, "reward": 0.23357930406928062, "reward_after_mean": 0.23357930406928062, "reward_after_std": 0.5394825823605061, "reward_before_mean": 0.8446227628737688, "reward_before_std": 0.37303208094090223, "reward_change_max": 0.0, "reward_change_mean": -0.6110434681177139, "reward_change_min": -0.8454090058803558, "reward_change_std": 0.3367054909467697, "reward_std": 0.5394826009869576, "rewards/accuracy_reward": 0.5416666716337204, "rewards/cosine_scaled_reward": 0.3029560726135969, "step": 355 }, { "clip_fraction": 0.0, "completion_length": 2752.3333740234375, "epoch": 0.40685714285714286, "grad_norm": 0.01905817724764347, "kl": 0.00031578540802001953, "lambda_div_used": 0.6255660429596901, "learning_rate": 3.115363310950578e-07, "loss": 0.0216, "reward": 0.12779017974389717, "reward_after_mean": 0.12779017974389717, "reward_after_std": 0.6442528441548347, "reward_before_mean": 0.5468167327344418, "reward_before_std": 0.5852497918531299, "reward_change_max": 0.0, "reward_change_mean": -0.41902654618024826, "reward_change_min": -0.6820830404758453, "reward_change_std": 0.2657659938558936, "reward_std": 0.6442528460174799, "rewards/accuracy_reward": 0.3958333395421505, "rewards/cosine_scaled_reward": 0.15098336525261402, "step": 356 }, { "clip_fraction": 0.0, "completion_length": 2952.520854949951, "epoch": 0.408, "grad_norm": 0.024500912055373192, "kl": 0.0002726316452026367, "lambda_div_used": 0.573906421661377, "learning_rate": 3.0887794225945143e-07, "loss": -0.0555, "reward": -0.22403784468770027, "reward_after_mean": -0.22403784468770027, "reward_after_std": 0.4447880759835243, "reward_before_mean": 0.14636994618922472, "reward_before_std": 0.3346911370754242, "reward_change_max": 0.0, "reward_change_mean": -0.3704077899456024, "reward_change_min": -0.5609126053750515, "reward_change_std": 0.20881207659840584, "reward_std": 0.4447880797088146, "rewards/accuracy_reward": 0.18750000186264515, "rewards/cosine_scaled_reward": -0.04113006801344454, "step": 357 }, { "clip_fraction": 0.0, "completion_length": 2411.8541946411133, "epoch": 0.40914285714285714, "grad_norm": 0.024526813998818398, "kl": 0.00026684999465942383, "lambda_div_used": 0.6140530630946159, "learning_rate": 3.062313053727671e-07, "loss": -0.0357, "reward": 0.30990589410066605, "reward_after_mean": 0.30990589410066605, "reward_after_std": 0.5656850170344114, "reward_before_mean": 0.8357270993292332, "reward_before_std": 0.5201095007359982, "reward_change_max": 0.0, "reward_change_mean": -0.5258211866021156, "reward_change_min": -0.7649649046361446, "reward_change_std": 0.3101219357922673, "reward_std": 0.5656850375235081, "rewards/accuracy_reward": 0.5416666865348816, "rewards/cosine_scaled_reward": 0.2940603978931904, "step": 358 }, { "clip_fraction": 0.0, "completion_length": 2100.604217529297, "epoch": 0.4102857142857143, "grad_norm": 0.029383460059762, "kl": 0.00023573637008666992, "lambda_div_used": 0.6149207651615143, "learning_rate": 3.0359654942835247e-07, "loss": -0.0725, "reward": -0.014719150494784117, "reward_after_mean": -0.014719150494784117, "reward_after_std": 0.6281173154711723, "reward_before_mean": 0.3637474989518523, "reward_before_std": 0.5245081130415201, "reward_change_max": 0.0, "reward_change_mean": -0.37846665270626545, "reward_change_min": -0.5382856801152229, "reward_change_std": 0.20675079058855772, "reward_std": 0.6281173229217529, "rewards/accuracy_reward": 0.29166667349636555, "rewards/cosine_scaled_reward": 0.07208081643329933, "step": 359 }, { "clip_fraction": 0.0, "completion_length": 2650.0625610351562, "epoch": 0.4114285714285714, "grad_norm": 0.028681648895144463, "kl": 0.00029480457305908203, "lambda_div_used": 0.5845082253217697, "learning_rate": 3.0097380284049523e-07, "loss": 0.0359, "reward": 0.034104809165000916, "reward_after_mean": 0.034104809165000916, "reward_after_std": 0.5086164381355047, "reward_before_mean": 0.5177552103996277, "reward_before_std": 0.3886600947007537, "reward_change_max": 0.0, "reward_change_mean": -0.4836503826081753, "reward_change_min": -0.7048606462776661, "reward_change_std": 0.27614138927310705, "reward_std": 0.5086164511740208, "rewards/accuracy_reward": 0.3958333358168602, "rewards/cosine_scaled_reward": 0.12192187085747719, "step": 360 }, { "clip_fraction": 0.0, "completion_length": 2715.7084197998047, "epoch": 0.4125714285714286, "grad_norm": 0.02216268703341484, "kl": 0.00035099685192108154, "lambda_div_used": 0.6144327148795128, "learning_rate": 2.9836319343816397e-07, "loss": 0.0341, "reward": 0.11957069113850594, "reward_after_mean": 0.11957069113850594, "reward_after_std": 0.5912090875208378, "reward_before_mean": 0.5671045240014791, "reward_before_std": 0.523097550496459, "reward_change_max": 0.0, "reward_change_mean": -0.4475338254123926, "reward_change_min": -0.6687514297664165, "reward_change_std": 0.2680971557274461, "reward_std": 0.5912090986967087, "rewards/accuracy_reward": 0.41666667722165585, "rewards/cosine_scaled_reward": 0.15043783793225884, "step": 361 }, { "clip_fraction": 0.0, "completion_length": 1608.958396911621, "epoch": 0.4137142857142857, "grad_norm": 0.03410353511571884, "kl": 0.0002919435501098633, "lambda_div_used": 0.5573309659957886, "learning_rate": 2.9576484845877793e-07, "loss": -0.1085, "reward": 0.017518717795610428, "reward_after_mean": 0.017518717795610428, "reward_after_std": 0.46568065509200096, "reward_before_mean": 0.5754165817052126, "reward_before_std": 0.2668048879131675, "reward_change_max": 0.0, "reward_change_mean": -0.557897862046957, "reward_change_min": -0.7572538442909718, "reward_change_std": 0.2997015379369259, "reward_std": 0.46568066254258156, "rewards/accuracy_reward": 0.4375, "rewards/cosine_scaled_reward": 0.13791657239198685, "step": 362 }, { "clip_fraction": 0.0, "completion_length": 1770.7500305175781, "epoch": 0.41485714285714287, "grad_norm": 0.036160316318273544, "kl": 0.0002955198287963867, "lambda_div_used": 0.5742382705211639, "learning_rate": 2.931788945420058e-07, "loss": 0.0235, "reward": 0.08720480650663376, "reward_after_mean": 0.08720480650663376, "reward_after_std": 0.4698806144297123, "reward_before_mean": 0.624412227421999, "reward_before_std": 0.33839546935632825, "reward_change_max": 0.0, "reward_change_mean": -0.5372074488550425, "reward_change_min": -0.7525182664394379, "reward_change_std": 0.2967495834454894, "reward_std": 0.4698806367814541, "rewards/accuracy_reward": 0.4166666716337204, "rewards/cosine_scaled_reward": 0.20774555951356888, "step": 363 }, { "clip_fraction": 0.0, "completion_length": 2871.395866394043, "epoch": 0.416, "grad_norm": 0.021756965667009354, "kl": 0.0003199577331542969, "lambda_div_used": 0.6380201950669289, "learning_rate": 2.9060545772359305e-07, "loss": 0.0094, "reward": -0.06151282729115337, "reward_after_mean": -0.06151282729115337, "reward_after_std": 0.6731106694787741, "reward_before_mean": 0.22438967041671276, "reward_before_std": 0.6390752401202917, "reward_change_max": 0.0, "reward_change_mean": -0.2859025076031685, "reward_change_min": -0.4525425359606743, "reward_change_std": 0.18198375776410103, "reward_std": 0.6731106787919998, "rewards/accuracy_reward": 0.22916667349636555, "rewards/cosine_scaled_reward": -0.004777010064572096, "step": 364 }, { "clip_fraction": 0.0, "completion_length": 2915.7708892822266, "epoch": 0.41714285714285715, "grad_norm": 0.026546625420451164, "kl": 0.00028133392333984375, "lambda_div_used": 0.6235380545258522, "learning_rate": 2.8804466342921987e-07, "loss": 0.0013, "reward": -0.19956049136817455, "reward_after_mean": -0.19956049136817455, "reward_after_std": 0.5996164344251156, "reward_before_mean": 0.056321932934224606, "reward_before_std": 0.5726153058931231, "reward_change_max": 0.0, "reward_change_mean": -0.2558824270963669, "reward_change_min": -0.486958272755146, "reward_change_std": 0.17727997712790966, "reward_std": 0.5996164586395025, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.11034472845494747, "step": 365 }, { "clip_fraction": 0.0, "completion_length": 1711.791690826416, "epoch": 0.41828571428571426, "grad_norm": 0.030281659215688705, "kl": 0.00020581483840942383, "lambda_div_used": 0.5937831178307533, "learning_rate": 2.854966364683872e-07, "loss": 0.0313, "reward": 0.19904952123761177, "reward_after_mean": 0.19904952123761177, "reward_after_std": 0.5501149389892817, "reward_before_mean": 0.7462510112673044, "reward_before_std": 0.428714738227427, "reward_change_max": 0.0, "reward_change_mean": -0.5472014844417572, "reward_change_min": -0.7698434814810753, "reward_change_std": 0.31167063396424055, "reward_std": 0.550114942714572, "rewards/accuracy_reward": 0.5208333432674408, "rewards/cosine_scaled_reward": 0.2254176577553153, "step": 366 }, { "clip_fraction": 0.0, "completion_length": 2489.0209045410156, "epoch": 0.41942857142857143, "grad_norm": 0.02355087362229824, "kl": 0.00023761391639709473, "lambda_div_used": 0.6294166967272758, "learning_rate": 2.829615010283344e-07, "loss": -0.0082, "reward": 0.1264641396701336, "reward_after_mean": 0.1264641396701336, "reward_after_std": 0.6499300934374332, "reward_before_mean": 0.5483083166182041, "reward_before_std": 0.6043886244297028, "reward_change_max": 0.0, "reward_change_mean": -0.4218441918492317, "reward_change_min": -0.6975029557943344, "reward_change_std": 0.2718219608068466, "reward_std": 0.6499301269650459, "rewards/accuracy_reward": 0.3958333432674408, "rewards/cosine_scaled_reward": 0.15247498638927937, "step": 367 }, { "clip_fraction": 0.0, "completion_length": 3003.458335876465, "epoch": 0.4205714285714286, "grad_norm": 0.030648062005639076, "kl": 0.00033867359161376953, "lambda_div_used": 0.5932426005601883, "learning_rate": 2.8043938066798645e-07, "loss": 0.0339, "reward": -0.11047623306512833, "reward_after_mean": -0.11047623306512833, "reward_after_std": 0.46016608364880085, "reward_before_mean": 0.24663935555145144, "reward_before_std": 0.42351202201098204, "reward_change_max": 0.0, "reward_change_mean": -0.3571155872195959, "reward_change_min": -0.523664090782404, "reward_change_std": 0.21024074219167233, "reward_std": 0.460166085511446, "rewards/accuracy_reward": 0.22916667722165585, "rewards/cosine_scaled_reward": 0.01747269369661808, "step": 368 }, { "clip_fraction": 0.0, "completion_length": 2393.520854949951, "epoch": 0.4217142857142857, "grad_norm": 0.03554327413439751, "kl": 0.00029546022415161133, "lambda_div_used": 0.6290072500705719, "learning_rate": 2.7793039831193133e-07, "loss": -0.1119, "reward": 0.045321037992835045, "reward_after_mean": 0.045321037992835045, "reward_after_std": 0.6632880251854658, "reward_before_mean": 0.42809890396893024, "reward_before_std": 0.6034117415547371, "reward_change_max": 0.0, "reward_change_mean": -0.3827778585255146, "reward_change_min": -0.6426752880215645, "reward_change_std": 0.24680283293128014, "reward_std": 0.6632880419492722, "rewards/accuracy_reward": 0.31250000186264515, "rewards/cosine_scaled_reward": 0.11559889325872064, "step": 369 }, { "clip_fraction": 0.0, "completion_length": 3084.6458587646484, "epoch": 0.4228571428571429, "grad_norm": 0.021294524893164635, "kl": 0.00037282705307006836, "lambda_div_used": 0.6392548009753227, "learning_rate": 2.7543467624442956e-07, "loss": 0.0248, "reward": 0.1099370252341032, "reward_after_mean": 0.1099370252341032, "reward_after_std": 0.7434613928198814, "reward_before_mean": 0.5280295421835035, "reward_before_std": 0.6512415455654263, "reward_change_max": 0.0, "reward_change_mean": -0.4180925004184246, "reward_change_min": -0.6554959528148174, "reward_change_std": 0.2566772401332855, "reward_std": 0.7434614114463329, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.15302953217178583, "step": 370 }, { "clip_fraction": 0.0, "completion_length": 1583.729190826416, "epoch": 0.424, "grad_norm": 0.03987959772348404, "kl": 0.000273287296295166, "lambda_div_used": 0.5595665127038956, "learning_rate": 2.729523361034538e-07, "loss": 0.015, "reward": -0.05838925391435623, "reward_after_mean": -0.05838925391435623, "reward_after_std": 0.44396297819912434, "reward_before_mean": 0.4467340558767319, "reward_before_std": 0.2712427484802902, "reward_change_max": 0.0, "reward_change_mean": -0.5051233097910881, "reward_change_min": -0.7045671716332436, "reward_change_std": 0.27023117896169424, "reward_std": 0.44396298564970493, "rewards/accuracy_reward": 0.4166666679084301, "rewards/cosine_scaled_reward": 0.030067380517721176, "step": 371 }, { "clip_fraction": 0.0, "completion_length": 2887.87504196167, "epoch": 0.42514285714285716, "grad_norm": 0.024510102346539497, "kl": 0.0002981424331665039, "lambda_div_used": 0.6707161664962769, "learning_rate": 2.7048349887476037e-07, "loss": -0.0009, "reward": 0.25641736947000027, "reward_after_mean": 0.25641736947000027, "reward_after_std": 0.7552758120000362, "reward_before_mean": 0.6394204869866371, "reward_before_std": 0.800614426843822, "reward_change_max": 0.0, "reward_change_mean": -0.3830030895769596, "reward_change_min": -0.689335536211729, "reward_change_std": 0.28328478895127773, "reward_std": 0.7552758287638426, "rewards/accuracy_reward": 0.4583333469927311, "rewards/cosine_scaled_reward": 0.18108712136745453, "step": 372 }, { "clip_fraction": 0.0, "completion_length": 1918.5833930969238, "epoch": 0.42628571428571427, "grad_norm": 0.03153732046484947, "kl": 0.00024700164794921875, "lambda_div_used": 0.6141555905342102, "learning_rate": 2.6802828488599294e-07, "loss": 0.0114, "reward": 0.040444918908178806, "reward_after_mean": 0.040444918908178806, "reward_after_std": 0.6271691359579563, "reward_before_mean": 0.4637982491403818, "reward_before_std": 0.5247453823685646, "reward_change_max": 0.0, "reward_change_mean": -0.42335335724055767, "reward_change_min": -0.6717953830957413, "reward_change_std": 0.2525172745808959, "reward_std": 0.6271691434085369, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.10963157773949206, "step": 373 }, { "clip_fraction": 0.0, "completion_length": 1947.4792213439941, "epoch": 0.42742857142857144, "grad_norm": 0.036099888384342194, "kl": 0.0002910494804382324, "lambda_div_used": 0.6011270731687546, "learning_rate": 2.655868138008171e-07, "loss": -0.1104, "reward": 0.05605058930814266, "reward_after_mean": 0.05605058930814266, "reward_after_std": 0.614537576213479, "reward_before_mean": 0.5277584344148636, "reward_before_std": 0.46880532428622246, "reward_change_max": 0.0, "reward_change_mean": -0.4717078376561403, "reward_change_min": -0.7001210488379002, "reward_change_std": 0.270130792632699, "reward_std": 0.6145375911146402, "rewards/accuracy_reward": 0.37500000186264515, "rewards/cosine_scaled_reward": 0.1527584195137024, "step": 374 }, { "clip_fraction": 0.0, "completion_length": 2643.3333892822266, "epoch": 0.42857142857142855, "grad_norm": 0.025029828771948814, "kl": 0.0003476142883300781, "lambda_div_used": 0.558178536593914, "learning_rate": 2.631592046130896e-07, "loss": -0.0046, "reward": 0.06168156489729881, "reward_after_mean": 0.06168156489729881, "reward_after_std": 0.46577400900423527, "reward_before_mean": 0.6490027587860823, "reward_before_std": 0.2644077790901065, "reward_change_max": 0.0, "reward_change_mean": -0.5873212069272995, "reward_change_min": -0.8054014258086681, "reward_change_std": 0.3127285521477461, "reward_std": 0.46577401272952557, "rewards/accuracy_reward": 0.4375, "rewards/cosine_scaled_reward": 0.2115027718245983, "step": 375 }, { "clip_fraction": 0.0, "completion_length": 2176.875045776367, "epoch": 0.4297142857142857, "grad_norm": 0.025618579238653183, "kl": 0.00024950504302978516, "lambda_div_used": 0.5522258281707764, "learning_rate": 2.6074557564105724e-07, "loss": 0.0245, "reward": -0.24329839646816254, "reward_after_mean": -0.24329839646816254, "reward_after_std": 0.3648342005908489, "reward_before_mean": 0.15543348528444767, "reward_before_std": 0.2328620203770697, "reward_change_max": 0.0, "reward_change_mean": -0.3987318556755781, "reward_change_min": -0.5627252347767353, "reward_change_std": 0.21180008072406054, "reward_std": 0.3648342117667198, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.052899875794537365, "step": 376 }, { "clip_fraction": 0.0, "completion_length": 3302.1875610351562, "epoch": 0.4308571428571429, "grad_norm": 0.019104059785604477, "kl": 0.0004137754440307617, "lambda_div_used": 0.6249697953462601, "learning_rate": 2.583460445215911e-07, "loss": 0.0065, "reward": -0.03637286019511521, "reward_after_mean": -0.03637286019511521, "reward_after_std": 0.6691129393875599, "reward_before_mean": 0.30246374011039734, "reward_before_std": 0.578639387153089, "reward_change_max": 0.0, "reward_change_mean": -0.3388365972787142, "reward_change_min": -0.5369448103010654, "reward_change_std": 0.20115663390606642, "reward_std": 0.6691129766404629, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": 0.010797052644193172, "step": 377 }, { "clip_fraction": 0.0, "completion_length": 1876.0208568572998, "epoch": 0.432, "grad_norm": 0.039962347596883774, "kl": 0.0002534538507461548, "lambda_div_used": 0.6230520308017731, "learning_rate": 2.5596072820445254e-07, "loss": 0.0138, "reward": 0.20900094881653786, "reward_after_mean": 0.20900094881653786, "reward_after_std": 0.685304744169116, "reward_before_mean": 0.7048993427306414, "reward_before_std": 0.577358863549307, "reward_change_max": 0.0, "reward_change_mean": -0.4958983939141035, "reward_change_min": -0.7410164549946785, "reward_change_std": 0.3000446343794465, "reward_std": 0.6853047590702772, "rewards/accuracy_reward": 0.5000000055879354, "rewards/cosine_scaled_reward": 0.204899336444214, "step": 378 }, { "clip_fraction": 0.0, "completion_length": 2977.7917289733887, "epoch": 0.43314285714285716, "grad_norm": 0.023082684725522995, "kl": 0.00034427642822265625, "lambda_div_used": 0.564103439450264, "learning_rate": 2.5358974294659373e-07, "loss": 0.0472, "reward": -0.2481522224843502, "reward_after_mean": -0.2481522224843502, "reward_after_std": 0.3903357107192278, "reward_before_mean": 0.1231729257851839, "reward_before_std": 0.2936667911708355, "reward_change_max": 0.0, "reward_change_mean": -0.37132514640688896, "reward_change_min": -0.5592780411243439, "reward_change_std": 0.21076095290482044, "reward_std": 0.39033573493361473, "rewards/accuracy_reward": 0.1875, "rewards/cosine_scaled_reward": -0.06432707794010639, "step": 379 }, { "clip_fraction": 0.0, "completion_length": 2197.2708740234375, "epoch": 0.4342857142857143, "grad_norm": 0.0468011274933815, "kl": 0.0003237128257751465, "lambda_div_used": 0.6098255217075348, "learning_rate": 2.512332043064913e-07, "loss": -0.1385, "reward": -0.01926261931657791, "reward_after_mean": -0.01926261931657791, "reward_after_std": 0.582806745544076, "reward_before_mean": 0.3642494883388281, "reward_before_std": 0.5125604961067438, "reward_change_max": 0.0, "reward_change_mean": -0.38351211696863174, "reward_change_min": -0.6093635484576225, "reward_change_std": 0.24249585159122944, "reward_std": 0.5828067641705275, "rewards/accuracy_reward": 0.3333333358168602, "rewards/cosine_scaled_reward": 0.030916159972548485, "step": 380 }, { "clip_fraction": 0.0, "completion_length": 2804.291702270508, "epoch": 0.43542857142857144, "grad_norm": 0.03009038046002388, "kl": 0.00038611888885498047, "lambda_div_used": 0.6074136793613434, "learning_rate": 2.488912271385139e-07, "loss": 0.0773, "reward": -0.2030959241092205, "reward_after_mean": -0.2030959241092205, "reward_after_std": 0.5294999033212662, "reward_before_mean": 0.0848972403910011, "reward_before_std": 0.4909538859501481, "reward_change_max": 0.0, "reward_change_mean": -0.287993174046278, "reward_change_min": -0.4920281432569027, "reward_change_std": 0.1843261569738388, "reward_std": 0.5294999219477177, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.08176943706348538, "step": 381 }, { "clip_fraction": 0.0, "completion_length": 1741.7292022705078, "epoch": 0.43657142857142855, "grad_norm": 0.02836902253329754, "kl": 0.000269085168838501, "lambda_div_used": 0.593373216688633, "learning_rate": 2.465639255873246e-07, "loss": 0.0017, "reward": -0.2921748459339142, "reward_after_mean": -0.2921748459339142, "reward_after_std": 0.4851537048816681, "reward_before_mean": -0.02789500029757619, "reward_before_std": 0.4276847830042243, "reward_change_max": 0.0, "reward_change_mean": -0.2642798572778702, "reward_change_min": -0.4220714569091797, "reward_change_std": 0.156461289152503, "reward_std": 0.485153716057539, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.152894988656044, "step": 382 }, { "clip_fraction": 0.0, "completion_length": 2540.208366394043, "epoch": 0.4377142857142857, "grad_norm": 0.027282925322651863, "kl": 0.00039958953857421875, "lambda_div_used": 0.6196694001555443, "learning_rate": 2.4425141308231765e-07, "loss": -0.0249, "reward": -0.009514571633189917, "reward_after_mean": -0.009514571633189917, "reward_after_std": 0.6358621753752232, "reward_before_mean": 0.3640937558375299, "reward_before_std": 0.5521808844059706, "reward_change_max": 0.0, "reward_change_mean": -0.3736083246767521, "reward_change_min": -0.5938910432159901, "reward_change_std": 0.22305074147880077, "reward_std": 0.6358622014522552, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": 0.07242710120044649, "step": 383 }, { "clip_fraction": 0.0, "completion_length": 2133.9166984558105, "epoch": 0.43885714285714283, "grad_norm": 0.031124358996748924, "kl": 0.00030043721199035645, "lambda_div_used": 0.6193316578865051, "learning_rate": 2.4195380233209006e-07, "loss": -0.0227, "reward": 0.32511539570987225, "reward_after_mean": 0.32511539570987225, "reward_after_std": 0.7301186248660088, "reward_before_mean": 0.913657930213958, "reward_before_std": 0.5516102942638099, "reward_change_max": 0.0, "reward_change_mean": -0.588542552664876, "reward_change_min": -0.8305801004171371, "reward_change_std": 0.32795302756130695, "reward_std": 0.7301186472177505, "rewards/accuracy_reward": 0.5833333395421505, "rewards/cosine_scaled_reward": 0.3303246097639203, "step": 384 }, { "clip_fraction": 0.0, "completion_length": 2633.770851135254, "epoch": 0.44, "grad_norm": 0.018747175112366676, "kl": 0.00027829408645629883, "lambda_div_used": 0.59091367572546, "learning_rate": 2.3967120531894857e-07, "loss": 0.0052, "reward": -0.16111253947019577, "reward_after_mean": -0.16111253947019577, "reward_after_std": 0.4700228702276945, "reward_before_mean": 0.17826138995587826, "reward_before_std": 0.41598498076200485, "reward_change_max": 0.0, "reward_change_mean": -0.3393739238381386, "reward_change_min": -0.5320228524506092, "reward_change_std": 0.20085815154016018, "reward_std": 0.4700228702276945, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.05090527608990669, "step": 385 }, { "clip_fraction": 0.0, "completion_length": 2641.6250762939453, "epoch": 0.44114285714285717, "grad_norm": 0.022516794502735138, "kl": 0.00033086538314819336, "lambda_div_used": 0.5835114791989326, "learning_rate": 2.374037332934512e-07, "loss": -0.0898, "reward": -0.16643539629876614, "reward_after_mean": -0.16643539629876614, "reward_after_std": 0.47341430373489857, "reward_before_mean": 0.20273982174694538, "reward_before_std": 0.38260515965521336, "reward_change_max": 0.0, "reward_change_mean": -0.36917522735893726, "reward_change_min": -0.5404521636664867, "reward_change_std": 0.20916004106402397, "reward_std": 0.4734143167734146, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.02642684616148472, "step": 386 }, { "clip_fraction": 0.0, "completion_length": 2953.6875610351562, "epoch": 0.4422857142857143, "grad_norm": 0.023601215332746506, "kl": 0.0003770887851715088, "lambda_div_used": 0.6255086436867714, "learning_rate": 2.3515149676898552e-07, "loss": 0.013, "reward": -0.055427778512239456, "reward_after_mean": -0.055427778512239456, "reward_after_std": 0.6565756388008595, "reward_before_mean": 0.2887960313819349, "reward_before_std": 0.582161720842123, "reward_change_max": 0.0, "reward_change_mean": -0.3442238178104162, "reward_change_min": -0.555218169465661, "reward_change_std": 0.2120614117011428, "reward_std": 0.6565756406635046, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": -0.0028706385055556893, "step": 387 }, { "clip_fraction": 0.0, "completion_length": 2360.6667251586914, "epoch": 0.44342857142857145, "grad_norm": 0.026174332946538925, "kl": 0.00029283761978149414, "lambda_div_used": 0.5821729674935341, "learning_rate": 2.3291460551638237e-07, "loss": -0.0288, "reward": -0.10938419215381145, "reward_after_mean": -0.10938419215381145, "reward_after_std": 0.4824158512055874, "reward_before_mean": 0.2964679952710867, "reward_before_std": 0.37308686412870884, "reward_change_max": 0.0, "reward_change_mean": -0.405852185562253, "reward_change_min": -0.5691216923296452, "reward_change_std": 0.2231999458745122, "reward_std": 0.4824158661067486, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.025634657591581345, "step": 388 }, { "clip_fraction": 0.0, "completion_length": 2439.229232788086, "epoch": 0.44457142857142856, "grad_norm": 0.023821894079446793, "kl": 0.00028192996978759766, "lambda_div_used": 0.6302470341324806, "learning_rate": 2.306931685585657e-07, "loss": 0.0324, "reward": 0.04070591554045677, "reward_after_mean": 0.04070591554045677, "reward_after_std": 0.6089895591139793, "reward_before_mean": 0.3929547220468521, "reward_before_std": 0.6121768653392792, "reward_change_max": 0.0, "reward_change_mean": -0.35224880650639534, "reward_change_min": -0.6027859784662724, "reward_change_std": 0.2432803064584732, "reward_std": 0.6089895665645599, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": 0.10128805413842201, "step": 389 }, { "clip_fraction": 0.0, "completion_length": 2221.3750610351562, "epoch": 0.44571428571428573, "grad_norm": 0.026545513421297073, "kl": 0.0002053976058959961, "lambda_div_used": 0.5998165532946587, "learning_rate": 2.2848729416523859e-07, "loss": 0.0542, "reward": 0.11247721314430237, "reward_after_mean": 0.11247721314430237, "reward_after_std": 0.5243697017431259, "reward_before_mean": 0.5878691142424941, "reward_before_std": 0.4597471170127392, "reward_change_max": 0.0, "reward_change_mean": -0.4753919020295143, "reward_change_min": -0.7015566639602184, "reward_change_std": 0.28336913883686066, "reward_std": 0.5243697185069323, "rewards/accuracy_reward": 0.3750000074505806, "rewards/cosine_scaled_reward": 0.2128690993413329, "step": 390 }, { "clip_fraction": 0.0, "completion_length": 2397.791736602783, "epoch": 0.44685714285714284, "grad_norm": 0.030633823946118355, "kl": 0.0002848505973815918, "lambda_div_used": 0.6453134343028069, "learning_rate": 2.2629708984760706e-07, "loss": 0.0079, "reward": 0.32186132250353694, "reward_after_mean": 0.32186132250353694, "reward_after_std": 0.7361500542610884, "reward_before_mean": 0.8155574453994632, "reward_before_std": 0.6687694359570742, "reward_change_max": 0.0, "reward_change_mean": -0.4936961196362972, "reward_change_min": -0.7406940795481205, "reward_change_std": 0.2951981630176306, "reward_std": 0.7361500766128302, "rewards/accuracy_reward": 0.5208333507180214, "rewards/cosine_scaled_reward": 0.2947241172660142, "step": 391 }, { "clip_fraction": 0.0, "completion_length": 1701.3541946411133, "epoch": 0.448, "grad_norm": 0.030896564945578575, "kl": 0.00023484230041503906, "lambda_div_used": 0.5580313578248024, "learning_rate": 2.2412266235313973e-07, "loss": 0.0276, "reward": -0.1474175527691841, "reward_after_mean": -0.1474175527691841, "reward_after_std": 0.4200621973723173, "reward_before_mean": 0.3056653430685401, "reward_before_std": 0.2651460962370038, "reward_change_max": 0.0, "reward_change_mean": -0.45308290608227253, "reward_change_min": -0.6591883115470409, "reward_change_std": 0.24698374886065722, "reward_std": 0.42006222531199455, "rewards/accuracy_reward": 0.2916666679084301, "rewards/cosine_scaled_reward": 0.013998678419739008, "step": 392 }, { "clip_fraction": 0.0, "completion_length": 2144.7708892822266, "epoch": 0.4491428571428571, "grad_norm": 0.025813451036810875, "kl": 0.00031191110610961914, "lambda_div_used": 0.6718219220638275, "learning_rate": 2.2196411766036487e-07, "loss": -0.0357, "reward": 0.048092352226376534, "reward_after_mean": 0.048092352226376534, "reward_after_std": 0.7942012958228588, "reward_before_mean": 0.32687312876805663, "reward_before_std": 0.8060374613851309, "reward_change_max": 0.0, "reward_change_mean": -0.27878078632056713, "reward_change_min": -0.5697837248444557, "reward_change_std": 0.22098596021533012, "reward_std": 0.7942013349384069, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.014373119222000241, "step": 393 }, { "clip_fraction": 0.0, "completion_length": 3021.5833740234375, "epoch": 0.4502857142857143, "grad_norm": 0.02360186167061329, "kl": 0.00038444995880126953, "lambda_div_used": 0.6319213733077049, "learning_rate": 2.1982156097370557e-07, "loss": 0.052, "reward": -0.20860249735414982, "reward_after_mean": -0.20860249735414982, "reward_after_std": 0.6297264527529478, "reward_before_mean": 0.03818079084157944, "reward_before_std": 0.6159613355994225, "reward_change_max": 0.0, "reward_change_mean": -0.2467832900583744, "reward_change_min": -0.5391863323748112, "reward_change_std": 0.1899872226640582, "reward_std": 0.6297264751046896, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.12848588544875383, "step": 394 }, { "clip_fraction": 0.0, "completion_length": 2135.8333473205566, "epoch": 0.4514285714285714, "grad_norm": 0.04946435987949371, "kl": 0.00024586915969848633, "lambda_div_used": 0.576830618083477, "learning_rate": 2.1769509671835223e-07, "loss": -0.0301, "reward": -0.24670689832419157, "reward_after_mean": -0.24670689832419157, "reward_after_std": 0.4664156064391136, "reward_before_mean": 0.09556343220174313, "reward_before_std": 0.35311094112694263, "reward_change_max": 0.0, "reward_change_mean": -0.3422703631222248, "reward_change_min": -0.5164104513823986, "reward_change_std": 0.19275081250816584, "reward_std": 0.466415636241436, "rewards/accuracy_reward": 0.18750000186264515, "rewards/cosine_scaled_reward": -0.09193656174466014, "step": 395 }, { "clip_fraction": 0.0, "completion_length": 2614.750068664551, "epoch": 0.45257142857142857, "grad_norm": 0.021299051120877266, "kl": 0.00028631091117858887, "lambda_div_used": 0.6214602738618851, "learning_rate": 2.1558482853517253e-07, "loss": 0.0454, "reward": 0.09535084664821625, "reward_after_mean": 0.09535084664821625, "reward_after_std": 0.6240330375730991, "reward_before_mean": 0.5079526733607054, "reward_before_std": 0.5631251083686948, "reward_change_max": 0.0, "reward_change_mean": -0.41260186582803726, "reward_change_min": -0.6420417241752148, "reward_change_std": 0.2541531687602401, "reward_std": 0.6240330524742603, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.13295269757509232, "step": 396 }, { "clip_fraction": 0.0, "completion_length": 2262.5625610351562, "epoch": 0.45371428571428574, "grad_norm": 0.023174487054347992, "kl": 0.00028808414936065674, "lambda_div_used": 0.620752289891243, "learning_rate": 2.134908592756607e-07, "loss": -0.0334, "reward": 0.0681952117010951, "reward_after_mean": 0.0681952117010951, "reward_after_std": 0.6165110263973475, "reward_before_mean": 0.48235524632036686, "reward_before_std": 0.5585681181401014, "reward_change_max": 0.0, "reward_change_mean": -0.41416002810001373, "reward_change_min": -0.6492009982466698, "reward_change_std": 0.2585160303860903, "reward_std": 0.6165110506117344, "rewards/accuracy_reward": 0.35416667349636555, "rewards/cosine_scaled_reward": 0.1281885566713754, "step": 397 }, { "clip_fraction": 0.0, "completion_length": 2219.1875534057617, "epoch": 0.45485714285714285, "grad_norm": 0.02591089904308319, "kl": 0.0003045201301574707, "lambda_div_used": 0.6068568229675293, "learning_rate": 2.1141329099692406e-07, "loss": 0.0302, "reward": -0.10764243453741074, "reward_after_mean": -0.10764243453741074, "reward_after_std": 0.5858584549278021, "reward_before_mean": 0.2538683768361807, "reward_before_std": 0.4971063416451216, "reward_change_max": 0.0, "reward_change_mean": -0.36151083186268806, "reward_change_min": -0.6307843886315823, "reward_change_std": 0.2273276075720787, "reward_std": 0.5858584903180599, "rewards/accuracy_reward": 0.2500000037252903, "rewards/cosine_scaled_reward": 0.003868376836180687, "step": 398 }, { "clip_fraction": 0.0, "completion_length": 2028.7292175292969, "epoch": 0.456, "grad_norm": 0.023573419079184532, "kl": 0.00022789835929870605, "lambda_div_used": 0.6212376356124878, "learning_rate": 2.0935222495670968e-07, "loss": 0.0692, "reward": 0.19389131292700768, "reward_after_mean": 0.19389131292700768, "reward_after_std": 0.6119374781847, "reward_before_mean": 0.656364331021905, "reward_before_std": 0.5594985205680132, "reward_change_max": 0.0, "reward_change_mean": -0.4624730013310909, "reward_change_min": -0.6958450116217136, "reward_change_std": 0.28642346803098917, "reward_std": 0.6119375005364418, "rewards/accuracy_reward": 0.479166679084301, "rewards/cosine_scaled_reward": 0.1771976239979267, "step": 399 }, { "clip_fraction": 0.0, "completion_length": 1497.520881652832, "epoch": 0.45714285714285713, "grad_norm": 0.034730274230241776, "kl": 0.00023819506168365479, "lambda_div_used": 0.6499348282814026, "learning_rate": 2.0730776160846853e-07, "loss": -0.0384, "reward": 0.35817267652601004, "reward_after_mean": 0.35817267652601004, "reward_after_std": 0.7002598587423563, "reward_before_mean": 0.8351266942918301, "reward_before_std": 0.6965709868818521, "reward_change_max": 0.0, "reward_change_mean": -0.47695402428507805, "reward_change_min": -0.7681624032557011, "reward_change_std": 0.3101581484079361, "reward_std": 0.7002598755061626, "rewards/accuracy_reward": 0.5208333469927311, "rewards/cosine_scaled_reward": 0.31429335149005055, "step": 400 }, { "clip_fraction": 0.0, "completion_length": 2817.020851135254, "epoch": 0.4582857142857143, "grad_norm": 0.024576053023338318, "kl": 0.00034928321838378906, "lambda_div_used": 0.5570264235138893, "learning_rate": 2.0528000059645995e-07, "loss": 0.0255, "reward": -0.17014812678098679, "reward_after_mean": -0.17014812678098679, "reward_after_std": 0.3594451379030943, "reward_before_mean": 0.25060533825308084, "reward_before_std": 0.25669852178543806, "reward_change_max": 0.0, "reward_change_mean": -0.4207534771412611, "reward_change_min": -0.5965141579508781, "reward_change_std": 0.2299406472593546, "reward_std": 0.35944515466690063, "rewards/accuracy_reward": 0.2291666716337204, "rewards/cosine_scaled_reward": 0.02143866289407015, "step": 401 }, { "clip_fraction": 0.0, "completion_length": 2323.062530517578, "epoch": 0.4594285714285714, "grad_norm": 0.02968554012477398, "kl": 0.00033861398696899414, "lambda_div_used": 0.579738162457943, "learning_rate": 2.032690407508949e-07, "loss": -0.0104, "reward": -0.20102215744554996, "reward_after_mean": -0.20102215744554996, "reward_after_std": 0.4696238599717617, "reward_before_mean": 0.16348140873014927, "reward_before_std": 0.36563692055642605, "reward_change_max": 0.0, "reward_change_mean": -0.36450355127453804, "reward_change_min": -0.5928855016827583, "reward_change_std": 0.21332142874598503, "reward_std": 0.46962387673556805, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.044851938262581825, "step": 402 }, { "clip_fraction": 0.0, "completion_length": 1727.2708587646484, "epoch": 0.4605714285714286, "grad_norm": 0.03782231733202934, "kl": 0.0002620220184326172, "lambda_div_used": 0.5557271614670753, "learning_rate": 2.0127498008311922e-07, "loss": 0.0459, "reward": -0.10220484808087349, "reward_after_mean": -0.10220484808087349, "reward_after_std": 0.4127990063279867, "reward_before_mean": 0.37698337249457836, "reward_before_std": 0.25216651428490877, "reward_change_max": 0.0, "reward_change_mean": -0.4791882447898388, "reward_change_min": -0.6764676049351692, "reward_change_std": 0.25756980665028095, "reward_std": 0.4127990175038576, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.06448337621986866, "step": 403 }, { "clip_fraction": 0.0, "completion_length": 2469.5000076293945, "epoch": 0.4617142857142857, "grad_norm": 0.0334598608314991, "kl": 0.00029639899730682373, "lambda_div_used": 0.5738808363676071, "learning_rate": 1.9929791578083655e-07, "loss": 0.0173, "reward": 0.004905553534626961, "reward_after_mean": 0.004905553534626961, "reward_after_std": 0.48677791468799114, "reward_before_mean": 0.506166247650981, "reward_before_std": 0.33276718482375145, "reward_change_max": 0.0, "reward_change_mean": -0.5012606829404831, "reward_change_min": -0.6952128820121288, "reward_change_std": 0.26712857093662024, "reward_std": 0.4867779165506363, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.1311662346124649, "step": 404 }, { "clip_fraction": 0.0, "completion_length": 1970.9167022705078, "epoch": 0.46285714285714286, "grad_norm": 0.03668729215860367, "kl": 0.00032258033752441406, "lambda_div_used": 0.6146402955055237, "learning_rate": 1.9733794420337213e-07, "loss": 0.0422, "reward": 0.1430281363427639, "reward_after_mean": 0.1430281363427639, "reward_after_std": 0.5927682984620333, "reward_before_mean": 0.610472509637475, "reward_before_std": 0.530167305842042, "reward_change_max": 0.0, "reward_change_mean": -0.4674443490803242, "reward_change_min": -0.726629700511694, "reward_change_std": 0.28909510001540184, "reward_std": 0.592768307775259, "rewards/accuracy_reward": 0.4375000111758709, "rewards/cosine_scaled_reward": 0.17297248914837837, "step": 405 }, { "clip_fraction": 0.0, "completion_length": 2001.6875228881836, "epoch": 0.464, "grad_norm": 0.022773489356040955, "kl": 0.0002652406692504883, "lambda_div_used": 0.6149442344903946, "learning_rate": 1.9539516087697517e-07, "loss": -0.01, "reward": 0.13723283261060715, "reward_after_mean": 0.13723283261060715, "reward_after_std": 0.6057328097522259, "reward_before_mean": 0.5846194308251143, "reward_before_std": 0.5296543845906854, "reward_change_max": 0.0, "reward_change_mean": -0.44738658517599106, "reward_change_min": -0.6640791147947311, "reward_change_std": 0.267610440030694, "reward_std": 0.6057328246533871, "rewards/accuracy_reward": 0.41666667722165585, "rewards/cosine_scaled_reward": 0.16795274708420038, "step": 406 }, { "clip_fraction": 0.0, "completion_length": 2183.3333892822266, "epoch": 0.46514285714285714, "grad_norm": 0.029050234705209732, "kl": 0.00023674964904785156, "lambda_div_used": 0.5692102611064911, "learning_rate": 1.934696604901642e-07, "loss": -0.002, "reward": 0.08383433520793915, "reward_after_mean": 0.08383433520793915, "reward_after_std": 0.5238823061808944, "reward_before_mean": 0.6431381715228781, "reward_before_std": 0.31836726085748523, "reward_change_max": 0.0, "reward_change_mean": -0.5593038275837898, "reward_change_min": -0.7351665589958429, "reward_change_std": 0.29199546575546265, "reward_std": 0.5238823387771845, "rewards/accuracy_reward": 0.47916666977107525, "rewards/cosine_scaled_reward": 0.16397148557007313, "step": 407 }, { "clip_fraction": 0.0, "completion_length": 2340.208396911621, "epoch": 0.4662857142857143, "grad_norm": 0.0308608990162611, "kl": 0.0002751350402832031, "lambda_div_used": 0.595428429543972, "learning_rate": 1.915615368891117e-07, "loss": -0.0448, "reward": -0.14165206719189882, "reward_after_mean": -0.14165206719189882, "reward_after_std": 0.5383005198091269, "reward_before_mean": 0.22720495285466313, "reward_before_std": 0.4390671527944505, "reward_change_max": 0.0, "reward_change_mean": -0.3688570037484169, "reward_change_min": -0.560546163469553, "reward_change_std": 0.21708335354924202, "reward_std": 0.5383005253970623, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": -0.043628389947116375, "step": 408 }, { "clip_fraction": 0.0, "completion_length": 3338.3333740234375, "epoch": 0.4674285714285714, "grad_norm": 0.017489202320575714, "kl": 0.0003743171691894531, "lambda_div_used": 0.641681618988514, "learning_rate": 1.8967088307307e-07, "loss": 0.0221, "reward": 0.0034925403306260705, "reward_after_mean": 0.0034925403306260705, "reward_after_std": 0.7239628247916698, "reward_before_mean": 0.3585042329505086, "reward_before_std": 0.6582456473261118, "reward_change_max": 0.0, "reward_change_mean": -0.3550117015838623, "reward_change_min": -0.627179455012083, "reward_change_std": 0.23101032618433237, "reward_std": 0.7239628490060568, "rewards/accuracy_reward": 0.3125000037252903, "rewards/cosine_scaled_reward": 0.04600422829389572, "step": 409 }, { "clip_fraction": 0.0, "completion_length": 2387.291690826416, "epoch": 0.4685714285714286, "grad_norm": 0.028829436749219894, "kl": 0.000278472900390625, "lambda_div_used": 0.6026698350906372, "learning_rate": 1.8779779118983867e-07, "loss": -0.0184, "reward": -0.08084386587142944, "reward_after_mean": -0.08084386587142944, "reward_after_std": 0.5610201843082905, "reward_before_mean": 0.2896402692422271, "reward_before_std": 0.4759355755522847, "reward_change_max": 0.0, "reward_change_mean": -0.37048413045704365, "reward_change_min": -0.5802675113081932, "reward_change_std": 0.2220335192978382, "reward_std": 0.561020215973258, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.018806922249495983, "step": 410 }, { "clip_fraction": 0.0, "completion_length": 2483.9791717529297, "epoch": 0.4697142857142857, "grad_norm": 0.02735401690006256, "kl": 0.00030410289764404297, "lambda_div_used": 0.5891791060566902, "learning_rate": 1.8594235253127372e-07, "loss": 0.0464, "reward": -0.09278726205229759, "reward_after_mean": -0.09278726205229759, "reward_after_std": 0.48578726314008236, "reward_before_mean": 0.29704072792083025, "reward_before_std": 0.41402094066143036, "reward_change_max": 0.0, "reward_change_mean": -0.3898279666900635, "reward_change_min": -0.6011387817561626, "reward_change_std": 0.234495647251606, "reward_std": 0.48578727059066296, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": 0.026207380928099155, "step": 411 }, { "clip_fraction": 0.0, "completion_length": 2443.000057220459, "epoch": 0.47085714285714286, "grad_norm": 0.027080198749899864, "kl": 0.0003039836883544922, "lambda_div_used": 0.626535639166832, "learning_rate": 1.8410465752883758e-07, "loss": 0.0382, "reward": 0.18809181079268456, "reward_after_mean": 0.18809181079268456, "reward_after_std": 0.6282138898968697, "reward_before_mean": 0.6274868324398994, "reward_before_std": 0.5877205710858107, "reward_change_max": 0.0, "reward_change_mean": -0.4393950141966343, "reward_change_min": -0.6751919612288475, "reward_change_std": 0.2793376138433814, "reward_std": 0.6282139029353857, "rewards/accuracy_reward": 0.4583333432674408, "rewards/cosine_scaled_reward": 0.16915349289774895, "step": 412 }, { "clip_fraction": 0.0, "completion_length": 2277.0417251586914, "epoch": 0.472, "grad_norm": 0.02998846024274826, "kl": 0.00023829936981201172, "lambda_div_used": 0.6537542790174484, "learning_rate": 1.822847957491922e-07, "loss": 0.023, "reward": 0.0677551869302988, "reward_after_mean": 0.0677551869302988, "reward_after_std": 0.6939267106354237, "reward_before_mean": 0.3922595623880625, "reward_before_std": 0.7187845781445503, "reward_change_max": 0.0, "reward_change_mean": -0.3245043680071831, "reward_change_min": -0.6250169165432453, "reward_change_std": 0.24318813905119896, "reward_std": 0.6939267329871655, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.05892622594546992, "step": 413 }, { "clip_fraction": 0.0, "completion_length": 3080.5625, "epoch": 0.47314285714285714, "grad_norm": 0.016316330060362816, "kl": 0.00028955936431884766, "lambda_div_used": 0.6257347464561462, "learning_rate": 1.804828558898332e-07, "loss": 0.0235, "reward": -0.13909682049416006, "reward_after_mean": -0.13909682049416006, "reward_after_std": 0.6051931101828814, "reward_before_mean": 0.1403335351496935, "reward_before_std": 0.583410625346005, "reward_change_max": 0.0, "reward_change_mean": -0.2794303596019745, "reward_change_min": -0.45995376631617546, "reward_change_std": 0.1819247854873538, "reward_std": 0.6051931362599134, "rewards/accuracy_reward": 0.1875000037252903, "rewards/cosine_scaled_reward": -0.04716646298766136, "step": 414 }, { "clip_fraction": 0.0, "completion_length": 3129.0208740234375, "epoch": 0.4742857142857143, "grad_norm": 0.020433053374290466, "kl": 0.0003305673599243164, "lambda_div_used": 0.6250654757022858, "learning_rate": 1.7869892577476722e-07, "loss": 0.0036, "reward": -0.22648247238248587, "reward_after_mean": -0.22648247238248587, "reward_after_std": 0.6230853609740734, "reward_before_mean": 0.01668240688741207, "reward_before_std": 0.5801385007798672, "reward_change_max": 0.0, "reward_change_mean": -0.24316489323973656, "reward_change_min": -0.43964531272649765, "reward_change_std": 0.1610111938789487, "reward_std": 0.6230853945016861, "rewards/accuracy_reward": 0.1458333358168602, "rewards/cosine_scaled_reward": -0.12915092520415783, "step": 415 }, { "clip_fraction": 0.0, "completion_length": 1350.31254196167, "epoch": 0.4754285714285714, "grad_norm": 0.029406633228063583, "kl": 0.00015437602996826172, "lambda_div_used": 0.6057035326957703, "learning_rate": 1.7693309235023127e-07, "loss": -0.006, "reward": -0.15187997743487358, "reward_after_mean": -0.15187997743487358, "reward_after_std": 0.500312227755785, "reward_before_mean": 0.1518111266195774, "reward_before_std": 0.4886645954102278, "reward_change_max": 0.0, "reward_change_mean": -0.3036911189556122, "reward_change_min": -0.5342599004507065, "reward_change_std": 0.20331810228526592, "reward_std": 0.5003122296184301, "rewards/accuracy_reward": 0.2083333395421505, "rewards/cosine_scaled_reward": -0.05652220547199249, "step": 416 }, { "clip_fraction": 0.0, "completion_length": 3118.062530517578, "epoch": 0.4765714285714286, "grad_norm": 0.022452035918831825, "kl": 0.00038176774978637695, "lambda_div_used": 0.5929878354072571, "learning_rate": 1.7518544168045524e-07, "loss": -0.0269, "reward": -0.31791230058297515, "reward_after_mean": -0.31791230058297515, "reward_after_std": 0.5058948453515768, "reward_before_mean": -0.05459975823760033, "reward_before_std": 0.4272688911296427, "reward_change_max": 0.0, "reward_change_mean": -0.2633125390857458, "reward_change_min": -0.4145249240100384, "reward_change_std": 0.15364530310034752, "reward_std": 0.5058948528021574, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.13793309262837283, "step": 417 }, { "clip_fraction": 0.0, "completion_length": 2075.0417289733887, "epoch": 0.4777142857142857, "grad_norm": 0.03444957733154297, "kl": 0.00030410289764404297, "lambda_div_used": 0.5850719586014748, "learning_rate": 1.7345605894346726e-07, "loss": -0.0603, "reward": 0.03764221305027604, "reward_after_mean": 0.03764221305027604, "reward_after_std": 0.5699926447123289, "reward_before_mean": 0.5322978757321835, "reward_before_std": 0.3904368221992627, "reward_change_max": 0.0, "reward_change_mean": -0.4946556333452463, "reward_change_min": -0.6658468469977379, "reward_change_std": 0.2617466766387224, "reward_std": 0.5699926633387804, "rewards/accuracy_reward": 0.3750000037252903, "rewards/cosine_scaled_reward": 0.15729784907307476, "step": 418 }, { "clip_fraction": 0.0, "completion_length": 2422.062515258789, "epoch": 0.47885714285714287, "grad_norm": 0.023973578587174416, "kl": 0.0002415478229522705, "lambda_div_used": 0.5876604542136192, "learning_rate": 1.7174502842694212e-07, "loss": 0.0464, "reward": -0.01938623934984207, "reward_after_mean": -0.01938623934984207, "reward_after_std": 0.531193170696497, "reward_before_mean": 0.4354735445231199, "reward_before_std": 0.4033324606716633, "reward_change_max": 0.0, "reward_change_mean": -0.45485977828502655, "reward_change_min": -0.6743562705814838, "reward_change_std": 0.2555869175121188, "reward_std": 0.5311931855976582, "rewards/accuracy_reward": 0.33333333395421505, "rewards/cosine_scaled_reward": 0.10214020684361458, "step": 419 }, { "clip_fraction": 0.0, "completion_length": 1696.083381652832, "epoch": 0.48, "grad_norm": 0.04010794684290886, "kl": 0.0002671480178833008, "lambda_div_used": 0.5893955454230309, "learning_rate": 1.7005243352409333e-07, "loss": -0.0499, "reward": -0.18572357669472694, "reward_after_mean": -0.18572357669472694, "reward_after_std": 0.4607646930962801, "reward_before_mean": 0.12988583371043205, "reward_before_std": 0.416760787833482, "reward_change_max": 0.0, "reward_change_mean": -0.31560939736664295, "reward_change_min": -0.5215952098369598, "reward_change_std": 0.1989196827635169, "reward_std": 0.4607647117227316, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": -0.14094750862568617, "step": 420 }, { "clip_fraction": 0.0, "completion_length": 2925.3333892822266, "epoch": 0.48114285714285715, "grad_norm": 0.025533905252814293, "kl": 0.0003809928894042969, "lambda_div_used": 0.6258808895945549, "learning_rate": 1.6837835672960831e-07, "loss": 0.0252, "reward": -0.1876915767788887, "reward_after_mean": -0.1876915767788887, "reward_after_std": 0.6121686920523643, "reward_before_mean": 0.07189313881099224, "reward_before_std": 0.5847431821748614, "reward_change_max": 0.0, "reward_change_mean": -0.25958471931517124, "reward_change_min": -0.5139855779707432, "reward_change_std": 0.1843523010611534, "reward_std": 0.6121687144041061, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.09477353328838944, "step": 421 }, { "clip_fraction": 0.0, "completion_length": 2811.4166946411133, "epoch": 0.48228571428571426, "grad_norm": 0.020569510757923126, "kl": 0.00035816431045532227, "lambda_div_used": 0.5612113624811172, "learning_rate": 1.6672287963562852e-07, "loss": 0.0257, "reward": -0.22088398411870003, "reward_after_mean": -0.22088398411870003, "reward_after_std": 0.37868294678628445, "reward_before_mean": 0.17238148115575314, "reward_before_std": 0.27797973807901144, "reward_change_max": 0.0, "reward_change_mean": -0.3932654559612274, "reward_change_min": -0.5825164802372456, "reward_change_std": 0.22011223249137402, "reward_std": 0.3786829560995102, "rewards/accuracy_reward": 0.1875, "rewards/cosine_scaled_reward": -0.015118520706892014, "step": 422 }, { "clip_fraction": 0.0, "completion_length": 2948.2916870117188, "epoch": 0.48342857142857143, "grad_norm": 0.021468866616487503, "kl": 0.0003040432929992676, "lambda_div_used": 0.6255258545279503, "learning_rate": 1.6508608292777203e-07, "loss": -0.0097, "reward": -0.10086626000702381, "reward_after_mean": -0.10086626000702381, "reward_after_std": 0.5887319762259722, "reward_before_mean": 0.19233786687254906, "reward_before_std": 0.5806169025599957, "reward_change_max": 0.0, "reward_change_mean": -0.2932041045278311, "reward_change_min": -0.5194867514073849, "reward_change_std": 0.19756229128688574, "reward_std": 0.5887319948524237, "rewards/accuracy_reward": 0.22916667349636555, "rewards/cosine_scaled_reward": -0.03682881221175194, "step": 423 }, { "clip_fraction": 0.0, "completion_length": 2725.6041946411133, "epoch": 0.4845714285714286, "grad_norm": 0.025675497949123383, "kl": 0.00032907724380493164, "lambda_div_used": 0.6430495753884315, "learning_rate": 1.6346804638120098e-07, "loss": -0.0044, "reward": -0.06141174025833607, "reward_after_mean": -0.06141174025833607, "reward_after_std": 0.6873566564172506, "reward_before_mean": 0.2285282697994262, "reward_before_std": 0.664992194622755, "reward_change_max": 0.0, "reward_change_mean": -0.28994001634418964, "reward_change_min": -0.5423923581838608, "reward_change_std": 0.20123817585408688, "reward_std": 0.6873566769063473, "rewards/accuracy_reward": 0.25000000931322575, "rewards/cosine_scaled_reward": -0.021471746265888214, "step": 424 }, { "clip_fraction": 0.0, "completion_length": 1639.2292175292969, "epoch": 0.4857142857142857, "grad_norm": 0.025542836636304855, "kl": 0.00018703937530517578, "lambda_div_used": 0.6579956188797951, "learning_rate": 1.6186884885673413e-07, "loss": 0.0441, "reward": 0.5941350422799587, "reward_after_mean": 0.5941350422799587, "reward_after_std": 0.8043180033564568, "reward_before_mean": 1.2064684219658375, "reward_before_std": 0.7356861205771565, "reward_change_max": 0.0, "reward_change_mean": -0.6123334169387817, "reward_change_min": -0.9342719316482544, "reward_change_std": 0.38359352573752403, "reward_std": 0.804318018257618, "rewards/accuracy_reward": 0.7500000186264515, "rewards/cosine_scaled_reward": 0.456468403339386, "step": 425 }, { "clip_fraction": 0.0, "completion_length": 2131.9375381469727, "epoch": 0.4868571428571429, "grad_norm": 0.029882676899433136, "kl": 0.00031810998916625977, "lambda_div_used": 0.6198792308568954, "learning_rate": 1.6028856829700258e-07, "loss": -0.0153, "reward": 0.03275429271161556, "reward_after_mean": 0.03275429271161556, "reward_after_std": 0.6299177911132574, "reward_before_mean": 0.4296752456575632, "reward_before_std": 0.5544933034107089, "reward_change_max": 0.0, "reward_change_mean": -0.3969209287315607, "reward_change_min": -0.6235288828611374, "reward_change_std": 0.24328004382550716, "reward_std": 0.6299178209155798, "rewards/accuracy_reward": 0.37500000931322575, "rewards/cosine_scaled_reward": 0.05467522703111172, "step": 426 }, { "clip_fraction": 0.0, "completion_length": 3065.3958587646484, "epoch": 0.488, "grad_norm": 0.024560794234275818, "kl": 0.0003638267517089844, "lambda_div_used": 0.5728301778435707, "learning_rate": 1.5872728172265146e-07, "loss": 0.0713, "reward": -0.181079788133502, "reward_after_mean": -0.181079788133502, "reward_after_std": 0.4291039705276489, "reward_before_mean": 0.19762573204934597, "reward_before_std": 0.3286724528297782, "reward_change_max": 0.0, "reward_change_mean": -0.3787055220454931, "reward_change_min": -0.5433184914290905, "reward_change_std": 0.20709905866533518, "reward_std": 0.4291039779782295, "rewards/accuracy_reward": 0.25000000558793545, "rewards/cosine_scaled_reward": -0.05237427353858948, "step": 427 }, { "clip_fraction": 0.0, "completion_length": 2331.416702270508, "epoch": 0.48914285714285716, "grad_norm": 0.027837947010993958, "kl": 0.00027292966842651367, "lambda_div_used": 0.6576317623257637, "learning_rate": 1.5718506522858572e-07, "loss": 0.0627, "reward": 0.037938148714601994, "reward_after_mean": 0.037938148714601994, "reward_after_std": 0.7411033473908901, "reward_before_mean": 0.3408977910876274, "reward_before_std": 0.7301055882126093, "reward_change_max": 0.0, "reward_change_mean": -0.3029596321284771, "reward_change_min": -0.5550829358398914, "reward_change_std": 0.21431603003293276, "reward_std": 0.7411033622920513, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.028397773392498493, "step": 428 }, { "clip_fraction": 0.0, "completion_length": 2035.8750381469727, "epoch": 0.49028571428571427, "grad_norm": 0.0320022888481617, "kl": 0.00036203861236572266, "lambda_div_used": 0.6039041504263878, "learning_rate": 1.5566199398026147e-07, "loss": -0.0493, "reward": -0.09264844097197056, "reward_after_mean": -0.09264844097197056, "reward_after_std": 0.5593565441668034, "reward_before_mean": 0.28112196549773216, "reward_before_std": 0.4800149817019701, "reward_change_max": 0.0, "reward_change_mean": -0.3737703934311867, "reward_change_min": -0.602749090641737, "reward_change_std": 0.22795243095606565, "reward_std": 0.5593565553426743, "rewards/accuracy_reward": 0.27083333767950535, "rewards/cosine_scaled_reward": 0.010288612451404333, "step": 429 }, { "clip_fraction": 0.0, "completion_length": 2334.7083778381348, "epoch": 0.49142857142857144, "grad_norm": 0.02567731775343418, "kl": 0.00031498074531555176, "lambda_div_used": 0.609040379524231, "learning_rate": 1.5415814221002265e-07, "loss": 0.0017, "reward": -0.11068469006568193, "reward_after_mean": -0.11068469006568193, "reward_after_std": 0.5843690279871225, "reward_before_mean": 0.24647100269794464, "reward_before_std": 0.5133073255419731, "reward_change_max": 0.0, "reward_change_mean": -0.35715569369494915, "reward_change_min": -0.6265733800828457, "reward_change_std": 0.23320611286908388, "reward_std": 0.5843690391629934, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": -0.02436233009211719, "step": 430 }, { "clip_fraction": 0.0, "completion_length": 2203.125015258789, "epoch": 0.49257142857142855, "grad_norm": 0.030070627108216286, "kl": 0.0003618001937866211, "lambda_div_used": 0.5612699165940285, "learning_rate": 1.5267358321348285e-07, "loss": 0.0147, "reward": -0.21507295966148376, "reward_after_mean": -0.21507295966148376, "reward_after_std": 0.37470250017941, "reward_before_mean": 0.18050049245357513, "reward_before_std": 0.2745527196675539, "reward_change_max": 0.0, "reward_change_mean": -0.39557345397770405, "reward_change_min": -0.5842532999813557, "reward_change_std": 0.21895906049758196, "reward_std": 0.37470250204205513, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.027832843363285065, "step": 431 }, { "clip_fraction": 0.0, "completion_length": 2875.2917098999023, "epoch": 0.4937142857142857, "grad_norm": 0.021773481741547585, "kl": 0.00034308433532714844, "lambda_div_used": 0.6463272646069527, "learning_rate": 1.5120838934595337e-07, "loss": -0.0175, "reward": 0.0671270489692688, "reward_after_mean": 0.0671270489692688, "reward_after_std": 0.6703518275171518, "reward_before_mean": 0.3971955068409443, "reward_before_std": 0.6849911892786622, "reward_change_max": 0.0, "reward_change_mean": -0.3300684615969658, "reward_change_min": -0.6204027272760868, "reward_change_std": 0.24090207554399967, "reward_std": 0.6703518535941839, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.06386216171085835, "step": 432 }, { "clip_fraction": 0.0, "completion_length": 2982.0000610351562, "epoch": 0.4948571428571429, "grad_norm": 0.021818527951836586, "kl": 0.0003089308738708496, "lambda_div_used": 0.5629478171467781, "learning_rate": 1.4976263201891613e-07, "loss": -0.0006, "reward": -0.06292321160435677, "reward_after_mean": -0.06292321160435677, "reward_after_std": 0.43814039044082165, "reward_before_mean": 0.4110525958240032, "reward_before_std": 0.28899803664535284, "reward_change_max": 0.0, "reward_change_mean": -0.4739758223295212, "reward_change_min": -0.672174334526062, "reward_change_std": 0.25748884305357933, "reward_std": 0.4381403960287571, "rewards/accuracy_reward": 0.3333333358168602, "rewards/cosine_scaled_reward": 0.07771925255656242, "step": 433 }, { "clip_fraction": 0.0, "completion_length": 2822.6458587646484, "epoch": 0.496, "grad_norm": 0.024085119366645813, "kl": 0.0003399848937988281, "lambda_div_used": 0.5637771561741829, "learning_rate": 1.483363816965435e-07, "loss": 0.0353, "reward": -0.39865921065211296, "reward_after_mean": -0.39865921065211296, "reward_after_std": 0.34102493710815907, "reward_before_mean": -0.12386159785091877, "reward_before_std": 0.2913210419937968, "reward_change_max": 0.0, "reward_change_mean": -0.2747976202517748, "reward_change_min": -0.4381771683692932, "reward_change_std": 0.16477954387664795, "reward_std": 0.34102495945990086, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.18636160157620907, "step": 434 }, { "clip_fraction": 0.0, "completion_length": 2098.333366394043, "epoch": 0.49714285714285716, "grad_norm": 0.03476332873106003, "kl": 0.0004093945026397705, "lambda_div_used": 0.561927042901516, "learning_rate": 1.469297078922642e-07, "loss": 0.012, "reward": -0.23742017894983292, "reward_after_mean": -0.23742017894983292, "reward_after_std": 0.3864587936550379, "reward_before_mean": 0.1438802983611822, "reward_before_std": 0.28451414965093136, "reward_change_max": 0.0, "reward_change_mean": -0.3813004810363054, "reward_change_min": -0.5687282234430313, "reward_change_std": 0.21544194873422384, "reward_std": 0.3864588178694248, "rewards/accuracy_reward": 0.1875, "rewards/cosine_scaled_reward": -0.043619705364108086, "step": 435 }, { "clip_fraction": 0.0, "completion_length": 1956.6666984558105, "epoch": 0.4982857142857143, "grad_norm": 0.03724474087357521, "kl": 0.00024643540382385254, "lambda_div_used": 0.5972427576780319, "learning_rate": 1.4554267916537495e-07, "loss": -0.0215, "reward": 0.24212833493947983, "reward_after_mean": 0.24212833493947983, "reward_after_std": 0.5586434360593557, "reward_before_mean": 0.7999392561614513, "reward_before_std": 0.44878256041556597, "reward_change_max": 0.0, "reward_change_mean": -0.5578109100461006, "reward_change_min": -0.789992418140173, "reward_change_std": 0.32050481624901295, "reward_std": 0.5586434435099363, "rewards/accuracy_reward": 0.5208333432674408, "rewards/cosine_scaled_reward": 0.27910589799284935, "step": 436 }, { "clip_fraction": 0.0, "completion_length": 2701.1458587646484, "epoch": 0.49942857142857144, "grad_norm": 0.02587887831032276, "kl": 0.0003287792205810547, "lambda_div_used": 0.5530718564987183, "learning_rate": 1.4417536311769885e-07, "loss": -0.0406, "reward": -0.2597166027408093, "reward_after_mean": -0.2597166027408093, "reward_after_std": 0.3689497411251068, "reward_before_mean": 0.13261681143194437, "reward_before_std": 0.23820086661726236, "reward_change_max": 0.0, "reward_change_mean": -0.39233342185616493, "reward_change_min": -0.5372026227414608, "reward_change_std": 0.20350486412644386, "reward_std": 0.3689497448503971, "rewards/accuracy_reward": 0.14583333395421505, "rewards/cosine_scaled_reward": -0.013216521823778749, "step": 437 }, { "clip_fraction": 0.0, "completion_length": 2737.6667404174805, "epoch": 0.5005714285714286, "grad_norm": 0.02056184597313404, "kl": 0.0002714395523071289, "lambda_div_used": 0.6291297823190689, "learning_rate": 1.4282782639029128e-07, "loss": -0.0391, "reward": 0.011486291885375977, "reward_after_mean": 0.011486291885375977, "reward_after_std": 0.6032967660576105, "reward_before_mean": 0.34824367985129356, "reward_before_std": 0.6035201866179705, "reward_change_max": 0.0, "reward_change_mean": -0.3367573842406273, "reward_change_min": -0.588931929320097, "reward_change_std": 0.23199212551116943, "reward_std": 0.6032967790961266, "rewards/accuracy_reward": 0.2916666753590107, "rewards/cosine_scaled_reward": 0.05657700449228287, "step": 438 }, { "clip_fraction": 0.0, "completion_length": 2417.875057220459, "epoch": 0.5017142857142857, "grad_norm": 0.02567973919212818, "kl": 0.00029343366622924805, "lambda_div_used": 0.6277910619974136, "learning_rate": 1.4150013466019114e-07, "loss": 0.013, "reward": -0.039602138102054596, "reward_after_mean": -0.039602138102054596, "reward_after_std": 0.6028888281434774, "reward_before_mean": 0.2732508610934019, "reward_before_std": 0.5980509808287024, "reward_change_max": 0.0, "reward_change_mean": -0.3128530103713274, "reward_change_min": -0.5787924043834209, "reward_change_std": 0.21938505861908197, "reward_std": 0.6028888486325741, "rewards/accuracy_reward": 0.2500000074505806, "rewards/cosine_scaled_reward": 0.023250849917531013, "step": 439 }, { "clip_fraction": 0.0, "completion_length": 2942.770835876465, "epoch": 0.5028571428571429, "grad_norm": 0.028702648356556892, "kl": 0.00038042664527893066, "lambda_div_used": 0.5716730058193207, "learning_rate": 1.4019235263722034e-07, "loss": -0.0684, "reward": -0.40989339258521795, "reward_after_mean": -0.40989339258521795, "reward_after_std": 0.42086669616401196, "reward_before_mean": -0.15851380862295628, "reward_before_std": 0.3258522395044565, "reward_change_max": 0.0, "reward_change_mean": -0.2513795755803585, "reward_change_min": -0.3500328026711941, "reward_change_std": 0.13096946012228727, "reward_std": 0.42086671106517315, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2001804756000638, "step": 440 }, { "clip_fraction": 0.0, "completion_length": 2898.250015258789, "epoch": 0.504, "grad_norm": 0.025851793587207794, "kl": 0.0004258155822753906, "lambda_div_used": 0.5947419032454491, "learning_rate": 1.3890454406082956e-07, "loss": 0.0188, "reward": -0.09268893860280514, "reward_after_mean": -0.09268893860280514, "reward_after_std": 0.527678394690156, "reward_before_mean": 0.28031823271885514, "reward_before_std": 0.43342068372294307, "reward_change_max": 0.0, "reward_change_mean": -0.373007170855999, "reward_change_min": -0.5688190311193466, "reward_change_std": 0.2149599390104413, "reward_std": 0.5276784114539623, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": -0.03218177333474159, "step": 441 }, { "clip_fraction": 0.0, "completion_length": 2738.312530517578, "epoch": 0.5051428571428571, "grad_norm": 0.024453362450003624, "kl": 0.0003438591957092285, "lambda_div_used": 0.6373191103339195, "learning_rate": 1.3763677169699217e-07, "loss": -0.0098, "reward": -0.11879788711667061, "reward_after_mean": -0.11879788711667061, "reward_after_std": 0.6699451506137848, "reward_before_mean": 0.15645072294864804, "reward_before_std": 0.6331657916307449, "reward_change_max": 0.0, "reward_change_mean": -0.27524859458208084, "reward_change_min": -0.4614573121070862, "reward_change_std": 0.1758969947695732, "reward_std": 0.669945165514946, "rewards/accuracy_reward": 0.20833334140479565, "rewards/cosine_scaled_reward": -0.051882621832191944, "step": 442 }, { "clip_fraction": 0.0, "completion_length": 3111.3541870117188, "epoch": 0.5062857142857143, "grad_norm": 0.018424130976200104, "kl": 0.0003415346145629883, "lambda_div_used": 0.594095878303051, "learning_rate": 1.3638909733514452e-07, "loss": -0.0021, "reward": -0.13722801115363836, "reward_after_mean": -0.13722801115363836, "reward_after_std": 0.5484136454761028, "reward_before_mean": 0.23274649307131767, "reward_before_std": 0.4324050806462765, "reward_change_max": 0.0, "reward_change_mean": -0.36997453309595585, "reward_change_min": -0.530858725309372, "reward_change_std": 0.2040829285979271, "reward_std": 0.5484136454761028, "rewards/accuracy_reward": 0.2500000037252903, "rewards/cosine_scaled_reward": -0.01725347526371479, "step": 443 }, { "clip_fraction": 0.0, "completion_length": 2850.6666984558105, "epoch": 0.5074285714285715, "grad_norm": 0.023973438888788223, "kl": 0.0003078579902648926, "lambda_div_used": 0.5948461815714836, "learning_rate": 1.351615817851748e-07, "loss": -0.0055, "reward": -0.1747817099094391, "reward_after_mean": -0.1747817099094391, "reward_after_std": 0.4692641645669937, "reward_before_mean": 0.1552269384264946, "reward_before_std": 0.4344164803624153, "reward_change_max": 0.0, "reward_change_mean": -0.3300086557865143, "reward_change_min": -0.4955419562757015, "reward_change_std": 0.19987357687205076, "reward_std": 0.4692641757428646, "rewards/accuracy_reward": 0.20833334140479565, "rewards/cosine_scaled_reward": -0.05310639180243015, "step": 444 }, { "clip_fraction": 0.0, "completion_length": 2896.1458892822266, "epoch": 0.5085714285714286, "grad_norm": 0.021344272419810295, "kl": 0.000371396541595459, "lambda_div_used": 0.5911799594759941, "learning_rate": 1.3395428487445914e-07, "loss": 0.0204, "reward": -0.09663986414670944, "reward_after_mean": -0.09663986414670944, "reward_after_std": 0.5167377535253763, "reward_before_mean": 0.29358627926558256, "reward_before_std": 0.4120226204395294, "reward_change_max": 0.0, "reward_change_mean": -0.3902261406183243, "reward_change_min": -0.5846884902566671, "reward_change_std": 0.21900581941008568, "reward_std": 0.5167377851903439, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": -0.018913742154836655, "step": 445 }, { "clip_fraction": 0.0, "completion_length": 2849.7083892822266, "epoch": 0.5097142857142857, "grad_norm": 0.020637815818190575, "kl": 0.0003114938735961914, "lambda_div_used": 0.5921742841601372, "learning_rate": 1.3276726544494571e-07, "loss": 0.0358, "reward": -0.19162439927458763, "reward_after_mean": -0.19162439927458763, "reward_after_std": 0.4613885171711445, "reward_before_mean": 0.12132475152611732, "reward_before_std": 0.42135200183838606, "reward_change_max": 0.0, "reward_change_mean": -0.31294916570186615, "reward_change_min": -0.5426378659904003, "reward_change_std": 0.196873115375638, "reward_std": 0.4613885283470154, "rewards/accuracy_reward": 0.1875000074505806, "rewards/cosine_scaled_reward": -0.06617523916065693, "step": 446 }, { "clip_fraction": 0.0, "completion_length": 1833.833351135254, "epoch": 0.5108571428571429, "grad_norm": 0.03296150267124176, "kl": 0.00031435489654541016, "lambda_div_used": 0.5761597007513046, "learning_rate": 1.316005813502869e-07, "loss": -0.0092, "reward": -0.1308344192802906, "reward_after_mean": -0.1308344192802906, "reward_after_std": 0.44760639779269695, "reward_before_mean": 0.27170680463314056, "reward_before_std": 0.3468586690723896, "reward_change_max": 0.0, "reward_change_mean": -0.40254124999046326, "reward_change_min": -0.6140144616365433, "reward_change_std": 0.23130866140127182, "reward_std": 0.44760641269385815, "rewards/accuracy_reward": 0.2708333395421505, "rewards/cosine_scaled_reward": 0.0008734846487641335, "step": 447 }, { "clip_fraction": 0.0, "completion_length": 2031.1875228881836, "epoch": 0.512, "grad_norm": 0.03561374545097351, "kl": 0.0002592802047729492, "lambda_div_used": 0.5793976187705994, "learning_rate": 1.3045428945301953e-07, "loss": 0.0625, "reward": -0.15141154546290636, "reward_after_mean": -0.15141154546290636, "reward_after_std": 0.47303674556314945, "reward_before_mean": 0.2355510238558054, "reward_before_std": 0.36865185387432575, "reward_change_max": 0.0, "reward_change_mean": -0.38696256279945374, "reward_change_min": -0.6033525615930557, "reward_change_std": 0.22703420650213957, "reward_std": 0.47303677164018154, "rewards/accuracy_reward": 0.2291666679084301, "rewards/cosine_scaled_reward": 0.00638435548171401, "step": 448 }, { "clip_fraction": 0.0, "completion_length": 2527.416679382324, "epoch": 0.5131428571428571, "grad_norm": 0.028000032529234886, "kl": 0.0003337860107421875, "lambda_div_used": 0.5869953334331512, "learning_rate": 1.2932844562179352e-07, "loss": -0.0384, "reward": -0.2571147223934531, "reward_after_mean": -0.2571147223934531, "reward_after_std": 0.44822895526885986, "reward_before_mean": 0.04588266555219889, "reward_before_std": 0.39473184011876583, "reward_change_max": 0.0, "reward_change_mean": -0.30299739353358746, "reward_change_min": -0.4681765213608742, "reward_change_std": 0.17783036269247532, "reward_std": 0.4482289757579565, "rewards/accuracy_reward": 0.20833334140479565, "rewards/cosine_scaled_reward": -0.1624506814405322, "step": 449 }, { "clip_fraction": 0.0, "completion_length": 2360.3125228881836, "epoch": 0.5142857142857142, "grad_norm": 0.028984738513827324, "kl": 0.00030869245529174805, "lambda_div_used": 0.5782932788133621, "learning_rate": 1.2822310472864885e-07, "loss": -0.0103, "reward": -0.1581341177225113, "reward_after_mean": -0.1581341177225113, "reward_after_std": 0.45438094437122345, "reward_before_mean": 0.22925141779705882, "reward_before_std": 0.3587577445432544, "reward_change_max": 0.0, "reward_change_mean": -0.3873855248093605, "reward_change_min": -0.5598408095538616, "reward_change_std": 0.21874273754656315, "reward_std": 0.4543809536844492, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": 8.474662899971008e-05, "step": 450 }, { "clip_fraction": 0.0, "completion_length": 2468.1667098999023, "epoch": 0.5154285714285715, "grad_norm": 0.035742077976465225, "kl": 0.0004057884216308594, "lambda_div_used": 0.5319794341921806, "learning_rate": 1.2713832064634125e-07, "loss": -0.0206, "reward": -0.24930068850517273, "reward_after_mean": -0.24930068850517273, "reward_after_std": 0.3233966138213873, "reward_before_mean": 0.2043198449537158, "reward_before_std": 0.1404099608771503, "reward_change_max": 0.0, "reward_change_mean": -0.4536205381155014, "reward_change_min": -0.6192456483840942, "reward_change_std": 0.23171372152864933, "reward_std": 0.3233966249972582, "rewards/accuracy_reward": 0.25, "rewards/cosine_scaled_reward": -0.04568016994744539, "step": 451 }, { "clip_fraction": 0.0, "completion_length": 2740.4166831970215, "epoch": 0.5165714285714286, "grad_norm": 0.02392762340605259, "kl": 0.0003084242343902588, "lambda_div_used": 0.6302760690450668, "learning_rate": 1.260741462457165e-07, "loss": -0.0038, "reward": 0.021876126527786255, "reward_after_mean": 0.021876126527786255, "reward_after_std": 0.5877971854060888, "reward_before_mean": 0.37123518623411655, "reward_before_std": 0.6054041795432568, "reward_change_max": 0.0, "reward_change_mean": -0.34935908019542694, "reward_change_min": -0.593828123062849, "reward_change_std": 0.2436074260622263, "reward_std": 0.5877972133457661, "rewards/accuracy_reward": 0.3333333432674408, "rewards/cosine_scaled_reward": 0.03790186531841755, "step": 452 }, { "clip_fraction": 0.0, "completion_length": 2600.1667518615723, "epoch": 0.5177142857142857, "grad_norm": 0.03355313092470169, "kl": 0.0003637075424194336, "lambda_div_used": 0.6487660184502602, "learning_rate": 1.2503063339313356e-07, "loss": 0.0766, "reward": 0.08280018530786037, "reward_after_mean": 0.08280018530786037, "reward_after_std": 0.721244465559721, "reward_before_mean": 0.43781263194978237, "reward_before_std": 0.6883194223046303, "reward_change_max": 0.0, "reward_change_mean": -0.35501245222985744, "reward_change_min": -0.6027462910860777, "reward_change_std": 0.2280629277229309, "reward_std": 0.7212444879114628, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.08364596217870712, "step": 453 }, { "clip_fraction": 0.0, "completion_length": 2229.520866394043, "epoch": 0.5188571428571429, "grad_norm": 0.026976440101861954, "kl": 0.0003204345703125, "lambda_div_used": 0.586439348757267, "learning_rate": 1.2400783294793668e-07, "loss": 0.0213, "reward": -0.06522449851036072, "reward_after_mean": -0.06522449851036072, "reward_after_std": 0.4758566189557314, "reward_before_mean": 0.34830280393362045, "reward_before_std": 0.4015544820576906, "reward_change_max": 0.0, "reward_change_mean": -0.4135272856801748, "reward_change_min": -0.6215804703533649, "reward_change_std": 0.24525572545826435, "reward_std": 0.47585663571953773, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": 0.05663611739873886, "step": 454 }, { "clip_fraction": 0.0, "completion_length": 2910.729217529297, "epoch": 0.52, "grad_norm": 0.022959912195801735, "kl": 0.0003757178783416748, "lambda_div_used": 0.575455017387867, "learning_rate": 1.2300579475997657e-07, "loss": 0.0464, "reward": -0.396820537163876, "reward_after_mean": -0.396820537163876, "reward_after_std": 0.42159392312169075, "reward_before_mean": -0.14198972191661596, "reward_before_std": 0.3413227070122957, "reward_change_max": 0.0, "reward_change_mean": -0.2548308204859495, "reward_change_min": -0.36302991211414337, "reward_change_std": 0.13639382366091013, "reward_std": 0.4215939249843359, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18365638982504606, "step": 455 }, { "clip_fraction": 0.0, "completion_length": 2996.0417098999023, "epoch": 0.5211428571428571, "grad_norm": 0.022153589874505997, "kl": 0.0003063082695007324, "lambda_div_used": 0.6117950826883316, "learning_rate": 1.220245676671809e-07, "loss": 0.03, "reward": -0.22662064619362354, "reward_after_mean": -0.22662064619362354, "reward_after_std": 0.5622703209519386, "reward_before_mean": 0.03414517780765891, "reward_before_std": 0.5151250278577209, "reward_change_max": 0.0, "reward_change_mean": -0.26076582819223404, "reward_change_min": -0.43476971983909607, "reward_change_std": 0.165956006385386, "reward_std": 0.5622703321278095, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.13252148625906557, "step": 456 }, { "clip_fraction": 0.0, "completion_length": 2978.187530517578, "epoch": 0.5222857142857142, "grad_norm": 0.0224411953240633, "kl": 0.0004011392593383789, "lambda_div_used": 0.5359829142689705, "learning_rate": 1.2106419949317388e-07, "loss": -0.0311, "reward": -0.2614034563302994, "reward_after_mean": -0.2614034563302994, "reward_after_std": 0.33165648579597473, "reward_before_mean": 0.16703256964683533, "reward_before_std": 0.157981239259243, "reward_change_max": 0.0, "reward_change_mean": -0.42843602411448956, "reward_change_min": -0.5874424390494823, "reward_change_std": 0.21858325507491827, "reward_std": 0.3316564913839102, "rewards/accuracy_reward": 0.25, "rewards/cosine_scaled_reward": -0.08296743780374527, "step": 457 }, { "clip_fraction": 0.0, "completion_length": 2119.250026702881, "epoch": 0.5234285714285715, "grad_norm": 0.03472839295864105, "kl": 0.00028389692306518555, "lambda_div_used": 0.6622679010033607, "learning_rate": 1.2012473704494537e-07, "loss": 0.0396, "reward": 0.09622732177376747, "reward_after_mean": 0.09622732177376747, "reward_after_std": 0.7549249790608883, "reward_before_mean": 0.4134064484387636, "reward_before_std": 0.7546715997159481, "reward_change_max": 0.0, "reward_change_mean": -0.3171791285276413, "reward_change_min": -0.5532067231833935, "reward_change_std": 0.2206783127039671, "reward_std": 0.7549249865114689, "rewards/accuracy_reward": 0.3541666753590107, "rewards/cosine_scaled_reward": 0.05923975070982124, "step": 458 }, { "clip_fraction": 0.0, "completion_length": 1308.0000534057617, "epoch": 0.5245714285714286, "grad_norm": 0.034576416015625, "kl": 0.00019240379333496094, "lambda_div_used": 0.6211641430854797, "learning_rate": 1.1920622611056974e-07, "loss": 0.0136, "reward": 0.10968395322561264, "reward_after_mean": 0.10968395322561264, "reward_after_std": 0.642698410898447, "reward_before_mean": 0.5380105744116008, "reward_before_std": 0.5641936575993896, "reward_change_max": 0.0, "reward_change_mean": -0.42832658998668194, "reward_change_min": -0.647167906165123, "reward_change_std": 0.26214463263750076, "reward_std": 0.6426984257996082, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.16301053576171398, "step": 459 }, { "clip_fraction": 0.0, "completion_length": 2947.2916870117188, "epoch": 0.5257142857142857, "grad_norm": 0.022817425429821014, "kl": 0.00036215782165527344, "lambda_div_used": 0.6313923373818398, "learning_rate": 1.1830871145697412e-07, "loss": 0.0374, "reward": -0.143511475995183, "reward_after_mean": -0.143511475995183, "reward_after_std": 0.6310819126665592, "reward_before_mean": 0.12019729614257812, "reward_before_std": 0.6178826270624995, "reward_change_max": 0.0, "reward_change_mean": -0.26370877772569656, "reward_change_min": -0.5500058270990849, "reward_change_std": 0.197435456328094, "reward_std": 0.6310819499194622, "rewards/accuracy_reward": 0.1875000037252903, "rewards/cosine_scaled_reward": -0.06730269826948643, "step": 460 }, { "clip_fraction": 0.0, "completion_length": 3020.604232788086, "epoch": 0.5268571428571428, "grad_norm": 0.019895615056157112, "kl": 0.0003611445426940918, "lambda_div_used": 0.6468427553772926, "learning_rate": 1.1743223682775649e-07, "loss": 0.0226, "reward": 0.13486449420452118, "reward_after_mean": 0.13486449420452118, "reward_after_std": 0.6662128213793039, "reward_before_mean": 0.5074618738144636, "reward_before_std": 0.6824358962476254, "reward_change_max": 0.0, "reward_change_mean": -0.3725973889231682, "reward_change_min": -0.659518338739872, "reward_change_std": 0.26068645529448986, "reward_std": 0.6662128381431103, "rewards/accuracy_reward": 0.3750000111758709, "rewards/cosine_scaled_reward": 0.1324618849903345, "step": 461 }, { "clip_fraction": 0.0, "completion_length": 2896.958354949951, "epoch": 0.528, "grad_norm": 0.027659112587571144, "kl": 0.0003952980041503906, "lambda_div_used": 0.5935230925679207, "learning_rate": 1.1657684494105386e-07, "loss": -0.0084, "reward": -0.32939455355517566, "reward_after_mean": -0.32939455355517566, "reward_after_std": 0.4861418064683676, "reward_before_mean": -0.06984398560598493, "reward_before_std": 0.4248745897784829, "reward_change_max": 0.0, "reward_change_mean": -0.25955056957900524, "reward_change_min": -0.4244098737835884, "reward_change_std": 0.1560937762260437, "reward_std": 0.4861418195068836, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.15317732468247414, "step": 462 }, { "clip_fraction": 0.0, "completion_length": 2485.375030517578, "epoch": 0.5291428571428571, "grad_norm": 0.02032829262316227, "kl": 0.00022867321968078613, "lambda_div_used": 0.6013497039675713, "learning_rate": 1.1574257748745986e-07, "loss": 0.1063, "reward": 0.004297456704080105, "reward_after_mean": 0.004297456704080105, "reward_after_std": 0.5376875698566437, "reward_before_mean": 0.42904988676309586, "reward_before_std": 0.46889279037714005, "reward_change_max": 0.0, "reward_change_mean": -0.4247523993253708, "reward_change_min": -0.6676977872848511, "reward_change_std": 0.26014791429042816, "reward_std": 0.5376875959336758, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.09571653697639704, "step": 463 }, { "clip_fraction": 0.0, "completion_length": 1664.770881652832, "epoch": 0.5302857142857142, "grad_norm": 0.03104904294013977, "kl": 0.0002219080924987793, "lambda_div_used": 0.6030265837907791, "learning_rate": 1.1492947512799328e-07, "loss": -0.0781, "reward": 0.027926755137741566, "reward_after_mean": 0.027926755137741566, "reward_after_std": 0.5750233307480812, "reward_before_mean": 0.4541918604518287, "reward_before_std": 0.4690140914171934, "reward_change_max": 0.0, "reward_change_mean": -0.4262651167809963, "reward_change_min": -0.597686804831028, "reward_change_std": 0.22887016367167234, "reward_std": 0.5750233307480812, "rewards/accuracy_reward": 0.4166666679084301, "rewards/cosine_scaled_reward": 0.03752519562840462, "step": 464 }, { "clip_fraction": 0.0, "completion_length": 2696.979248046875, "epoch": 0.5314285714285715, "grad_norm": 0.02595655620098114, "kl": 0.0003947019577026367, "lambda_div_used": 0.6469813883304596, "learning_rate": 1.1413757749211602e-07, "loss": 0.0015, "reward": -0.005829358473420143, "reward_after_mean": -0.005829358473420143, "reward_after_std": 0.7018453720957041, "reward_before_mean": 0.2963540703058243, "reward_before_std": 0.687056201742962, "reward_change_max": 0.0, "reward_change_mean": -0.30218344181776047, "reward_change_min": -0.5677898563444614, "reward_change_std": 0.2154219476506114, "reward_std": 0.7018453869968653, "rewards/accuracy_reward": 0.25000000558793545, "rewards/cosine_scaled_reward": 0.04635407403111458, "step": 465 }, { "clip_fraction": 0.0, "completion_length": 3051.750030517578, "epoch": 0.5325714285714286, "grad_norm": 0.022232649847865105, "kl": 0.00035768747329711914, "lambda_div_used": 0.6076068878173828, "learning_rate": 1.1336692317580158e-07, "loss": 0.0096, "reward": -0.09472141414880753, "reward_after_mean": -0.09472141414880753, "reward_after_std": 0.5900444928556681, "reward_before_mean": 0.2662593559361994, "reward_before_std": 0.5045403479598463, "reward_change_max": 0.0, "reward_change_mean": -0.36098079197108746, "reward_change_min": -0.5804308690130711, "reward_change_std": 0.22072006110101938, "reward_std": 0.5900445096194744, "rewards/accuracy_reward": 0.25000000186264515, "rewards/cosine_scaled_reward": 0.016259355936199427, "step": 466 }, { "clip_fraction": 0.0, "completion_length": 2831.6458702087402, "epoch": 0.5337142857142857, "grad_norm": 0.027867048978805542, "kl": 0.0003955364227294922, "lambda_div_used": 0.5972240790724754, "learning_rate": 1.1261754973965422e-07, "loss": 0.0064, "reward": -0.16292368434369564, "reward_after_mean": -0.16292368434369564, "reward_after_std": 0.5499103963375092, "reward_before_mean": 0.19015199813293293, "reward_before_std": 0.4444003812968731, "reward_change_max": 0.0, "reward_change_mean": -0.3530757036060095, "reward_change_min": -0.5379382502287626, "reward_change_std": 0.1972823329269886, "reward_std": 0.5499104224145412, "rewards/accuracy_reward": 0.2083333358168602, "rewards/cosine_scaled_reward": -0.018181337043642998, "step": 467 }, { "clip_fraction": 0.0, "completion_length": 2942.583354949951, "epoch": 0.5348571428571428, "grad_norm": 0.021755212917923927, "kl": 0.0003478527069091797, "lambda_div_used": 0.5782695487141609, "learning_rate": 1.1188949370707787e-07, "loss": -0.0004, "reward": -0.2234923504292965, "reward_after_mean": -0.2234923504292965, "reward_after_std": 0.4573483895510435, "reward_before_mean": 0.13846815121360123, "reward_before_std": 0.36004857218358666, "reward_change_max": 0.0, "reward_change_mean": -0.36196050979197025, "reward_change_min": -0.5418836548924446, "reward_change_std": 0.2078724503517151, "reward_std": 0.45734839886426926, "rewards/accuracy_reward": 0.20833333395421505, "rewards/cosine_scaled_reward": -0.06986518204212189, "step": 468 }, { "clip_fraction": 0.0, "completion_length": 2679.208396911621, "epoch": 0.536, "grad_norm": 0.02693931572139263, "kl": 0.00039780139923095703, "lambda_div_used": 0.6250492706894875, "learning_rate": 1.1118279056249653e-07, "loss": 0.0181, "reward": 0.01080943364650011, "reward_after_mean": 0.01080943364650011, "reward_after_std": 0.6565821636468172, "reward_before_mean": 0.38899326138198376, "reward_before_std": 0.5838505062274635, "reward_change_max": 0.0, "reward_change_mean": -0.37818380631506443, "reward_change_min": -0.6336386613547802, "reward_change_std": 0.23969437181949615, "reward_std": 0.6565821878612041, "rewards/accuracy_reward": 0.31250000558793545, "rewards/cosine_scaled_reward": 0.07649324322119355, "step": 469 }, { "clip_fraction": 0.0, "completion_length": 2651.8333740234375, "epoch": 0.5371428571428571, "grad_norm": 0.022814009338617325, "kl": 0.0003286600112915039, "lambda_div_used": 0.5820565819740295, "learning_rate": 1.1049747474962444e-07, "loss": 0.062, "reward": -0.1548374481499195, "reward_after_mean": -0.1548374481499195, "reward_after_std": 0.46373489685356617, "reward_before_mean": 0.21819785539992154, "reward_before_std": 0.37470409646630287, "reward_change_max": 0.0, "reward_change_mean": -0.3730353116989136, "reward_change_min": -0.5532267577946186, "reward_change_std": 0.2110730605199933, "reward_std": 0.46373490430414677, "rewards/accuracy_reward": 0.22916666977107525, "rewards/cosine_scaled_reward": -0.010968813672661781, "step": 470 }, { "clip_fraction": 0.0, "completion_length": 2898.7916946411133, "epoch": 0.5382857142857143, "grad_norm": 0.01961471140384674, "kl": 0.00035419315099716187, "lambda_div_used": 0.580303005874157, "learning_rate": 1.0983357966978745e-07, "loss": -0.0592, "reward": -0.22997316345572472, "reward_after_mean": -0.22997316345572472, "reward_after_std": 0.4005854483693838, "reward_before_mean": 0.09628471545875072, "reward_before_std": 0.36470284312963486, "reward_change_max": 0.0, "reward_change_mean": -0.3262578770518303, "reward_change_min": -0.5184299051761627, "reward_change_std": 0.1985421497374773, "reward_std": 0.40058545023202896, "rewards/accuracy_reward": 0.1666666716337204, "rewards/cosine_scaled_reward": -0.07038197666406631, "step": 471 }, { "clip_fraction": 0.0, "completion_length": 2656.125030517578, "epoch": 0.5394285714285715, "grad_norm": 0.020427875220775604, "kl": 0.00032085180282592773, "lambda_div_used": 0.6381691917777061, "learning_rate": 1.0919113768029517e-07, "loss": -0.0459, "reward": -0.11792396203964017, "reward_after_mean": -0.11792396203964017, "reward_after_std": 0.6783455964177847, "reward_before_mean": 0.15370581997558475, "reward_before_std": 0.6358637362718582, "reward_change_max": 0.0, "reward_change_mean": -0.2716297823935747, "reward_change_min": -0.4567250721156597, "reward_change_std": 0.1737262774258852, "reward_std": 0.678345600143075, "rewards/accuracy_reward": 0.18750000558793545, "rewards/cosine_scaled_reward": -0.033794180024415255, "step": 472 }, { "clip_fraction": 0.0, "completion_length": 2983.750015258789, "epoch": 0.5405714285714286, "grad_norm": 0.02453148551285267, "kl": 0.0003961324691772461, "lambda_div_used": 0.6016801968216896, "learning_rate": 1.0857018009286381e-07, "loss": 0.0472, "reward": -0.006093651056289673, "reward_after_mean": -0.006093651056289673, "reward_after_std": 0.5333354268223047, "reward_before_mean": 0.4089024979621172, "reward_before_std": 0.4693258060142398, "reward_change_max": 0.0, "reward_change_mean": -0.4149961844086647, "reward_change_min": -0.6490769572556019, "reward_change_std": 0.25427413638681173, "reward_std": 0.5333354473114014, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.07556918449699879, "step": 473 }, { "clip_fraction": 0.0, "completion_length": 2252.104202270508, "epoch": 0.5417142857142857, "grad_norm": 0.02836003340780735, "kl": 0.00038546323776245117, "lambda_div_used": 0.5747748166322708, "learning_rate": 1.0797073717209013e-07, "loss": 0.0473, "reward": 0.11940666288137436, "reward_after_mean": 0.11940666288137436, "reward_after_std": 0.5397788472473621, "reward_before_mean": 0.7007539421319962, "reward_before_std": 0.3393053291365504, "reward_change_max": 0.0, "reward_change_mean": -0.5813473239541054, "reward_change_min": -0.7977702841162682, "reward_change_std": 0.3083681631833315, "reward_std": 0.5397788546979427, "rewards/accuracy_reward": 0.47916666977107525, "rewards/cosine_scaled_reward": 0.22158729657530785, "step": 474 }, { "clip_fraction": 0.0, "completion_length": 2133.562568664551, "epoch": 0.5428571428571428, "grad_norm": 0.02848704345524311, "kl": 0.0002709627151489258, "lambda_div_used": 0.6917188391089439, "learning_rate": 1.0739283813397639e-07, "loss": 0.0352, "reward": 0.2483121044933796, "reward_after_mean": 0.2483121044933796, "reward_after_std": 0.851221090182662, "reward_before_mean": 0.5981418825685978, "reward_before_std": 0.9012213246896863, "reward_change_max": 0.0, "reward_change_mean": -0.34982976876199245, "reward_change_min": -0.7087801471352577, "reward_change_std": 0.28033955581486225, "reward_std": 0.8512210976332426, "rewards/accuracy_reward": 0.4375000111758709, "rewards/cosine_scaled_reward": 0.16064186580479145, "step": 475 }, { "clip_fraction": 0.0, "completion_length": 2319.3750381469727, "epoch": 0.544, "grad_norm": 0.02834029123187065, "kl": 0.00039076805114746094, "lambda_div_used": 0.6543590575456619, "learning_rate": 1.068365111445064e-07, "loss": 0.0905, "reward": 0.3722646813839674, "reward_after_mean": 0.3722646813839674, "reward_after_std": 0.76073794439435, "reward_before_mean": 0.8793929517269135, "reward_before_std": 0.7163430340588093, "reward_change_max": 0.0, "reward_change_mean": -0.5071282722055912, "reward_change_min": -0.8575163669884205, "reward_change_std": 0.32718705013394356, "reward_std": 0.7607379760593176, "rewards/accuracy_reward": 0.5625000074505806, "rewards/cosine_scaled_reward": 0.31689293240197003, "step": 476 }, { "clip_fraction": 0.0, "completion_length": 1388.479190826416, "epoch": 0.5451428571428572, "grad_norm": 0.03197532892227173, "kl": 0.00019982457160949707, "lambda_div_used": 0.6069512218236923, "learning_rate": 1.063017833182728e-07, "loss": 0.0104, "reward": 0.2797765755094588, "reward_after_mean": 0.2797765755094588, "reward_after_std": 0.6170946378260851, "reward_before_mean": 0.8468085322529078, "reward_before_std": 0.48898326186463237, "reward_change_max": 0.0, "reward_change_mean": -0.5670319274067879, "reward_change_min": -0.8104716204106808, "reward_change_std": 0.32000066339969635, "reward_std": 0.6170946676284075, "rewards/accuracy_reward": 0.520833345130086, "rewards/cosine_scaled_reward": 0.325975195504725, "step": 477 }, { "clip_fraction": 0.0, "completion_length": 2758.9375228881836, "epoch": 0.5462857142857143, "grad_norm": 0.023568512871861458, "kl": 0.0003166794776916504, "lambda_div_used": 0.6045826748013496, "learning_rate": 1.0578868071715544e-07, "loss": 0.0424, "reward": 0.06877373531460762, "reward_after_mean": 0.06877373531460762, "reward_after_std": 0.5457048490643501, "reward_before_mean": 0.5106075219810009, "reward_before_std": 0.4815037827938795, "reward_change_max": 0.0, "reward_change_mean": -0.4418338183313608, "reward_change_min": -0.6827114932239056, "reward_change_std": 0.2656334117054939, "reward_std": 0.5457048676908016, "rewards/accuracy_reward": 0.3750000111758709, "rewards/cosine_scaled_reward": 0.1356075219810009, "step": 478 }, { "clip_fraction": 0.0, "completion_length": 2932.895866394043, "epoch": 0.5474285714285714, "grad_norm": 0.025046760216355324, "kl": 0.00037872791290283203, "lambda_div_used": 0.5930827036499977, "learning_rate": 1.0529722834905125e-07, "loss": 0.0013, "reward": -0.29585114773362875, "reward_after_mean": -0.29585114773362875, "reward_after_std": 0.4981949180364609, "reward_before_mean": -0.02313473215326667, "reward_before_std": 0.4286517295986414, "reward_change_max": 0.0, "reward_change_mean": -0.27271641232073307, "reward_change_min": -0.4339797645807266, "reward_change_std": 0.1596519472077489, "reward_std": 0.4981949217617512, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.12730140378698707, "step": 479 }, { "clip_fraction": 0.0, "completion_length": 2238.062515258789, "epoch": 0.5485714285714286, "grad_norm": 0.03429793193936348, "kl": 0.0003292560577392578, "lambda_div_used": 0.6356522366404533, "learning_rate": 1.0482745016665526e-07, "loss": 0.0622, "reward": -0.19213765393942595, "reward_after_mean": -0.19213765393942595, "reward_after_std": 0.6865857243537903, "reward_before_mean": 0.048641178291291, "reward_before_std": 0.6263551618903875, "reward_change_max": 0.0, "reward_change_mean": -0.24077884666621685, "reward_change_min": -0.43974681198596954, "reward_change_std": 0.15421691350638866, "reward_std": 0.6865857467055321, "rewards/accuracy_reward": 0.14583333767950535, "rewards/cosine_scaled_reward": -0.09719215868972242, "step": 480 }, { "clip_fraction": 0.0, "completion_length": 2533.666732788086, "epoch": 0.5497142857142857, "grad_norm": 0.01749316044151783, "kl": 0.00031572580337524414, "lambda_div_used": 0.6021129563450813, "learning_rate": 1.0437936906629334e-07, "loss": 0.0031, "reward": -0.24226298835128546, "reward_after_mean": -0.24226298835128546, "reward_after_std": 0.5046281572431326, "reward_before_mean": 0.02957908995449543, "reward_before_std": 0.47462415788322687, "reward_change_max": 0.0, "reward_change_mean": -0.2718420699238777, "reward_change_min": -0.45962032303214073, "reward_change_std": 0.1774886343628168, "reward_std": 0.5046281665563583, "rewards/accuracy_reward": 0.14583333767950535, "rewards/cosine_scaled_reward": -0.11625425447709858, "step": 481 }, { "clip_fraction": 0.0, "completion_length": 2584.3958702087402, "epoch": 0.5508571428571428, "grad_norm": 0.027141164988279343, "kl": 0.0004488229751586914, "lambda_div_used": 0.6000719964504242, "learning_rate": 1.0395300688680625e-07, "loss": -0.0155, "reward": 0.2671008687466383, "reward_after_mean": 0.2671008687466383, "reward_after_std": 0.6417696066200733, "reward_before_mean": 0.8570855539292097, "reward_before_std": 0.46050422452390194, "reward_change_max": 0.0, "reward_change_mean": -0.5899846386164427, "reward_change_min": -0.8298100866377354, "reward_change_std": 0.3227744400501251, "reward_std": 0.6417696103453636, "rewards/accuracy_reward": 0.5625000055879354, "rewards/cosine_scaled_reward": 0.29458553344011307, "step": 482 }, { "clip_fraction": 0.0, "completion_length": 2577.833366394043, "epoch": 0.552, "grad_norm": 0.022445959970355034, "kl": 0.00040972232818603516, "lambda_div_used": 0.5583987012505531, "learning_rate": 1.0354838440848501e-07, "loss": -0.0221, "reward": -0.2919683400541544, "reward_after_mean": -0.2919683400541544, "reward_after_std": 0.37157695554196835, "reward_before_mean": 0.06092929560691118, "reward_before_std": 0.26688239723443985, "reward_change_max": 0.0, "reward_change_mean": -0.35289763286709785, "reward_change_min": -0.5401931628584862, "reward_change_std": 0.19714731443673372, "reward_std": 0.37157696671783924, "rewards/accuracy_reward": 0.1666666679084301, "rewards/cosine_scaled_reward": -0.10573737230151892, "step": 483 }, { "clip_fraction": 0.0, "completion_length": 2279.06254196167, "epoch": 0.5531428571428572, "grad_norm": 0.027125045657157898, "kl": 0.0002903938293457031, "lambda_div_used": 0.6015297621488571, "learning_rate": 1.0316552135205837e-07, "loss": -0.0284, "reward": 0.08303672191686928, "reward_after_mean": 0.08303672191686928, "reward_after_std": 0.5998833030462265, "reward_before_mean": 0.5530420504510403, "reward_before_std": 0.4744609510526061, "reward_change_max": 0.0, "reward_change_mean": -0.47000533528625965, "reward_change_min": -0.6774297691881657, "reward_change_std": 0.2718982184305787, "reward_std": 0.5998833123594522, "rewards/accuracy_reward": 0.41666666977107525, "rewards/cosine_scaled_reward": 0.13637536205351353, "step": 484 }, { "clip_fraction": 0.0, "completion_length": 1672.458351135254, "epoch": 0.5542857142857143, "grad_norm": 0.03484974429011345, "kl": 0.00024706125259399414, "lambda_div_used": 0.5606790855526924, "learning_rate": 1.0280443637773163e-07, "loss": -0.0009, "reward": -0.24691498838365078, "reward_after_mean": -0.24691498838365078, "reward_after_std": 0.4085298776626587, "reward_before_mean": 0.13868734147399664, "reward_before_std": 0.2728640455752611, "reward_change_max": 0.0, "reward_change_mean": -0.38560234755277634, "reward_change_min": -0.5570618100464344, "reward_change_std": 0.20243172626942396, "reward_std": 0.408529881387949, "rewards/accuracy_reward": 0.18750000186264515, "rewards/cosine_scaled_reward": -0.04881266225129366, "step": 485 }, { "clip_fraction": 0.0, "completion_length": 1802.2083473205566, "epoch": 0.5554285714285714, "grad_norm": 0.03193674981594086, "kl": 0.000291287899017334, "lambda_div_used": 0.6145108714699745, "learning_rate": 1.0246514708427701e-07, "loss": 0.0763, "reward": -0.15824460261501372, "reward_after_mean": -0.15824460261501372, "reward_after_std": 0.5667215548455715, "reward_before_mean": 0.12157449871301651, "reward_before_std": 0.5284877885133028, "reward_change_max": 0.0, "reward_change_mean": -0.2798191010951996, "reward_change_min": -0.43175532296299934, "reward_change_std": 0.17297773249447346, "reward_std": 0.566721560433507, "rewards/accuracy_reward": 0.2083333395421505, "rewards/cosine_scaled_reward": -0.0867588329128921, "step": 486 }, { "clip_fraction": 0.0, "completion_length": 1764.979190826416, "epoch": 0.5565714285714286, "grad_norm": 0.03538261726498604, "kl": 0.0003072023391723633, "lambda_div_used": 0.6353137269616127, "learning_rate": 1.0214767000817596e-07, "loss": 0.0034, "reward": 0.4021994969807565, "reward_after_mean": 0.4021994969807565, "reward_after_std": 0.7458214424550533, "reward_before_mean": 0.9773663654923439, "reward_before_std": 0.6311542720068246, "reward_change_max": 0.0, "reward_change_mean": -0.5751669164747, "reward_change_min": -0.8273528963327408, "reward_change_std": 0.3425491387024522, "reward_std": 0.745821475982666, "rewards/accuracy_reward": 0.6041666772216558, "rewards/cosine_scaled_reward": 0.37319972552359104, "step": 487 }, { "clip_fraction": 0.0, "completion_length": 2063.020866394043, "epoch": 0.5577142857142857, "grad_norm": 0.027028201147913933, "kl": 0.0002803802490234375, "lambda_div_used": 0.5719931498169899, "learning_rate": 1.0185202062281336e-07, "loss": 0.0247, "reward": -0.2594454251229763, "reward_after_mean": -0.2594454251229763, "reward_after_std": 0.458538630977273, "reward_before_mean": 0.09588468819856644, "reward_before_std": 0.3299330030567944, "reward_change_max": 0.0, "reward_change_mean": -0.35533010959625244, "reward_change_min": -0.5170786269009113, "reward_change_std": 0.19317798037081957, "reward_std": 0.45853864029049873, "rewards/accuracy_reward": 0.18750000186264515, "rewards/cosine_scaled_reward": -0.09161530435085297, "step": 488 }, { "clip_fraction": 0.0, "completion_length": 3084.8958892822266, "epoch": 0.5588571428571428, "grad_norm": 0.02722005918622017, "kl": 0.00041961669921875, "lambda_div_used": 0.5843943357467651, "learning_rate": 1.0157821333772304e-07, "loss": -0.012, "reward": -0.31660015136003494, "reward_after_mean": -0.31660015136003494, "reward_after_std": 0.43002712167799473, "reward_before_mean": -0.0431265402585268, "reward_before_std": 0.38921575900167227, "reward_change_max": 0.0, "reward_change_mean": -0.2734736017882824, "reward_change_min": -0.4792550317943096, "reward_change_std": 0.17569494806230068, "reward_std": 0.4300271272659302, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.14729320164769888, "step": 489 }, { "clip_fraction": 0.0, "completion_length": 2155.020851135254, "epoch": 0.56, "grad_norm": 0.024445833638310432, "kl": 0.00034758448600769043, "lambda_div_used": 0.5682637020945549, "learning_rate": 1.013262614978859e-07, "loss": -0.0323, "reward": -0.13516036188229918, "reward_after_mean": -0.13516036188229918, "reward_after_std": 0.41841856203973293, "reward_before_mean": 0.28103313967585564, "reward_before_std": 0.307465685531497, "reward_change_max": 0.0, "reward_change_mean": -0.41619347035884857, "reward_change_min": -0.5914728902280331, "reward_change_std": 0.22987399622797966, "reward_std": 0.4184185788035393, "rewards/accuracy_reward": 0.2916666716337204, "rewards/cosine_scaled_reward": -0.01063353568315506, "step": 490 }, { "clip_fraction": 0.0, "completion_length": 2476.8958740234375, "epoch": 0.5611428571428572, "grad_norm": 0.026141280308365822, "kl": 0.0003236532211303711, "lambda_div_used": 0.61311075091362, "learning_rate": 1.0109617738307911e-07, "loss": -0.0399, "reward": 0.14587094401940703, "reward_after_mean": 0.14587094401940703, "reward_after_std": 0.6566581912338734, "reward_before_mean": 0.6461835531517863, "reward_before_std": 0.5190226640552282, "reward_change_max": 0.0, "reward_change_mean": -0.500312577933073, "reward_change_min": -0.7899926863610744, "reward_change_std": 0.29301502648741007, "reward_std": 0.6566582024097443, "rewards/accuracy_reward": 0.45833334140479565, "rewards/cosine_scaled_reward": 0.1878501633182168, "step": 491 }, { "clip_fraction": 0.0, "completion_length": 2644.770835876465, "epoch": 0.5622857142857143, "grad_norm": 0.04309506341814995, "kl": 0.000333636999130249, "lambda_div_used": 0.5899444594979286, "learning_rate": 1.0088797220727779e-07, "loss": 0.012, "reward": -0.1085394024848938, "reward_after_mean": -0.1085394024848938, "reward_after_std": 0.4626395758241415, "reward_before_mean": 0.25940042175352573, "reward_before_std": 0.4149730410426855, "reward_change_max": 0.0, "reward_change_mean": -0.3679398000240326, "reward_change_min": -0.5678628534078598, "reward_change_std": 0.223101656883955, "reward_std": 0.46263957768678665, "rewards/accuracy_reward": 0.2708333358168602, "rewards/cosine_scaled_reward": -0.011432923376560211, "step": 492 }, { "clip_fraction": 0.0, "completion_length": 1723.0000457763672, "epoch": 0.5634285714285714, "grad_norm": 0.029525646939873695, "kl": 0.00028970837593078613, "lambda_div_used": 0.6680872738361359, "learning_rate": 1.0070165611810855e-07, "loss": 0.0137, "reward": 0.13114306051284075, "reward_after_mean": 0.13114306051284075, "reward_after_std": 0.8511836100369692, "reward_before_mean": 0.4935412285849452, "reward_before_std": 0.7856986094266176, "reward_change_max": 0.0, "reward_change_mean": -0.3623981699347496, "reward_change_min": -0.587810892611742, "reward_change_std": 0.22921003215014935, "reward_std": 0.8511836417019367, "rewards/accuracy_reward": 0.37500000558793545, "rewards/cosine_scaled_reward": 0.11854122020304203, "step": 493 }, { "clip_fraction": 0.0, "completion_length": 1625.3125915527344, "epoch": 0.5645714285714286, "grad_norm": 0.031883496791124344, "kl": 0.00029462575912475586, "lambda_div_used": 0.6554152071475983, "learning_rate": 1.005372381963547e-07, "loss": -0.0192, "reward": 0.2886330671608448, "reward_after_mean": 0.2886330671608448, "reward_after_std": 0.7760931197553873, "reward_before_mean": 0.7313874992541969, "reward_before_std": 0.7224944466724992, "reward_change_max": 0.0, "reward_change_mean": -0.4427544269710779, "reward_change_min": -0.7037210427224636, "reward_change_std": 0.28225341718643904, "reward_std": 0.776093129068613, "rewards/accuracy_reward": 0.4583333432674408, "rewards/cosine_scaled_reward": 0.2730541592463851, "step": 494 }, { "clip_fraction": 0.0, "completion_length": 2404.104217529297, "epoch": 0.5657142857142857, "grad_norm": 0.025816943496465683, "kl": 0.00029496103525161743, "lambda_div_used": 0.6370417103171349, "learning_rate": 1.0039472645551372e-07, "loss": 0.0444, "reward": -0.002785082906484604, "reward_after_mean": -0.002785082906484604, "reward_after_std": 0.7017532214522362, "reward_before_mean": 0.35137681500054896, "reward_before_std": 0.6334654297679663, "reward_change_max": 0.0, "reward_change_mean": -0.3541618827730417, "reward_change_min": -0.61022624745965, "reward_change_std": 0.2296114508062601, "reward_std": 0.7017532512545586, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.018043467309325933, "step": 495 }, { "clip_fraction": 0.0, "completion_length": 1731.9791717529297, "epoch": 0.5668571428571428, "grad_norm": 0.03966463729739189, "kl": 0.00029587745666503906, "lambda_div_used": 0.6256460249423981, "learning_rate": 1.002741278414069e-07, "loss": 0.0666, "reward": 0.17603341676294804, "reward_after_mean": 0.17603341676294804, "reward_after_std": 0.636838972568512, "reward_before_mean": 0.6388222957029939, "reward_before_std": 0.586422567255795, "reward_change_max": 0.0, "reward_change_mean": -0.4627888761460781, "reward_change_min": -0.7264026664197445, "reward_change_std": 0.29389980621635914, "reward_std": 0.6368389893323183, "rewards/accuracy_reward": 0.4166666753590107, "rewards/cosine_scaled_reward": 0.22215561103075743, "step": 496 }, { "clip_fraction": 0.0, "completion_length": 2216.1250534057617, "epoch": 0.568, "grad_norm": 0.028931519016623497, "kl": 0.0002690255641937256, "lambda_div_used": 0.6521744430065155, "learning_rate": 1.0017544823184055e-07, "loss": 0.0104, "reward": 0.271162249147892, "reward_after_mean": 0.271162249147892, "reward_after_std": 0.7392647787928581, "reward_before_mean": 0.7059557363390923, "reward_before_std": 0.7143486840650439, "reward_change_max": 0.0, "reward_change_mean": -0.43479350954294205, "reward_change_min": -0.7290924154222012, "reward_change_std": 0.2943303110077977, "reward_std": 0.7392647992819548, "rewards/accuracy_reward": 0.4791666753590107, "rewards/cosine_scaled_reward": 0.2267890479415655, "step": 497 }, { "clip_fraction": 0.0, "completion_length": 2327.6250534057617, "epoch": 0.5691428571428572, "grad_norm": 0.02442491240799427, "kl": 0.0003362894058227539, "lambda_div_used": 0.659889928996563, "learning_rate": 1.0009869243631952e-07, "loss": 0.0113, "reward": 0.31018914096057415, "reward_after_mean": 0.31018914096057415, "reward_after_std": 0.7232479602098465, "reward_before_mean": 0.7269191518425941, "reward_before_std": 0.741204846650362, "reward_change_max": 0.0, "reward_change_mean": -0.41672998666763306, "reward_change_min": -0.7166927941143513, "reward_change_std": 0.2858916409313679, "reward_std": 0.7232479825615883, "rewards/accuracy_reward": 0.5000000111758709, "rewards/cosine_scaled_reward": 0.22691912204027176, "step": 498 }, { "clip_fraction": 0.0, "completion_length": 2263.500026702881, "epoch": 0.5702857142857143, "grad_norm": 0.024736450985074043, "kl": 0.0002918243408203125, "lambda_div_used": 0.60347481071949, "learning_rate": 1.000438641958131e-07, "loss": -0.0248, "reward": 0.009699596092104912, "reward_after_mean": 0.009699596092104912, "reward_after_std": 0.5504223238676786, "reward_before_mean": 0.4187678713351488, "reward_before_std": 0.483045837841928, "reward_change_max": 0.0, "reward_change_mean": -0.4090682379901409, "reward_change_min": -0.6463241390883923, "reward_change_std": 0.2534347465261817, "reward_std": 0.5504223313182592, "rewards/accuracy_reward": 0.3333333395421505, "rewards/cosine_scaled_reward": 0.08543450571596622, "step": 499 }, { "clip_fraction": 0.0, "completion_length": 2641.000030517578, "epoch": 0.5714285714285714, "grad_norm": 0.02830589935183525, "kl": 0.00043022632598876953, "lambda_div_used": 0.6449039503931999, "learning_rate": 1.0001096618257236e-07, "loss": 0.0892, "reward": -0.016204694285988808, "reward_after_mean": -0.016204694285988808, "reward_after_std": 0.6894690785557032, "reward_before_mean": 0.2851157810073346, "reward_before_std": 0.6750934664160013, "reward_change_max": 0.0, "reward_change_mean": -0.3013204652816057, "reward_change_min": -0.5549999382346869, "reward_change_std": 0.20787928439676762, "reward_std": 0.6894691102206707, "rewards/accuracy_reward": 0.29166667722165585, "rewards/cosine_scaled_reward": -0.006550896912813187, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0046342451156378955, "train_runtime": 100887.1622, "train_samples_per_second": 0.238, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }