{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.997032640949555, "eval_steps": 500, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage/logodds_epsilon": 1e-06, "advantage/max": 2.7076480388641357, "advantage/mean": -0.0003062831237912178, "advantage/min": -2.7068095207214355, "advantage/std": 1.0714755058288574, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 910.609375, "completions/mean_terminated_length": 721.625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.011869436201780416, "grad_norm": 1.140625, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0339, "num_tokens": 133270.0, "rewards//mean": 0.31050193309783936, "rewards//std": 0.4752292037010193, "step": 1 }, { "advantage/logodds_epsilon": 1.2892857142857144e-05, "advantage/max": 2.707470655441284, "advantage/mean": -0.021119937300682068, "advantage/min": -2.7074711322784424, "advantage/std": 1.1238324642181396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 907.1328125, "completions/mean_terminated_length": 730.686279296875, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.02373887240356083, "grad_norm": 1.015625, "kl": 0.0, "learning_rate": 1.1764705882352942e-06, "loss": 0.0521, "num_tokens": 266647.0, "rewards//mean": 0.34321528673171997, "rewards//std": 0.4855186939239502, "step": 2 }, { "advantage/logodds_epsilon": 2.4785714285714287e-05, "advantage/max": 2.7072935104370117, "advantage/mean": 0.01818234473466873, "advantage/min": -2.707293748855591, "advantage/std": 0.7201385498046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 785.921875, "completions/mean_terminated_length": 681.5955200195312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.03560830860534125, "grad_norm": 0.69140625, "kl": 0.0008829391736071557, "learning_rate": 2.3529411764705885e-06, "loss": -0.001, "num_tokens": 382637.0, "rewards//mean": 0.6515579223632812, "rewards//std": 0.4833427369594574, "step": 3 }, { "advantage/logodds_epsilon": 3.667857142857143e-05, "advantage/max": 2.70711612701416, "advantage/mean": 0.018909499049186707, "advantage/min": -2.70711612701416, "advantage/std": 1.1025551557540894, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 864.203125, "completions/mean_terminated_length": 754.868408203125, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.04747774480712166, "grad_norm": 1.1484375, "kl": 0.0009217608167091385, "learning_rate": 3.529411764705883e-06, "loss": -0.0083, "num_tokens": 506791.0, "rewards//mean": 0.3201034367084503, "rewards//std": 0.47702425718307495, "step": 4 }, { "advantage/logodds_epsilon": 4.857142857142857e-05, "advantage/max": 2.7069385051727295, "advantage/mean": 0.02930786833167076, "advantage/min": -2.700977325439453, "advantage/std": 1.028095006942749, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 873.8125, "completions/mean_terminated_length": 680.7142944335938, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.05934718100890208, "grad_norm": 1.0546875, "kl": 0.0010906819879892282, "learning_rate": 4.705882352941177e-06, "loss": -0.0306, "num_tokens": 637655.0, "rewards//mean": 0.3020392656326294, "rewards//std": 0.47242945432662964, "step": 5 }, { "advantage/logodds_epsilon": 6.0464285714285715e-05, "advantage/max": 2.68178391456604, "advantage/mean": 0.006097273901104927, "advantage/min": -2.706761360168457, "advantage/std": 0.9648444056510925, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 827.3125, "completions/mean_terminated_length": 659.1304321289062, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.0712166172106825, "grad_norm": 0.90234375, "kl": 0.0010743636812549084, "learning_rate": 5.882352941176471e-06, "loss": 0.007, "num_tokens": 760639.0, "rewards//mean": 0.492506206035614, "rewards//std": 0.509504497051239, "step": 6 }, { "advantage/logodds_epsilon": 7.235714285714286e-05, "advantage/max": 2.7065823078155518, "advantage/mean": -0.03362283855676651, "advantage/min": -2.7065842151641846, "advantage/std": 1.066990613937378, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 822.859375, "completions/mean_terminated_length": 639.7313232421875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.0830860534124629, "grad_norm": 1.0546875, "kl": 0.0011787074290623423, "learning_rate": 7.058823529411766e-06, "loss": 0.0606, "num_tokens": 884397.0, "rewards//mean": 0.3580586016178131, "rewards//std": 0.49095040559768677, "step": 7 }, { "advantage/logodds_epsilon": 8.425e-05, "advantage/max": 2.706406354904175, "advantage/mean": 0.0027046892791986465, "advantage/min": -2.6997547149658203, "advantage/std": 1.0536103248596191, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 928.8671875, "completions/mean_terminated_length": 827.5967407226562, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.09495548961424333, "grad_norm": 0.95703125, "kl": 0.0014505159560940228, "learning_rate": 8.23529411764706e-06, "loss": 0.0164, "num_tokens": 1024724.0, "rewards//mean": 0.28668415546417236, "rewards//std": 0.46535518765449524, "step": 8 }, { "advantage/logodds_epsilon": 9.614285714285714e-05, "advantage/max": 2.7062294483184814, "advantage/mean": 0.04780469834804535, "advantage/min": -2.5455925464630127, "advantage/std": 0.9474127292633057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 795.640625, "completions/mean_terminated_length": 676.0238037109375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.10682492581602374, "grad_norm": 0.9921875, "kl": 0.002061821702227462, "learning_rate": 9.411764705882354e-06, "loss": -0.0054, "num_tokens": 1140126.0, "rewards//mean": 0.3750962018966675, "rewards//std": 0.4941062927246094, "step": 9 }, { "advantage/logodds_epsilon": 0.0001080357142857143, "advantage/max": 2.062196731567383, "advantage/mean": -0.00547090545296669, "advantage/min": -2.706052303314209, "advantage/std": 0.9570024013519287, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 804.234375, "completions/mean_terminated_length": 700.6666870117188, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.11869436201780416, "grad_norm": 0.90625, "kl": 0.002494313448551111, "learning_rate": 1.0588235294117648e-05, "loss": 0.0128, "num_tokens": 1257956.0, "rewards//mean": 0.5335986614227295, "rewards//std": 0.5063714385032654, "step": 10 }, { "advantage/logodds_epsilon": 0.00011992857142857143, "advantage/max": 2.705874443054199, "advantage/mean": -0.017507879063487053, "advantage/min": -2.7058751583099365, "advantage/std": 1.0383834838867188, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 818.03125, "completions/mean_terminated_length": 636.2941284179688, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.13056379821958458, "grad_norm": 0.9921875, "kl": 0.00399306406325195, "learning_rate": 1.1764705882352942e-05, "loss": 0.0496, "num_tokens": 1379048.0, "rewards//mean": 0.3989332616329193, "rewards//std": 0.49916210770606995, "step": 11 }, { "advantage/logodds_epsilon": 0.0001318214285714286, "advantage/max": 2.705698013305664, "advantage/mean": 0.00386135745793581, "advantage/min": -2.705698013305664, "advantage/std": 0.9110799431800842, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 828.6953125, "completions/mean_terminated_length": 650.8805541992188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.142433234421365, "grad_norm": 0.8125, "kl": 0.005920416064327583, "learning_rate": 1.2941176470588238e-05, "loss": 0.0131, "num_tokens": 1504145.0, "rewards//mean": 0.5091220140457153, "rewards//std": 0.5084678530693054, "step": 12 }, { "advantage/logodds_epsilon": 0.00014371428571428573, "advantage/max": 2.2751290798187256, "advantage/mean": -0.02284279279410839, "advantage/min": -2.7055208683013916, "advantage/std": 1.0446645021438599, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 811.8203125, "completions/mean_terminated_length": 666.6447143554688, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.1543026706231454, "grad_norm": 2.875, "kl": 0.01130726927658543, "learning_rate": 1.4117647058823532e-05, "loss": 0.0248, "num_tokens": 1623906.0, "rewards//mean": 0.4440869688987732, "rewards//std": 0.5080564618110657, "step": 13 }, { "advantage/logodds_epsilon": 0.00015560714285714288, "advantage/max": 2.705343723297119, "advantage/mean": -0.011263464577496052, "advantage/min": -2.705343723297119, "advantage/std": 0.894480288028717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 724.546875, "completions/mean_terminated_length": 573.058837890625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.1661721068249258, "grad_norm": 0.76953125, "kl": 0.012325245566898957, "learning_rate": 1.5294117647058822e-05, "loss": 0.0273, "num_tokens": 1731216.0, "rewards//mean": 0.5954668521881104, "rewards//std": 0.49903228878974915, "step": 14 }, { "advantage/logodds_epsilon": 0.0001675, "advantage/max": 2.7051665782928467, "advantage/mean": -0.022828437387943268, "advantage/min": -2.705166816711426, "advantage/std": 0.8195313215255737, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 746.6328125, "completions/mean_terminated_length": 665.3838500976562, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.17804154302670624, "grad_norm": 0.87109375, "kl": 0.013961711403680965, "learning_rate": 1.647058823529412e-05, "loss": 0.0358, "num_tokens": 1841201.0, "rewards//mean": 0.7065119743347168, "rewards//std": 0.46209007501602173, "step": 15 }, { "advantage/logodds_epsilon": 0.00017939285714285716, "advantage/max": 2.704983711242676, "advantage/mean": 0.0014846273697912693, "advantage/min": -2.1927168369293213, "advantage/std": 0.8186325430870056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 672.0, "completions/mean_terminated_length": 544.6808471679688, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.18991097922848665, "grad_norm": 0.91015625, "kl": 0.01810563838807866, "learning_rate": 1.7647058823529414e-05, "loss": -0.014, "num_tokens": 1942193.0, "rewards//mean": 0.5964069962501526, "rewards//std": 0.4978735148906708, "step": 16 }, { "advantage/logodds_epsilon": 0.00019128571428571428, "advantage/max": 2.7048118114471436, "advantage/mean": -0.02652079612016678, "advantage/min": -2.70481276512146, "advantage/std": 0.8032910227775574, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 723.3984375, "completions/mean_terminated_length": 627.3298950195312, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.20178041543026706, "grad_norm": 0.85546875, "kl": 0.017719955882057548, "learning_rate": 1.8823529411764708e-05, "loss": 0.0519, "num_tokens": 2047660.0, "rewards//mean": 0.7063321471214294, "rewards//std": 0.46237605810165405, "step": 17 }, { "advantage/logodds_epsilon": 0.00020317857142857144, "advantage/max": 2.7046356201171875, "advantage/mean": -0.013976034708321095, "advantage/min": -2.7046358585357666, "advantage/std": 0.7605122327804565, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 728.5625, "completions/mean_terminated_length": 621.7020874023438, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.21364985163204747, "grad_norm": 0.7890625, "kl": 0.018178344005718827, "learning_rate": 2e-05, "loss": 0.0528, "num_tokens": 2154476.0, "rewards//mean": 0.7472767233848572, "rewards//std": 0.43945369124412537, "step": 18 }, { "advantage/logodds_epsilon": 0.0002150714285714286, "advantage/max": 2.704457998275757, "advantage/mean": -0.005952497012913227, "advantage/min": -2.7044589519500732, "advantage/std": 0.7705256342887878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 747.2734375, "completions/mean_terminated_length": 643.1290283203125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.22551928783382788, "grad_norm": 0.69921875, "kl": 0.02424622658872977, "learning_rate": 1.999783578606323e-05, "loss": 0.0195, "num_tokens": 2270471.0, "rewards//mean": 0.6908018589019775, "rewards//std": 0.4689319431781769, "step": 19 }, { "advantage/logodds_epsilon": 0.00022696428571428574, "advantage/max": 2.704281806945801, "advantage/mean": -0.012495366856455803, "advantage/min": -2.70428204536438, "advantage/std": 0.9640302062034607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 864.2421875, "completions/mean_terminated_length": 699.4127197265625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.23738872403560832, "grad_norm": 0.90234375, "kl": 0.01943658519303426, "learning_rate": 1.9991344081017312e-05, "loss": 0.0451, "num_tokens": 2398374.0, "rewards//mean": 0.4931441843509674, "rewards//std": 0.5088549256324768, "step": 20 }, { "advantage/logodds_epsilon": 0.00023885714285714287, "advantage/max": 2.2589097023010254, "advantage/mean": -0.004630962386727333, "advantage/min": -2.7041053771972656, "advantage/std": 0.8182272911071777, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 801.796875, "completions/mean_terminated_length": 681.3252563476562, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.24925816023738873, "grad_norm": 0.87890625, "kl": 0.025031055964063853, "learning_rate": 1.9980527694749952e-05, "loss": 0.0144, "num_tokens": 2516852.0, "rewards//mean": 0.6364069581031799, "rewards//std": 0.4873649477958679, "step": 21 }, { "advantage/logodds_epsilon": 0.00025075000000000005, "advantage/max": 2.703927993774414, "advantage/mean": -0.012429304420948029, "advantage/min": -2.7039284706115723, "advantage/std": 0.8538786768913269, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 718.7890625, "completions/mean_terminated_length": 541.6913452148438, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.26112759643916916, "grad_norm": 0.8828125, "kl": 0.029131366696674377, "learning_rate": 1.996539130905593e-05, "loss": 0.0149, "num_tokens": 2626337.0, "rewards//mean": 0.587719202041626, "rewards//std": 0.5004004836082458, "step": 22 }, { "advantage/logodds_epsilon": 0.00026264285714285723, "advantage/max": 2.7037513256073, "advantage/mean": -0.0014824382960796356, "advantage/min": -2.703751802444458, "advantage/std": 0.5372014045715332, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 681.2109375, "completions/mean_terminated_length": 621.4586791992188, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.27299703264094954, "grad_norm": 0.62109375, "kl": 0.031289953854866326, "learning_rate": 1.9945941475610623e-05, "loss": 0.0109, "num_tokens": 2727476.0, "rewards//mean": 0.8566299676895142, "rewards//std": 0.35583803057670593, "step": 23 }, { "advantage/logodds_epsilon": 0.00027453571428571436, "advantage/max": 1.6469597816467285, "advantage/mean": -0.041688546538352966, "advantage/min": -2.7035748958587646, "advantage/std": 0.8940851092338562, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 767.953125, "completions/mean_terminated_length": 667.7608642578125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.28486646884273, "grad_norm": 0.93359375, "kl": 0.027894207509234548, "learning_rate": 1.9922186613134152e-05, "loss": 0.1136, "num_tokens": 2840230.0, "rewards//mean": 0.7074149250984192, "rewards//std": 0.46066340804100037, "step": 24 }, { "advantage/logodds_epsilon": 0.0002864285714285715, "advantage/max": 1.9371424913406372, "advantage/mean": -0.019469894468784332, "advantage/min": -2.7033982276916504, "advantage/std": 0.8306273221969604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 742.0078125, "completions/mean_terminated_length": 599.3529663085938, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.29673590504451036, "grad_norm": 0.81640625, "kl": 0.02968023216817528, "learning_rate": 1.9894137003747404e-05, "loss": 0.0728, "num_tokens": 2952367.0, "rewards//mean": 0.6998242139816284, "rewards//std": 0.4637812674045563, "step": 25 }, { "advantage/logodds_epsilon": 0.0002983214285714286, "advantage/max": 2.703221559524536, "advantage/mean": -0.009083299897611141, "advantage/min": -2.703221559524536, "advantage/std": 0.8210997581481934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 796.15625, "completions/mean_terminated_length": 680.8941040039062, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.3086053412462908, "grad_norm": 0.83203125, "kl": 0.024444299924653023, "learning_rate": 1.986180478852149e-05, "loss": 0.041, "num_tokens": 3070579.0, "rewards//mean": 0.6674760580062866, "rewards//std": 0.47770819067955017, "step": 26 }, { "advantage/logodds_epsilon": 0.0003102142857142858, "advantage/max": 2.7030446529388428, "advantage/mean": 0.03770775347948074, "advantage/min": -2.703045129776001, "advantage/std": 0.9571543335914612, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 868.8828125, "completions/mean_terminated_length": 656.3148193359375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.32047477744807124, "grad_norm": 0.82421875, "kl": 0.021971712238155305, "learning_rate": 1.9825203962222573e-05, "loss": -0.0093, "num_tokens": 3199700.0, "rewards//mean": 0.4137590527534485, "rewards//std": 0.5027788281440735, "step": 27 }, { "advantage/logodds_epsilon": 0.0003221071428571429, "advantage/max": 2.0144219398498535, "advantage/mean": -0.008568160235881805, "advantage/min": -2.691011428833008, "advantage/std": 0.9483392238616943, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 833.6875, "completions/mean_terminated_length": 699.2000122070312, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3323442136498516, "grad_norm": 0.875, "kl": 0.027091103547718376, "learning_rate": 1.9784350367254322e-05, "loss": 0.0383, "num_tokens": 3321380.0, "rewards//mean": 0.5257835984230042, "rewards//std": 0.5068398714065552, "step": 28 }, { "advantage/logodds_epsilon": 0.00033400000000000004, "advantage/max": 2.7026913166046143, "advantage/mean": 0.013357289135456085, "advantage/min": -2.7026920318603516, "advantage/std": 0.7889476418495178, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 796.6484375, "completions/mean_terminated_length": 660.2374877929688, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.34421364985163205, "grad_norm": 0.6484375, "kl": 0.025885087088681757, "learning_rate": 1.9739261686800662e-05, "loss": 0.0067, "num_tokens": 3441263.0, "rewards//mean": 0.6671923995018005, "rewards//std": 0.47810864448547363, "step": 29 }, { "advantage/logodds_epsilon": 0.0003458928571428572, "advantage/max": 2.7025139331817627, "advantage/mean": -0.003880620002746582, "advantage/min": -2.7025153636932373, "advantage/std": 0.7040800452232361, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 713.0703125, "completions/mean_terminated_length": 591.4021606445312, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3560830860534125, "grad_norm": 0.703125, "kl": 0.032981967844534665, "learning_rate": 1.968995743717171e-05, "loss": 0.0277, "num_tokens": 3546704.0, "rewards//mean": 0.7451189756393433, "rewards//std": 0.44322526454925537, "step": 30 }, { "advantage/logodds_epsilon": 0.00035778571428571435, "advantage/max": 2.702338695526123, "advantage/mean": -0.0032848306000232697, "advantage/min": -2.613555431365967, "advantage/std": 0.8725407719612122, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 826.203125, "completions/mean_terminated_length": 634.4923095703125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.36795252225519287, "grad_norm": 0.75, "kl": 0.02586214942857623, "learning_rate": 1.963645895935632e-05, "loss": 0.0233, "num_tokens": 3667562.0, "rewards//mean": 0.5240461826324463, "rewards//std": 0.5086991190910339, "step": 31 }, { "advantage/logodds_epsilon": 0.00036967857142857153, "advantage/max": 2.5066890716552734, "advantage/mean": -0.031669870018959045, "advantage/min": -2.702162504196167, "advantage/std": 0.8189276456832886, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 795.7734375, "completions/mean_terminated_length": 658.8375244140625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3798219584569733, "grad_norm": 0.734375, "kl": 0.029125412344001234, "learning_rate": 1.9578789409784727e-05, "loss": 0.0437, "num_tokens": 3787557.0, "rewards//mean": 0.6826491355895996, "rewards//std": 0.47256773710250854, "step": 32 }, { "advantage/logodds_epsilon": 0.0003815714285714286, "advantage/max": 2.7019858360290527, "advantage/mean": 0.0026858258061110973, "advantage/min": -1.8257653713226318, "advantage/std": 0.5741036534309387, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 757.546875, "completions/mean_terminated_length": 686.3168334960938, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.3916913946587537, "grad_norm": 0.458984375, "kl": 0.025140227226074785, "learning_rate": 1.951697375030553e-05, "loss": 0.0053, "num_tokens": 3898155.0, "rewards//mean": 0.7945812940597534, "rewards//std": 0.40846800804138184, "step": 33 }, { "advantage/logodds_epsilon": 0.0003934642857142858, "advantage/max": 2.7018091678619385, "advantage/mean": -0.0054876613430678844, "advantage/min": -2.598515272140503, "advantage/std": 0.8233418464660645, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 831.125, "completions/mean_terminated_length": 715.4000244140625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.4035608308605341, "grad_norm": 0.69140625, "kl": 0.029167757485993207, "learning_rate": 1.9451038737381078e-05, "loss": 0.0223, "num_tokens": 4020891.0, "rewards//mean": 0.6431732177734375, "rewards//std": 0.4865179657936096, "step": 34 }, { "advantage/logodds_epsilon": 0.0004053571428571429, "advantage/max": 2.701632261276245, "advantage/mean": -0.02064463123679161, "advantage/min": -2.7016334533691406, "advantage/std": 0.8611717224121094, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 805.0, "completions/mean_terminated_length": 694.2117919921875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.41543026706231456, "grad_norm": 0.7265625, "kl": 0.029243278899230063, "learning_rate": 1.9381012910506146e-05, "loss": 0.0591, "num_tokens": 4138995.0, "rewards//mean": 0.6594928503036499, "rewards//std": 0.4806313216686249, "step": 35 }, { "advantage/logodds_epsilon": 0.0004172500000000001, "advantage/max": 2.7014570236206055, "advantage/mean": 0.010569433681666851, "advantage/min": -2.7014570236206055, "advantage/std": 0.6529413461685181, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 725.75, "completions/mean_terminated_length": 634.448974609375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.42729970326409494, "grad_norm": 0.5234375, "kl": 0.03295414987951517, "learning_rate": 1.930692657985482e-05, "loss": 0.0061, "num_tokens": 4247299.0, "rewards//mean": 0.7556747198104858, "rewards//std": 0.4338902533054352, "step": 36 }, { "advantage/logodds_epsilon": 0.0004291428571428572, "advantage/max": 2.418079137802124, "advantage/mean": -0.012974461540579796, "advantage/min": -2.7012808322906494, "advantage/std": 0.728173017501831, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 735.921875, "completions/mean_terminated_length": 639.8958740234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.4391691394658754, "grad_norm": 0.734375, "kl": 0.03324733709450811, "learning_rate": 1.9228811813160972e-05, "loss": 0.025, "num_tokens": 4359721.0, "rewards//mean": 0.7533758282661438, "rewards//std": 0.43797916173934937, "step": 37 }, { "advantage/logodds_epsilon": 0.00044103571428571434, "advantage/max": 2.324380397796631, "advantage/mean": -0.024011407047510147, "advantage/min": -2.7011046409606934, "advantage/std": 0.6718021631240845, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 743.9765625, "completions/mean_terminated_length": 654.4844970703125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.45103857566765576, "grad_norm": 0.7421875, "kl": 0.032304306514561176, "learning_rate": 1.9146702421837952e-05, "loss": 0.0519, "num_tokens": 4470342.0, "rewards//mean": 0.8252986669540405, "rewards//std": 0.3849862813949585, "step": 38 }, { "advantage/logodds_epsilon": 0.0004529285714285715, "advantage/max": 1.0875418186187744, "advantage/mean": -0.006378441117703915, "advantage/min": -1.9391565322875977, "advantage/std": 0.46930885314941406, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 654.1015625, "completions/mean_terminated_length": 612.2869262695312, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.4629080118694362, "grad_norm": 0.4140625, "kl": 0.03506830852711573, "learning_rate": 1.906063394634356e-05, "loss": 0.0236, "num_tokens": 4566963.0, "rewards//mean": 0.9207130670547485, "rewards//std": 0.27343931794166565, "step": 39 }, { "advantage/logodds_epsilon": 0.00046482142857142864, "advantage/max": 2.700751781463623, "advantage/mean": -0.01085714902728796, "advantage/min": -2.7007522583007812, "advantage/std": 0.6957706212997437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 756.15625, "completions/mean_terminated_length": 659.2765502929688, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.47477744807121663, "grad_norm": 0.58984375, "kl": 0.03276660805568099, "learning_rate": 1.8970643640796642e-05, "loss": 0.0285, "num_tokens": 4678071.0, "rewards//mean": 0.7705238461494446, "rewards//std": 0.4256618022918701, "step": 40 }, { "advantage/logodds_epsilon": 0.00047671428571428577, "advantage/max": 1.4527703523635864, "advantage/mean": -0.016703419387340546, "advantage/min": -2.7005763053894043, "advantage/std": 0.6522344350814819, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 669.84375, "completions/mean_terminated_length": 575.1683349609375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.486646884272997, "grad_norm": 0.9140625, "kl": 0.03814881236758083, "learning_rate": 1.887677045685188e-05, "loss": 0.066, "num_tokens": 4780747.0, "rewards//mean": 0.8180162906646729, "rewards//std": 0.3903641104698181, "step": 41 }, { "advantage/logodds_epsilon": 0.0004886071428571428, "advantage/max": 2.700399160385132, "advantage/mean": -0.006951633840799332, "advantage/min": -2.7004001140594482, "advantage/std": 0.8440094590187073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 781.7890625, "completions/mean_terminated_length": 631.5570068359375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.49851632047477745, "grad_norm": 0.7890625, "kl": 0.035947895026765764, "learning_rate": 1.877905502683987e-05, "loss": 0.0346, "num_tokens": 4898672.0, "rewards//mean": 0.6191579103469849, "rewards//std": 0.49360737204551697, "step": 42 }, { "advantage/logodds_epsilon": 0.0005005000000000001, "advantage/max": 1.9356884956359863, "advantage/mean": -0.026186538860201836, "advantage/min": -2.7002241611480713, "advantage/std": 0.7014551162719727, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 749.828125, "completions/mean_terminated_length": 689.771484375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.5103857566765578, "grad_norm": 0.61328125, "kl": 0.033930204808712006, "learning_rate": 1.8677539646179706e-05, "loss": 0.0662, "num_tokens": 5009066.0, "rewards//mean": 0.818067729473114, "rewards//std": 0.39025673270225525, "step": 43 }, { "advantage/logodds_epsilon": 0.0005123928571428572, "advantage/max": 2.6708412170410156, "advantage/mean": -0.011584467254579067, "advantage/min": -2.7000479698181152, "advantage/std": 0.9008896350860596, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 833.59375, "completions/mean_terminated_length": 715.4937133789062, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.5222551928783383, "grad_norm": 0.78515625, "kl": 0.032038538600318134, "learning_rate": 1.8572268255071718e-05, "loss": 0.0502, "num_tokens": 5132766.0, "rewards//mean": 0.6196942329406738, "rewards//std": 0.4929179847240448, "step": 44 }, { "advantage/logodds_epsilon": 0.0005242857142857144, "advantage/max": 2.6998708248138428, "advantage/mean": -0.006081804633140564, "advantage/min": -2.6998720169067383, "advantage/std": 0.8706440925598145, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 800.7109375, "completions/mean_terminated_length": 621.45068359375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5341246290801187, "grad_norm": 0.74609375, "kl": 0.036316482815891504, "learning_rate": 1.8463286419478256e-05, "loss": 0.0214, "num_tokens": 5254737.0, "rewards//mean": 0.5557836294174194, "rewards//std": 0.5056889653205872, "step": 45 }, { "advantage/logodds_epsilon": 0.0005361785714285715, "advantage/max": 1.452530026435852, "advantage/mean": -0.0004155319184064865, "advantage/min": -2.6996960639953613, "advantage/std": 0.45691579580307007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 710.0625, "completions/mean_terminated_length": 668.389404296875, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.5459940652818991, "grad_norm": 0.390625, "kl": 0.03480621299240738, "learning_rate": 1.8350641311400813e-05, "loss": 0.0041, "num_tokens": 5359401.0, "rewards//mean": 0.9130905866622925, "rewards//std": 0.28455793857574463, "step": 46 }, { "advantage/logodds_epsilon": 0.0005480714285714287, "advantage/max": 2.6995198726654053, "advantage/mean": -0.008102089166641235, "advantage/min": -2.6995203495025635, "advantage/std": 0.6858307123184204, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 764.671875, "completions/mean_terminated_length": 685.2857055664062, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5578635014836796, "grad_norm": 0.60546875, "kl": 0.02857328619575128, "learning_rate": 1.8234381688461943e-05, "loss": 0.0463, "num_tokens": 5473399.0, "rewards//mean": 0.7786081433296204, "rewards//std": 0.42004773020744324, "step": 47 }, { "advantage/logodds_epsilon": 0.0005599642857142858, "advantage/max": 2.6993439197540283, "advantage/mean": -0.018411144614219666, "advantage/min": -2.6993443965911865, "advantage/std": 0.7576082348823547, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 755.4296875, "completions/mean_terminated_length": 650.3369750976562, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.56973293768546, "grad_norm": 0.59765625, "kl": 0.028340922435745597, "learning_rate": 1.8114557872800906e-05, "loss": 0.0362, "num_tokens": 5583862.0, "rewards//mean": 0.7149896621704102, "rewards//std": 0.4574246108531952, "step": 48 }, { "advantage/logodds_epsilon": 0.0005718571428571429, "advantage/max": 2.1167187690734863, "advantage/mean": -0.013100487180054188, "advantage/min": -2.6991684436798096, "advantage/std": 0.6667536497116089, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 708.8984375, "completions/mean_terminated_length": 643.5, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5816023738872403, "grad_norm": 0.65234375, "kl": 0.03176001494284719, "learning_rate": 1.799122172929206e-05, "loss": 0.0367, "num_tokens": 5689025.0, "rewards//mean": 0.8245458602905273, "rewards//std": 0.3866560459136963, "step": 49 }, { "advantage/logodds_epsilon": 0.0005837500000000002, "advantage/max": 2.512308120727539, "advantage/mean": -0.006615880876779556, "advantage/min": -2.5892391204833984, "advantage/std": 0.9890721440315247, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 846.890625, "completions/mean_terminated_length": 626.2807006835938, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5934718100890207, "grad_norm": 0.8359375, "kl": 0.028011595306452364, "learning_rate": 1.7864426643095537e-05, "loss": 0.0215, "num_tokens": 5818339.0, "rewards//mean": 0.4374961256980896, "rewards//std": 0.5059982538223267, "step": 50 }, { "advantage/logodds_epsilon": 0.0005956428571428572, "advantage/max": 2.1364798545837402, "advantage/mean": -0.01754511147737503, "advantage/min": -2.698817014694214, "advantage/std": 0.6074024438858032, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 700.421875, "completions/mean_terminated_length": 640.5, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6053412462908012, "grad_norm": 0.5546875, "kl": 0.03255216917023063, "learning_rate": 1.773422749654988e-05, "loss": 0.0416, "num_tokens": 5923825.0, "rewards//mean": 0.8574329614639282, "rewards//std": 0.3538227379322052, "step": 51 }, { "advantage/logodds_epsilon": 0.0006075357142857143, "advantage/max": 2.5419423580169678, "advantage/mean": -0.006520539056509733, "advantage/min": -2.077256917953491, "advantage/std": 0.6562984585762024, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 681.4765625, "completions/mean_terminated_length": 594.1666870117188, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6172106824925816, "grad_norm": 0.546875, "kl": 0.041263859427999705, "learning_rate": 1.7600680645416583e-05, "loss": 0.0254, "num_tokens": 6025430.0, "rewards//mean": 0.8023372888565063, "rewards//std": 0.40279263257980347, "step": 52 }, { "advantage/logodds_epsilon": 0.0006194285714285716, "advantage/max": 2.698464870452881, "advantage/mean": -0.01813482493162155, "advantage/min": -2.698465585708618, "advantage/std": 0.8114321231842041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 827.1953125, "completions/mean_terminated_length": 731.0814208984375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.629080118694362, "grad_norm": 0.640625, "kl": 0.028205811278894544, "learning_rate": 1.746384389448694e-05, "loss": 0.0416, "num_tokens": 6149983.0, "rewards//mean": 0.7141739130020142, "rewards//std": 0.45872655510902405, "step": 53 }, { "advantage/logodds_epsilon": 0.0006313214285714287, "advantage/max": 2.698289394378662, "advantage/mean": -0.004583118483424187, "advantage/min": -2.6982898712158203, "advantage/std": 0.7579176425933838, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 799.671875, "completions/mean_terminated_length": 660.5316772460938, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.6409495548961425, "grad_norm": 0.640625, "kl": 0.03153213975019753, "learning_rate": 1.7323776472561625e-05, "loss": 0.0272, "num_tokens": 6268077.0, "rewards//mean": 0.73039710521698, "rewards//std": 0.4500512182712555, "step": 54 }, { "advantage/logodds_epsilon": 0.0006432142857142858, "advantage/max": 2.6981141567230225, "advantage/mean": 0.0031758034601807594, "advantage/min": -2.6981143951416016, "advantage/std": 0.6641281247138977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 745.0390625, "completions/mean_terminated_length": 659.642822265625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6528189910979229, "grad_norm": 0.8359375, "kl": 0.030679657589644194, "learning_rate": 1.7180539006813973e-05, "loss": 0.0237, "num_tokens": 6378738.0, "rewards//mean": 0.7700124979019165, "rewards//std": 0.42661646008491516, "step": 55 }, { "advantage/logodds_epsilon": 0.0006551071428571429, "advantage/max": 2.6979339122772217, "advantage/mean": -0.009846007451415062, "advantage/min": -2.6979386806488037, "advantage/std": 0.5840507745742798, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 711.046875, "completions/mean_terminated_length": 649.6261596679688, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6646884272997032, "grad_norm": 1.390625, "kl": 0.028229560470208526, "learning_rate": 1.7034193496547903e-05, "loss": 0.0326, "num_tokens": 6483208.0, "rewards//mean": 0.8656327724456787, "rewards//std": 0.34470248222351074, "step": 56 }, { "advantage/logodds_epsilon": 0.0006670000000000001, "advantage/max": 2.5979971885681152, "advantage/mean": -0.017020640894770622, "advantage/min": -2.697763204574585, "advantage/std": 1.0236399173736572, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 916.8515625, "completions/mean_terminated_length": 799.1638793945312, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.6765578635014837, "grad_norm": 2.65625, "kl": 0.027361205138731748, "learning_rate": 1.6884803286362e-05, "loss": 0.0522, "num_tokens": 6615989.0, "rewards//mean": 0.5157707333564758, "rewards//std": 0.5094924569129944, "step": 57 }, { "advantage/logodds_epsilon": 0.0006788928571428573, "advantage/max": 1.9344719648361206, "advantage/mean": -0.010600470006465912, "advantage/min": -2.697587728500366, "advantage/std": 0.7744669914245605, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 799.8984375, "completions/mean_terminated_length": 674.1829223632812, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.6884272997032641, "grad_norm": 0.65625, "kl": 0.029555415618233383, "learning_rate": 1.6732433038731245e-05, "loss": 0.0248, "num_tokens": 6735664.0, "rewards//mean": 0.7059880495071411, "rewards//std": 0.462918758392334, "step": 58 }, { "advantage/logodds_epsilon": 0.0006907857142857144, "advantage/max": 2.6974117755889893, "advantage/mean": -0.013238908722996712, "advantage/min": -2.6974122524261475, "advantage/std": 0.677533745765686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 726.625, "completions/mean_terminated_length": 647.1287231445312, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7002967359050445, "grad_norm": 0.625, "kl": 0.025012695870827883, "learning_rate": 1.657714870601833e-05, "loss": 0.0425, "num_tokens": 6841864.0, "rewards//mean": 0.794950544834137, "rewards//std": 0.40773335099220276, "step": 59 }, { "advantage/logodds_epsilon": 0.0007026785714285714, "advantage/max": 1.642048954963684, "advantage/mean": -0.03683529794216156, "advantage/min": -2.6972367763519287, "advantage/std": 0.8866681456565857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 797.8671875, "completions/mean_terminated_length": 675.2650146484375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.712166172106825, "grad_norm": 0.84765625, "kl": 0.0283702181186527, "learning_rate": 1.641901750192666e-05, "loss": 0.0942, "num_tokens": 6959047.0, "rewards//mean": 0.6587684750556946, "rewards//std": 0.4816700518131256, "step": 60 }, { "advantage/logodds_epsilon": 0.0007145714285714287, "advantage/max": 2.6065056324005127, "advantage/mean": -0.024096639826893806, "advantage/min": -2.69706130027771, "advantage/std": 0.8914181590080261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 808.078125, "completions/mean_terminated_length": 686.951171875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7240356083086054, "grad_norm": 0.703125, "kl": 0.02628780552186072, "learning_rate": 1.6258107872407376e-05, "loss": 0.0564, "num_tokens": 7077337.0, "rewards//mean": 0.6672515273094177, "rewards//std": 0.47803038358688354, "step": 61 }, { "advantage/logodds_epsilon": 0.0007264642857142858, "advantage/max": 1.9922456741333008, "advantage/mean": -0.023510506376624107, "advantage/min": -2.694539785385132, "advantage/std": 0.8509385585784912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 797.3671875, "completions/mean_terminated_length": 661.3875122070312, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.7359050445103857, "grad_norm": 0.6953125, "kl": 0.025755617709364742, "learning_rate": 1.609448946603304e-05, "loss": 0.0489, "num_tokens": 7197120.0, "rewards//mean": 0.6514172554016113, "rewards//std": 0.483541876077652, "step": 62 }, { "advantage/logodds_epsilon": 0.000738357142857143, "advantage/max": 1.9340767860412598, "advantage/mean": -0.014695411548018456, "advantage/min": -2.6967105865478516, "advantage/std": 0.7965668439865112, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 811.0546875, "completions/mean_terminated_length": 730.9140014648438, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7477744807121661, "grad_norm": 0.78515625, "kl": 0.02421920670894906, "learning_rate": 1.592823310385073e-05, "loss": 0.05, "num_tokens": 7316143.0, "rewards//mean": 0.7625675797462463, "rewards//std": 0.43082737922668457, "step": 63 }, { "advantage/logodds_epsilon": 0.0007502500000000002, "advantage/max": 2.6965343952178955, "advantage/mean": -0.025378329679369926, "advantage/min": -2.696535348892212, "advantage/std": 0.92302006483078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 825.9765625, "completions/mean_terminated_length": 699.0384521484375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.7596439169139466, "grad_norm": 0.83984375, "kl": 0.0288353796931915, "learning_rate": 1.5759410748727663e-05, "loss": 0.0559, "num_tokens": 7438428.0, "rewards//mean": 0.6124646663665771, "rewards//std": 0.49401599168777466, "step": 64 }, { "advantage/logodds_epsilon": 0.0007621428571428572, "advantage/max": 2.696359872817993, "advantage/mean": -0.007194130681455135, "advantage/min": -2.6963601112365723, "advantage/std": 0.9335386157035828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 871.6015625, "completions/mean_terminated_length": 737.1323852539062, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.771513353115727, "grad_norm": 0.84375, "kl": 0.02507934661116451, "learning_rate": 1.5588095474202597e-05, "loss": 0.0377, "num_tokens": 7568553.0, "rewards//mean": 0.549358606338501, "rewards//std": 0.5049357414245605, "step": 65 }, { "advantage/logodds_epsilon": 0.0007740357142857144, "advantage/max": 2.696183681488037, "advantage/mean": -0.023947857320308685, "advantage/min": -2.6961848735809326, "advantage/std": 0.8393827676773071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 784.75, "completions/mean_terminated_length": 667.906982421875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.7833827893175074, "grad_norm": 0.75, "kl": 0.026422377151902765, "learning_rate": 1.5414361432856475e-05, "loss": 0.0404, "num_tokens": 7684065.0, "rewards//mean": 0.6828888654708862, "rewards//std": 0.4722091257572174, "step": 66 }, { "advantage/logodds_epsilon": 0.0007859285714285715, "advantage/max": 1.9337598085403442, "advantage/mean": -0.028962209820747375, "advantage/min": -2.696009635925293, "advantage/std": 0.7832530736923218, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 777.4375, "completions/mean_terminated_length": 711.5247192382812, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7952522255192879, "grad_norm": 0.75390625, "kl": 0.02613440022105351, "learning_rate": 1.5238283824216015e-05, "loss": 0.0734, "num_tokens": 7798321.0, "rewards//mean": 0.8016011118888855, "rewards//std": 0.4042954444885254, "step": 67 }, { "advantage/logodds_epsilon": 0.0007978214285714288, "advantage/max": 1.451474905014038, "advantage/mean": -0.0028909663669764996, "advantage/min": -1.9345837831497192, "advantage/std": 0.3807392120361328, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 654.34375, "completions/mean_terminated_length": 632.9586791992188, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.8071216617210683, "grad_norm": 0.33984375, "kl": 0.03040262428112328, "learning_rate": 1.5059938862204126e-05, "loss": 0.0118, "num_tokens": 7896805.0, "rewards//mean": 0.9445071220397949, "rewards//std": 0.23162488639354706, "step": 68 }, { "advantage/logodds_epsilon": 0.0008097142857142858, "advantage/max": 2.6956591606140137, "advantage/mean": 0.04149559512734413, "advantage/min": -1.9395079612731934, "advantage/std": 0.7863659858703613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 824.3125, "completions/mean_terminated_length": 636.727294921875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8189910979228486, "grad_norm": 0.6875, "kl": 0.026372475258540362, "learning_rate": 1.4879403742151283e-05, "loss": -0.0153, "num_tokens": 8017661.0, "rewards//mean": 0.5564619302749634, "rewards//std": 0.5049146413803101, "step": 69 }, { "advantage/logodds_epsilon": 0.0008216071428571429, "advantage/max": 1.9278475046157837, "advantage/mean": -0.023548610508441925, "advantage/min": -2.695484161376953, "advantage/std": 0.6201450824737549, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 706.578125, "completions/mean_terminated_length": 644.2803344726562, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.8308605341246291, "grad_norm": 0.59765625, "kl": 0.02705864212475717, "learning_rate": 1.469675660738206e-05, "loss": 0.0395, "num_tokens": 8121991.0, "rewards//mean": 0.8400956392288208, "rewards//std": 0.3730737268924713, "step": 70 }, { "advantage/logodds_epsilon": 0.0008335000000000001, "advantage/max": 1.933438777923584, "advantage/mean": -0.000690137967467308, "advantage/min": -2.6953091621398926, "advantage/std": 0.8231402039527893, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 791.5234375, "completions/mean_terminated_length": 627.239990234375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.8427299703264095, "grad_norm": 0.69140625, "kl": 0.031296103610657156, "learning_rate": 1.4512076515391375e-05, "loss": 0.0387, "num_tokens": 8237898.0, "rewards//mean": 0.6578705310821533, "rewards//std": 0.48294711112976074, "step": 71 }, { "advantage/logodds_epsilon": 0.0008453928571428573, "advantage/max": 2.6951327323913574, "advantage/mean": -0.011506309732794762, "advantage/min": -2.695134162902832, "advantage/std": 0.7828315496444702, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 771.1328125, "completions/mean_terminated_length": 614.2911376953125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8545994065281899, "grad_norm": 0.64453125, "kl": 0.028297776996623725, "learning_rate": 1.4325443403625012e-05, "loss": 0.0231, "num_tokens": 8350979.0, "rewards//mean": 0.6668483018875122, "rewards//std": 0.4786072373390198, "step": 72 }, { "advantage/logodds_epsilon": 0.0008572857142857144, "advantage/max": 2.201094388961792, "advantage/mean": -0.009981157258152962, "advantage/min": -2.6949591636657715, "advantage/std": 0.7409846186637878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 788.46875, "completions/mean_terminated_length": 677.4712524414062, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.8664688427299704, "grad_norm": 0.6640625, "kl": 0.026707628334406763, "learning_rate": 1.4136938054879284e-05, "loss": 0.0315, "num_tokens": 8467959.0, "rewards//mean": 0.7310893535614014, "rewards//std": 0.4489072263240814, "step": 73 }, { "advantage/logodds_epsilon": 0.0008691785714285715, "advantage/max": 2.6035120487213135, "advantage/mean": -0.02285468950867653, "advantage/min": -2.694784164428711, "advantage/std": 0.8432738184928894, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 747.890625, "completions/mean_terminated_length": 603.2619018554688, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8783382789317508, "grad_norm": 0.7890625, "kl": 0.025898948137182742, "learning_rate": 1.3946642062334765e-05, "loss": 0.0603, "num_tokens": 8582449.0, "rewards//mean": 0.6668868660926819, "rewards//std": 0.4785531759262085, "step": 74 }, { "advantage/logodds_epsilon": 0.0008810714285714287, "advantage/max": 2.1163926124572754, "advantage/mean": 0.0008571594953536987, "advantage/min": -2.6946091651916504, "advantage/std": 0.7458250522613525, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 761.109375, "completions/mean_terminated_length": 632.720947265625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.8902077151335311, "grad_norm": 0.72265625, "kl": 0.027754372102208436, "learning_rate": 1.3754637794239303e-05, "loss": 0.012, "num_tokens": 8697839.0, "rewards//mean": 0.6986972689628601, "rewards//std": 0.46553197503089905, "step": 75 }, { "advantage/logodds_epsilon": 0.0008929642857142859, "advantage/max": 1.0881941318511963, "advantage/mean": -0.027605965733528137, "advantage/min": -2.694434404373169, "advantage/std": 0.5187329053878784, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 667.125, "completions/mean_terminated_length": 630.2069091796875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9020771513353115, "grad_norm": 0.59765625, "kl": 0.024963520525489002, "learning_rate": 1.356100835825547e-05, "loss": 0.0458, "num_tokens": 8797575.0, "rewards//mean": 0.9130833148956299, "rewards//std": 0.2845827043056488, "step": 76 }, { "advantage/logodds_epsilon": 0.000904857142857143, "advantage/max": 1.451042890548706, "advantage/mean": -0.024881720542907715, "advantage/min": -2.6942596435546875, "advantage/std": 0.6890738010406494, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 695.6796875, "completions/mean_terminated_length": 619.9135131835938, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.913946587537092, "grad_norm": 0.65234375, "kl": 0.025639849482104182, "learning_rate": 1.3365837565488065e-05, "loss": 0.0641, "num_tokens": 8900822.0, "rewards//mean": 0.8573490977287292, "rewards//std": 0.3540341556072235, "step": 77 }, { "advantage/logodds_epsilon": 0.00091675, "advantage/max": 2.3416223526000977, "advantage/mean": -0.04060763865709305, "advantage/min": -2.694084644317627, "advantage/std": 0.9484633803367615, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 847.3984375, "completions/mean_terminated_length": 761.1511840820312, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.9258160237388724, "grad_norm": 0.89453125, "kl": 0.024255894881207496, "learning_rate": 1.316920989420703e-05, "loss": 0.0742, "num_tokens": 9024257.0, "rewards//mean": 0.611142635345459, "rewards//std": 0.49569782614707947, "step": 78 }, { "advantage/logodds_epsilon": 0.0009286428571428573, "advantage/max": 2.6939098834991455, "advantage/mean": -0.004008917137980461, "advantage/min": -2.6939098834991455, "advantage/std": 0.7015594840049744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 719.7578125, "completions/mean_terminated_length": 614.07373046875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.9376854599406528, "grad_norm": 0.875, "kl": 0.026439815410412848, "learning_rate": 1.2971210453281675e-05, "loss": 0.024, "num_tokens": 9131522.0, "rewards//mean": 0.7622599005699158, "rewards//std": 0.4313811957836151, "step": 79 }, { "advantage/logodds_epsilon": 0.0009405357142857144, "advantage/max": 2.693735122680664, "advantage/mean": 0.014469900168478489, "advantage/min": -2.693735122680664, "advantage/std": 0.8637734055519104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 851.75, "completions/mean_terminated_length": 704.4638061523438, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.9495548961424333, "grad_norm": 0.6484375, "kl": 0.021778338530566543, "learning_rate": 1.2771924945341906e-05, "loss": 0.0056, "num_tokens": 9258122.0, "rewards//mean": 0.5738894939422607, "rewards//std": 0.500784695148468, "step": 80 }, { "advantage/logodds_epsilon": 0.0009524285714285715, "advantage/max": 1.8007384538650513, "advantage/mean": -0.010520991869270802, "advantage/min": -2.6935606002807617, "advantage/std": 0.5462207794189453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 650.9921875, "completions/mean_terminated_length": 593.8648681640625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9614243323442137, "grad_norm": 0.5078125, "kl": 0.026499023369979113, "learning_rate": 1.257143962968246e-05, "loss": 0.0293, "num_tokens": 9354697.0, "rewards//mean": 0.8731256723403931, "rewards//std": 0.3370078206062317, "step": 81 }, { "advantage/logodds_epsilon": 0.0009643214285714288, "advantage/max": 2.693384885787964, "advantage/mean": -0.013431357219815254, "advantage/min": -2.6933858394622803, "advantage/std": 0.7807292342185974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 772.140625, "completions/mean_terminated_length": 673.5869750976562, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.973293768545994, "grad_norm": 0.78125, "kl": 0.03216716053429991, "learning_rate": 1.236984128492619e-05, "loss": 0.0495, "num_tokens": 9470627.0, "rewards//mean": 0.7139476537704468, "rewards//std": 0.4590918719768524, "step": 82 }, { "advantage/logodds_epsilon": 0.0009762142857142858, "advantage/max": 1.9324731826782227, "advantage/mean": -0.009806068614125252, "advantage/min": -2.178335189819336, "advantage/std": 0.7266980409622192, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 767.484375, "completions/mean_terminated_length": 681.9791870117188, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.9851632047477745, "grad_norm": 1.125, "kl": 0.030359028896782547, "learning_rate": 1.2167217171462566e-05, "loss": 0.0428, "num_tokens": 9582473.0, "rewards//mean": 0.7615800499916077, "rewards//std": 0.43262478709220886, "step": 83 }, { "advantage/logodds_epsilon": 0.000988107142857143, "advantage/max": 2.693035840988159, "advantage/mean": 0.011945459991693497, "advantage/min": -2.6930365562438965, "advantage/std": 0.6034679412841797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27941176470588236, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 769.8235473632812, "completions/mean_terminated_length": 671.2653198242188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9970326409495549, "grad_norm": 0.71484375, "kl": 0.026008586573880166, "learning_rate": 1.1963654993677645e-05, "loss": 0.0113, "num_tokens": 9693520.0, "rewards//mean": 0.6994150280952454, "rewards//std": 0.46441495418548584, "step": 84 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21393369138240814, "advantage/mean": -0.006712201051414013, "advantage/min": -1.9274110794067383, "advantage/std": 0.2519371211528778, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.078125, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 555.453125, "completions/mean_terminated_length": 602.5254516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 1.0118694362017804, "grad_norm": 0.0849609375, "kl": NaN, "learning_rate": 1.1759242861991855e-05, "loss": -0.0265, "num_tokens": 9780978.0, "rewards//mean": 0.9203323721885681, "rewards//std": 0.27474451065063477, "step": 85 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.91951584815979, "advantage/mean": -0.0033463314175605774, "advantage/min": -1.9267429113388062, "advantage/std": 0.38252419233322144, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 533.28125, "completions/mean_terminated_length": 626.238525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 1.0237388724035608, "grad_norm": 1.125, "kl": 0.025204737728927284, "learning_rate": 1.155406925472205e-05, "loss": -0.203, "num_tokens": 9862646.0, "rewards//mean": 0.8974473476409912, "rewards//std": 0.3062177896499634, "step": 86 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9081087112426758, "advantage/mean": -0.0033246770035475492, "advantage/min": -1.9195520877838135, "advantage/std": 0.2888992726802826, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 614.8828125, "completions/mean_terminated_length": 684.3912963867188, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 1.0356083086053411, "grad_norm": 0.265625, "kl": NaN, "learning_rate": 1.1348222979784289e-05, "loss": -0.0699, "num_tokens": 9954815.0, "rewards//mean": 0.8970840573310852, "rewards//std": 0.30730167031288147, "step": 87 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9123625755310059, "advantage/mean": -0.012081624940037727, "advantage/min": -1.9555816650390625, "advantage/std": 0.4595913589000702, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 639.546875, "completions/mean_terminated_length": 705.7069091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 418.0, "epoch": 1.0474777448071217, "grad_norm": 14.3125, "kl": 0.19445816351799294, "learning_rate": 1.1141793136253987e-05, "loss": -0.1501, "num_tokens": 10049965.0, "rewards//mean": 0.9127764701843262, "rewards//std": 0.28559643030166626, "step": 88 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21387545764446259, "advantage/mean": -0.0033545014448463917, "advantage/min": -1.926504373550415, "advantage/std": 0.17813949286937714, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 582.8828125, "completions/mean_terminated_length": 621.74169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 1.0593471810089021, "grad_norm": 0.058349609375, "kl": 0.02404517779359594, "learning_rate": 1.09348690758e-05, "loss": -0.0131, "num_tokens": 10137102.0, "rewards//mean": 0.992063581943512, "rewards//std": 0.08979016542434692, "step": 89 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5069457292556763, "advantage/mean": -0.007181974593549967, "advantage/min": -1.9241938591003418, "advantage/std": 0.5587527751922607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 569.7734375, "completions/mean_terminated_length": 657.0360717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 1.0712166172106825, "grad_norm": 0.58984375, "kl": 0.02127398713491857, "learning_rate": 1.072754036400944e-05, "loss": -0.2267, "num_tokens": 10224241.0, "rewards//mean": 0.8261239528656006, "rewards//std": 0.38317152857780457, "step": 90 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9089659452438354, "advantage/mean": -0.011963944882154465, "advantage/min": -1.9220253229141235, "advantage/std": 0.4585069715976715, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 561.8671875, "completions/mean_terminated_length": 619.9913940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 1.083086053412463, "grad_norm": 0.38671875, "kl": 0.02332187572028488, "learning_rate": 1.0519896741619803e-05, "loss": -0.1516, "num_tokens": 10309520.0, "rewards//mean": 0.9129678606987, "rewards//std": 0.2849576473236084, "step": 91 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9065917730331421, "advantage/mean": -0.010066758841276169, "advantage/min": -1.9456684589385986, "advantage/std": 0.4456890821456909, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 610.1640625, "completions/mean_terminated_length": 691.1593017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 1.0949554896142433, "grad_norm": 0.44921875, "kl": 0.020190397452097386, "learning_rate": 1.0312028085675393e-05, "loss": -0.1532, "num_tokens": 10400749.0, "rewards//mean": 0.9132087230682373, "rewards//std": 0.2841741144657135, "step": 92 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9331722259521484, "advantage/mean": -0.008558844216167927, "advantage/min": -1.9150750637054443, "advantage/std": 0.436222642660141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 550.3046875, "completions/mean_terminated_length": 646.2293090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 1.1068249258160237, "grad_norm": 0.67578125, "kl": 0.03286611312068999, "learning_rate": 1.0104024370624644e-05, "loss": -0.1458, "num_tokens": 10486644.0, "rewards//mean": 0.8887128829956055, "rewards//std": 0.31881752610206604, "step": 93 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5136432647705078, "advantage/mean": -0.003937204834073782, "advantage/min": -1.9212960004806519, "advantage/std": 0.4224414825439453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 574.9140625, "completions/mean_terminated_length": 639.904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 1.1186943620178043, "grad_norm": 0.83203125, "kl": 0.02204144821735099, "learning_rate": 9.89597562937536e-06, "loss": -0.1908, "num_tokens": 10573689.0, "rewards//mean": 0.8965259194374084, "rewards//std": 0.3089805841445923, "step": 94 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9286977052688599, "advantage/mean": 0.003363626077771187, "advantage/min": -0.22882387042045593, "advantage/std": 0.17835266888141632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 606.4609375, "completions/mean_terminated_length": 652.3277587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 1.1305637982195846, "grad_norm": 1.46875, "kl": 0.020696879364550114, "learning_rate": 9.687971914324607e-06, "loss": -0.1203, "num_tokens": 10665316.0, "rewards//mean": 0.9443669319152832, "rewards//std": 0.2322123646736145, "step": 95 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.1837575435638428, "advantage/mean": -0.004825676325708628, "advantage/min": -1.9381605386734009, "advantage/std": 0.337948203086853, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.109375, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 552.4296875, "completions/mean_terminated_length": 620.2719116210938, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 1.142433234421365, "grad_norm": 0.55859375, "kl": NaN, "learning_rate": 9.480103258380198e-06, "loss": -0.083, "num_tokens": 10751243.0, "rewards//mean": 0.8810471296310425, "rewards//std": 0.32778117060661316, "step": 96 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21301649510860443, "advantage/mean": -0.0032973499037325382, "advantage/min": -1.9131762981414795, "advantage/std": 0.17694886028766632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 573.46875, "completions/mean_terminated_length": 591.9677124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 1.1543026706231454, "grad_norm": 0.0595703125, "kl": 0.021307097631506622, "learning_rate": 9.272459635990563e-06, "loss": -0.0131, "num_tokens": 10838335.0, "rewards//mean": 0.9921300411224365, "rewards//std": 0.08903875201940536, "step": 97 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43134191632270813, "advantage/mean": -0.00999000109732151, "advantage/min": -1.9251052141189575, "advantage/std": 0.3280654847621918, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 533.0, "completions/mean_terminated_length": 631.7037353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 1.1661721068249258, "grad_norm": 0.185546875, "kl": NaN, "learning_rate": 9.065130924199998e-06, "loss": -0.0342, "num_tokens": 10920343.0, "rewards//mean": 0.8421245813369751, "rewards//std": 0.3683176040649414, "step": 98 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21354015171527863, "advantage/mean": -0.006645852699875832, "advantage/min": -1.9212852716445923, "advantage/std": 0.25095441937446594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 644.578125, "completions/mean_terminated_length": 660.0480346679688, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 1.1780415430267062, "grad_norm": 0.08056640625, "kl": 0.027586172567680478, "learning_rate": 8.858206863746018e-06, "loss": -0.0264, "num_tokens": 11016305.0, "rewards//mean": 0.9842005968093872, "rewards//std": 0.12589696049690247, "step": 99 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9087148904800415, "advantage/mean": -0.0066377646289765835, "advantage/min": -1.9224987030029297, "advantage/std": 0.339020699262619, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1640625, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 543.90625, "completions/mean_terminated_length": 650.6541748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 1.1899109792284865, "grad_norm": 0.328125, "kl": NaN, "learning_rate": 8.651777020215713e-06, "loss": -0.0832, "num_tokens": 11104733.0, "rewards//mean": 0.8897290229797363, "rewards//std": 0.3159048557281494, "step": 100 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43116751313209534, "advantage/mean": -0.006669612601399422, "advantage/min": -1.9255216121673584, "advantage/std": 0.2759830057621002, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 573.8515625, "completions/mean_terminated_length": 592.3629150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 1.2017804154302671, "grad_norm": 0.185546875, "kl": 0.025343034823890775, "learning_rate": 8.445930745277953e-06, "loss": -0.0401, "num_tokens": 11190650.0, "rewards//mean": 0.9763144254684448, "rewards//std": 0.15349189937114716, "step": 101 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9046536087989807, "advantage/mean": -0.006608244962990284, "advantage/min": -1.923117995262146, "advantage/std": 0.33801230788230896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 676.984375, "completions/mean_terminated_length": 728.1849365234375, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 1.2136498516320475, "grad_norm": 0.2373046875, "kl": 0.023159809643402696, "learning_rate": 8.240757138008149e-06, "loss": -0.0829, "num_tokens": 11293840.0, "rewards//mean": 0.9528600573539734, "rewards//std": 0.21340253949165344, "step": 102 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9051553606987, "advantage/mean": -0.0019344771280884743, "advantage/min": -1.1846874952316284, "advantage/std": 0.318952351808548, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 570.0078125, "completions/mean_terminated_length": 657.3063354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 1.225519287833828, "grad_norm": 0.55078125, "kl": 0.021536219341214746, "learning_rate": 8.036345006322358e-06, "loss": -0.0976, "num_tokens": 11382169.0, "rewards//mean": 0.8820893168449402, "rewards//std": 0.32490062713623047, "step": 103 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9108253717422485, "advantage/mean": -0.003229904919862747, "advantage/min": -1.917354941368103, "advantage/std": 0.4715126156806946, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1640625, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 562.8984375, "completions/mean_terminated_length": 673.3738403320312, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 1.2373887240356083, "grad_norm": 0.91015625, "kl": NaN, "learning_rate": 7.832782828537437e-06, "loss": -0.1913, "num_tokens": 11468252.0, "rewards//mean": 0.8420026302337646, "rewards//std": 0.3686053454875946, "step": 104 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9139962196350098, "advantage/mean": 2.95368954539299e-06, "advantage/min": -1.9132927656173706, "advantage/std": 0.2503484785556793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 574.703125, "completions/mean_terminated_length": 607.9503784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 1.2492581602373887, "grad_norm": 0.6640625, "kl": 0.022887362516485155, "learning_rate": 7.630158715073813e-06, "loss": -0.1163, "num_tokens": 11558062.0, "rewards//mean": 0.9370107650756836, "rewards//std": 0.2449151873588562, "step": 105 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9214539527893066, "advantage/mean": 0.006719962228089571, "advantage/min": -1.935305118560791, "advantage/std": 0.4798320233821869, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 412.5078125, "completions/mean_terminated_length": 660.0125122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 1.2611275964391693, "grad_norm": 1.5234375, "kl": NaN, "learning_rate": 7.428560370317542e-06, "loss": -0.2411, "num_tokens": 11631071.0, "rewards//mean": 0.6590052247047424, "rewards//std": 0.4813198745250702, "step": 106 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.917263150215149, "advantage/mean": -0.002014155499637127, "advantage/min": -1.9239046573638916, "advantage/std": 0.4786885380744934, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1796875, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 518.59375, "completions/mean_terminated_length": 632.1904907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 1.2729970326409497, "grad_norm": 1.4609375, "kl": NaN, "learning_rate": 7.228075054658096e-06, "loss": -0.2252, "num_tokens": 11711595.0, "rewards//mean": 0.8180580139160156, "rewards//std": 0.39027684926986694, "step": 107 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9152618646621704, "advantage/mean": -0.008685033768415451, "advantage/min": -1.9196232557296753, "advantage/std": 0.5053057670593262, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 519.5234375, "completions/mean_terminated_length": 658.4059448242188, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 1.28486646884273, "grad_norm": 1.03125, "kl": NaN, "learning_rate": 7.028789546718327e-06, "loss": -0.2851, "num_tokens": 11793686.0, "rewards//mean": 0.7860877513885498, "rewards//std": 0.41535890102386475, "step": 108 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43442410230636597, "advantage/mean": -0.013488471508026123, "advantage/min": -1.912493109703064, "advantage/std": 0.4084751605987549, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 507.8671875, "completions/mean_terminated_length": 677.15625, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 1.2967359050445104, "grad_norm": 0.2255859375, "kl": NaN, "learning_rate": 6.8307901057929735e-06, "loss": -0.0944, "num_tokens": 11875269.0, "rewards//mean": 0.7525733113288879, "rewards//std": 0.43941107392311096, "step": 109 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.1851118803024292, "advantage/mean": -0.0014121305430307984, "advantage/min": -1.520728349685669, "advantage/std": 0.30872422456741333, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.328125, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 432.234375, "completions/mean_terminated_length": 643.3255615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 1.3086053412462908, "grad_norm": 0.458984375, "kl": NaN, "learning_rate": 6.634162434511939e-06, "loss": -0.1009, "num_tokens": 11946891.0, "rewards//mean": 0.690848708152771, "rewards//std": 0.46886134147644043, "step": 110 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9072984457015991, "advantage/mean": -0.008659638464450836, "advantage/min": -1.9238336086273193, "advantage/std": 0.4230641722679138, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 547.6875, "completions/mean_terminated_length": 701.0399780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 1.3204747774480712, "grad_norm": 0.4375, "kl": NaN, "learning_rate": 6.438991641744531e-06, "loss": -0.1383, "num_tokens": 12035875.0, "rewards//mean": 0.7936596870422363, "rewards//std": 0.4103051424026489, "step": 111 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9188259840011597, "advantage/mean": 0.0033213591668754816, "advantage/min": -0.22531408071517944, "advantage/std": 0.1774940937757492, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.2421875, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 461.28125, "completions/mean_terminated_length": 608.7009887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 1.3323442136498516, "grad_norm": 0.7890625, "kl": NaN, "learning_rate": 6.245362205760703e-06, "loss": -0.1197, "num_tokens": 12112143.0, "rewards//mean": 0.7544768452644348, "rewards//std": 0.43602651357650757, "step": 112 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9209489822387695, "advantage/mean": 0.0013821604661643505, "advantage/min": -1.1878427267074585, "advantage/std": 0.36598560214042664, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 477.5703125, "completions/mean_terminated_length": 686.8427124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 1.344213649851632, "grad_norm": 1.234375, "kl": NaN, "learning_rate": 6.053357937665237e-06, "loss": -0.2179, "num_tokens": 12189344.0, "rewards//mean": 0.699212908744812, "rewards//std": 0.46472683548927307, "step": 113 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9212185144424438, "advantage/mean": -0.003442929359152913, "advantage/min": -1.9365557432174683, "advantage/std": 0.4007267653942108, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.5, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 703.75, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 1.3560830860534125, "grad_norm": 1.203125, "kl": NaN, "learning_rate": 5.863061945120719e-06, "loss": -0.2171, "num_tokens": 12259648.0, "rewards//mean": 0.5091111063957214, "rewards//std": 0.5084741711616516, "step": 114 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.517584204673767, "advantage/mean": -0.001994214951992035, "advantage/min": -1.9394543170928955, "advantage/std": 0.4252905547618866, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.5078125, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 319.71875, "completions/mean_terminated_length": 649.5873413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 1.367952522255193, "grad_norm": 0.8125, "kl": NaN, "learning_rate": 5.674556596374993e-06, "loss": -0.2067, "num_tokens": 12325268.0, "rewards//mean": 0.5073824524879456, "rewards//std": 0.5102729201316833, "step": 115 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9323999881744385, "advantage/mean": 0.00012079346925020218, "advantage/min": -1.9256129264831543, "advantage/std": 0.37475845217704773, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 686.9041137695312, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 1.3798219584569733, "grad_norm": 1.3046875, "kl": NaN, "learning_rate": 5.487923484608629e-06, "loss": -0.2424, "num_tokens": 12399532.0, "rewards//mean": 0.5627472400665283, "rewards//std": 0.5057460069656372, "step": 116 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9130436182022095, "advantage/mean": -0.0118994927033782, "advantage/min": -1.9188669919967651, "advantage/std": 0.45863908529281616, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.328125, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 448.140625, "completions/mean_terminated_length": 667.0, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 1.3916913946587537, "grad_norm": 0.4453125, "kl": NaN, "learning_rate": 5.3032433926179395e-06, "loss": -0.1343, "num_tokens": 12480790.0, "rewards//mean": 0.6595108509063721, "rewards//std": 0.480607807636261, "step": 117 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9190807342529297, "advantage/mean": -8.095055818557739e-05, "advantage/min": -1.9297515153884888, "advantage/std": 0.4942281246185303, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 432.6328125, "completions/mean_terminated_length": 700.9747314453125, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 1.403560830860534, "grad_norm": 1.515625, "kl": NaN, "learning_rate": 5.120596257848716e-06, "loss": -0.3544, "num_tokens": 12554311.0, "rewards//mean": 0.6272512674331665, "rewards//std": 0.49126946926116943, "step": 118 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.930235743522644, "advantage/mean": -0.00047219451516866684, "advantage/min": -1.9093363285064697, "advantage/std": 0.44188758730888367, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 386.15625, "completions/mean_terminated_length": 677.0958862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 1.4154302670623147, "grad_norm": 1.7265625, "kl": NaN, "learning_rate": 4.940061137795876e-06, "loss": -0.3363, "num_tokens": 12626651.0, "rewards//mean": 0.5810023546218872, "rewards//std": 0.500396192073822, "step": 119 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9287009239196777, "advantage/mean": -0.0020257970318198204, "advantage/min": -1.9468318223953247, "advantage/std": 0.49218693375587463, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 544.4609375, "completions/mean_terminated_length": 801.0459594726562, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 1.427299703264095, "grad_norm": 0.68359375, "kl": NaN, "learning_rate": 4.7617161757839895e-06, "loss": -0.2186, "num_tokens": 12711518.0, "rewards//mean": 0.699238657951355, "rewards//std": 0.4646891951560974, "step": 120 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.930548071861267, "advantage/mean": 0.011293224059045315, "advantage/min": -1.929141879081726, "advantage/std": 0.6371313333511353, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.6484375, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 271.2265625, "completions/mean_terminated_length": 771.4888916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 1.4391691394658754, "grad_norm": 2.171875, "kl": NaN, "learning_rate": 4.5856385671435285e-06, "loss": -0.7817, "num_tokens": 12762531.0, "rewards//mean": 0.35258346796035767, "rewards//std": 0.486828476190567, "step": 121 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.917569875717163, "advantage/mean": -0.006770431064069271, "advantage/min": -1.933653473854065, "advantage/std": 0.4526441693305969, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 420.2421875, "completions/mean_terminated_length": 717.21337890625, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 1.4510385756676558, "grad_norm": 0.99609375, "kl": NaN, "learning_rate": 4.411904525797408e-06, "loss": -0.1954, "num_tokens": 12832634.0, "rewards//mean": 0.5567037463188171, "rewards//std": 0.5046417713165283, "step": 122 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9111862182617188, "advantage/mean": -0.0014686351642012596, "advantage/min": -1.923195481300354, "advantage/std": 0.5580769777297974, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.5390625, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 329.859375, "completions/mean_terminated_length": 715.6271362304688, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 1.4629080118694362, "grad_norm": 1.390625, "kl": NaN, "learning_rate": 4.240589251272342e-06, "loss": -0.2905, "num_tokens": 12889200.0, "rewards//mean": 0.46842291951179504, "rewards//std": 0.5092353820800781, "step": 123 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.1872975826263428, "advantage/mean": -0.0027686082758009434, "advantage/min": -1.9116945266723633, "advantage/std": 0.5272735953330994, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.609375, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 294.1640625, "completions/mean_terminated_length": 753.0599975585938, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 1.4747774480712166, "grad_norm": 0.6484375, "kl": NaN, "learning_rate": 4.0717668961492725e-06, "loss": -0.1927, "num_tokens": 12943093.0, "rewards//mean": 0.4538976550102234, "rewards//std": 0.5069791674613953, "step": 124 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.517442226409912, "advantage/mean": 0.002009883988648653, "advantage/min": -1.930153250694275, "advantage/std": 0.7648005485534668, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.546875, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 351.2890625, "completions/mean_terminated_length": 775.2586059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 1.486646884272997, "grad_norm": 1.6328125, "kl": NaN, "learning_rate": 3.905510533966959e-06, "loss": -0.5738, "num_tokens": 13002234.0, "rewards//mean": 0.5327528715133667, "rewards//std": 0.5072947144508362, "step": 125 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9298175573349, "advantage/mean": -0.0018671993166208267, "advantage/min": -1.921625018119812, "advantage/std": 0.5027943849563599, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.40625, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 425.0859375, "completions/mean_terminated_length": 715.9342041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 1.4985163204747773, "grad_norm": 1.484375, "kl": NaN, "learning_rate": 3.7418921275926245e-06, "loss": -0.3583, "num_tokens": 13071213.0, "rewards//mean": 0.6043626666069031, "rewards//std": 0.49610209465026855, "step": 126 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9274595975875854, "advantage/mean": 0.016750790178775787, "advantage/min": -1.9122111797332764, "advantage/std": 0.5896896123886108, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.546875, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 341.2421875, "completions/mean_terminated_length": 753.086181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 1.5103857566765577, "grad_norm": 1.9296875, "kl": NaN, "learning_rate": 3.5809824980733445e-06, "loss": -0.5838, "num_tokens": 13128844.0, "rewards//mean": 0.48590797185897827, "rewards//std": 0.5081177353858948, "step": 127 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9181711673736572, "advantage/mean": -0.0008597676642239094, "advantage/min": -1.9336591958999634, "advantage/std": 0.5854839086532593, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 436.5546875, "completions/mean_terminated_length": 707.3291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 1.5222551928783383, "grad_norm": 1.0390625, "kl": NaN, "learning_rate": 3.422851293981676e-06, "loss": -0.2994, "num_tokens": 13198155.0, "rewards//mean": 0.6356991529464722, "rewards//std": 0.48831233382225037, "step": 128 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9237265586853027, "advantage/mean": 0.008581508882343769, "advantage/min": -1.51692533493042, "advantage/std": 0.5177173614501953, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4921875, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 347.5078125, "completions/mean_terminated_length": 684.3230590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 1.5341246290801187, "grad_norm": 3.625, "kl": NaN, "learning_rate": 3.2675669612687565e-06, "loss": -0.4465, "num_tokens": 13259236.0, "rewards//mean": 0.5096697807312012, "rewards//std": 0.5079067945480347, "step": 129 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9265636205673218, "advantage/mean": 0.008637513034045696, "advantage/min": -1.9244918823242188, "advantage/std": 0.6367316246032715, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 455.984375, "completions/mean_terminated_length": 748.2820434570312, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 1.545994065281899, "grad_norm": 1.5703125, "kl": NaN, "learning_rate": 3.115196713638e-06, "loss": -0.5401, "num_tokens": 13331842.0, "rewards//mean": 0.6362784504890442, "rewards//std": 0.4875337481498718, "step": 130 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5149365663528442, "advantage/mean": -0.008159506134688854, "advantage/min": -1.940788984298706, "advantage/std": 0.6248160600662231, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 424.234375, "completions/mean_terminated_length": 743.863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 1.5578635014836797, "grad_norm": 0.8515625, "kl": NaN, "learning_rate": 2.965806503452098e-06, "loss": -0.3634, "num_tokens": 13399568.0, "rewards//mean": 0.5797673463821411, "rewards//std": 0.5018887519836426, "step": 131 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43188193440437317, "advantage/mean": -0.016634924337267876, "advantage/min": -1.9263575077056885, "advantage/std": 0.4129045605659485, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1171875, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 623.640625, "completions/mean_terminated_length": 706.4248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 418.0, "epoch": 1.56973293768546, "grad_norm": 0.330078125, "kl": NaN, "learning_rate": 2.819460993186032e-06, "loss": -0.0477, "num_tokens": 13491794.0, "rewards//mean": 0.8886122703552246, "rewards//std": 0.31911179423332214, "step": 132 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9200443029403687, "advantage/mean": -0.011932725086808205, "advantage/min": -1.921685814857483, "advantage/std": 0.5687617063522339, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 541.3984375, "completions/mean_terminated_length": 753.25, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 1.5816023738872405, "grad_norm": 0.7109375, "kl": NaN, "learning_rate": 2.6762235274383775e-06, "loss": -0.243, "num_tokens": 13574709.0, "rewards//mean": 0.7469240427017212, "rewards//std": 0.4400651156902313, "step": 133 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5213267803192139, "advantage/mean": -0.003202416468411684, "advantage/min": -1.9273756742477417, "advantage/std": 0.614234983921051, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 517.7578125, "completions/mean_terminated_length": 736.36669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 1.5934718100890208, "grad_norm": 1.5625, "kl": 0.036314057069830596, "learning_rate": 2.5361561055130625e-06, "loss": -0.4678, "num_tokens": 13653710.0, "rewards//mean": 0.7460275888442993, "rewards//std": 0.4416311979293823, "step": 134 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5160837173461914, "advantage/mean": -0.0012700525112450123, "advantage/min": -1.9151767492294312, "advantage/std": 0.6045173406600952, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 467.28125, "completions/mean_terminated_length": 776.7792358398438, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 1.6053412462908012, "grad_norm": 1.296875, "kl": NaN, "learning_rate": 2.3993193545834182e-06, "loss": -0.4441, "num_tokens": 13728250.0, "rewards//mean": 0.6352561712265015, "rewards//std": 0.4889114499092102, "step": 135 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5056098699569702, "advantage/mean": -0.008613115176558495, "advantage/min": -1.9262280464172363, "advantage/std": 0.5049684047698975, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.328125, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 449.8984375, "completions/mean_terminated_length": 669.6162719726562, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 1.6172106824925816, "grad_norm": 0.765625, "kl": NaN, "learning_rate": 2.265772503450122e-06, "loss": -0.2458, "num_tokens": 13800061.0, "rewards//mean": 0.6753218770027161, "rewards//std": 0.47482624650001526, "step": 136 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9252156019210815, "advantage/mean": -0.006081531755626202, "advantage/min": -1.9355872869491577, "advantage/std": 0.6520951986312866, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4375, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 406.21875, "completions/mean_terminated_length": 722.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 1.629080118694362, "grad_norm": 1.453125, "kl": NaN, "learning_rate": 2.1355733569044633e-06, "loss": -0.5079, "num_tokens": 13865777.0, "rewards//mean": 0.6048412322998047, "rewards//std": 0.4955025315284729, "step": 137 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.6603893637657166, "advantage/mean": -0.015362613834440708, "advantage/min": -1.927195429801941, "advantage/std": 0.4361397624015808, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 550.1796875, "completions/mean_terminated_length": 670.6952514648438, "completions/min_length": 0.0, "completions/min_terminated_length": 366.0, "epoch": 1.6409495548961424, "grad_norm": 0.263671875, "kl": NaN, "learning_rate": 2.008778270707944e-06, "loss": -0.081, "num_tokens": 13952608.0, "rewards//mean": 0.8092094659805298, "rewards//std": 0.39872902631759644, "step": 138 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.191937804222107, "advantage/mean": -0.004746164195239544, "advantage/min": -1.928928017616272, "advantage/std": 0.3572968542575836, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 383.6875, "completions/mean_terminated_length": 506.30926513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 1.6528189910979227, "grad_norm": 0.5546875, "kl": NaN, "learning_rate": 1.8854421271990964e-06, "loss": -0.093, "num_tokens": 14018752.0, "rewards//mean": 0.7453782558441162, "rewards//std": 0.44275522232055664, "step": 139 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.510254144668579, "advantage/mean": -0.004046538844704628, "advantage/min": -1.5299463272094727, "advantage/std": 0.4380718469619751, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 489.0625, "completions/mean_terminated_length": 535.0427856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 1.6646884272997031, "grad_norm": 0.95703125, "kl": 0.03683288465254009, "learning_rate": 1.7656183115380577e-06, "loss": -0.1419, "num_tokens": 14096584.0, "rewards//mean": 0.8887147307395935, "rewards//std": 0.31882667541503906, "step": 140 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.1849887371063232, "advantage/mean": -0.004679612349718809, "advantage/min": -1.9216474294662476, "advantage/std": 0.4064004123210907, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 511.734375, "completions/mean_terminated_length": 555.1016845703125, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 1.6765578635014837, "grad_norm": 0.62890625, "kl": 0.030216948012821376, "learning_rate": 1.6493586885991908e-06, "loss": -0.1395, "num_tokens": 14175462.0, "rewards//mean": 0.9131076335906982, "rewards//std": 0.28450077772140503, "step": 141 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.6602617502212524, "advantage/mean": -0.008524436503648758, "advantage/min": -1.9139026403427124, "advantage/std": 0.33574238419532776, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 481.90625, "completions/mean_terminated_length": 497.45159912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 1.688427299703264, "grad_norm": 0.33984375, "kl": 0.033639343455433846, "learning_rate": 1.536713580521746e-06, "loss": -0.0497, "num_tokens": 14249642.0, "rewards//mean": 0.9606139659881592, "rewards//std": 0.19611631333827972, "step": 142 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21360832452774048, "advantage/mean": -0.0066238404251635075, "advantage/min": -1.92234206199646, "advantage/std": 0.2506479322910309, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.140625, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 423.6015625, "completions/mean_terminated_length": 492.91815185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 1.7002967359050445, "grad_norm": 0.0859375, "kl": NaN, "learning_rate": 1.4277317449282834e-06, "loss": -0.0263, "num_tokens": 14320559.0, "rewards//mean": 0.857833743095398, "rewards//std": 0.3528285622596741, "step": 143 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.184670329093933, "advantage/mean": -0.004762329161167145, "advantage/min": -1.935533881187439, "advantage/std": 0.33737996220588684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 500.84375, "completions/mean_terminated_length": 525.4754028320312, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 1.712166172106825, "grad_norm": 0.43359375, "kl": 0.04113235871773213, "learning_rate": 1.3224603538202929e-06, "loss": -0.0835, "num_tokens": 14399243.0, "rewards//mean": 0.9446145296096802, "rewards//std": 0.23117810487747192, "step": 144 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.9115270376205444, "advantage/mean": -0.003408966585993767, "advantage/min": -1.93901789188385, "advantage/std": 0.2906622588634491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 481.109375, "completions/mean_terminated_length": 535.49560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 1.7240356083086055, "grad_norm": 0.306640625, "kl": 0.045802103355526924, "learning_rate": 1.220944973160133e-06, "loss": -0.0697, "num_tokens": 14478265.0, "rewards//mean": 0.8966078758239746, "rewards//std": 0.3087267577648163, "step": 145 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9281952381134033, "advantage/mean": 0.006729425396770239, "advantage/min": -0.45143717527389526, "advantage/std": 0.27702268958091736, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1484375, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 461.4375, "completions/mean_terminated_length": 541.8715209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 1.7359050445103859, "grad_norm": 0.240234375, "kl": NaN, "learning_rate": 1.1232295431481222e-06, "loss": -0.0139, "num_tokens": 14553449.0, "rewards//mean": 0.832952082157135, "rewards//std": 0.3785593509674072, "step": 146 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.4327715039253235, "advantage/mean": -0.006765198893845081, "advantage/min": -1.937062382698059, "advantage/std": 0.27742907404899597, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 571.734375, "completions/mean_terminated_length": 585.4560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 1.7477744807121662, "grad_norm": 0.1337890625, "kl": 0.033991911564953625, "learning_rate": 1.0293563592033595e-06, "loss": -0.0401, "num_tokens": 14639831.0, "rewards//mean": 0.9761366844177246, "rewards//std": 0.15464277565479279, "step": 147 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21408027410507202, "advantage/mean": -0.009968165308237076, "advantage/min": -1.9296997785568237, "advantage/std": 0.30730822682380676, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.2109375, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 472.7734375, "completions/mean_terminated_length": 599.1583862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 1.7596439169139466, "grad_norm": 0.10400390625, "kl": NaN, "learning_rate": 9.393660536564408e-07, "loss": -0.0397, "num_tokens": 14719362.0, "rewards//mean": 0.7863461375236511, "rewards//std": 0.4148584008216858, "step": 148 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43295615911483765, "advantage/mean": -0.0066620223224163055, "advantage/min": -1.911831259727478, "advantage/std": 0.27607157826423645, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 544.515625, "completions/mean_terminated_length": 557.5840454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 1.771513353115727, "grad_norm": 0.130859375, "kl": 0.03622849879320711, "learning_rate": 8.532975781620511e-07, "loss": -0.04, "num_tokens": 14802332.0, "rewards//mean": 0.9762489199638367, "rewards//std": 0.1539168804883957, "step": 149 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5160192251205444, "advantage/mean": -0.0032945233397185802, "advantage/min": -1.9176921844482422, "advantage/std": 0.3483807146549225, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 585.75, "completions/mean_terminated_length": 635.3898315429688, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 1.7833827893175074, "grad_norm": 0.578125, "kl": 0.035929113859310746, "learning_rate": 7.711881868390292e-07, "loss": -0.1348, "num_tokens": 14890124.0, "rewards//mean": 0.9285244941711426, "rewards//std": 0.2609255909919739, "step": 150 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.4329414665699005, "advantage/mean": -0.006688450463116169, "advantage/min": -1.9181170463562012, "advantage/std": 0.2764665484428406, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.265625, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 436.453125, "completions/mean_terminated_length": 594.3191528320312, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 1.7952522255192878, "grad_norm": 0.267578125, "kl": NaN, "learning_rate": 6.930734201451817e-07, "loss": -0.0248, "num_tokens": 14966814.0, "rewards//mean": 0.7219593524932861, "rewards//std": 0.45501524209976196, "step": 151 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9238147735595703, "advantage/mean": -0.010101165622472763, "advantage/min": -1.9405475854873657, "advantage/std": 0.4145474135875702, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 400.640625, "completions/mean_terminated_length": 507.7425537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 1.8071216617210681, "grad_norm": 0.98828125, "kl": NaN, "learning_rate": 6.189870894938587e-07, "loss": -0.1643, "num_tokens": 15035704.0, "rewards//mean": 0.7770117521286011, "rewards//std": 0.42307209968566895, "step": 152 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.1940398216247559, "advantage/mean": -0.0013529672287404537, "advantage/min": -1.511359453201294, "advantage/std": 0.30959567427635193, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 563.0859375, "completions/mean_terminated_length": 585.9755859375, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 1.8189910979228485, "grad_norm": 0.390625, "kl": 0.03943067276850343, "learning_rate": 5.489612626189245e-07, "loss": -0.0539, "num_tokens": 15121051.0, "rewards//mean": 0.9441364407539368, "rewards//std": 0.23318061232566833, "step": 153 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5175542831420898, "advantage/mean": -0.0018498229328542948, "advantage/min": -1.9130256175994873, "advantage/std": 0.3559645414352417, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1796875, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 488.4140625, "completions/mean_terminated_length": 595.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 1.8308605341246291, "grad_norm": 0.57421875, "kl": NaN, "learning_rate": 4.830262496944693e-07, "loss": -0.0694, "num_tokens": 15200952.0, "rewards//mean": 0.7931093573570251, "rewards//std": 0.41140785813331604, "step": 154 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.43180593848228455, "advantage/mean": -0.013412997126579285, "advantage/min": -1.9487814903259277, "advantage/std": 0.3739129304885864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 534.5390625, "completions/mean_terminated_length": 551.7822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 1.8427299703264095, "grad_norm": 0.275390625, "kl": 0.03880139789544046, "learning_rate": 4.21210590215273e-07, "loss": -0.0508, "num_tokens": 15284765.0, "rewards//mean": 0.9603752493858337, "rewards//std": 0.19731158018112183, "step": 155 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.6636139154434204, "advantage/mean": -0.00197417801246047, "advantage/min": -1.1930924654006958, "advantage/std": 0.22539697587490082, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0859375, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 567.4140625, "completions/mean_terminated_length": 620.7607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 1.8545994065281899, "grad_norm": 0.1875, "kl": NaN, "learning_rate": 3.635410406436857e-07, "loss": -0.0412, "num_tokens": 15370994.0, "rewards//mean": 0.8494309186935425, "rewards//std": 0.3620583713054657, "step": 156 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5063209533691406, "advantage/mean": -0.008625751361250877, "advantage/min": -1.9249359369277954, "advantage/std": 0.45040178298950195, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 481.1328125, "completions/mean_terminated_length": 565.0, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 1.8664688427299705, "grad_norm": 1.90625, "kl": NaN, "learning_rate": 3.100425628282899e-07, "loss": -0.146, "num_tokens": 15453427.0, "rewards//mean": 0.8336179256439209, "rewards//std": 0.377052903175354, "step": 157 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 0.21544617414474487, "advantage/mean": -0.010093946941196918, "advantage/min": -1.9512163400650024, "advantage/std": 0.30881720781326294, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1328125, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 475.921875, "completions/mean_terminated_length": 548.8108520507812, "completions/min_length": 0.0, "completions/min_terminated_length": 250.0, "epoch": 1.8783382789317509, "grad_norm": 0.201171875, "kl": NaN, "learning_rate": 2.607383131993424e-07, "loss": -0.0267, "num_tokens": 15530473.0, "rewards//mean": 0.8492758274078369, "rewards//std": 0.3624357283115387, "step": 158 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9234334230422974, "advantage/mean": 0.01999136619269848, "advantage/min": -1.1871734857559204, "advantage/std": 0.6579956412315369, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.6953125, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 758.5640869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 516.0, "epoch": 1.8902077151335313, "grad_norm": 1.921875, "kl": NaN, "learning_rate": 2.1564963274568028e-07, "loss": -0.5375, "num_tokens": 15578097.0, "rewards//mean": 0.37501630187034607, "rewards//std": 0.4941561222076416, "step": 159 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9284110069274902, "advantage/mean": -0.001977311447262764, "advantage/min": -1.5255322456359863, "advantage/std": 0.3568601608276367, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.6484375, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 727.1111450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 1.9020771513353116, "grad_norm": 0.2119140625, "kl": NaN, "learning_rate": 1.7479603777742937e-07, "loss": -0.0679, "num_tokens": 15625785.0, "rewards//mean": 0.3347344398498535, "rewards//std": 0.48339539766311646, "step": 160 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9180108308792114, "advantage/mean": -2.1150190150365233e-05, "advantage/min": -1.922570824623108, "advantage/std": 0.3901103436946869, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4375, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 400.0625, "completions/mean_terminated_length": 711.2222290039062, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 1.913946587537092, "grad_norm": 1.0703125, "kl": NaN, "learning_rate": 1.3819521147851122e-07, "loss": -0.2541, "num_tokens": 15692793.0, "rewards//mean": 0.5556085705757141, "rewards//std": 0.505899965763092, "step": 161 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.18680739402771, "advantage/mean": -0.006663821171969175, "advantage/min": -1.9252969026565552, "advantage/std": 0.5181385278701782, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.328125, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 461.046875, "completions/mean_terminated_length": 686.2092895507812, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 1.9258160237388724, "grad_norm": 0.78515625, "kl": NaN, "learning_rate": 1.0586299625259699e-07, "loss": -0.1893, "num_tokens": 15769607.0, "rewards//mean": 0.6662224531173706, "rewards//std": 0.4795023798942566, "step": 162 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.5068639516830444, "advantage/mean": -0.013799908570945263, "advantage/min": -1.9225540161132812, "advantage/std": 0.5686584711074829, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 494.484375, "completions/mean_terminated_length": 753.5, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 1.9376854599406528, "grad_norm": 0.921875, "kl": NaN, "learning_rate": 7.781338686584928e-08, "loss": -0.2712, "num_tokens": 15850349.0, "rewards//mean": 0.6513202786445618, "rewards//std": 0.4836761951446533, "step": 163 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.9118238687515259, "advantage/mean": -0.0020243192557245493, "advantage/min": -1.52021324634552, "advantage/std": 0.47649678587913513, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 408.46875, "completions/mean_terminated_length": 697.1199951171875, "completions/min_length": 0.0, "completions/min_terminated_length": 435.0, "epoch": 1.9495548961424332, "grad_norm": 1.046875, "kl": NaN, "learning_rate": 5.405852438937764e-08, "loss": -0.3028, "num_tokens": 15917585.0, "rewards//mean": 0.5886790156364441, "rewards//std": 0.4992271661758423, "step": 164 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.929634928703308, "advantage/mean": -0.004954553674906492, "advantage/min": -1.9497406482696533, "advantage/std": 0.667103111743927, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.625, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 314.578125, "completions/mean_terminated_length": 838.875, "completions/min_length": 0.0, "completions/min_terminated_length": 601.0, "epoch": 1.9614243323442135, "grad_norm": 1.4609375, "kl": NaN, "learning_rate": 3.460869094407127e-08, "loss": -0.6181, "num_tokens": 15973155.0, "rewards//mean": 0.4300598204135895, "rewards//std": 0.5046306848526001, "step": 165 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.187256097793579, "advantage/mean": -0.004739911761134863, "advantage/min": -1.919519066810608, "advantage/std": 0.5733982920646667, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.421875, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 413.28125, "completions/mean_terminated_length": 714.8648681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 1.973293768545994, "grad_norm": 0.8203125, "kl": NaN, "learning_rate": 1.947230525005006e-08, "loss": -0.2885, "num_tokens": 16043895.0, "rewards//mean": 0.5547634363174438, "rewards//std": 0.506853461265564, "step": 166 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.50879967212677, "advantage/mean": -0.0034448225051164627, "advantage/min": -1.9508203268051147, "advantage/std": 0.4751599431037903, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 400.4375, "completions/mean_terminated_length": 640.7000122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 1.9851632047477745, "grad_norm": 0.6328125, "kl": NaN, "learning_rate": 8.655918982689582e-09, "loss": -0.21, "num_tokens": 16112703.0, "rewards//mean": 0.611525297164917, "rewards//std": 0.49521562457084656, "step": 167 }, { "advantage/logodds_epsilon": 0.001, "advantage/max": 1.947137713432312, "advantage/mean": -0.05683013051748276, "advantage/min": -1.9427815675735474, "advantage/std": 0.6950681209564209, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3529411764705882, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 503.1323547363281, "completions/mean_terminated_length": 777.5681762695312, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 1.997032640949555, "grad_norm": 1.71875, "kl": NaN, "learning_rate": 2.164213936770576e-09, "loss": -0.3532, "num_tokens": 16184585.0, "rewards//mean": 0.5477771759033203, "rewards//std": 0.5067213177680969, "step": 168 }, { "epoch": 1.997032640949555, "step": 168, "total_flos": 0.0, "train_loss": -0.09854654141236097, "train_runtime": 3102.2738, "train_samples_per_second": 0.869, "train_steps_per_second": 0.054 } ], "logging_steps": 1, "max_steps": 168, "num_input_tokens_seen": 16184585, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }