4748 lines
174 KiB
JSON
4748 lines
174 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.997032640949555,
|
|
"eval_steps": 500,
|
|
"global_step": 168,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"advantage/logodds_epsilon": 1e-06,
|
|
"advantage/max": 2.7076480388641357,
|
|
"advantage/mean": -0.0003062831237912178,
|
|
"advantage/min": -2.7068095207214355,
|
|
"advantage/std": 1.0714755058288574,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 910.609375,
|
|
"completions/mean_terminated_length": 721.625,
|
|
"completions/min_length": 409.0,
|
|
"completions/min_terminated_length": 409.0,
|
|
"epoch": 0.011869436201780416,
|
|
"grad_norm": 1.140625,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0339,
|
|
"num_tokens": 133270.0,
|
|
"rewards/<lambda>/mean": 0.31050193309783936,
|
|
"rewards/<lambda>/std": 0.4752292037010193,
|
|
"step": 1
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 1.2892857142857144e-05,
|
|
"advantage/max": 2.707470655441284,
|
|
"advantage/mean": -0.021119937300682068,
|
|
"advantage/min": -2.7074711322784424,
|
|
"advantage/std": 1.1238324642181396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.6015625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1017.0,
|
|
"completions/mean_length": 907.1328125,
|
|
"completions/mean_terminated_length": 730.686279296875,
|
|
"completions/min_length": 458.0,
|
|
"completions/min_terminated_length": 458.0,
|
|
"epoch": 0.02373887240356083,
|
|
"grad_norm": 1.015625,
|
|
"kl": 0.0,
|
|
"learning_rate": 1.1764705882352942e-06,
|
|
"loss": 0.0521,
|
|
"num_tokens": 266647.0,
|
|
"rewards/<lambda>/mean": 0.34321528673171997,
|
|
"rewards/<lambda>/std": 0.4855186939239502,
|
|
"step": 2
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 2.4785714285714287e-05,
|
|
"advantage/max": 2.7072935104370117,
|
|
"advantage/mean": 0.01818234473466873,
|
|
"advantage/min": -2.707293748855591,
|
|
"advantage/std": 0.7201385498046875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 785.921875,
|
|
"completions/mean_terminated_length": 681.5955200195312,
|
|
"completions/min_length": 343.0,
|
|
"completions/min_terminated_length": 343.0,
|
|
"epoch": 0.03560830860534125,
|
|
"grad_norm": 0.69140625,
|
|
"kl": 0.0008829391736071557,
|
|
"learning_rate": 2.3529411764705885e-06,
|
|
"loss": -0.001,
|
|
"num_tokens": 382637.0,
|
|
"rewards/<lambda>/mean": 0.6515579223632812,
|
|
"rewards/<lambda>/std": 0.4833427369594574,
|
|
"step": 3
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 3.667857142857143e-05,
|
|
"advantage/max": 2.70711612701416,
|
|
"advantage/mean": 0.018909499049186707,
|
|
"advantage/min": -2.70711612701416,
|
|
"advantage/std": 1.1025551557540894,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.40625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 864.203125,
|
|
"completions/mean_terminated_length": 754.868408203125,
|
|
"completions/min_length": 481.0,
|
|
"completions/min_terminated_length": 481.0,
|
|
"epoch": 0.04747774480712166,
|
|
"grad_norm": 1.1484375,
|
|
"kl": 0.0009217608167091385,
|
|
"learning_rate": 3.529411764705883e-06,
|
|
"loss": -0.0083,
|
|
"num_tokens": 506791.0,
|
|
"rewards/<lambda>/mean": 0.3201034367084503,
|
|
"rewards/<lambda>/std": 0.47702425718307495,
|
|
"step": 4
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 4.857142857142857e-05,
|
|
"advantage/max": 2.7069385051727295,
|
|
"advantage/mean": 0.02930786833167076,
|
|
"advantage/min": -2.700977325439453,
|
|
"advantage/std": 1.028095006942749,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 873.8125,
|
|
"completions/mean_terminated_length": 680.7142944335938,
|
|
"completions/min_length": 308.0,
|
|
"completions/min_terminated_length": 308.0,
|
|
"epoch": 0.05934718100890208,
|
|
"grad_norm": 1.0546875,
|
|
"kl": 0.0010906819879892282,
|
|
"learning_rate": 4.705882352941177e-06,
|
|
"loss": -0.0306,
|
|
"num_tokens": 637655.0,
|
|
"rewards/<lambda>/mean": 0.3020392656326294,
|
|
"rewards/<lambda>/std": 0.47242945432662964,
|
|
"step": 5
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 6.0464285714285715e-05,
|
|
"advantage/max": 2.68178391456604,
|
|
"advantage/mean": 0.006097273901104927,
|
|
"advantage/min": -2.706761360168457,
|
|
"advantage/std": 0.9648444056510925,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4609375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 827.3125,
|
|
"completions/mean_terminated_length": 659.1304321289062,
|
|
"completions/min_length": 375.0,
|
|
"completions/min_terminated_length": 375.0,
|
|
"epoch": 0.0712166172106825,
|
|
"grad_norm": 0.90234375,
|
|
"kl": 0.0010743636812549084,
|
|
"learning_rate": 5.882352941176471e-06,
|
|
"loss": 0.007,
|
|
"num_tokens": 760639.0,
|
|
"rewards/<lambda>/mean": 0.492506206035614,
|
|
"rewards/<lambda>/std": 0.509504497051239,
|
|
"step": 6
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 7.235714285714286e-05,
|
|
"advantage/max": 2.7065823078155518,
|
|
"advantage/mean": -0.03362283855676651,
|
|
"advantage/min": -2.7065842151641846,
|
|
"advantage/std": 1.066990613937378,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1005.0,
|
|
"completions/mean_length": 822.859375,
|
|
"completions/mean_terminated_length": 639.7313232421875,
|
|
"completions/min_length": 358.0,
|
|
"completions/min_terminated_length": 358.0,
|
|
"epoch": 0.0830860534124629,
|
|
"grad_norm": 1.0546875,
|
|
"kl": 0.0011787074290623423,
|
|
"learning_rate": 7.058823529411766e-06,
|
|
"loss": 0.0606,
|
|
"num_tokens": 884397.0,
|
|
"rewards/<lambda>/mean": 0.3580586016178131,
|
|
"rewards/<lambda>/std": 0.49095040559768677,
|
|
"step": 7
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 8.425e-05,
|
|
"advantage/max": 2.706406354904175,
|
|
"advantage/mean": 0.0027046892791986465,
|
|
"advantage/min": -2.6997547149658203,
|
|
"advantage/std": 1.0536103248596191,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.515625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 928.8671875,
|
|
"completions/mean_terminated_length": 827.5967407226562,
|
|
"completions/min_length": 461.0,
|
|
"completions/min_terminated_length": 461.0,
|
|
"epoch": 0.09495548961424333,
|
|
"grad_norm": 0.95703125,
|
|
"kl": 0.0014505159560940228,
|
|
"learning_rate": 8.23529411764706e-06,
|
|
"loss": 0.0164,
|
|
"num_tokens": 1024724.0,
|
|
"rewards/<lambda>/mean": 0.28668415546417236,
|
|
"rewards/<lambda>/std": 0.46535518765449524,
|
|
"step": 8
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 9.614285714285714e-05,
|
|
"advantage/max": 2.7062294483184814,
|
|
"advantage/mean": 0.04780469834804535,
|
|
"advantage/min": -2.5455925464630127,
|
|
"advantage/std": 0.9474127292633057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.34375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 986.0,
|
|
"completions/mean_length": 795.640625,
|
|
"completions/mean_terminated_length": 676.0238037109375,
|
|
"completions/min_length": 378.0,
|
|
"completions/min_terminated_length": 378.0,
|
|
"epoch": 0.10682492581602374,
|
|
"grad_norm": 0.9921875,
|
|
"kl": 0.002061821702227462,
|
|
"learning_rate": 9.411764705882354e-06,
|
|
"loss": -0.0054,
|
|
"num_tokens": 1140126.0,
|
|
"rewards/<lambda>/mean": 0.3750962018966675,
|
|
"rewards/<lambda>/std": 0.4941062927246094,
|
|
"step": 9
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0001080357142857143,
|
|
"advantage/max": 2.062196731567383,
|
|
"advantage/mean": -0.00547090545296669,
|
|
"advantage/min": -2.706052303314209,
|
|
"advantage/std": 0.9570024013519287,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3203125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 804.234375,
|
|
"completions/mean_terminated_length": 700.6666870117188,
|
|
"completions/min_length": 351.0,
|
|
"completions/min_terminated_length": 351.0,
|
|
"epoch": 0.11869436201780416,
|
|
"grad_norm": 0.90625,
|
|
"kl": 0.002494313448551111,
|
|
"learning_rate": 1.0588235294117648e-05,
|
|
"loss": 0.0128,
|
|
"num_tokens": 1257956.0,
|
|
"rewards/<lambda>/mean": 0.5335986614227295,
|
|
"rewards/<lambda>/std": 0.5063714385032654,
|
|
"step": 10
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00011992857142857143,
|
|
"advantage/max": 2.705874443054199,
|
|
"advantage/mean": -0.017507879063487053,
|
|
"advantage/min": -2.7058751583099365,
|
|
"advantage/std": 1.0383834838867188,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.46875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 941.0,
|
|
"completions/mean_length": 818.03125,
|
|
"completions/mean_terminated_length": 636.2941284179688,
|
|
"completions/min_length": 281.0,
|
|
"completions/min_terminated_length": 281.0,
|
|
"epoch": 0.13056379821958458,
|
|
"grad_norm": 0.9921875,
|
|
"kl": 0.00399306406325195,
|
|
"learning_rate": 1.1764705882352942e-05,
|
|
"loss": 0.0496,
|
|
"num_tokens": 1379048.0,
|
|
"rewards/<lambda>/mean": 0.3989332616329193,
|
|
"rewards/<lambda>/std": 0.49916210770606995,
|
|
"step": 11
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0001318214285714286,
|
|
"advantage/max": 2.705698013305664,
|
|
"advantage/mean": 0.00386135745793581,
|
|
"advantage/min": -2.705698013305664,
|
|
"advantage/std": 0.9110799431800842,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1010.0,
|
|
"completions/mean_length": 828.6953125,
|
|
"completions/mean_terminated_length": 650.8805541992188,
|
|
"completions/min_length": 347.0,
|
|
"completions/min_terminated_length": 347.0,
|
|
"epoch": 0.142433234421365,
|
|
"grad_norm": 0.8125,
|
|
"kl": 0.005920416064327583,
|
|
"learning_rate": 1.2941176470588238e-05,
|
|
"loss": 0.0131,
|
|
"num_tokens": 1504145.0,
|
|
"rewards/<lambda>/mean": 0.5091220140457153,
|
|
"rewards/<lambda>/std": 0.5084678530693054,
|
|
"step": 12
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00014371428571428573,
|
|
"advantage/max": 2.2751290798187256,
|
|
"advantage/mean": -0.02284279279410839,
|
|
"advantage/min": -2.7055208683013916,
|
|
"advantage/std": 1.0446645021438599,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.40625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 811.8203125,
|
|
"completions/mean_terminated_length": 666.6447143554688,
|
|
"completions/min_length": 374.0,
|
|
"completions/min_terminated_length": 374.0,
|
|
"epoch": 0.1543026706231454,
|
|
"grad_norm": 2.875,
|
|
"kl": 0.01130726927658543,
|
|
"learning_rate": 1.4117647058823532e-05,
|
|
"loss": 0.0248,
|
|
"num_tokens": 1623906.0,
|
|
"rewards/<lambda>/mean": 0.4440869688987732,
|
|
"rewards/<lambda>/std": 0.5080564618110657,
|
|
"step": 13
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00015560714285714288,
|
|
"advantage/max": 2.705343723297119,
|
|
"advantage/mean": -0.011263464577496052,
|
|
"advantage/min": -2.705343723297119,
|
|
"advantage/std": 0.894480288028717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 970.0,
|
|
"completions/mean_length": 724.546875,
|
|
"completions/mean_terminated_length": 573.058837890625,
|
|
"completions/min_length": 320.0,
|
|
"completions/min_terminated_length": 320.0,
|
|
"epoch": 0.1661721068249258,
|
|
"grad_norm": 0.76953125,
|
|
"kl": 0.012325245566898957,
|
|
"learning_rate": 1.5294117647058822e-05,
|
|
"loss": 0.0273,
|
|
"num_tokens": 1731216.0,
|
|
"rewards/<lambda>/mean": 0.5954668521881104,
|
|
"rewards/<lambda>/std": 0.49903228878974915,
|
|
"step": 14
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0001675,
|
|
"advantage/max": 2.7051665782928467,
|
|
"advantage/mean": -0.022828437387943268,
|
|
"advantage/min": -2.705166816711426,
|
|
"advantage/std": 0.8195313215255737,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2265625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 746.6328125,
|
|
"completions/mean_terminated_length": 665.3838500976562,
|
|
"completions/min_length": 346.0,
|
|
"completions/min_terminated_length": 346.0,
|
|
"epoch": 0.17804154302670624,
|
|
"grad_norm": 0.87109375,
|
|
"kl": 0.013961711403680965,
|
|
"learning_rate": 1.647058823529412e-05,
|
|
"loss": 0.0358,
|
|
"num_tokens": 1841201.0,
|
|
"rewards/<lambda>/mean": 0.7065119743347168,
|
|
"rewards/<lambda>/std": 0.46209007501602173,
|
|
"step": 15
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00017939285714285716,
|
|
"advantage/max": 2.704983711242676,
|
|
"advantage/mean": 0.0014846273697912693,
|
|
"advantage/min": -2.1927168369293213,
|
|
"advantage/std": 0.8186325430870056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.265625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 966.0,
|
|
"completions/mean_length": 672.0,
|
|
"completions/mean_terminated_length": 544.6808471679688,
|
|
"completions/min_length": 283.0,
|
|
"completions/min_terminated_length": 283.0,
|
|
"epoch": 0.18991097922848665,
|
|
"grad_norm": 0.91015625,
|
|
"kl": 0.01810563838807866,
|
|
"learning_rate": 1.7647058823529414e-05,
|
|
"loss": -0.014,
|
|
"num_tokens": 1942193.0,
|
|
"rewards/<lambda>/mean": 0.5964069962501526,
|
|
"rewards/<lambda>/std": 0.4978735148906708,
|
|
"step": 16
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00019128571428571428,
|
|
"advantage/max": 2.7048118114471436,
|
|
"advantage/mean": -0.02652079612016678,
|
|
"advantage/min": -2.70481276512146,
|
|
"advantage/std": 0.8032910227775574,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2421875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 983.0,
|
|
"completions/mean_length": 723.3984375,
|
|
"completions/mean_terminated_length": 627.3298950195312,
|
|
"completions/min_length": 337.0,
|
|
"completions/min_terminated_length": 337.0,
|
|
"epoch": 0.20178041543026706,
|
|
"grad_norm": 0.85546875,
|
|
"kl": 0.017719955882057548,
|
|
"learning_rate": 1.8823529411764708e-05,
|
|
"loss": 0.0519,
|
|
"num_tokens": 2047660.0,
|
|
"rewards/<lambda>/mean": 0.7063321471214294,
|
|
"rewards/<lambda>/std": 0.46237605810165405,
|
|
"step": 17
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00020317857142857144,
|
|
"advantage/max": 2.7046356201171875,
|
|
"advantage/mean": -0.013976034708321095,
|
|
"advantage/min": -2.7046358585357666,
|
|
"advantage/std": 0.7605122327804565,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.265625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 728.5625,
|
|
"completions/mean_terminated_length": 621.7020874023438,
|
|
"completions/min_length": 257.0,
|
|
"completions/min_terminated_length": 257.0,
|
|
"epoch": 0.21364985163204747,
|
|
"grad_norm": 0.7890625,
|
|
"kl": 0.018178344005718827,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0528,
|
|
"num_tokens": 2154476.0,
|
|
"rewards/<lambda>/mean": 0.7472767233848572,
|
|
"rewards/<lambda>/std": 0.43945369124412537,
|
|
"step": 18
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0002150714285714286,
|
|
"advantage/max": 2.704457998275757,
|
|
"advantage/mean": -0.005952497012913227,
|
|
"advantage/min": -2.7044589519500732,
|
|
"advantage/std": 0.7705256342887878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2734375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 990.0,
|
|
"completions/mean_length": 747.2734375,
|
|
"completions/mean_terminated_length": 643.1290283203125,
|
|
"completions/min_length": 359.0,
|
|
"completions/min_terminated_length": 359.0,
|
|
"epoch": 0.22551928783382788,
|
|
"grad_norm": 0.69921875,
|
|
"kl": 0.02424622658872977,
|
|
"learning_rate": 1.999783578606323e-05,
|
|
"loss": 0.0195,
|
|
"num_tokens": 2270471.0,
|
|
"rewards/<lambda>/mean": 0.6908018589019775,
|
|
"rewards/<lambda>/std": 0.4689319431781769,
|
|
"step": 19
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00022696428571428574,
|
|
"advantage/max": 2.704281806945801,
|
|
"advantage/mean": -0.012495366856455803,
|
|
"advantage/min": -2.70428204536438,
|
|
"advantage/std": 0.9640302062034607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5078125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 993.0,
|
|
"completions/mean_length": 864.2421875,
|
|
"completions/mean_terminated_length": 699.4127197265625,
|
|
"completions/min_length": 281.0,
|
|
"completions/min_terminated_length": 281.0,
|
|
"epoch": 0.23738872403560832,
|
|
"grad_norm": 0.90234375,
|
|
"kl": 0.01943658519303426,
|
|
"learning_rate": 1.9991344081017312e-05,
|
|
"loss": 0.0451,
|
|
"num_tokens": 2398374.0,
|
|
"rewards/<lambda>/mean": 0.4931441843509674,
|
|
"rewards/<lambda>/std": 0.5088549256324768,
|
|
"step": 20
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00023885714285714287,
|
|
"advantage/max": 2.2589097023010254,
|
|
"advantage/mean": -0.004630962386727333,
|
|
"advantage/min": -2.7041053771972656,
|
|
"advantage/std": 0.8182272911071777,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3515625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1022.0,
|
|
"completions/mean_length": 801.796875,
|
|
"completions/mean_terminated_length": 681.3252563476562,
|
|
"completions/min_length": 302.0,
|
|
"completions/min_terminated_length": 302.0,
|
|
"epoch": 0.24925816023738873,
|
|
"grad_norm": 0.87890625,
|
|
"kl": 0.025031055964063853,
|
|
"learning_rate": 1.9980527694749952e-05,
|
|
"loss": 0.0144,
|
|
"num_tokens": 2516852.0,
|
|
"rewards/<lambda>/mean": 0.6364069581031799,
|
|
"rewards/<lambda>/std": 0.4873649477958679,
|
|
"step": 21
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00025075000000000005,
|
|
"advantage/max": 2.703927993774414,
|
|
"advantage/mean": -0.012429304420948029,
|
|
"advantage/min": -2.7039284706115723,
|
|
"advantage/std": 0.8538786768913269,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3671875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 718.7890625,
|
|
"completions/mean_terminated_length": 541.6913452148438,
|
|
"completions/min_length": 250.0,
|
|
"completions/min_terminated_length": 250.0,
|
|
"epoch": 0.26112759643916916,
|
|
"grad_norm": 0.8828125,
|
|
"kl": 0.029131366696674377,
|
|
"learning_rate": 1.996539130905593e-05,
|
|
"loss": 0.0149,
|
|
"num_tokens": 2626337.0,
|
|
"rewards/<lambda>/mean": 0.587719202041626,
|
|
"rewards/<lambda>/std": 0.5004004836082458,
|
|
"step": 22
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00026264285714285723,
|
|
"advantage/max": 2.7037513256073,
|
|
"advantage/mean": -0.0014824382960796356,
|
|
"advantage/min": -2.703751802444458,
|
|
"advantage/std": 0.5372014045715332,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 681.2109375,
|
|
"completions/mean_terminated_length": 621.4586791992188,
|
|
"completions/min_length": 317.0,
|
|
"completions/min_terminated_length": 317.0,
|
|
"epoch": 0.27299703264094954,
|
|
"grad_norm": 0.62109375,
|
|
"kl": 0.031289953854866326,
|
|
"learning_rate": 1.9945941475610623e-05,
|
|
"loss": 0.0109,
|
|
"num_tokens": 2727476.0,
|
|
"rewards/<lambda>/mean": 0.8566299676895142,
|
|
"rewards/<lambda>/std": 0.35583803057670593,
|
|
"step": 23
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00027453571428571436,
|
|
"advantage/max": 1.6469597816467285,
|
|
"advantage/mean": -0.041688546538352966,
|
|
"advantage/min": -2.7035748958587646,
|
|
"advantage/std": 0.8940851092338562,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.28125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 988.0,
|
|
"completions/mean_length": 767.953125,
|
|
"completions/mean_terminated_length": 667.7608642578125,
|
|
"completions/min_length": 342.0,
|
|
"completions/min_terminated_length": 342.0,
|
|
"epoch": 0.28486646884273,
|
|
"grad_norm": 0.93359375,
|
|
"kl": 0.027894207509234548,
|
|
"learning_rate": 1.9922186613134152e-05,
|
|
"loss": 0.1136,
|
|
"num_tokens": 2840230.0,
|
|
"rewards/<lambda>/mean": 0.7074149250984192,
|
|
"rewards/<lambda>/std": 0.46066340804100037,
|
|
"step": 24
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0002864285714285715,
|
|
"advantage/max": 1.9371424913406372,
|
|
"advantage/mean": -0.019469894468784332,
|
|
"advantage/min": -2.7033982276916504,
|
|
"advantage/std": 0.8306273221969604,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 993.0,
|
|
"completions/mean_length": 742.0078125,
|
|
"completions/mean_terminated_length": 599.3529663085938,
|
|
"completions/min_length": 354.0,
|
|
"completions/min_terminated_length": 354.0,
|
|
"epoch": 0.29673590504451036,
|
|
"grad_norm": 0.81640625,
|
|
"kl": 0.02968023216817528,
|
|
"learning_rate": 1.9894137003747404e-05,
|
|
"loss": 0.0728,
|
|
"num_tokens": 2952367.0,
|
|
"rewards/<lambda>/mean": 0.6998242139816284,
|
|
"rewards/<lambda>/std": 0.4637812674045563,
|
|
"step": 25
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0002983214285714286,
|
|
"advantage/max": 2.703221559524536,
|
|
"advantage/mean": -0.009083299897611141,
|
|
"advantage/min": -2.703221559524536,
|
|
"advantage/std": 0.8210997581481934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 796.15625,
|
|
"completions/mean_terminated_length": 680.8941040039062,
|
|
"completions/min_length": 291.0,
|
|
"completions/min_terminated_length": 291.0,
|
|
"epoch": 0.3086053412462908,
|
|
"grad_norm": 0.83203125,
|
|
"kl": 0.024444299924653023,
|
|
"learning_rate": 1.986180478852149e-05,
|
|
"loss": 0.041,
|
|
"num_tokens": 3070579.0,
|
|
"rewards/<lambda>/mean": 0.6674760580062866,
|
|
"rewards/<lambda>/std": 0.47770819067955017,
|
|
"step": 26
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0003102142857142858,
|
|
"advantage/max": 2.7030446529388428,
|
|
"advantage/mean": 0.03770775347948074,
|
|
"advantage/min": -2.703045129776001,
|
|
"advantage/std": 0.9571543335914612,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.578125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 868.8828125,
|
|
"completions/mean_terminated_length": 656.3148193359375,
|
|
"completions/min_length": 330.0,
|
|
"completions/min_terminated_length": 330.0,
|
|
"epoch": 0.32047477744807124,
|
|
"grad_norm": 0.82421875,
|
|
"kl": 0.021971712238155305,
|
|
"learning_rate": 1.9825203962222573e-05,
|
|
"loss": -0.0093,
|
|
"num_tokens": 3199700.0,
|
|
"rewards/<lambda>/mean": 0.4137590527534485,
|
|
"rewards/<lambda>/std": 0.5027788281440735,
|
|
"step": 27
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0003221071428571429,
|
|
"advantage/max": 2.0144219398498535,
|
|
"advantage/mean": -0.008568160235881805,
|
|
"advantage/min": -2.691011428833008,
|
|
"advantage/std": 0.9483392238616943,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4140625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 833.6875,
|
|
"completions/mean_terminated_length": 699.2000122070312,
|
|
"completions/min_length": 247.0,
|
|
"completions/min_terminated_length": 247.0,
|
|
"epoch": 0.3323442136498516,
|
|
"grad_norm": 0.875,
|
|
"kl": 0.027091103547718376,
|
|
"learning_rate": 1.9784350367254322e-05,
|
|
"loss": 0.0383,
|
|
"num_tokens": 3321380.0,
|
|
"rewards/<lambda>/mean": 0.5257835984230042,
|
|
"rewards/<lambda>/std": 0.5068398714065552,
|
|
"step": 28
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00033400000000000004,
|
|
"advantage/max": 2.7026913166046143,
|
|
"advantage/mean": 0.013357289135456085,
|
|
"advantage/min": -2.7026920318603516,
|
|
"advantage/std": 0.7889476418495178,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 796.6484375,
|
|
"completions/mean_terminated_length": 660.2374877929688,
|
|
"completions/min_length": 308.0,
|
|
"completions/min_terminated_length": 308.0,
|
|
"epoch": 0.34421364985163205,
|
|
"grad_norm": 0.6484375,
|
|
"kl": 0.025885087088681757,
|
|
"learning_rate": 1.9739261686800662e-05,
|
|
"loss": 0.0067,
|
|
"num_tokens": 3441263.0,
|
|
"rewards/<lambda>/mean": 0.6671923995018005,
|
|
"rewards/<lambda>/std": 0.47810864448547363,
|
|
"step": 29
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0003458928571428572,
|
|
"advantage/max": 2.7025139331817627,
|
|
"advantage/mean": -0.003880620002746582,
|
|
"advantage/min": -2.7025153636932373,
|
|
"advantage/std": 0.7040800452232361,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.28125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 960.0,
|
|
"completions/mean_length": 713.0703125,
|
|
"completions/mean_terminated_length": 591.4021606445312,
|
|
"completions/min_length": 268.0,
|
|
"completions/min_terminated_length": 268.0,
|
|
"epoch": 0.3560830860534125,
|
|
"grad_norm": 0.703125,
|
|
"kl": 0.032981967844534665,
|
|
"learning_rate": 1.968995743717171e-05,
|
|
"loss": 0.0277,
|
|
"num_tokens": 3546704.0,
|
|
"rewards/<lambda>/mean": 0.7451189756393433,
|
|
"rewards/<lambda>/std": 0.44322526454925537,
|
|
"step": 30
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00035778571428571435,
|
|
"advantage/max": 2.702338695526123,
|
|
"advantage/mean": -0.0032848306000232697,
|
|
"advantage/min": -2.613555431365967,
|
|
"advantage/std": 0.8725407719612122,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4921875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 826.203125,
|
|
"completions/mean_terminated_length": 634.4923095703125,
|
|
"completions/min_length": 304.0,
|
|
"completions/min_terminated_length": 304.0,
|
|
"epoch": 0.36795252225519287,
|
|
"grad_norm": 0.75,
|
|
"kl": 0.02586214942857623,
|
|
"learning_rate": 1.963645895935632e-05,
|
|
"loss": 0.0233,
|
|
"num_tokens": 3667562.0,
|
|
"rewards/<lambda>/mean": 0.5240461826324463,
|
|
"rewards/<lambda>/std": 0.5086991190910339,
|
|
"step": 31
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00036967857142857153,
|
|
"advantage/max": 2.5066890716552734,
|
|
"advantage/mean": -0.031669870018959045,
|
|
"advantage/min": -2.702162504196167,
|
|
"advantage/std": 0.8189276456832886,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 997.0,
|
|
"completions/mean_length": 795.7734375,
|
|
"completions/mean_terminated_length": 658.8375244140625,
|
|
"completions/min_length": 281.0,
|
|
"completions/min_terminated_length": 281.0,
|
|
"epoch": 0.3798219584569733,
|
|
"grad_norm": 0.734375,
|
|
"kl": 0.029125412344001234,
|
|
"learning_rate": 1.9578789409784727e-05,
|
|
"loss": 0.0437,
|
|
"num_tokens": 3787557.0,
|
|
"rewards/<lambda>/mean": 0.6826491355895996,
|
|
"rewards/<lambda>/std": 0.47256773710250854,
|
|
"step": 32
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0003815714285714286,
|
|
"advantage/max": 2.7019858360290527,
|
|
"advantage/mean": 0.0026858258061110973,
|
|
"advantage/min": -1.8257653713226318,
|
|
"advantage/std": 0.5741036534309387,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1005.0,
|
|
"completions/mean_length": 757.546875,
|
|
"completions/mean_terminated_length": 686.3168334960938,
|
|
"completions/min_length": 353.0,
|
|
"completions/min_terminated_length": 353.0,
|
|
"epoch": 0.3916913946587537,
|
|
"grad_norm": 0.458984375,
|
|
"kl": 0.025140227226074785,
|
|
"learning_rate": 1.951697375030553e-05,
|
|
"loss": 0.0053,
|
|
"num_tokens": 3898155.0,
|
|
"rewards/<lambda>/mean": 0.7945812940597534,
|
|
"rewards/<lambda>/std": 0.40846800804138184,
|
|
"step": 33
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0003934642857142858,
|
|
"advantage/max": 2.7018091678619385,
|
|
"advantage/mean": -0.0054876613430678844,
|
|
"advantage/min": -2.598515272140503,
|
|
"advantage/std": 0.8233418464660645,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 831.125,
|
|
"completions/mean_terminated_length": 715.4000244140625,
|
|
"completions/min_length": 356.0,
|
|
"completions/min_terminated_length": 356.0,
|
|
"epoch": 0.4035608308605341,
|
|
"grad_norm": 0.69140625,
|
|
"kl": 0.029167757485993207,
|
|
"learning_rate": 1.9451038737381078e-05,
|
|
"loss": 0.0223,
|
|
"num_tokens": 4020891.0,
|
|
"rewards/<lambda>/mean": 0.6431732177734375,
|
|
"rewards/<lambda>/std": 0.4865179657936096,
|
|
"step": 34
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0004053571428571429,
|
|
"advantage/max": 2.701632261276245,
|
|
"advantage/mean": -0.02064463123679161,
|
|
"advantage/min": -2.7016334533691406,
|
|
"advantage/std": 0.8611717224121094,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 805.0,
|
|
"completions/mean_terminated_length": 694.2117919921875,
|
|
"completions/min_length": 354.0,
|
|
"completions/min_terminated_length": 354.0,
|
|
"epoch": 0.41543026706231456,
|
|
"grad_norm": 0.7265625,
|
|
"kl": 0.029243278899230063,
|
|
"learning_rate": 1.9381012910506146e-05,
|
|
"loss": 0.0591,
|
|
"num_tokens": 4138995.0,
|
|
"rewards/<lambda>/mean": 0.6594928503036499,
|
|
"rewards/<lambda>/std": 0.4806313216686249,
|
|
"step": 35
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0004172500000000001,
|
|
"advantage/max": 2.7014570236206055,
|
|
"advantage/mean": 0.010569433681666851,
|
|
"advantage/min": -2.7014570236206055,
|
|
"advantage/std": 0.6529413461685181,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.234375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 725.75,
|
|
"completions/mean_terminated_length": 634.448974609375,
|
|
"completions/min_length": 315.0,
|
|
"completions/min_terminated_length": 315.0,
|
|
"epoch": 0.42729970326409494,
|
|
"grad_norm": 0.5234375,
|
|
"kl": 0.03295414987951517,
|
|
"learning_rate": 1.930692657985482e-05,
|
|
"loss": 0.0061,
|
|
"num_tokens": 4247299.0,
|
|
"rewards/<lambda>/mean": 0.7556747198104858,
|
|
"rewards/<lambda>/std": 0.4338902533054352,
|
|
"step": 36
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0004291428571428572,
|
|
"advantage/max": 2.418079137802124,
|
|
"advantage/mean": -0.012974461540579796,
|
|
"advantage/min": -2.7012808322906494,
|
|
"advantage/std": 0.728173017501831,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 735.921875,
|
|
"completions/mean_terminated_length": 639.8958740234375,
|
|
"completions/min_length": 287.0,
|
|
"completions/min_terminated_length": 287.0,
|
|
"epoch": 0.4391691394658754,
|
|
"grad_norm": 0.734375,
|
|
"kl": 0.03324733709450811,
|
|
"learning_rate": 1.9228811813160972e-05,
|
|
"loss": 0.025,
|
|
"num_tokens": 4359721.0,
|
|
"rewards/<lambda>/mean": 0.7533758282661438,
|
|
"rewards/<lambda>/std": 0.43797916173934937,
|
|
"step": 37
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00044103571428571434,
|
|
"advantage/max": 2.324380397796631,
|
|
"advantage/mean": -0.024011407047510147,
|
|
"advantage/min": -2.7011046409606934,
|
|
"advantage/std": 0.6718021631240845,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2421875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 967.0,
|
|
"completions/mean_length": 743.9765625,
|
|
"completions/mean_terminated_length": 654.4844970703125,
|
|
"completions/min_length": 332.0,
|
|
"completions/min_terminated_length": 332.0,
|
|
"epoch": 0.45103857566765576,
|
|
"grad_norm": 0.7421875,
|
|
"kl": 0.032304306514561176,
|
|
"learning_rate": 1.9146702421837952e-05,
|
|
"loss": 0.0519,
|
|
"num_tokens": 4470342.0,
|
|
"rewards/<lambda>/mean": 0.8252986669540405,
|
|
"rewards/<lambda>/std": 0.3849862813949585,
|
|
"step": 38
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0004529285714285715,
|
|
"advantage/max": 1.0875418186187744,
|
|
"advantage/mean": -0.006378441117703915,
|
|
"advantage/min": -1.9391565322875977,
|
|
"advantage/std": 0.46930885314941406,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1015625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 654.1015625,
|
|
"completions/mean_terminated_length": 612.2869262695312,
|
|
"completions/min_length": 350.0,
|
|
"completions/min_terminated_length": 350.0,
|
|
"epoch": 0.4629080118694362,
|
|
"grad_norm": 0.4140625,
|
|
"kl": 0.03506830852711573,
|
|
"learning_rate": 1.906063394634356e-05,
|
|
"loss": 0.0236,
|
|
"num_tokens": 4566963.0,
|
|
"rewards/<lambda>/mean": 0.9207130670547485,
|
|
"rewards/<lambda>/std": 0.27343931794166565,
|
|
"step": 39
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00046482142857142864,
|
|
"advantage/max": 2.700751781463623,
|
|
"advantage/mean": -0.01085714902728796,
|
|
"advantage/min": -2.7007522583007812,
|
|
"advantage/std": 0.6957706212997437,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.265625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 756.15625,
|
|
"completions/mean_terminated_length": 659.2765502929688,
|
|
"completions/min_length": 311.0,
|
|
"completions/min_terminated_length": 311.0,
|
|
"epoch": 0.47477744807121663,
|
|
"grad_norm": 0.58984375,
|
|
"kl": 0.03276660805568099,
|
|
"learning_rate": 1.8970643640796642e-05,
|
|
"loss": 0.0285,
|
|
"num_tokens": 4678071.0,
|
|
"rewards/<lambda>/mean": 0.7705238461494446,
|
|
"rewards/<lambda>/std": 0.4256618022918701,
|
|
"step": 40
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00047671428571428577,
|
|
"advantage/max": 1.4527703523635864,
|
|
"advantage/mean": -0.016703419387340546,
|
|
"advantage/min": -2.7005763053894043,
|
|
"advantage/std": 0.6522344350814819,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 962.0,
|
|
"completions/mean_length": 669.84375,
|
|
"completions/mean_terminated_length": 575.1683349609375,
|
|
"completions/min_length": 323.0,
|
|
"completions/min_terminated_length": 323.0,
|
|
"epoch": 0.486646884272997,
|
|
"grad_norm": 0.9140625,
|
|
"kl": 0.03814881236758083,
|
|
"learning_rate": 1.887677045685188e-05,
|
|
"loss": 0.066,
|
|
"num_tokens": 4780747.0,
|
|
"rewards/<lambda>/mean": 0.8180162906646729,
|
|
"rewards/<lambda>/std": 0.3903641104698181,
|
|
"step": 41
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0004886071428571428,
|
|
"advantage/max": 2.700399160385132,
|
|
"advantage/mean": -0.006951633840799332,
|
|
"advantage/min": -2.7004001140594482,
|
|
"advantage/std": 0.8440094590187073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 993.0,
|
|
"completions/mean_length": 781.7890625,
|
|
"completions/mean_terminated_length": 631.5570068359375,
|
|
"completions/min_length": 352.0,
|
|
"completions/min_terminated_length": 352.0,
|
|
"epoch": 0.49851632047477745,
|
|
"grad_norm": 0.7890625,
|
|
"kl": 0.035947895026765764,
|
|
"learning_rate": 1.877905502683987e-05,
|
|
"loss": 0.0346,
|
|
"num_tokens": 4898672.0,
|
|
"rewards/<lambda>/mean": 0.6191579103469849,
|
|
"rewards/<lambda>/std": 0.49360737204551697,
|
|
"step": 42
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005005000000000001,
|
|
"advantage/max": 1.9356884956359863,
|
|
"advantage/mean": -0.026186538860201836,
|
|
"advantage/min": -2.7002241611480713,
|
|
"advantage/std": 0.7014551162719727,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1796875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 749.828125,
|
|
"completions/mean_terminated_length": 689.771484375,
|
|
"completions/min_length": 413.0,
|
|
"completions/min_terminated_length": 413.0,
|
|
"epoch": 0.5103857566765578,
|
|
"grad_norm": 0.61328125,
|
|
"kl": 0.033930204808712006,
|
|
"learning_rate": 1.8677539646179706e-05,
|
|
"loss": 0.0662,
|
|
"num_tokens": 5009066.0,
|
|
"rewards/<lambda>/mean": 0.818067729473114,
|
|
"rewards/<lambda>/std": 0.39025673270225525,
|
|
"step": 43
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005123928571428572,
|
|
"advantage/max": 2.6708412170410156,
|
|
"advantage/mean": -0.011584467254579067,
|
|
"advantage/min": -2.7000479698181152,
|
|
"advantage/std": 0.9008896350860596,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 833.59375,
|
|
"completions/mean_terminated_length": 715.4937133789062,
|
|
"completions/min_length": 362.0,
|
|
"completions/min_terminated_length": 362.0,
|
|
"epoch": 0.5222551928783383,
|
|
"grad_norm": 0.78515625,
|
|
"kl": 0.032038538600318134,
|
|
"learning_rate": 1.8572268255071718e-05,
|
|
"loss": 0.0502,
|
|
"num_tokens": 5132766.0,
|
|
"rewards/<lambda>/mean": 0.6196942329406738,
|
|
"rewards/<lambda>/std": 0.4929179847240448,
|
|
"step": 44
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005242857142857144,
|
|
"advantage/max": 2.6998708248138428,
|
|
"advantage/mean": -0.006081804633140564,
|
|
"advantage/min": -2.6998720169067383,
|
|
"advantage/std": 0.8706440925598145,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4453125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 800.7109375,
|
|
"completions/mean_terminated_length": 621.45068359375,
|
|
"completions/min_length": 308.0,
|
|
"completions/min_terminated_length": 308.0,
|
|
"epoch": 0.5341246290801187,
|
|
"grad_norm": 0.74609375,
|
|
"kl": 0.036316482815891504,
|
|
"learning_rate": 1.8463286419478256e-05,
|
|
"loss": 0.0214,
|
|
"num_tokens": 5254737.0,
|
|
"rewards/<lambda>/mean": 0.5557836294174194,
|
|
"rewards/<lambda>/std": 0.5056889653205872,
|
|
"step": 45
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005361785714285715,
|
|
"advantage/max": 1.452530026435852,
|
|
"advantage/mean": -0.0004155319184064865,
|
|
"advantage/min": -2.6996960639953613,
|
|
"advantage/std": 0.45691579580307007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1171875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 710.0625,
|
|
"completions/mean_terminated_length": 668.389404296875,
|
|
"completions/min_length": 375.0,
|
|
"completions/min_terminated_length": 375.0,
|
|
"epoch": 0.5459940652818991,
|
|
"grad_norm": 0.390625,
|
|
"kl": 0.03480621299240738,
|
|
"learning_rate": 1.8350641311400813e-05,
|
|
"loss": 0.0041,
|
|
"num_tokens": 5359401.0,
|
|
"rewards/<lambda>/mean": 0.9130905866622925,
|
|
"rewards/<lambda>/std": 0.28455793857574463,
|
|
"step": 46
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005480714285714287,
|
|
"advantage/max": 2.6995198726654053,
|
|
"advantage/mean": -0.008102089166641235,
|
|
"advantage/min": -2.6995203495025635,
|
|
"advantage/std": 0.6858307123184204,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.234375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 996.0,
|
|
"completions/mean_length": 764.671875,
|
|
"completions/mean_terminated_length": 685.2857055664062,
|
|
"completions/min_length": 291.0,
|
|
"completions/min_terminated_length": 291.0,
|
|
"epoch": 0.5578635014836796,
|
|
"grad_norm": 0.60546875,
|
|
"kl": 0.02857328619575128,
|
|
"learning_rate": 1.8234381688461943e-05,
|
|
"loss": 0.0463,
|
|
"num_tokens": 5473399.0,
|
|
"rewards/<lambda>/mean": 0.7786081433296204,
|
|
"rewards/<lambda>/std": 0.42004773020744324,
|
|
"step": 47
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005599642857142858,
|
|
"advantage/max": 2.6993439197540283,
|
|
"advantage/mean": -0.018411144614219666,
|
|
"advantage/min": -2.6993443965911865,
|
|
"advantage/std": 0.7576082348823547,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.28125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 987.0,
|
|
"completions/mean_length": 755.4296875,
|
|
"completions/mean_terminated_length": 650.3369750976562,
|
|
"completions/min_length": 374.0,
|
|
"completions/min_terminated_length": 374.0,
|
|
"epoch": 0.56973293768546,
|
|
"grad_norm": 0.59765625,
|
|
"kl": 0.028340922435745597,
|
|
"learning_rate": 1.8114557872800906e-05,
|
|
"loss": 0.0362,
|
|
"num_tokens": 5583862.0,
|
|
"rewards/<lambda>/mean": 0.7149896621704102,
|
|
"rewards/<lambda>/std": 0.4574246108531952,
|
|
"step": 48
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005718571428571429,
|
|
"advantage/max": 2.1167187690734863,
|
|
"advantage/mean": -0.013100487180054188,
|
|
"advantage/min": -2.6991684436798096,
|
|
"advantage/std": 0.6667536497116089,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.171875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 708.8984375,
|
|
"completions/mean_terminated_length": 643.5,
|
|
"completions/min_length": 258.0,
|
|
"completions/min_terminated_length": 258.0,
|
|
"epoch": 0.5816023738872403,
|
|
"grad_norm": 0.65234375,
|
|
"kl": 0.03176001494284719,
|
|
"learning_rate": 1.799122172929206e-05,
|
|
"loss": 0.0367,
|
|
"num_tokens": 5689025.0,
|
|
"rewards/<lambda>/mean": 0.8245458602905273,
|
|
"rewards/<lambda>/std": 0.3866560459136963,
|
|
"step": 49
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005837500000000002,
|
|
"advantage/max": 2.512308120727539,
|
|
"advantage/mean": -0.006615880876779556,
|
|
"advantage/min": -2.5892391204833984,
|
|
"advantage/std": 0.9890721440315247,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5546875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 846.890625,
|
|
"completions/mean_terminated_length": 626.2807006835938,
|
|
"completions/min_length": 313.0,
|
|
"completions/min_terminated_length": 313.0,
|
|
"epoch": 0.5934718100890207,
|
|
"grad_norm": 0.8359375,
|
|
"kl": 0.028011595306452364,
|
|
"learning_rate": 1.7864426643095537e-05,
|
|
"loss": 0.0215,
|
|
"num_tokens": 5818339.0,
|
|
"rewards/<lambda>/mean": 0.4374961256980896,
|
|
"rewards/<lambda>/std": 0.5059982538223267,
|
|
"step": 50
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0005956428571428572,
|
|
"advantage/max": 2.1364798545837402,
|
|
"advantage/mean": -0.01754511147737503,
|
|
"advantage/min": -2.698817014694214,
|
|
"advantage/std": 0.6074024438858032,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 963.0,
|
|
"completions/mean_length": 700.421875,
|
|
"completions/mean_terminated_length": 640.5,
|
|
"completions/min_length": 271.0,
|
|
"completions/min_terminated_length": 271.0,
|
|
"epoch": 0.6053412462908012,
|
|
"grad_norm": 0.5546875,
|
|
"kl": 0.03255216917023063,
|
|
"learning_rate": 1.773422749654988e-05,
|
|
"loss": 0.0416,
|
|
"num_tokens": 5923825.0,
|
|
"rewards/<lambda>/mean": 0.8574329614639282,
|
|
"rewards/<lambda>/std": 0.3538227379322052,
|
|
"step": 51
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006075357142857143,
|
|
"advantage/max": 2.5419423580169678,
|
|
"advantage/mean": -0.006520539056509733,
|
|
"advantage/min": -2.077256917953491,
|
|
"advantage/std": 0.6562984585762024,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.203125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1004.0,
|
|
"completions/mean_length": 681.4765625,
|
|
"completions/mean_terminated_length": 594.1666870117188,
|
|
"completions/min_length": 230.0,
|
|
"completions/min_terminated_length": 230.0,
|
|
"epoch": 0.6172106824925816,
|
|
"grad_norm": 0.546875,
|
|
"kl": 0.041263859427999705,
|
|
"learning_rate": 1.7600680645416583e-05,
|
|
"loss": 0.0254,
|
|
"num_tokens": 6025430.0,
|
|
"rewards/<lambda>/mean": 0.8023372888565063,
|
|
"rewards/<lambda>/std": 0.40279263257980347,
|
|
"step": 52
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006194285714285716,
|
|
"advantage/max": 2.698464870452881,
|
|
"advantage/mean": -0.01813482493162155,
|
|
"advantage/min": -2.698465585708618,
|
|
"advantage/std": 0.8114321231842041,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 827.1953125,
|
|
"completions/mean_terminated_length": 731.0814208984375,
|
|
"completions/min_length": 401.0,
|
|
"completions/min_terminated_length": 401.0,
|
|
"epoch": 0.629080118694362,
|
|
"grad_norm": 0.640625,
|
|
"kl": 0.028205811278894544,
|
|
"learning_rate": 1.746384389448694e-05,
|
|
"loss": 0.0416,
|
|
"num_tokens": 6149983.0,
|
|
"rewards/<lambda>/mean": 0.7141739130020142,
|
|
"rewards/<lambda>/std": 0.45872655510902405,
|
|
"step": 53
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006313214285714287,
|
|
"advantage/max": 2.698289394378662,
|
|
"advantage/mean": -0.004583118483424187,
|
|
"advantage/min": -2.6982898712158203,
|
|
"advantage/std": 0.7579176425933838,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 799.671875,
|
|
"completions/mean_terminated_length": 660.5316772460938,
|
|
"completions/min_length": 343.0,
|
|
"completions/min_terminated_length": 343.0,
|
|
"epoch": 0.6409495548961425,
|
|
"grad_norm": 0.640625,
|
|
"kl": 0.03153213975019753,
|
|
"learning_rate": 1.7323776472561625e-05,
|
|
"loss": 0.0272,
|
|
"num_tokens": 6268077.0,
|
|
"rewards/<lambda>/mean": 0.73039710521698,
|
|
"rewards/<lambda>/std": 0.4500512182712555,
|
|
"step": 54
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006432142857142858,
|
|
"advantage/max": 2.6981141567230225,
|
|
"advantage/mean": 0.0031758034601807594,
|
|
"advantage/min": -2.6981143951416016,
|
|
"advantage/std": 0.6641281247138977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.234375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 745.0390625,
|
|
"completions/mean_terminated_length": 659.642822265625,
|
|
"completions/min_length": 328.0,
|
|
"completions/min_terminated_length": 328.0,
|
|
"epoch": 0.6528189910979229,
|
|
"grad_norm": 0.8359375,
|
|
"kl": 0.030679657589644194,
|
|
"learning_rate": 1.7180539006813973e-05,
|
|
"loss": 0.0237,
|
|
"num_tokens": 6378738.0,
|
|
"rewards/<lambda>/mean": 0.7700124979019165,
|
|
"rewards/<lambda>/std": 0.42661646008491516,
|
|
"step": 55
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006551071428571429,
|
|
"advantage/max": 2.6979339122772217,
|
|
"advantage/mean": -0.009846007451415062,
|
|
"advantage/min": -2.6979386806488037,
|
|
"advantage/std": 0.5840507745742798,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 711.046875,
|
|
"completions/mean_terminated_length": 649.6261596679688,
|
|
"completions/min_length": 287.0,
|
|
"completions/min_terminated_length": 287.0,
|
|
"epoch": 0.6646884272997032,
|
|
"grad_norm": 1.390625,
|
|
"kl": 0.028229560470208526,
|
|
"learning_rate": 1.7034193496547903e-05,
|
|
"loss": 0.0326,
|
|
"num_tokens": 6483208.0,
|
|
"rewards/<lambda>/mean": 0.8656327724456787,
|
|
"rewards/<lambda>/std": 0.34470248222351074,
|
|
"step": 56
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006670000000000001,
|
|
"advantage/max": 2.5979971885681152,
|
|
"advantage/mean": -0.017020640894770622,
|
|
"advantage/min": -2.697763204574585,
|
|
"advantage/std": 1.0236399173736572,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5234375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 916.8515625,
|
|
"completions/mean_terminated_length": 799.1638793945312,
|
|
"completions/min_length": 504.0,
|
|
"completions/min_terminated_length": 504.0,
|
|
"epoch": 0.6765578635014837,
|
|
"grad_norm": 2.65625,
|
|
"kl": 0.027361205138731748,
|
|
"learning_rate": 1.6884803286362e-05,
|
|
"loss": 0.0522,
|
|
"num_tokens": 6615989.0,
|
|
"rewards/<lambda>/mean": 0.5157707333564758,
|
|
"rewards/<lambda>/std": 0.5094924569129944,
|
|
"step": 57
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006788928571428573,
|
|
"advantage/max": 1.9344719648361206,
|
|
"advantage/mean": -0.010600470006465912,
|
|
"advantage/min": -2.697587728500366,
|
|
"advantage/std": 0.7744669914245605,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 799.8984375,
|
|
"completions/mean_terminated_length": 674.1829223632812,
|
|
"completions/min_length": 354.0,
|
|
"completions/min_terminated_length": 354.0,
|
|
"epoch": 0.6884272997032641,
|
|
"grad_norm": 0.65625,
|
|
"kl": 0.029555415618233383,
|
|
"learning_rate": 1.6732433038731245e-05,
|
|
"loss": 0.0248,
|
|
"num_tokens": 6735664.0,
|
|
"rewards/<lambda>/mean": 0.7059880495071411,
|
|
"rewards/<lambda>/std": 0.462918758392334,
|
|
"step": 58
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0006907857142857144,
|
|
"advantage/max": 2.6974117755889893,
|
|
"advantage/mean": -0.013238908722996712,
|
|
"advantage/min": -2.6974122524261475,
|
|
"advantage/std": 0.677533745765686,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 726.625,
|
|
"completions/mean_terminated_length": 647.1287231445312,
|
|
"completions/min_length": 332.0,
|
|
"completions/min_terminated_length": 332.0,
|
|
"epoch": 0.7002967359050445,
|
|
"grad_norm": 0.625,
|
|
"kl": 0.025012695870827883,
|
|
"learning_rate": 1.657714870601833e-05,
|
|
"loss": 0.0425,
|
|
"num_tokens": 6841864.0,
|
|
"rewards/<lambda>/mean": 0.794950544834137,
|
|
"rewards/<lambda>/std": 0.40773335099220276,
|
|
"step": 59
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007026785714285714,
|
|
"advantage/max": 1.642048954963684,
|
|
"advantage/mean": -0.03683529794216156,
|
|
"advantage/min": -2.6972367763519287,
|
|
"advantage/std": 0.8866681456565857,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3515625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1022.0,
|
|
"completions/mean_length": 797.8671875,
|
|
"completions/mean_terminated_length": 675.2650146484375,
|
|
"completions/min_length": 309.0,
|
|
"completions/min_terminated_length": 309.0,
|
|
"epoch": 0.712166172106825,
|
|
"grad_norm": 0.84765625,
|
|
"kl": 0.0283702181186527,
|
|
"learning_rate": 1.641901750192666e-05,
|
|
"loss": 0.0942,
|
|
"num_tokens": 6959047.0,
|
|
"rewards/<lambda>/mean": 0.6587684750556946,
|
|
"rewards/<lambda>/std": 0.4816700518131256,
|
|
"step": 60
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007145714285714287,
|
|
"advantage/max": 2.6065056324005127,
|
|
"advantage/mean": -0.024096639826893806,
|
|
"advantage/min": -2.69706130027771,
|
|
"advantage/std": 0.8914181590080261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.359375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 808.078125,
|
|
"completions/mean_terminated_length": 686.951171875,
|
|
"completions/min_length": 317.0,
|
|
"completions/min_terminated_length": 317.0,
|
|
"epoch": 0.7240356083086054,
|
|
"grad_norm": 0.703125,
|
|
"kl": 0.02628780552186072,
|
|
"learning_rate": 1.6258107872407376e-05,
|
|
"loss": 0.0564,
|
|
"num_tokens": 7077337.0,
|
|
"rewards/<lambda>/mean": 0.6672515273094177,
|
|
"rewards/<lambda>/std": 0.47803038358688354,
|
|
"step": 61
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007264642857142858,
|
|
"advantage/max": 1.9922456741333008,
|
|
"advantage/mean": -0.023510506376624107,
|
|
"advantage/min": -2.694539785385132,
|
|
"advantage/std": 0.8509385585784912,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 797.3671875,
|
|
"completions/mean_terminated_length": 661.3875122070312,
|
|
"completions/min_length": 241.0,
|
|
"completions/min_terminated_length": 241.0,
|
|
"epoch": 0.7359050445103857,
|
|
"grad_norm": 0.6953125,
|
|
"kl": 0.025755617709364742,
|
|
"learning_rate": 1.609448946603304e-05,
|
|
"loss": 0.0489,
|
|
"num_tokens": 7197120.0,
|
|
"rewards/<lambda>/mean": 0.6514172554016113,
|
|
"rewards/<lambda>/std": 0.483541876077652,
|
|
"step": 62
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.000738357142857143,
|
|
"advantage/max": 1.9340767860412598,
|
|
"advantage/mean": -0.014695411548018456,
|
|
"advantage/min": -2.6967105865478516,
|
|
"advantage/std": 0.7965668439865112,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2734375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 811.0546875,
|
|
"completions/mean_terminated_length": 730.9140014648438,
|
|
"completions/min_length": 450.0,
|
|
"completions/min_terminated_length": 450.0,
|
|
"epoch": 0.7477744807121661,
|
|
"grad_norm": 0.78515625,
|
|
"kl": 0.02421920670894906,
|
|
"learning_rate": 1.592823310385073e-05,
|
|
"loss": 0.05,
|
|
"num_tokens": 7316143.0,
|
|
"rewards/<lambda>/mean": 0.7625675797462463,
|
|
"rewards/<lambda>/std": 0.43082737922668457,
|
|
"step": 63
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007502500000000002,
|
|
"advantage/max": 2.6965343952178955,
|
|
"advantage/mean": -0.025378329679369926,
|
|
"advantage/min": -2.696535348892212,
|
|
"advantage/std": 0.92302006483078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.390625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 825.9765625,
|
|
"completions/mean_terminated_length": 699.0384521484375,
|
|
"completions/min_length": 316.0,
|
|
"completions/min_terminated_length": 316.0,
|
|
"epoch": 0.7596439169139466,
|
|
"grad_norm": 0.83984375,
|
|
"kl": 0.0288353796931915,
|
|
"learning_rate": 1.5759410748727663e-05,
|
|
"loss": 0.0559,
|
|
"num_tokens": 7438428.0,
|
|
"rewards/<lambda>/mean": 0.6124646663665771,
|
|
"rewards/<lambda>/std": 0.49401599168777466,
|
|
"step": 64
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007621428571428572,
|
|
"advantage/max": 2.696359872817993,
|
|
"advantage/mean": -0.007194130681455135,
|
|
"advantage/min": -2.6963601112365723,
|
|
"advantage/std": 0.9335386157035828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.46875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 871.6015625,
|
|
"completions/mean_terminated_length": 737.1323852539062,
|
|
"completions/min_length": 404.0,
|
|
"completions/min_terminated_length": 404.0,
|
|
"epoch": 0.771513353115727,
|
|
"grad_norm": 0.84375,
|
|
"kl": 0.02507934661116451,
|
|
"learning_rate": 1.5588095474202597e-05,
|
|
"loss": 0.0377,
|
|
"num_tokens": 7568553.0,
|
|
"rewards/<lambda>/mean": 0.549358606338501,
|
|
"rewards/<lambda>/std": 0.5049357414245605,
|
|
"step": 65
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007740357142857144,
|
|
"advantage/max": 2.696183681488037,
|
|
"advantage/mean": -0.023947857320308685,
|
|
"advantage/min": -2.6961848735809326,
|
|
"advantage/std": 0.8393827676773071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 997.0,
|
|
"completions/mean_length": 784.75,
|
|
"completions/mean_terminated_length": 667.906982421875,
|
|
"completions/min_length": 328.0,
|
|
"completions/min_terminated_length": 328.0,
|
|
"epoch": 0.7833827893175074,
|
|
"grad_norm": 0.75,
|
|
"kl": 0.026422377151902765,
|
|
"learning_rate": 1.5414361432856475e-05,
|
|
"loss": 0.0404,
|
|
"num_tokens": 7684065.0,
|
|
"rewards/<lambda>/mean": 0.6828888654708862,
|
|
"rewards/<lambda>/std": 0.4722091257572174,
|
|
"step": 66
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007859285714285715,
|
|
"advantage/max": 1.9337598085403442,
|
|
"advantage/mean": -0.028962209820747375,
|
|
"advantage/min": -2.696009635925293,
|
|
"advantage/std": 0.7832530736923218,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1005.0,
|
|
"completions/mean_length": 777.4375,
|
|
"completions/mean_terminated_length": 711.5247192382812,
|
|
"completions/min_length": 292.0,
|
|
"completions/min_terminated_length": 292.0,
|
|
"epoch": 0.7952522255192879,
|
|
"grad_norm": 0.75390625,
|
|
"kl": 0.02613440022105351,
|
|
"learning_rate": 1.5238283824216015e-05,
|
|
"loss": 0.0734,
|
|
"num_tokens": 7798321.0,
|
|
"rewards/<lambda>/mean": 0.8016011118888855,
|
|
"rewards/<lambda>/std": 0.4042954444885254,
|
|
"step": 67
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0007978214285714288,
|
|
"advantage/max": 1.451474905014038,
|
|
"advantage/mean": -0.0028909663669764996,
|
|
"advantage/min": -1.9345837831497192,
|
|
"advantage/std": 0.3807392120361328,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0546875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 654.34375,
|
|
"completions/mean_terminated_length": 632.9586791992188,
|
|
"completions/min_length": 384.0,
|
|
"completions/min_terminated_length": 384.0,
|
|
"epoch": 0.8071216617210683,
|
|
"grad_norm": 0.33984375,
|
|
"kl": 0.03040262428112328,
|
|
"learning_rate": 1.5059938862204126e-05,
|
|
"loss": 0.0118,
|
|
"num_tokens": 7896805.0,
|
|
"rewards/<lambda>/mean": 0.9445071220397949,
|
|
"rewards/<lambda>/std": 0.23162488639354706,
|
|
"step": 68
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008097142857142858,
|
|
"advantage/max": 2.6956591606140137,
|
|
"advantage/mean": 0.04149559512734413,
|
|
"advantage/min": -1.9395079612731934,
|
|
"advantage/std": 0.7863659858703613,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.484375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 980.0,
|
|
"completions/mean_length": 824.3125,
|
|
"completions/mean_terminated_length": 636.727294921875,
|
|
"completions/min_length": 338.0,
|
|
"completions/min_terminated_length": 338.0,
|
|
"epoch": 0.8189910979228486,
|
|
"grad_norm": 0.6875,
|
|
"kl": 0.026372475258540362,
|
|
"learning_rate": 1.4879403742151283e-05,
|
|
"loss": -0.0153,
|
|
"num_tokens": 8017661.0,
|
|
"rewards/<lambda>/mean": 0.5564619302749634,
|
|
"rewards/<lambda>/std": 0.5049146413803101,
|
|
"step": 69
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008216071428571429,
|
|
"advantage/max": 1.9278475046157837,
|
|
"advantage/mean": -0.023548610508441925,
|
|
"advantage/min": -2.695484161376953,
|
|
"advantage/std": 0.6201450824737549,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 706.578125,
|
|
"completions/mean_terminated_length": 644.2803344726562,
|
|
"completions/min_length": 320.0,
|
|
"completions/min_terminated_length": 320.0,
|
|
"epoch": 0.8308605341246291,
|
|
"grad_norm": 0.59765625,
|
|
"kl": 0.02705864212475717,
|
|
"learning_rate": 1.469675660738206e-05,
|
|
"loss": 0.0395,
|
|
"num_tokens": 8121991.0,
|
|
"rewards/<lambda>/mean": 0.8400956392288208,
|
|
"rewards/<lambda>/std": 0.3730737268924713,
|
|
"step": 70
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008335000000000001,
|
|
"advantage/max": 1.933438777923584,
|
|
"advantage/mean": -0.000690137967467308,
|
|
"advantage/min": -2.6953091621398926,
|
|
"advantage/std": 0.8231402039527893,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4140625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 981.0,
|
|
"completions/mean_length": 791.5234375,
|
|
"completions/mean_terminated_length": 627.239990234375,
|
|
"completions/min_length": 301.0,
|
|
"completions/min_terminated_length": 301.0,
|
|
"epoch": 0.8427299703264095,
|
|
"grad_norm": 0.69140625,
|
|
"kl": 0.031296103610657156,
|
|
"learning_rate": 1.4512076515391375e-05,
|
|
"loss": 0.0387,
|
|
"num_tokens": 8237898.0,
|
|
"rewards/<lambda>/mean": 0.6578705310821533,
|
|
"rewards/<lambda>/std": 0.48294711112976074,
|
|
"step": 71
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008453928571428573,
|
|
"advantage/max": 2.6951327323913574,
|
|
"advantage/mean": -0.011506309732794762,
|
|
"advantage/min": -2.695134162902832,
|
|
"advantage/std": 0.7828315496444702,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 771.1328125,
|
|
"completions/mean_terminated_length": 614.2911376953125,
|
|
"completions/min_length": 315.0,
|
|
"completions/min_terminated_length": 315.0,
|
|
"epoch": 0.8545994065281899,
|
|
"grad_norm": 0.64453125,
|
|
"kl": 0.028297776996623725,
|
|
"learning_rate": 1.4325443403625012e-05,
|
|
"loss": 0.0231,
|
|
"num_tokens": 8350979.0,
|
|
"rewards/<lambda>/mean": 0.6668483018875122,
|
|
"rewards/<lambda>/std": 0.4786072373390198,
|
|
"step": 72
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008572857142857144,
|
|
"advantage/max": 2.201094388961792,
|
|
"advantage/mean": -0.009981157258152962,
|
|
"advantage/min": -2.6949591636657715,
|
|
"advantage/std": 0.7409846186637878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.3203125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 788.46875,
|
|
"completions/mean_terminated_length": 677.4712524414062,
|
|
"completions/min_length": 468.0,
|
|
"completions/min_terminated_length": 468.0,
|
|
"epoch": 0.8664688427299704,
|
|
"grad_norm": 0.6640625,
|
|
"kl": 0.026707628334406763,
|
|
"learning_rate": 1.4136938054879284e-05,
|
|
"loss": 0.0315,
|
|
"num_tokens": 8467959.0,
|
|
"rewards/<lambda>/mean": 0.7310893535614014,
|
|
"rewards/<lambda>/std": 0.4489072263240814,
|
|
"step": 73
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008691785714285715,
|
|
"advantage/max": 2.6035120487213135,
|
|
"advantage/mean": -0.02285468950867653,
|
|
"advantage/min": -2.694784164428711,
|
|
"advantage/std": 0.8432738184928894,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.34375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 993.0,
|
|
"completions/mean_length": 747.890625,
|
|
"completions/mean_terminated_length": 603.2619018554688,
|
|
"completions/min_length": 275.0,
|
|
"completions/min_terminated_length": 275.0,
|
|
"epoch": 0.8783382789317508,
|
|
"grad_norm": 0.7890625,
|
|
"kl": 0.025898948137182742,
|
|
"learning_rate": 1.3946642062334765e-05,
|
|
"loss": 0.0603,
|
|
"num_tokens": 8582449.0,
|
|
"rewards/<lambda>/mean": 0.6668868660926819,
|
|
"rewards/<lambda>/std": 0.4785531759262085,
|
|
"step": 74
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008810714285714287,
|
|
"advantage/max": 2.1163926124572754,
|
|
"advantage/mean": 0.0008571594953536987,
|
|
"advantage/min": -2.6946091651916504,
|
|
"advantage/std": 0.7458250522613525,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 985.0,
|
|
"completions/mean_length": 761.109375,
|
|
"completions/mean_terminated_length": 632.720947265625,
|
|
"completions/min_length": 304.0,
|
|
"completions/min_terminated_length": 304.0,
|
|
"epoch": 0.8902077151335311,
|
|
"grad_norm": 0.72265625,
|
|
"kl": 0.027754372102208436,
|
|
"learning_rate": 1.3754637794239303e-05,
|
|
"loss": 0.012,
|
|
"num_tokens": 8697839.0,
|
|
"rewards/<lambda>/mean": 0.6986972689628601,
|
|
"rewards/<lambda>/std": 0.46553197503089905,
|
|
"step": 75
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0008929642857142859,
|
|
"advantage/max": 1.0881941318511963,
|
|
"advantage/mean": -0.027605965733528137,
|
|
"advantage/min": -2.694434404373169,
|
|
"advantage/std": 0.5187329053878784,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 667.125,
|
|
"completions/mean_terminated_length": 630.2069091796875,
|
|
"completions/min_length": 310.0,
|
|
"completions/min_terminated_length": 310.0,
|
|
"epoch": 0.9020771513353115,
|
|
"grad_norm": 0.59765625,
|
|
"kl": 0.024963520525489002,
|
|
"learning_rate": 1.356100835825547e-05,
|
|
"loss": 0.0458,
|
|
"num_tokens": 8797575.0,
|
|
"rewards/<lambda>/mean": 0.9130833148956299,
|
|
"rewards/<lambda>/std": 0.2845827043056488,
|
|
"step": 76
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.000904857142857143,
|
|
"advantage/max": 1.451042890548706,
|
|
"advantage/mean": -0.024881720542907715,
|
|
"advantage/min": -2.6942596435546875,
|
|
"advantage/std": 0.6890738010406494,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 695.6796875,
|
|
"completions/mean_terminated_length": 619.9135131835938,
|
|
"completions/min_length": 370.0,
|
|
"completions/min_terminated_length": 370.0,
|
|
"epoch": 0.913946587537092,
|
|
"grad_norm": 0.65234375,
|
|
"kl": 0.025639849482104182,
|
|
"learning_rate": 1.3365837565488065e-05,
|
|
"loss": 0.0641,
|
|
"num_tokens": 8900822.0,
|
|
"rewards/<lambda>/mean": 0.8573490977287292,
|
|
"rewards/<lambda>/std": 0.3540341556072235,
|
|
"step": 77
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.00091675,
|
|
"advantage/max": 2.3416223526000977,
|
|
"advantage/mean": -0.04060763865709305,
|
|
"advantage/min": -2.694084644317627,
|
|
"advantage/std": 0.9484633803367615,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 847.3984375,
|
|
"completions/mean_terminated_length": 761.1511840820312,
|
|
"completions/min_length": 422.0,
|
|
"completions/min_terminated_length": 422.0,
|
|
"epoch": 0.9258160237388724,
|
|
"grad_norm": 0.89453125,
|
|
"kl": 0.024255894881207496,
|
|
"learning_rate": 1.316920989420703e-05,
|
|
"loss": 0.0742,
|
|
"num_tokens": 9024257.0,
|
|
"rewards/<lambda>/mean": 0.611142635345459,
|
|
"rewards/<lambda>/std": 0.49569782614707947,
|
|
"step": 78
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0009286428571428573,
|
|
"advantage/max": 2.6939098834991455,
|
|
"advantage/mean": -0.004008917137980461,
|
|
"advantage/min": -2.6939098834991455,
|
|
"advantage/std": 0.7015594840049744,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2578125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 719.7578125,
|
|
"completions/mean_terminated_length": 614.07373046875,
|
|
"completions/min_length": 282.0,
|
|
"completions/min_terminated_length": 282.0,
|
|
"epoch": 0.9376854599406528,
|
|
"grad_norm": 0.875,
|
|
"kl": 0.026439815410412848,
|
|
"learning_rate": 1.2971210453281675e-05,
|
|
"loss": 0.024,
|
|
"num_tokens": 9131522.0,
|
|
"rewards/<lambda>/mean": 0.7622599005699158,
|
|
"rewards/<lambda>/std": 0.4313811957836151,
|
|
"step": 79
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0009405357142857144,
|
|
"advantage/max": 2.693735122680664,
|
|
"advantage/mean": 0.014469900168478489,
|
|
"advantage/min": -2.693735122680664,
|
|
"advantage/std": 0.8637734055519104,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.4609375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 851.75,
|
|
"completions/mean_terminated_length": 704.4638061523438,
|
|
"completions/min_length": 286.0,
|
|
"completions/min_terminated_length": 286.0,
|
|
"epoch": 0.9495548961424333,
|
|
"grad_norm": 0.6484375,
|
|
"kl": 0.021778338530566543,
|
|
"learning_rate": 1.2771924945341906e-05,
|
|
"loss": 0.0056,
|
|
"num_tokens": 9258122.0,
|
|
"rewards/<lambda>/mean": 0.5738894939422607,
|
|
"rewards/<lambda>/std": 0.500784695148468,
|
|
"step": 80
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0009524285714285715,
|
|
"advantage/max": 1.8007384538650513,
|
|
"advantage/mean": -0.010520991869270802,
|
|
"advantage/min": -2.6935606002807617,
|
|
"advantage/std": 0.5462207794189453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1001.0,
|
|
"completions/mean_length": 650.9921875,
|
|
"completions/mean_terminated_length": 593.8648681640625,
|
|
"completions/min_length": 314.0,
|
|
"completions/min_terminated_length": 314.0,
|
|
"epoch": 0.9614243323442137,
|
|
"grad_norm": 0.5078125,
|
|
"kl": 0.026499023369979113,
|
|
"learning_rate": 1.257143962968246e-05,
|
|
"loss": 0.0293,
|
|
"num_tokens": 9354697.0,
|
|
"rewards/<lambda>/mean": 0.8731256723403931,
|
|
"rewards/<lambda>/std": 0.3370078206062317,
|
|
"step": 81
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0009643214285714288,
|
|
"advantage/max": 2.693384885787964,
|
|
"advantage/mean": -0.013431357219815254,
|
|
"advantage/min": -2.6933858394622803,
|
|
"advantage/std": 0.7807292342185974,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.28125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 772.140625,
|
|
"completions/mean_terminated_length": 673.5869750976562,
|
|
"completions/min_length": 368.0,
|
|
"completions/min_terminated_length": 368.0,
|
|
"epoch": 0.973293768545994,
|
|
"grad_norm": 0.78125,
|
|
"kl": 0.03216716053429991,
|
|
"learning_rate": 1.236984128492619e-05,
|
|
"loss": 0.0495,
|
|
"num_tokens": 9470627.0,
|
|
"rewards/<lambda>/mean": 0.7139476537704468,
|
|
"rewards/<lambda>/std": 0.4590918719768524,
|
|
"step": 82
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.0009762142857142858,
|
|
"advantage/max": 1.9324731826782227,
|
|
"advantage/mean": -0.009806068614125252,
|
|
"advantage/min": -2.178335189819336,
|
|
"advantage/std": 0.7266980409622192,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 767.484375,
|
|
"completions/mean_terminated_length": 681.9791870117188,
|
|
"completions/min_length": 365.0,
|
|
"completions/min_terminated_length": 365.0,
|
|
"epoch": 0.9851632047477745,
|
|
"grad_norm": 1.125,
|
|
"kl": 0.030359028896782547,
|
|
"learning_rate": 1.2167217171462566e-05,
|
|
"loss": 0.0428,
|
|
"num_tokens": 9582473.0,
|
|
"rewards/<lambda>/mean": 0.7615800499916077,
|
|
"rewards/<lambda>/std": 0.43262478709220886,
|
|
"step": 83
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.000988107142857143,
|
|
"advantage/max": 2.693035840988159,
|
|
"advantage/mean": 0.011945459991693497,
|
|
"advantage/min": -2.6930365562438965,
|
|
"advantage/std": 0.6034679412841797,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.27941176470588236,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1004.0,
|
|
"completions/mean_length": 769.8235473632812,
|
|
"completions/mean_terminated_length": 671.2653198242188,
|
|
"completions/min_length": 347.0,
|
|
"completions/min_terminated_length": 347.0,
|
|
"epoch": 0.9970326409495549,
|
|
"grad_norm": 0.71484375,
|
|
"kl": 0.026008586573880166,
|
|
"learning_rate": 1.1963654993677645e-05,
|
|
"loss": 0.0113,
|
|
"num_tokens": 9693520.0,
|
|
"rewards/<lambda>/mean": 0.6994150280952454,
|
|
"rewards/<lambda>/std": 0.46441495418548584,
|
|
"step": 84
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21393369138240814,
|
|
"advantage/mean": -0.006712201051414013,
|
|
"advantage/min": -1.9274110794067383,
|
|
"advantage/std": 0.2519371211528778,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 996.0,
|
|
"completions/max_terminated_length": 996.0,
|
|
"completions/mean_length": 555.453125,
|
|
"completions/mean_terminated_length": 602.5254516601562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 392.0,
|
|
"epoch": 1.0118694362017804,
|
|
"grad_norm": 0.0849609375,
|
|
"kl": NaN,
|
|
"learning_rate": 1.1759242861991855e-05,
|
|
"loss": -0.0265,
|
|
"num_tokens": 9780978.0,
|
|
"rewards/<lambda>/mean": 0.9203323721885681,
|
|
"rewards/<lambda>/std": 0.27474451065063477,
|
|
"step": 85
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.91951584815979,
|
|
"advantage/mean": -0.0033463314175605774,
|
|
"advantage/min": -1.9267429113388062,
|
|
"advantage/std": 0.38252419233322144,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 1009.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 533.28125,
|
|
"completions/mean_terminated_length": 626.238525390625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 269.0,
|
|
"epoch": 1.0237388724035608,
|
|
"grad_norm": 1.125,
|
|
"kl": 0.025204737728927284,
|
|
"learning_rate": 1.155406925472205e-05,
|
|
"loss": -0.203,
|
|
"num_tokens": 9862646.0,
|
|
"rewards/<lambda>/mean": 0.8974473476409912,
|
|
"rewards/<lambda>/std": 0.3062177896499634,
|
|
"step": 86
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9081087112426758,
|
|
"advantage/mean": -0.0033246770035475492,
|
|
"advantage/min": -1.9195520877838135,
|
|
"advantage/std": 0.2888992726802826,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1015625,
|
|
"completions/max_length": 1014.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 614.8828125,
|
|
"completions/mean_terminated_length": 684.3912963867188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 309.0,
|
|
"epoch": 1.0356083086053411,
|
|
"grad_norm": 0.265625,
|
|
"kl": NaN,
|
|
"learning_rate": 1.1348222979784289e-05,
|
|
"loss": -0.0699,
|
|
"num_tokens": 9954815.0,
|
|
"rewards/<lambda>/mean": 0.8970840573310852,
|
|
"rewards/<lambda>/std": 0.30730167031288147,
|
|
"step": 87
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9123625755310059,
|
|
"advantage/mean": -0.012081624940037727,
|
|
"advantage/min": -1.9555816650390625,
|
|
"advantage/std": 0.4595913589000702,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09375,
|
|
"completions/max_length": 1019.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 639.546875,
|
|
"completions/mean_terminated_length": 705.7069091796875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 418.0,
|
|
"epoch": 1.0474777448071217,
|
|
"grad_norm": 14.3125,
|
|
"kl": 0.19445816351799294,
|
|
"learning_rate": 1.1141793136253987e-05,
|
|
"loss": -0.1501,
|
|
"num_tokens": 10049965.0,
|
|
"rewards/<lambda>/mean": 0.9127764701843262,
|
|
"rewards/<lambda>/std": 0.28559643030166626,
|
|
"step": 88
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21387545764446259,
|
|
"advantage/mean": -0.0033545014448463917,
|
|
"advantage/min": -1.926504373550415,
|
|
"advantage/std": 0.17813949286937714,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 582.8828125,
|
|
"completions/mean_terminated_length": 621.74169921875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 342.0,
|
|
"epoch": 1.0593471810089021,
|
|
"grad_norm": 0.058349609375,
|
|
"kl": 0.02404517779359594,
|
|
"learning_rate": 1.09348690758e-05,
|
|
"loss": -0.0131,
|
|
"num_tokens": 10137102.0,
|
|
"rewards/<lambda>/mean": 0.992063581943512,
|
|
"rewards/<lambda>/std": 0.08979016542434692,
|
|
"step": 89
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5069457292556763,
|
|
"advantage/mean": -0.007181974593549967,
|
|
"advantage/min": -1.9241938591003418,
|
|
"advantage/std": 0.5587527751922607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 569.7734375,
|
|
"completions/mean_terminated_length": 657.0360717773438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 302.0,
|
|
"epoch": 1.0712166172106825,
|
|
"grad_norm": 0.58984375,
|
|
"kl": 0.02127398713491857,
|
|
"learning_rate": 1.072754036400944e-05,
|
|
"loss": -0.2267,
|
|
"num_tokens": 10224241.0,
|
|
"rewards/<lambda>/mean": 0.8261239528656006,
|
|
"rewards/<lambda>/std": 0.38317152857780457,
|
|
"step": 90
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9089659452438354,
|
|
"advantage/mean": -0.011963944882154465,
|
|
"advantage/min": -1.9220253229141235,
|
|
"advantage/std": 0.4585069715976715,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 561.8671875,
|
|
"completions/mean_terminated_length": 619.9913940429688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 370.0,
|
|
"epoch": 1.083086053412463,
|
|
"grad_norm": 0.38671875,
|
|
"kl": 0.02332187572028488,
|
|
"learning_rate": 1.0519896741619803e-05,
|
|
"loss": -0.1516,
|
|
"num_tokens": 10309520.0,
|
|
"rewards/<lambda>/mean": 0.9129678606987,
|
|
"rewards/<lambda>/std": 0.2849576473236084,
|
|
"step": 91
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9065917730331421,
|
|
"advantage/mean": -0.010066758841276169,
|
|
"advantage/min": -1.9456684589385986,
|
|
"advantage/std": 0.4456890821456909,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1171875,
|
|
"completions/max_length": 995.0,
|
|
"completions/max_terminated_length": 995.0,
|
|
"completions/mean_length": 610.1640625,
|
|
"completions/mean_terminated_length": 691.1593017578125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 481.0,
|
|
"epoch": 1.0949554896142433,
|
|
"grad_norm": 0.44921875,
|
|
"kl": 0.020190397452097386,
|
|
"learning_rate": 1.0312028085675393e-05,
|
|
"loss": -0.1532,
|
|
"num_tokens": 10400749.0,
|
|
"rewards/<lambda>/mean": 0.9132087230682373,
|
|
"rewards/<lambda>/std": 0.2841741144657135,
|
|
"step": 92
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9331722259521484,
|
|
"advantage/mean": -0.008558844216167927,
|
|
"advantage/min": -1.9150750637054443,
|
|
"advantage/std": 0.436222642660141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 1008.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 550.3046875,
|
|
"completions/mean_terminated_length": 646.2293090820312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 333.0,
|
|
"epoch": 1.1068249258160237,
|
|
"grad_norm": 0.67578125,
|
|
"kl": 0.03286611312068999,
|
|
"learning_rate": 1.0104024370624644e-05,
|
|
"loss": -0.1458,
|
|
"num_tokens": 10486644.0,
|
|
"rewards/<lambda>/mean": 0.8887128829956055,
|
|
"rewards/<lambda>/std": 0.31881752610206604,
|
|
"step": 93
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5136432647705078,
|
|
"advantage/mean": -0.003937204834073782,
|
|
"advantage/min": -1.9212960004806519,
|
|
"advantage/std": 0.4224414825439453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1015625,
|
|
"completions/max_length": 1013.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 574.9140625,
|
|
"completions/mean_terminated_length": 639.904296875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 312.0,
|
|
"epoch": 1.1186943620178043,
|
|
"grad_norm": 0.83203125,
|
|
"kl": 0.02204144821735099,
|
|
"learning_rate": 9.89597562937536e-06,
|
|
"loss": -0.1908,
|
|
"num_tokens": 10573689.0,
|
|
"rewards/<lambda>/mean": 0.8965259194374084,
|
|
"rewards/<lambda>/std": 0.3089805841445923,
|
|
"step": 94
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9286977052688599,
|
|
"advantage/mean": 0.003363626077771187,
|
|
"advantage/min": -0.22882387042045593,
|
|
"advantage/std": 0.17835266888141632,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0703125,
|
|
"completions/max_length": 978.0,
|
|
"completions/max_terminated_length": 978.0,
|
|
"completions/mean_length": 606.4609375,
|
|
"completions/mean_terminated_length": 652.3277587890625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 327.0,
|
|
"epoch": 1.1305637982195846,
|
|
"grad_norm": 1.46875,
|
|
"kl": 0.020696879364550114,
|
|
"learning_rate": 9.687971914324607e-06,
|
|
"loss": -0.1203,
|
|
"num_tokens": 10665316.0,
|
|
"rewards/<lambda>/mean": 0.9443669319152832,
|
|
"rewards/<lambda>/std": 0.2322123646736145,
|
|
"step": 95
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.1837575435638428,
|
|
"advantage/mean": -0.004825676325708628,
|
|
"advantage/min": -1.9381605386734009,
|
|
"advantage/std": 0.337948203086853,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.109375,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 552.4296875,
|
|
"completions/mean_terminated_length": 620.2719116210938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 382.0,
|
|
"epoch": 1.142433234421365,
|
|
"grad_norm": 0.55859375,
|
|
"kl": NaN,
|
|
"learning_rate": 9.480103258380198e-06,
|
|
"loss": -0.083,
|
|
"num_tokens": 10751243.0,
|
|
"rewards/<lambda>/mean": 0.8810471296310425,
|
|
"rewards/<lambda>/std": 0.32778117060661316,
|
|
"step": 96
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21301649510860443,
|
|
"advantage/mean": -0.0032973499037325382,
|
|
"advantage/min": -1.9131762981414795,
|
|
"advantage/std": 0.17694886028766632,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1003.0,
|
|
"completions/max_terminated_length": 1003.0,
|
|
"completions/mean_length": 573.46875,
|
|
"completions/mean_terminated_length": 591.9677124023438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 333.0,
|
|
"epoch": 1.1543026706231454,
|
|
"grad_norm": 0.0595703125,
|
|
"kl": 0.021307097631506622,
|
|
"learning_rate": 9.272459635990563e-06,
|
|
"loss": -0.0131,
|
|
"num_tokens": 10838335.0,
|
|
"rewards/<lambda>/mean": 0.9921300411224365,
|
|
"rewards/<lambda>/std": 0.08903875201940536,
|
|
"step": 97
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43134191632270813,
|
|
"advantage/mean": -0.00999000109732151,
|
|
"advantage/min": -1.9251052141189575,
|
|
"advantage/std": 0.3280654847621918,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.15625,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 533.0,
|
|
"completions/mean_terminated_length": 631.7037353515625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 322.0,
|
|
"epoch": 1.1661721068249258,
|
|
"grad_norm": 0.185546875,
|
|
"kl": NaN,
|
|
"learning_rate": 9.065130924199998e-06,
|
|
"loss": -0.0342,
|
|
"num_tokens": 10920343.0,
|
|
"rewards/<lambda>/mean": 0.8421245813369751,
|
|
"rewards/<lambda>/std": 0.3683176040649414,
|
|
"step": 98
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21354015171527863,
|
|
"advantage/mean": -0.006645852699875832,
|
|
"advantage/min": -1.9212852716445923,
|
|
"advantage/std": 0.25095441937446594,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0234375,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 644.578125,
|
|
"completions/mean_terminated_length": 660.0480346679688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 344.0,
|
|
"epoch": 1.1780415430267062,
|
|
"grad_norm": 0.08056640625,
|
|
"kl": 0.027586172567680478,
|
|
"learning_rate": 8.858206863746018e-06,
|
|
"loss": -0.0264,
|
|
"num_tokens": 11016305.0,
|
|
"rewards/<lambda>/mean": 0.9842005968093872,
|
|
"rewards/<lambda>/std": 0.12589696049690247,
|
|
"step": 99
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9087148904800415,
|
|
"advantage/mean": -0.0066377646289765835,
|
|
"advantage/min": -1.9224987030029297,
|
|
"advantage/std": 0.339020699262619,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 543.90625,
|
|
"completions/mean_terminated_length": 650.6541748046875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 312.0,
|
|
"epoch": 1.1899109792284865,
|
|
"grad_norm": 0.328125,
|
|
"kl": NaN,
|
|
"learning_rate": 8.651777020215713e-06,
|
|
"loss": -0.0832,
|
|
"num_tokens": 11104733.0,
|
|
"rewards/<lambda>/mean": 0.8897290229797363,
|
|
"rewards/<lambda>/std": 0.3159048557281494,
|
|
"step": 100
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43116751313209534,
|
|
"advantage/mean": -0.006669612601399422,
|
|
"advantage/min": -1.9255216121673584,
|
|
"advantage/std": 0.2759830057621002,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 573.8515625,
|
|
"completions/mean_terminated_length": 592.3629150390625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 359.0,
|
|
"epoch": 1.2017804154302671,
|
|
"grad_norm": 0.185546875,
|
|
"kl": 0.025343034823890775,
|
|
"learning_rate": 8.445930745277953e-06,
|
|
"loss": -0.0401,
|
|
"num_tokens": 11190650.0,
|
|
"rewards/<lambda>/mean": 0.9763144254684448,
|
|
"rewards/<lambda>/std": 0.15349189937114716,
|
|
"step": 101
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9046536087989807,
|
|
"advantage/mean": -0.006608244962990284,
|
|
"advantage/min": -1.923117995262146,
|
|
"advantage/std": 0.33801230788230896,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0703125,
|
|
"completions/max_length": 1017.0,
|
|
"completions/max_terminated_length": 1017.0,
|
|
"completions/mean_length": 676.984375,
|
|
"completions/mean_terminated_length": 728.1849365234375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 365.0,
|
|
"epoch": 1.2136498516320475,
|
|
"grad_norm": 0.2373046875,
|
|
"kl": 0.023159809643402696,
|
|
"learning_rate": 8.240757138008149e-06,
|
|
"loss": -0.0829,
|
|
"num_tokens": 11293840.0,
|
|
"rewards/<lambda>/mean": 0.9528600573539734,
|
|
"rewards/<lambda>/std": 0.21340253949165344,
|
|
"step": 102
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9051553606987,
|
|
"advantage/mean": -0.0019344771280884743,
|
|
"advantage/min": -1.1846874952316284,
|
|
"advantage/std": 0.318952351808548,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 570.0078125,
|
|
"completions/mean_terminated_length": 657.3063354492188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 338.0,
|
|
"epoch": 1.225519287833828,
|
|
"grad_norm": 0.55078125,
|
|
"kl": 0.021536219341214746,
|
|
"learning_rate": 8.036345006322358e-06,
|
|
"loss": -0.0976,
|
|
"num_tokens": 11382169.0,
|
|
"rewards/<lambda>/mean": 0.8820893168449402,
|
|
"rewards/<lambda>/std": 0.32490062713623047,
|
|
"step": 103
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9108253717422485,
|
|
"advantage/mean": -0.003229904919862747,
|
|
"advantage/min": -1.917354941368103,
|
|
"advantage/std": 0.4715126156806946,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 990.0,
|
|
"completions/max_terminated_length": 990.0,
|
|
"completions/mean_length": 562.8984375,
|
|
"completions/mean_terminated_length": 673.3738403320312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 312.0,
|
|
"epoch": 1.2373887240356083,
|
|
"grad_norm": 0.91015625,
|
|
"kl": NaN,
|
|
"learning_rate": 7.832782828537437e-06,
|
|
"loss": -0.1913,
|
|
"num_tokens": 11468252.0,
|
|
"rewards/<lambda>/mean": 0.8420026302337646,
|
|
"rewards/<lambda>/std": 0.3686053454875946,
|
|
"step": 104
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9139962196350098,
|
|
"advantage/mean": 2.95368954539299e-06,
|
|
"advantage/min": -1.9132927656173706,
|
|
"advantage/std": 0.2503484785556793,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0546875,
|
|
"completions/max_length": 977.0,
|
|
"completions/max_terminated_length": 977.0,
|
|
"completions/mean_length": 574.703125,
|
|
"completions/mean_terminated_length": 607.9503784179688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 308.0,
|
|
"epoch": 1.2492581602373887,
|
|
"grad_norm": 0.6640625,
|
|
"kl": 0.022887362516485155,
|
|
"learning_rate": 7.630158715073813e-06,
|
|
"loss": -0.1163,
|
|
"num_tokens": 11558062.0,
|
|
"rewards/<lambda>/mean": 0.9370107650756836,
|
|
"rewards/<lambda>/std": 0.2449151873588562,
|
|
"step": 105
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9214539527893066,
|
|
"advantage/mean": 0.006719962228089571,
|
|
"advantage/min": -1.935305118560791,
|
|
"advantage/std": 0.4798320233821869,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 412.5078125,
|
|
"completions/mean_terminated_length": 660.0125122070312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 387.0,
|
|
"epoch": 1.2611275964391693,
|
|
"grad_norm": 1.5234375,
|
|
"kl": NaN,
|
|
"learning_rate": 7.428560370317542e-06,
|
|
"loss": -0.2411,
|
|
"num_tokens": 11631071.0,
|
|
"rewards/<lambda>/mean": 0.6590052247047424,
|
|
"rewards/<lambda>/std": 0.4813198745250702,
|
|
"step": 106
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.917263150215149,
|
|
"advantage/mean": -0.002014155499637127,
|
|
"advantage/min": -1.9239046573638916,
|
|
"advantage/std": 0.4786885380744934,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1796875,
|
|
"completions/max_length": 969.0,
|
|
"completions/max_terminated_length": 969.0,
|
|
"completions/mean_length": 518.59375,
|
|
"completions/mean_terminated_length": 632.1904907226562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 439.0,
|
|
"epoch": 1.2729970326409497,
|
|
"grad_norm": 1.4609375,
|
|
"kl": NaN,
|
|
"learning_rate": 7.228075054658096e-06,
|
|
"loss": -0.2252,
|
|
"num_tokens": 11711595.0,
|
|
"rewards/<lambda>/mean": 0.8180580139160156,
|
|
"rewards/<lambda>/std": 0.39027684926986694,
|
|
"step": 107
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9152618646621704,
|
|
"advantage/mean": -0.008685033768415451,
|
|
"advantage/min": -1.9196232557296753,
|
|
"advantage/std": 0.5053057670593262,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 519.5234375,
|
|
"completions/mean_terminated_length": 658.4059448242188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 327.0,
|
|
"epoch": 1.28486646884273,
|
|
"grad_norm": 1.03125,
|
|
"kl": NaN,
|
|
"learning_rate": 7.028789546718327e-06,
|
|
"loss": -0.2851,
|
|
"num_tokens": 11793686.0,
|
|
"rewards/<lambda>/mean": 0.7860877513885498,
|
|
"rewards/<lambda>/std": 0.41535890102386475,
|
|
"step": 108
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43442410230636597,
|
|
"advantage/mean": -0.013488471508026123,
|
|
"advantage/min": -1.912493109703064,
|
|
"advantage/std": 0.4084751605987549,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 1012.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 507.8671875,
|
|
"completions/mean_terminated_length": 677.15625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 330.0,
|
|
"epoch": 1.2967359050445104,
|
|
"grad_norm": 0.2255859375,
|
|
"kl": NaN,
|
|
"learning_rate": 6.8307901057929735e-06,
|
|
"loss": -0.0944,
|
|
"num_tokens": 11875269.0,
|
|
"rewards/<lambda>/mean": 0.7525733113288879,
|
|
"rewards/<lambda>/std": 0.43941107392311096,
|
|
"step": 109
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.1851118803024292,
|
|
"advantage/mean": -0.0014121305430307984,
|
|
"advantage/min": -1.520728349685669,
|
|
"advantage/std": 0.30872422456741333,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1011.0,
|
|
"completions/max_terminated_length": 1011.0,
|
|
"completions/mean_length": 432.234375,
|
|
"completions/mean_terminated_length": 643.3255615234375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 368.0,
|
|
"epoch": 1.3086053412462908,
|
|
"grad_norm": 0.458984375,
|
|
"kl": NaN,
|
|
"learning_rate": 6.634162434511939e-06,
|
|
"loss": -0.1009,
|
|
"num_tokens": 11946891.0,
|
|
"rewards/<lambda>/mean": 0.690848708152771,
|
|
"rewards/<lambda>/std": 0.46886134147644043,
|
|
"step": 110
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9072984457015991,
|
|
"advantage/mean": -0.008659638464450836,
|
|
"advantage/min": -1.9238336086273193,
|
|
"advantage/std": 0.4230641722679138,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.21875,
|
|
"completions/max_length": 999.0,
|
|
"completions/max_terminated_length": 999.0,
|
|
"completions/mean_length": 547.6875,
|
|
"completions/mean_terminated_length": 701.0399780273438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 359.0,
|
|
"epoch": 1.3204747774480712,
|
|
"grad_norm": 0.4375,
|
|
"kl": NaN,
|
|
"learning_rate": 6.438991641744531e-06,
|
|
"loss": -0.1383,
|
|
"num_tokens": 12035875.0,
|
|
"rewards/<lambda>/mean": 0.7936596870422363,
|
|
"rewards/<lambda>/std": 0.4103051424026489,
|
|
"step": 111
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9188259840011597,
|
|
"advantage/mean": 0.0033213591668754816,
|
|
"advantage/min": -0.22531408071517944,
|
|
"advantage/std": 0.1774940937757492,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.2421875,
|
|
"completions/max_length": 995.0,
|
|
"completions/max_terminated_length": 995.0,
|
|
"completions/mean_length": 461.28125,
|
|
"completions/mean_terminated_length": 608.7009887695312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 341.0,
|
|
"epoch": 1.3323442136498516,
|
|
"grad_norm": 0.7890625,
|
|
"kl": NaN,
|
|
"learning_rate": 6.245362205760703e-06,
|
|
"loss": -0.1197,
|
|
"num_tokens": 12112143.0,
|
|
"rewards/<lambda>/mean": 0.7544768452644348,
|
|
"rewards/<lambda>/std": 0.43602651357650757,
|
|
"step": 112
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9209489822387695,
|
|
"advantage/mean": 0.0013821604661643505,
|
|
"advantage/min": -1.1878427267074585,
|
|
"advantage/std": 0.36598560214042664,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3046875,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 477.5703125,
|
|
"completions/mean_terminated_length": 686.8427124023438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 358.0,
|
|
"epoch": 1.344213649851632,
|
|
"grad_norm": 1.234375,
|
|
"kl": NaN,
|
|
"learning_rate": 6.053357937665237e-06,
|
|
"loss": -0.2179,
|
|
"num_tokens": 12189344.0,
|
|
"rewards/<lambda>/mean": 0.699212908744812,
|
|
"rewards/<lambda>/std": 0.46472683548927307,
|
|
"step": 113
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9212185144424438,
|
|
"advantage/mean": -0.003442929359152913,
|
|
"advantage/min": -1.9365557432174683,
|
|
"advantage/std": 0.4007267653942108,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 1015.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 351.875,
|
|
"completions/mean_terminated_length": 703.75,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 407.0,
|
|
"epoch": 1.3560830860534125,
|
|
"grad_norm": 1.203125,
|
|
"kl": NaN,
|
|
"learning_rate": 5.863061945120719e-06,
|
|
"loss": -0.2171,
|
|
"num_tokens": 12259648.0,
|
|
"rewards/<lambda>/mean": 0.5091111063957214,
|
|
"rewards/<lambda>/std": 0.5084741711616516,
|
|
"step": 114
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.517584204673767,
|
|
"advantage/mean": -0.001994214951992035,
|
|
"advantage/min": -1.9394543170928955,
|
|
"advantage/std": 0.4252905547618866,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.5078125,
|
|
"completions/max_length": 1012.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 319.71875,
|
|
"completions/mean_terminated_length": 649.5873413085938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 284.0,
|
|
"epoch": 1.367952522255193,
|
|
"grad_norm": 0.8125,
|
|
"kl": NaN,
|
|
"learning_rate": 5.674556596374993e-06,
|
|
"loss": -0.2067,
|
|
"num_tokens": 12325268.0,
|
|
"rewards/<lambda>/mean": 0.5073824524879456,
|
|
"rewards/<lambda>/std": 0.5102729201316833,
|
|
"step": 115
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9323999881744385,
|
|
"advantage/mean": 0.00012079346925020218,
|
|
"advantage/min": -1.9256129264831543,
|
|
"advantage/std": 0.37475845217704773,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4296875,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 391.75,
|
|
"completions/mean_terminated_length": 686.9041137695312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 273.0,
|
|
"epoch": 1.3798219584569733,
|
|
"grad_norm": 1.3046875,
|
|
"kl": NaN,
|
|
"learning_rate": 5.487923484608629e-06,
|
|
"loss": -0.2424,
|
|
"num_tokens": 12399532.0,
|
|
"rewards/<lambda>/mean": 0.5627472400665283,
|
|
"rewards/<lambda>/std": 0.5057460069656372,
|
|
"step": 116
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9130436182022095,
|
|
"advantage/mean": -0.0118994927033782,
|
|
"advantage/min": -1.9188669919967651,
|
|
"advantage/std": 0.45863908529281616,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1005.0,
|
|
"completions/max_terminated_length": 1005.0,
|
|
"completions/mean_length": 448.140625,
|
|
"completions/mean_terminated_length": 667.0,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 383.0,
|
|
"epoch": 1.3916913946587537,
|
|
"grad_norm": 0.4453125,
|
|
"kl": NaN,
|
|
"learning_rate": 5.3032433926179395e-06,
|
|
"loss": -0.1343,
|
|
"num_tokens": 12480790.0,
|
|
"rewards/<lambda>/mean": 0.6595108509063721,
|
|
"rewards/<lambda>/std": 0.480607807636261,
|
|
"step": 117
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9190807342529297,
|
|
"advantage/mean": -8.095055818557739e-05,
|
|
"advantage/min": -1.9297515153884888,
|
|
"advantage/std": 0.4942281246185303,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 432.6328125,
|
|
"completions/mean_terminated_length": 700.9747314453125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 443.0,
|
|
"epoch": 1.403560830860534,
|
|
"grad_norm": 1.515625,
|
|
"kl": NaN,
|
|
"learning_rate": 5.120596257848716e-06,
|
|
"loss": -0.3544,
|
|
"num_tokens": 12554311.0,
|
|
"rewards/<lambda>/mean": 0.6272512674331665,
|
|
"rewards/<lambda>/std": 0.49126946926116943,
|
|
"step": 118
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.930235743522644,
|
|
"advantage/mean": -0.00047219451516866684,
|
|
"advantage/min": -1.9093363285064697,
|
|
"advantage/std": 0.44188758730888367,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4296875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 386.15625,
|
|
"completions/mean_terminated_length": 677.0958862304688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 358.0,
|
|
"epoch": 1.4154302670623147,
|
|
"grad_norm": 1.7265625,
|
|
"kl": NaN,
|
|
"learning_rate": 4.940061137795876e-06,
|
|
"loss": -0.3363,
|
|
"num_tokens": 12626651.0,
|
|
"rewards/<lambda>/mean": 0.5810023546218872,
|
|
"rewards/<lambda>/std": 0.500396192073822,
|
|
"step": 119
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9287009239196777,
|
|
"advantage/mean": -0.0020257970318198204,
|
|
"advantage/min": -1.9468318223953247,
|
|
"advantage/std": 0.49218693375587463,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3203125,
|
|
"completions/max_length": 1003.0,
|
|
"completions/max_terminated_length": 1003.0,
|
|
"completions/mean_length": 544.4609375,
|
|
"completions/mean_terminated_length": 801.0459594726562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 454.0,
|
|
"epoch": 1.427299703264095,
|
|
"grad_norm": 0.68359375,
|
|
"kl": NaN,
|
|
"learning_rate": 4.7617161757839895e-06,
|
|
"loss": -0.2186,
|
|
"num_tokens": 12711518.0,
|
|
"rewards/<lambda>/mean": 0.699238657951355,
|
|
"rewards/<lambda>/std": 0.4646891951560974,
|
|
"step": 120
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.930548071861267,
|
|
"advantage/mean": 0.011293224059045315,
|
|
"advantage/min": -1.929141879081726,
|
|
"advantage/std": 0.6371313333511353,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.6484375,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 271.2265625,
|
|
"completions/mean_terminated_length": 771.4888916015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 444.0,
|
|
"epoch": 1.4391691394658754,
|
|
"grad_norm": 2.171875,
|
|
"kl": NaN,
|
|
"learning_rate": 4.5856385671435285e-06,
|
|
"loss": -0.7817,
|
|
"num_tokens": 12762531.0,
|
|
"rewards/<lambda>/mean": 0.35258346796035767,
|
|
"rewards/<lambda>/std": 0.486828476190567,
|
|
"step": 121
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.917569875717163,
|
|
"advantage/mean": -0.006770431064069271,
|
|
"advantage/min": -1.933653473854065,
|
|
"advantage/std": 0.4526441693305969,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4140625,
|
|
"completions/max_length": 1008.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 420.2421875,
|
|
"completions/mean_terminated_length": 717.21337890625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 427.0,
|
|
"epoch": 1.4510385756676558,
|
|
"grad_norm": 0.99609375,
|
|
"kl": NaN,
|
|
"learning_rate": 4.411904525797408e-06,
|
|
"loss": -0.1954,
|
|
"num_tokens": 12832634.0,
|
|
"rewards/<lambda>/mean": 0.5567037463188171,
|
|
"rewards/<lambda>/std": 0.5046417713165283,
|
|
"step": 122
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9111862182617188,
|
|
"advantage/mean": -0.0014686351642012596,
|
|
"advantage/min": -1.923195481300354,
|
|
"advantage/std": 0.5580769777297974,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.5390625,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 329.859375,
|
|
"completions/mean_terminated_length": 715.6271362304688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 343.0,
|
|
"epoch": 1.4629080118694362,
|
|
"grad_norm": 1.390625,
|
|
"kl": NaN,
|
|
"learning_rate": 4.240589251272342e-06,
|
|
"loss": -0.2905,
|
|
"num_tokens": 12889200.0,
|
|
"rewards/<lambda>/mean": 0.46842291951179504,
|
|
"rewards/<lambda>/std": 0.5092353820800781,
|
|
"step": 123
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.1872975826263428,
|
|
"advantage/mean": -0.0027686082758009434,
|
|
"advantage/min": -1.9116945266723633,
|
|
"advantage/std": 0.5272735953330994,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.609375,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 294.1640625,
|
|
"completions/mean_terminated_length": 753.0599975585938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 456.0,
|
|
"epoch": 1.4747774480712166,
|
|
"grad_norm": 0.6484375,
|
|
"kl": NaN,
|
|
"learning_rate": 4.0717668961492725e-06,
|
|
"loss": -0.1927,
|
|
"num_tokens": 12943093.0,
|
|
"rewards/<lambda>/mean": 0.4538976550102234,
|
|
"rewards/<lambda>/std": 0.5069791674613953,
|
|
"step": 124
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.517442226409912,
|
|
"advantage/mean": 0.002009883988648653,
|
|
"advantage/min": -1.930153250694275,
|
|
"advantage/std": 0.7648005485534668,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.546875,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 351.2890625,
|
|
"completions/mean_terminated_length": 775.2586059570312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 461.0,
|
|
"epoch": 1.486646884272997,
|
|
"grad_norm": 1.6328125,
|
|
"kl": NaN,
|
|
"learning_rate": 3.905510533966959e-06,
|
|
"loss": -0.5738,
|
|
"num_tokens": 13002234.0,
|
|
"rewards/<lambda>/mean": 0.5327528715133667,
|
|
"rewards/<lambda>/std": 0.5072947144508362,
|
|
"step": 125
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9298175573349,
|
|
"advantage/mean": -0.0018671993166208267,
|
|
"advantage/min": -1.921625018119812,
|
|
"advantage/std": 0.5027943849563599,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.40625,
|
|
"completions/max_length": 1001.0,
|
|
"completions/max_terminated_length": 1001.0,
|
|
"completions/mean_length": 425.0859375,
|
|
"completions/mean_terminated_length": 715.9342041015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 399.0,
|
|
"epoch": 1.4985163204747773,
|
|
"grad_norm": 1.484375,
|
|
"kl": NaN,
|
|
"learning_rate": 3.7418921275926245e-06,
|
|
"loss": -0.3583,
|
|
"num_tokens": 13071213.0,
|
|
"rewards/<lambda>/mean": 0.6043626666069031,
|
|
"rewards/<lambda>/std": 0.49610209465026855,
|
|
"step": 126
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9274595975875854,
|
|
"advantage/mean": 0.016750790178775787,
|
|
"advantage/min": -1.9122111797332764,
|
|
"advantage/std": 0.5896896123886108,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.546875,
|
|
"completions/max_length": 1013.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 341.2421875,
|
|
"completions/mean_terminated_length": 753.086181640625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 450.0,
|
|
"epoch": 1.5103857566765577,
|
|
"grad_norm": 1.9296875,
|
|
"kl": NaN,
|
|
"learning_rate": 3.5809824980733445e-06,
|
|
"loss": -0.5838,
|
|
"num_tokens": 13128844.0,
|
|
"rewards/<lambda>/mean": 0.48590797185897827,
|
|
"rewards/<lambda>/std": 0.5081177353858948,
|
|
"step": 127
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9181711673736572,
|
|
"advantage/mean": -0.0008597676642239094,
|
|
"advantage/min": -1.9336591958999634,
|
|
"advantage/std": 0.5854839086532593,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3828125,
|
|
"completions/max_length": 1020.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 436.5546875,
|
|
"completions/mean_terminated_length": 707.3291015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 347.0,
|
|
"epoch": 1.5222551928783383,
|
|
"grad_norm": 1.0390625,
|
|
"kl": NaN,
|
|
"learning_rate": 3.422851293981676e-06,
|
|
"loss": -0.2994,
|
|
"num_tokens": 13198155.0,
|
|
"rewards/<lambda>/mean": 0.6356991529464722,
|
|
"rewards/<lambda>/std": 0.48831233382225037,
|
|
"step": 128
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9237265586853027,
|
|
"advantage/mean": 0.008581508882343769,
|
|
"advantage/min": -1.51692533493042,
|
|
"advantage/std": 0.5177173614501953,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4921875,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 347.5078125,
|
|
"completions/mean_terminated_length": 684.3230590820312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 356.0,
|
|
"epoch": 1.5341246290801187,
|
|
"grad_norm": 3.625,
|
|
"kl": NaN,
|
|
"learning_rate": 3.2675669612687565e-06,
|
|
"loss": -0.4465,
|
|
"num_tokens": 13259236.0,
|
|
"rewards/<lambda>/mean": 0.5096697807312012,
|
|
"rewards/<lambda>/std": 0.5079067945480347,
|
|
"step": 129
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9265636205673218,
|
|
"advantage/mean": 0.008637513034045696,
|
|
"advantage/min": -1.9244918823242188,
|
|
"advantage/std": 0.6367316246032715,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.390625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 455.984375,
|
|
"completions/mean_terminated_length": 748.2820434570312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 402.0,
|
|
"epoch": 1.545994065281899,
|
|
"grad_norm": 1.5703125,
|
|
"kl": NaN,
|
|
"learning_rate": 3.115196713638e-06,
|
|
"loss": -0.5401,
|
|
"num_tokens": 13331842.0,
|
|
"rewards/<lambda>/mean": 0.6362784504890442,
|
|
"rewards/<lambda>/std": 0.4875337481498718,
|
|
"step": 130
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5149365663528442,
|
|
"advantage/mean": -0.008159506134688854,
|
|
"advantage/min": -1.940788984298706,
|
|
"advantage/std": 0.6248160600662231,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4296875,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 424.234375,
|
|
"completions/mean_terminated_length": 743.863037109375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 380.0,
|
|
"epoch": 1.5578635014836797,
|
|
"grad_norm": 0.8515625,
|
|
"kl": NaN,
|
|
"learning_rate": 2.965806503452098e-06,
|
|
"loss": -0.3634,
|
|
"num_tokens": 13399568.0,
|
|
"rewards/<lambda>/mean": 0.5797673463821411,
|
|
"rewards/<lambda>/std": 0.5018887519836426,
|
|
"step": 131
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43188193440437317,
|
|
"advantage/mean": -0.016634924337267876,
|
|
"advantage/min": -1.9263575077056885,
|
|
"advantage/std": 0.4129045605659485,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1171875,
|
|
"completions/max_length": 1008.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 623.640625,
|
|
"completions/mean_terminated_length": 706.4248046875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 418.0,
|
|
"epoch": 1.56973293768546,
|
|
"grad_norm": 0.330078125,
|
|
"kl": NaN,
|
|
"learning_rate": 2.819460993186032e-06,
|
|
"loss": -0.0477,
|
|
"num_tokens": 13491794.0,
|
|
"rewards/<lambda>/mean": 0.8886122703552246,
|
|
"rewards/<lambda>/std": 0.31911179423332214,
|
|
"step": 132
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9200443029403687,
|
|
"advantage/mean": -0.011932725086808205,
|
|
"advantage/min": -1.921685814857483,
|
|
"advantage/std": 0.5687617063522339,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.28125,
|
|
"completions/max_length": 1004.0,
|
|
"completions/max_terminated_length": 1004.0,
|
|
"completions/mean_length": 541.3984375,
|
|
"completions/mean_terminated_length": 753.25,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 428.0,
|
|
"epoch": 1.5816023738872405,
|
|
"grad_norm": 0.7109375,
|
|
"kl": NaN,
|
|
"learning_rate": 2.6762235274383775e-06,
|
|
"loss": -0.243,
|
|
"num_tokens": 13574709.0,
|
|
"rewards/<lambda>/mean": 0.7469240427017212,
|
|
"rewards/<lambda>/std": 0.4400651156902313,
|
|
"step": 133
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5213267803192139,
|
|
"advantage/mean": -0.003202416468411684,
|
|
"advantage/min": -1.9273756742477417,
|
|
"advantage/std": 0.614234983921051,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.296875,
|
|
"completions/max_length": 1018.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 517.7578125,
|
|
"completions/mean_terminated_length": 736.36669921875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 440.0,
|
|
"epoch": 1.5934718100890208,
|
|
"grad_norm": 1.5625,
|
|
"kl": 0.036314057069830596,
|
|
"learning_rate": 2.5361561055130625e-06,
|
|
"loss": -0.4678,
|
|
"num_tokens": 13653710.0,
|
|
"rewards/<lambda>/mean": 0.7460275888442993,
|
|
"rewards/<lambda>/std": 0.4416311979293823,
|
|
"step": 134
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5160837173461914,
|
|
"advantage/mean": -0.0012700525112450123,
|
|
"advantage/min": -1.9151767492294312,
|
|
"advantage/std": 0.6045173406600952,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3984375,
|
|
"completions/max_length": 1019.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 467.28125,
|
|
"completions/mean_terminated_length": 776.7792358398438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 417.0,
|
|
"epoch": 1.6053412462908012,
|
|
"grad_norm": 1.296875,
|
|
"kl": NaN,
|
|
"learning_rate": 2.3993193545834182e-06,
|
|
"loss": -0.4441,
|
|
"num_tokens": 13728250.0,
|
|
"rewards/<lambda>/mean": 0.6352561712265015,
|
|
"rewards/<lambda>/std": 0.4889114499092102,
|
|
"step": 135
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5056098699569702,
|
|
"advantage/mean": -0.008613115176558495,
|
|
"advantage/min": -1.9262280464172363,
|
|
"advantage/std": 0.5049684047698975,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 449.8984375,
|
|
"completions/mean_terminated_length": 669.6162719726562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 373.0,
|
|
"epoch": 1.6172106824925816,
|
|
"grad_norm": 0.765625,
|
|
"kl": NaN,
|
|
"learning_rate": 2.265772503450122e-06,
|
|
"loss": -0.2458,
|
|
"num_tokens": 13800061.0,
|
|
"rewards/<lambda>/mean": 0.6753218770027161,
|
|
"rewards/<lambda>/std": 0.47482624650001526,
|
|
"step": 136
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9252156019210815,
|
|
"advantage/mean": -0.006081531755626202,
|
|
"advantage/min": -1.9355872869491577,
|
|
"advantage/std": 0.6520951986312866,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4375,
|
|
"completions/max_length": 1018.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 406.21875,
|
|
"completions/mean_terminated_length": 722.1666870117188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 348.0,
|
|
"epoch": 1.629080118694362,
|
|
"grad_norm": 1.453125,
|
|
"kl": NaN,
|
|
"learning_rate": 2.1355733569044633e-06,
|
|
"loss": -0.5079,
|
|
"num_tokens": 13865777.0,
|
|
"rewards/<lambda>/mean": 0.6048412322998047,
|
|
"rewards/<lambda>/std": 0.4955025315284729,
|
|
"step": 137
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.6603893637657166,
|
|
"advantage/mean": -0.015362613834440708,
|
|
"advantage/min": -1.927195429801941,
|
|
"advantage/std": 0.4361397624015808,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1796875,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 550.1796875,
|
|
"completions/mean_terminated_length": 670.6952514648438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 366.0,
|
|
"epoch": 1.6409495548961424,
|
|
"grad_norm": 0.263671875,
|
|
"kl": NaN,
|
|
"learning_rate": 2.008778270707944e-06,
|
|
"loss": -0.081,
|
|
"num_tokens": 13952608.0,
|
|
"rewards/<lambda>/mean": 0.8092094659805298,
|
|
"rewards/<lambda>/std": 0.39872902631759644,
|
|
"step": 138
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.191937804222107,
|
|
"advantage/mean": -0.004746164195239544,
|
|
"advantage/min": -1.928928017616272,
|
|
"advantage/std": 0.3572968542575836,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.2421875,
|
|
"completions/max_length": 1021.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 383.6875,
|
|
"completions/mean_terminated_length": 506.30926513671875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 239.0,
|
|
"epoch": 1.6528189910979227,
|
|
"grad_norm": 0.5546875,
|
|
"kl": NaN,
|
|
"learning_rate": 1.8854421271990964e-06,
|
|
"loss": -0.093,
|
|
"num_tokens": 14018752.0,
|
|
"rewards/<lambda>/mean": 0.7453782558441162,
|
|
"rewards/<lambda>/std": 0.44275522232055664,
|
|
"step": 139
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.510254144668579,
|
|
"advantage/mean": -0.004046538844704628,
|
|
"advantage/min": -1.5299463272094727,
|
|
"advantage/std": 0.4380718469619751,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0859375,
|
|
"completions/max_length": 1020.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 489.0625,
|
|
"completions/mean_terminated_length": 535.0427856445312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 330.0,
|
|
"epoch": 1.6646884272997031,
|
|
"grad_norm": 0.95703125,
|
|
"kl": 0.03683288465254009,
|
|
"learning_rate": 1.7656183115380577e-06,
|
|
"loss": -0.1419,
|
|
"num_tokens": 14096584.0,
|
|
"rewards/<lambda>/mean": 0.8887147307395935,
|
|
"rewards/<lambda>/std": 0.31882667541503906,
|
|
"step": 140
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.1849887371063232,
|
|
"advantage/mean": -0.004679612349718809,
|
|
"advantage/min": -1.9216474294662476,
|
|
"advantage/std": 0.4064004123210907,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 511.734375,
|
|
"completions/mean_terminated_length": 555.1016845703125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 253.0,
|
|
"epoch": 1.6765578635014837,
|
|
"grad_norm": 0.62890625,
|
|
"kl": 0.030216948012821376,
|
|
"learning_rate": 1.6493586885991908e-06,
|
|
"loss": -0.1395,
|
|
"num_tokens": 14175462.0,
|
|
"rewards/<lambda>/mean": 0.9131076335906982,
|
|
"rewards/<lambda>/std": 0.28450077772140503,
|
|
"step": 141
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.6602617502212524,
|
|
"advantage/mean": -0.008524436503648758,
|
|
"advantage/min": -1.9139026403427124,
|
|
"advantage/std": 0.33574238419532776,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 481.90625,
|
|
"completions/mean_terminated_length": 497.45159912109375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 258.0,
|
|
"epoch": 1.688427299703264,
|
|
"grad_norm": 0.33984375,
|
|
"kl": 0.033639343455433846,
|
|
"learning_rate": 1.536713580521746e-06,
|
|
"loss": -0.0497,
|
|
"num_tokens": 14249642.0,
|
|
"rewards/<lambda>/mean": 0.9606139659881592,
|
|
"rewards/<lambda>/std": 0.19611631333827972,
|
|
"step": 142
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21360832452774048,
|
|
"advantage/mean": -0.0066238404251635075,
|
|
"advantage/min": -1.92234206199646,
|
|
"advantage/std": 0.2506479322910309,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.140625,
|
|
"completions/max_length": 943.0,
|
|
"completions/max_terminated_length": 943.0,
|
|
"completions/mean_length": 423.6015625,
|
|
"completions/mean_terminated_length": 492.91815185546875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 247.0,
|
|
"epoch": 1.7002967359050445,
|
|
"grad_norm": 0.0859375,
|
|
"kl": NaN,
|
|
"learning_rate": 1.4277317449282834e-06,
|
|
"loss": -0.0263,
|
|
"num_tokens": 14320559.0,
|
|
"rewards/<lambda>/mean": 0.857833743095398,
|
|
"rewards/<lambda>/std": 0.3528285622596741,
|
|
"step": 143
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.184670329093933,
|
|
"advantage/mean": -0.004762329161167145,
|
|
"advantage/min": -1.935533881187439,
|
|
"advantage/std": 0.33737996220588684,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 933.0,
|
|
"completions/max_terminated_length": 933.0,
|
|
"completions/mean_length": 500.84375,
|
|
"completions/mean_terminated_length": 525.4754028320312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 286.0,
|
|
"epoch": 1.712166172106825,
|
|
"grad_norm": 0.43359375,
|
|
"kl": 0.04113235871773213,
|
|
"learning_rate": 1.3224603538202929e-06,
|
|
"loss": -0.0835,
|
|
"num_tokens": 14399243.0,
|
|
"rewards/<lambda>/mean": 0.9446145296096802,
|
|
"rewards/<lambda>/std": 0.23117810487747192,
|
|
"step": 144
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.9115270376205444,
|
|
"advantage/mean": -0.003408966585993767,
|
|
"advantage/min": -1.93901789188385,
|
|
"advantage/std": 0.2906622588634491,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1015625,
|
|
"completions/max_length": 990.0,
|
|
"completions/max_terminated_length": 990.0,
|
|
"completions/mean_length": 481.109375,
|
|
"completions/mean_terminated_length": 535.49560546875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 209.0,
|
|
"epoch": 1.7240356083086055,
|
|
"grad_norm": 0.306640625,
|
|
"kl": 0.045802103355526924,
|
|
"learning_rate": 1.220944973160133e-06,
|
|
"loss": -0.0697,
|
|
"num_tokens": 14478265.0,
|
|
"rewards/<lambda>/mean": 0.8966078758239746,
|
|
"rewards/<lambda>/std": 0.3087267577648163,
|
|
"step": 145
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9281952381134033,
|
|
"advantage/mean": 0.006729425396770239,
|
|
"advantage/min": -0.45143717527389526,
|
|
"advantage/std": 0.27702268958091736,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 956.0,
|
|
"completions/max_terminated_length": 956.0,
|
|
"completions/mean_length": 461.4375,
|
|
"completions/mean_terminated_length": 541.8715209960938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 282.0,
|
|
"epoch": 1.7359050445103859,
|
|
"grad_norm": 0.240234375,
|
|
"kl": NaN,
|
|
"learning_rate": 1.1232295431481222e-06,
|
|
"loss": -0.0139,
|
|
"num_tokens": 14553449.0,
|
|
"rewards/<lambda>/mean": 0.832952082157135,
|
|
"rewards/<lambda>/std": 0.3785593509674072,
|
|
"step": 146
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.4327715039253235,
|
|
"advantage/mean": -0.006765198893845081,
|
|
"advantage/min": -1.937062382698059,
|
|
"advantage/std": 0.27742907404899597,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0234375,
|
|
"completions/max_length": 1014.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 571.734375,
|
|
"completions/mean_terminated_length": 585.4560546875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 330.0,
|
|
"epoch": 1.7477744807121662,
|
|
"grad_norm": 0.1337890625,
|
|
"kl": 0.033991911564953625,
|
|
"learning_rate": 1.0293563592033595e-06,
|
|
"loss": -0.0401,
|
|
"num_tokens": 14639831.0,
|
|
"rewards/<lambda>/mean": 0.9761366844177246,
|
|
"rewards/<lambda>/std": 0.15464277565479279,
|
|
"step": 147
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21408027410507202,
|
|
"advantage/mean": -0.009968165308237076,
|
|
"advantage/min": -1.9296997785568237,
|
|
"advantage/std": 0.30730822682380676,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 945.0,
|
|
"completions/max_terminated_length": 945.0,
|
|
"completions/mean_length": 472.7734375,
|
|
"completions/mean_terminated_length": 599.1583862304688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 297.0,
|
|
"epoch": 1.7596439169139466,
|
|
"grad_norm": 0.10400390625,
|
|
"kl": NaN,
|
|
"learning_rate": 9.393660536564408e-07,
|
|
"loss": -0.0397,
|
|
"num_tokens": 14719362.0,
|
|
"rewards/<lambda>/mean": 0.7863461375236511,
|
|
"rewards/<lambda>/std": 0.4148584008216858,
|
|
"step": 148
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43295615911483765,
|
|
"advantage/mean": -0.0066620223224163055,
|
|
"advantage/min": -1.911831259727478,
|
|
"advantage/std": 0.27607157826423645,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0234375,
|
|
"completions/max_length": 970.0,
|
|
"completions/max_terminated_length": 970.0,
|
|
"completions/mean_length": 544.515625,
|
|
"completions/mean_terminated_length": 557.5840454101562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 292.0,
|
|
"epoch": 1.771513353115727,
|
|
"grad_norm": 0.130859375,
|
|
"kl": 0.03622849879320711,
|
|
"learning_rate": 8.532975781620511e-07,
|
|
"loss": -0.04,
|
|
"num_tokens": 14802332.0,
|
|
"rewards/<lambda>/mean": 0.9762489199638367,
|
|
"rewards/<lambda>/std": 0.1539168804883957,
|
|
"step": 149
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5160192251205444,
|
|
"advantage/mean": -0.0032945233397185802,
|
|
"advantage/min": -1.9176921844482422,
|
|
"advantage/std": 0.3483807146549225,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 1015.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 585.75,
|
|
"completions/mean_terminated_length": 635.3898315429688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 375.0,
|
|
"epoch": 1.7833827893175074,
|
|
"grad_norm": 0.578125,
|
|
"kl": 0.035929113859310746,
|
|
"learning_rate": 7.711881868390292e-07,
|
|
"loss": -0.1348,
|
|
"num_tokens": 14890124.0,
|
|
"rewards/<lambda>/mean": 0.9285244941711426,
|
|
"rewards/<lambda>/std": 0.2609255909919739,
|
|
"step": 150
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.4329414665699005,
|
|
"advantage/mean": -0.006688450463116169,
|
|
"advantage/min": -1.9181170463562012,
|
|
"advantage/std": 0.2764665484428406,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.265625,
|
|
"completions/max_length": 998.0,
|
|
"completions/max_terminated_length": 998.0,
|
|
"completions/mean_length": 436.453125,
|
|
"completions/mean_terminated_length": 594.3191528320312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 292.0,
|
|
"epoch": 1.7952522255192878,
|
|
"grad_norm": 0.267578125,
|
|
"kl": NaN,
|
|
"learning_rate": 6.930734201451817e-07,
|
|
"loss": -0.0248,
|
|
"num_tokens": 14966814.0,
|
|
"rewards/<lambda>/mean": 0.7219593524932861,
|
|
"rewards/<lambda>/std": 0.45501524209976196,
|
|
"step": 151
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9238147735595703,
|
|
"advantage/mean": -0.010101165622472763,
|
|
"advantage/min": -1.9405475854873657,
|
|
"advantage/std": 0.4145474135875702,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.2109375,
|
|
"completions/max_length": 1006.0,
|
|
"completions/max_terminated_length": 1006.0,
|
|
"completions/mean_length": 400.640625,
|
|
"completions/mean_terminated_length": 507.7425537109375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 256.0,
|
|
"epoch": 1.8071216617210681,
|
|
"grad_norm": 0.98828125,
|
|
"kl": NaN,
|
|
"learning_rate": 6.189870894938587e-07,
|
|
"loss": -0.1643,
|
|
"num_tokens": 15035704.0,
|
|
"rewards/<lambda>/mean": 0.7770117521286011,
|
|
"rewards/<lambda>/std": 0.42307209968566895,
|
|
"step": 152
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.1940398216247559,
|
|
"advantage/mean": -0.0013529672287404537,
|
|
"advantage/min": -1.511359453201294,
|
|
"advantage/std": 0.30959567427635193,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0390625,
|
|
"completions/max_length": 962.0,
|
|
"completions/max_terminated_length": 962.0,
|
|
"completions/mean_length": 563.0859375,
|
|
"completions/mean_terminated_length": 585.9755859375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 291.0,
|
|
"epoch": 1.8189910979228485,
|
|
"grad_norm": 0.390625,
|
|
"kl": 0.03943067276850343,
|
|
"learning_rate": 5.489612626189245e-07,
|
|
"loss": -0.0539,
|
|
"num_tokens": 15121051.0,
|
|
"rewards/<lambda>/mean": 0.9441364407539368,
|
|
"rewards/<lambda>/std": 0.23318061232566833,
|
|
"step": 153
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5175542831420898,
|
|
"advantage/mean": -0.0018498229328542948,
|
|
"advantage/min": -1.9130256175994873,
|
|
"advantage/std": 0.3559645414352417,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1796875,
|
|
"completions/max_length": 971.0,
|
|
"completions/max_terminated_length": 971.0,
|
|
"completions/mean_length": 488.4140625,
|
|
"completions/mean_terminated_length": 595.4000244140625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 361.0,
|
|
"epoch": 1.8308605341246291,
|
|
"grad_norm": 0.57421875,
|
|
"kl": NaN,
|
|
"learning_rate": 4.830262496944693e-07,
|
|
"loss": -0.0694,
|
|
"num_tokens": 15200952.0,
|
|
"rewards/<lambda>/mean": 0.7931093573570251,
|
|
"rewards/<lambda>/std": 0.41140785813331604,
|
|
"step": 154
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.43180593848228455,
|
|
"advantage/mean": -0.013412997126579285,
|
|
"advantage/min": -1.9487814903259277,
|
|
"advantage/std": 0.3739129304885864,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 966.0,
|
|
"completions/max_terminated_length": 966.0,
|
|
"completions/mean_length": 534.5390625,
|
|
"completions/mean_terminated_length": 551.7822265625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 258.0,
|
|
"epoch": 1.8427299703264095,
|
|
"grad_norm": 0.275390625,
|
|
"kl": 0.03880139789544046,
|
|
"learning_rate": 4.21210590215273e-07,
|
|
"loss": -0.0508,
|
|
"num_tokens": 15284765.0,
|
|
"rewards/<lambda>/mean": 0.9603752493858337,
|
|
"rewards/<lambda>/std": 0.19731158018112183,
|
|
"step": 155
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.6636139154434204,
|
|
"advantage/mean": -0.00197417801246047,
|
|
"advantage/min": -1.1930924654006958,
|
|
"advantage/std": 0.22539697587490082,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.0859375,
|
|
"completions/max_length": 995.0,
|
|
"completions/max_terminated_length": 995.0,
|
|
"completions/mean_length": 567.4140625,
|
|
"completions/mean_terminated_length": 620.7607421875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 283.0,
|
|
"epoch": 1.8545994065281899,
|
|
"grad_norm": 0.1875,
|
|
"kl": NaN,
|
|
"learning_rate": 3.635410406436857e-07,
|
|
"loss": -0.0412,
|
|
"num_tokens": 15370994.0,
|
|
"rewards/<lambda>/mean": 0.8494309186935425,
|
|
"rewards/<lambda>/std": 0.3620583713054657,
|
|
"step": 156
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5063209533691406,
|
|
"advantage/mean": -0.008625751361250877,
|
|
"advantage/min": -1.9249359369277954,
|
|
"advantage/std": 0.45040178298950195,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 481.1328125,
|
|
"completions/mean_terminated_length": 565.0,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 336.0,
|
|
"epoch": 1.8664688427299705,
|
|
"grad_norm": 1.90625,
|
|
"kl": NaN,
|
|
"learning_rate": 3.100425628282899e-07,
|
|
"loss": -0.146,
|
|
"num_tokens": 15453427.0,
|
|
"rewards/<lambda>/mean": 0.8336179256439209,
|
|
"rewards/<lambda>/std": 0.377052903175354,
|
|
"step": 157
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 0.21544617414474487,
|
|
"advantage/mean": -0.010093946941196918,
|
|
"advantage/min": -1.9512163400650024,
|
|
"advantage/std": 0.30881720781326294,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1011.0,
|
|
"completions/max_terminated_length": 1011.0,
|
|
"completions/mean_length": 475.921875,
|
|
"completions/mean_terminated_length": 548.8108520507812,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 250.0,
|
|
"epoch": 1.8783382789317509,
|
|
"grad_norm": 0.201171875,
|
|
"kl": NaN,
|
|
"learning_rate": 2.607383131993424e-07,
|
|
"loss": -0.0267,
|
|
"num_tokens": 15530473.0,
|
|
"rewards/<lambda>/mean": 0.8492758274078369,
|
|
"rewards/<lambda>/std": 0.3624357283115387,
|
|
"step": 158
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9234334230422974,
|
|
"advantage/mean": 0.01999136619269848,
|
|
"advantage/min": -1.1871734857559204,
|
|
"advantage/std": 0.6579956412315369,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.6953125,
|
|
"completions/max_length": 1014.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 231.125,
|
|
"completions/mean_terminated_length": 758.5640869140625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 516.0,
|
|
"epoch": 1.8902077151335313,
|
|
"grad_norm": 1.921875,
|
|
"kl": NaN,
|
|
"learning_rate": 2.1564963274568028e-07,
|
|
"loss": -0.5375,
|
|
"num_tokens": 15578097.0,
|
|
"rewards/<lambda>/mean": 0.37501630187034607,
|
|
"rewards/<lambda>/std": 0.4941561222076416,
|
|
"step": 159
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9284110069274902,
|
|
"advantage/mean": -0.001977311447262764,
|
|
"advantage/min": -1.5255322456359863,
|
|
"advantage/std": 0.3568601608276367,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.6484375,
|
|
"completions/max_length": 1014.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 255.625,
|
|
"completions/mean_terminated_length": 727.1111450195312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 382.0,
|
|
"epoch": 1.9020771513353116,
|
|
"grad_norm": 0.2119140625,
|
|
"kl": NaN,
|
|
"learning_rate": 1.7479603777742937e-07,
|
|
"loss": -0.0679,
|
|
"num_tokens": 15625785.0,
|
|
"rewards/<lambda>/mean": 0.3347344398498535,
|
|
"rewards/<lambda>/std": 0.48339539766311646,
|
|
"step": 160
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9180108308792114,
|
|
"advantage/mean": -2.1150190150365233e-05,
|
|
"advantage/min": -1.922570824623108,
|
|
"advantage/std": 0.3901103436946869,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4375,
|
|
"completions/max_length": 1000.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 400.0625,
|
|
"completions/mean_terminated_length": 711.2222290039062,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 306.0,
|
|
"epoch": 1.913946587537092,
|
|
"grad_norm": 1.0703125,
|
|
"kl": NaN,
|
|
"learning_rate": 1.3819521147851122e-07,
|
|
"loss": -0.2541,
|
|
"num_tokens": 15692793.0,
|
|
"rewards/<lambda>/mean": 0.5556085705757141,
|
|
"rewards/<lambda>/std": 0.505899965763092,
|
|
"step": 161
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.18680739402771,
|
|
"advantage/mean": -0.006663821171969175,
|
|
"advantage/min": -1.9252969026565552,
|
|
"advantage/std": 0.5181385278701782,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.328125,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 461.046875,
|
|
"completions/mean_terminated_length": 686.2092895507812,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 291.0,
|
|
"epoch": 1.9258160237388724,
|
|
"grad_norm": 0.78515625,
|
|
"kl": NaN,
|
|
"learning_rate": 1.0586299625259699e-07,
|
|
"loss": -0.1893,
|
|
"num_tokens": 15769607.0,
|
|
"rewards/<lambda>/mean": 0.6662224531173706,
|
|
"rewards/<lambda>/std": 0.4795023798942566,
|
|
"step": 162
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.5068639516830444,
|
|
"advantage/mean": -0.013799908570945263,
|
|
"advantage/min": -1.9225540161132812,
|
|
"advantage/std": 0.5686584711074829,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.34375,
|
|
"completions/max_length": 1005.0,
|
|
"completions/max_terminated_length": 1005.0,
|
|
"completions/mean_length": 494.484375,
|
|
"completions/mean_terminated_length": 753.5,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 411.0,
|
|
"epoch": 1.9376854599406528,
|
|
"grad_norm": 0.921875,
|
|
"kl": NaN,
|
|
"learning_rate": 7.781338686584928e-08,
|
|
"loss": -0.2712,
|
|
"num_tokens": 15850349.0,
|
|
"rewards/<lambda>/mean": 0.6513202786445618,
|
|
"rewards/<lambda>/std": 0.4836761951446533,
|
|
"step": 163
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.9118238687515259,
|
|
"advantage/mean": -0.0020243192557245493,
|
|
"advantage/min": -1.52021324634552,
|
|
"advantage/std": 0.47649678587913513,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.4140625,
|
|
"completions/max_length": 1018.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 408.46875,
|
|
"completions/mean_terminated_length": 697.1199951171875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 435.0,
|
|
"epoch": 1.9495548961424332,
|
|
"grad_norm": 1.046875,
|
|
"kl": NaN,
|
|
"learning_rate": 5.405852438937764e-08,
|
|
"loss": -0.3028,
|
|
"num_tokens": 15917585.0,
|
|
"rewards/<lambda>/mean": 0.5886790156364441,
|
|
"rewards/<lambda>/std": 0.4992271661758423,
|
|
"step": 164
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.929634928703308,
|
|
"advantage/mean": -0.004954553674906492,
|
|
"advantage/min": -1.9497406482696533,
|
|
"advantage/std": 0.667103111743927,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.625,
|
|
"completions/max_length": 1009.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 314.578125,
|
|
"completions/mean_terminated_length": 838.875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 601.0,
|
|
"epoch": 1.9614243323442135,
|
|
"grad_norm": 1.4609375,
|
|
"kl": NaN,
|
|
"learning_rate": 3.460869094407127e-08,
|
|
"loss": -0.6181,
|
|
"num_tokens": 15973155.0,
|
|
"rewards/<lambda>/mean": 0.4300598204135895,
|
|
"rewards/<lambda>/std": 0.5046306848526001,
|
|
"step": 165
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.187256097793579,
|
|
"advantage/mean": -0.004739911761134863,
|
|
"advantage/min": -1.919519066810608,
|
|
"advantage/std": 0.5733982920646667,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.421875,
|
|
"completions/max_length": 1014.0,
|
|
"completions/max_terminated_length": 1014.0,
|
|
"completions/mean_length": 413.28125,
|
|
"completions/mean_terminated_length": 714.8648681640625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 375.0,
|
|
"epoch": 1.973293768545994,
|
|
"grad_norm": 0.8203125,
|
|
"kl": NaN,
|
|
"learning_rate": 1.947230525005006e-08,
|
|
"loss": -0.2885,
|
|
"num_tokens": 16043895.0,
|
|
"rewards/<lambda>/mean": 0.5547634363174438,
|
|
"rewards/<lambda>/std": 0.506853461265564,
|
|
"step": 166
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.50879967212677,
|
|
"advantage/mean": -0.0034448225051164627,
|
|
"advantage/min": -1.9508203268051147,
|
|
"advantage/std": 0.4751599431037903,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.375,
|
|
"completions/max_length": 1009.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 400.4375,
|
|
"completions/mean_terminated_length": 640.7000122070312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 298.0,
|
|
"epoch": 1.9851632047477745,
|
|
"grad_norm": 0.6328125,
|
|
"kl": NaN,
|
|
"learning_rate": 8.655918982689582e-09,
|
|
"loss": -0.21,
|
|
"num_tokens": 16112703.0,
|
|
"rewards/<lambda>/mean": 0.611525297164917,
|
|
"rewards/<lambda>/std": 0.49521562457084656,
|
|
"step": 167
|
|
},
|
|
{
|
|
"advantage/logodds_epsilon": 0.001,
|
|
"advantage/max": 1.947137713432312,
|
|
"advantage/mean": -0.05683013051748276,
|
|
"advantage/min": -1.9427815675735474,
|
|
"advantage/std": 0.6950681209564209,
|
|
"clip_ratio/high_max": NaN,
|
|
"clip_ratio/high_mean": NaN,
|
|
"clip_ratio/low_mean": NaN,
|
|
"clip_ratio/low_min": NaN,
|
|
"clip_ratio/region_mean": NaN,
|
|
"completions/clipped_ratio": 0.3529411764705882,
|
|
"completions/max_length": 1020.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 503.1323547363281,
|
|
"completions/mean_terminated_length": 777.5681762695312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 486.0,
|
|
"epoch": 1.997032640949555,
|
|
"grad_norm": 1.71875,
|
|
"kl": NaN,
|
|
"learning_rate": 2.164213936770576e-09,
|
|
"loss": -0.3532,
|
|
"num_tokens": 16184585.0,
|
|
"rewards/<lambda>/mean": 0.5477771759033203,
|
|
"rewards/<lambda>/std": 0.5067213177680969,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 1.997032640949555,
|
|
"step": 168,
|
|
"total_flos": 0.0,
|
|
"train_loss": -0.09854654141236097,
|
|
"train_runtime": 3102.2738,
|
|
"train_samples_per_second": 0.869,
|
|
"train_steps_per_second": 0.054
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 168,
|
|
"num_input_tokens_seen": 16184585,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|