4174 lines
198 KiB
JSON
4174 lines
198 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 12,
|
|
"global_step": 64,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.015625,
|
|
"grad_norm": 99.72795104980469,
|
|
"kl/sequence_policy_ref": 0.0,
|
|
"kl/vocab_forward": 0.0,
|
|
"kl/vocab_js": 1.071127120333415e-08,
|
|
"kl/vocab_reverse": 0.0,
|
|
"kl/vocab_symmetric": 0.0,
|
|
"learning_rate": 0.0,
|
|
"logps/chosen": -742.8982963562012,
|
|
"logps/rejected": -1508.9743347167969,
|
|
"loss": 0.8464,
|
|
"loss/dpo": 0.6931471824645996,
|
|
"misalign/J": 15.325548589229584,
|
|
"misalign/J_aux_loss": 0.15325548127293587,
|
|
"misalign/J_aux_loss_raw": 15.325548589229584,
|
|
"misalign/J_over_reverse_kl": 128560019.5,
|
|
"misalign/J_per_token": 0.019860354135744274,
|
|
"misalign/compressed_reward_absmax": 4573.125,
|
|
"misalign/compressed_reward_range": 6496.04150390625,
|
|
"misalign/entropy_a": 1394.3690948486328,
|
|
"misalign/entropy_b": 1394.3689727783203,
|
|
"misalign/forward_kl_divergence": 0.0,
|
|
"misalign/forward_kl_divergence_per_token": 0.0,
|
|
"misalign/gamma_abs_times_reward_std": 3358948.609375,
|
|
"misalign/gamma_bracketed_rate": 0.9989435374736786,
|
|
"misalign/gamma_reward_residual": -3.5030208209718694e-05,
|
|
"misalign/gamma_star": 6581222.96875,
|
|
"misalign/js_divergence": 1.071127120333415e-08,
|
|
"misalign/reverse_kl_divergence": 0.0,
|
|
"misalign/reverse_kl_divergence_per_token": 0.0,
|
|
"misalign/reward_a": -14.673600375652313,
|
|
"misalign/reward_b": -14.673597425222397,
|
|
"misalign/reward_improvement": 0.0,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.0,
|
|
"misalign/reward_improvement_per_token": 0.0,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -492.57253074645996,
|
|
"misalign/reward_vocab_std": 775.2254867553711,
|
|
"misalign/symmetric_kl": 0.0,
|
|
"misalign/tv_distance": 0.0,
|
|
"num_tokens": 178419.0,
|
|
"rewards/accuracies": 0.0,
|
|
"rewards/chosen": 0.0,
|
|
"rewards/margins": 0.0,
|
|
"rewards/rejected": 0.0,
|
|
"step": 1,
|
|
"support/residual_count": 151893.28125,
|
|
"support/residual_mass_policy": 0.043290185276418924,
|
|
"support/residual_mass_reference": 0.043290185276418924,
|
|
"support/residual_reward": -0.42998734675347805,
|
|
"support/runtime_width": 42.71537160873413,
|
|
"support/sampled_loser_rank": 0.6159256920218468,
|
|
"support/sampled_reward_rank": -0.34261330030858517,
|
|
"support/sampled_token_added_rate": 0.03961701481603086,
|
|
"support/sampled_winner_rank": 0.6312572248280048,
|
|
"support/selected_width": 42.71537160873413,
|
|
"support/stored_width": 42.71537160873413
|
|
},
|
|
{
|
|
"epoch": 0.03125,
|
|
"grad_norm": 90.65897369384766,
|
|
"kl/sequence_policy_ref": 0.0,
|
|
"kl/vocab_forward": 0.0,
|
|
"kl/vocab_js": 1.1555954915198896e-08,
|
|
"kl/vocab_reverse": 0.0,
|
|
"kl/vocab_symmetric": 0.0,
|
|
"learning_rate": 2.857142857142857e-07,
|
|
"logps/chosen": -639.7089538574219,
|
|
"logps/rejected": -1256.9280319213867,
|
|
"loss": 0.8522,
|
|
"loss/dpo": 0.6931471824645996,
|
|
"misalign/J": 15.909758105874062,
|
|
"misalign/J_aux_loss": 0.15909757697954774,
|
|
"misalign/J_aux_loss_raw": 15.909758105874062,
|
|
"misalign/J_over_reverse_kl": 133460724.125,
|
|
"misalign/J_per_token": 0.022011274049873464,
|
|
"misalign/compressed_reward_absmax": 4128.2406005859375,
|
|
"misalign/compressed_reward_range": 5892.38916015625,
|
|
"misalign/entropy_a": 1163.6351776123047,
|
|
"misalign/entropy_b": 1163.6351852416992,
|
|
"misalign/forward_kl_divergence": 0.0,
|
|
"misalign/forward_kl_divergence_per_token": 0.0,
|
|
"misalign/gamma_abs_times_reward_std": 3731882.666015625,
|
|
"misalign/gamma_bracketed_rate": 0.99904465675354,
|
|
"misalign/gamma_reward_residual": -2.5868400712170114e-05,
|
|
"misalign/gamma_star": 6485823.03515625,
|
|
"misalign/js_divergence": 1.1555954915198896e-08,
|
|
"misalign/reverse_kl_divergence": 0.0,
|
|
"misalign/reverse_kl_divergence_per_token": 0.0,
|
|
"misalign/reward_a": -14.091559052467346,
|
|
"misalign/reward_b": -14.091563642024994,
|
|
"misalign/reward_improvement": 0.0,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.0,
|
|
"misalign/reward_improvement_per_token": 0.0,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -373.4254665374756,
|
|
"misalign/reward_vocab_std": 714.1293716430664,
|
|
"misalign/symmetric_kl": 0.0,
|
|
"misalign/tv_distance": 0.0,
|
|
"num_tokens": 353406.0,
|
|
"rewards/accuracies": 0.0,
|
|
"rewards/chosen": 0.0,
|
|
"rewards/margins": 0.0,
|
|
"rewards/rejected": 0.0,
|
|
"step": 2,
|
|
"support/residual_count": 151893.015625,
|
|
"support/residual_mass_policy": 0.04986313637346029,
|
|
"support/residual_mass_reference": 0.04986313637346029,
|
|
"support/residual_reward": -0.33420879393815994,
|
|
"support/runtime_width": 42.98376941680908,
|
|
"support/sampled_loser_rank": 0.5725675709545612,
|
|
"support/sampled_reward_rank": -0.46486284397542477,
|
|
"support/sampled_token_added_rate": 0.039943449199199677,
|
|
"support/sampled_winner_rank": 0.6429276466369629,
|
|
"support/selected_width": 42.98376941680908,
|
|
"support/stored_width": 42.98376941680908
|
|
},
|
|
{
|
|
"epoch": 0.046875,
|
|
"grad_norm": 93.3929672241211,
|
|
"kl/sequence_policy_ref": 0.012497343122959137,
|
|
"kl/vocab_forward": 0.583718778565526,
|
|
"kl/vocab_js": 0.14564759889617562,
|
|
"kl/vocab_reverse": 0.5832288525998592,
|
|
"kl/vocab_symmetric": 1.1669474430382252,
|
|
"learning_rate": 5.714285714285714e-07,
|
|
"logps/chosen": -682.0669765472412,
|
|
"logps/rejected": -1253.8106155395508,
|
|
"loss": 0.8984,
|
|
"loss/dpo": 0.7069794088602066,
|
|
"misalign/J": 19.14525681734085,
|
|
"misalign/J_aux_loss": 0.1914525495376438,
|
|
"misalign/J_aux_loss_raw": 19.14525681734085,
|
|
"misalign/J_over_reverse_kl": 181.960098862648,
|
|
"misalign/J_per_token": 0.11238381525618024,
|
|
"misalign/compressed_reward_absmax": 4042.6752319335938,
|
|
"misalign/compressed_reward_range": 5874.695739746094,
|
|
"misalign/entropy_a": 1187.622413635254,
|
|
"misalign/entropy_b": 1187.9832382202148,
|
|
"misalign/forward_kl_divergence": 0.583718778565526,
|
|
"misalign/forward_kl_divergence_per_token": 0.0007106075063347816,
|
|
"misalign/gamma_abs_times_reward_std": 4061511.5234375,
|
|
"misalign/gamma_bracketed_rate": 0.9965383857488632,
|
|
"misalign/gamma_reward_residual": -1.116046053084574e-05,
|
|
"misalign/gamma_star": 6953297.8359375,
|
|
"misalign/js_divergence": 0.14564759889617562,
|
|
"misalign/reverse_kl_divergence": 0.5832288525998592,
|
|
"misalign/reverse_kl_divergence_per_token": 0.0007096838744473644,
|
|
"misalign/reward_a": -14.631245076656342,
|
|
"misalign/reward_b": -14.68013870716095,
|
|
"misalign/reward_improvement": 0.04889671457931399,
|
|
"misalign/reward_improvement_over_reverse_kl": -0.01731939986348152,
|
|
"misalign/reward_improvement_per_token": 0.0011197524540875747,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -477.72042179107666,
|
|
"misalign/reward_vocab_std": 708.0760498046875,
|
|
"misalign/symmetric_kl": 1.1669474430382252,
|
|
"misalign/tv_distance": 7.252490729093552,
|
|
"num_tokens": 522739.0,
|
|
"rewards/accuracies": 0.4375,
|
|
"rewards/chosen": -0.009032575355377048,
|
|
"rewards/margins": -0.020564619218930602,
|
|
"rewards/rejected": 0.011532044620253146,
|
|
"step": 3,
|
|
"support/residual_count": 151893.173828125,
|
|
"support/residual_mass_policy": 0.041550057008862495,
|
|
"support/residual_mass_reference": 0.04161923169158399,
|
|
"support/residual_reward": -0.48134796414524317,
|
|
"support/runtime_width": 42.824374198913574,
|
|
"support/sampled_loser_rank": 0.6047959439456463,
|
|
"support/sampled_reward_rank": -0.43707933463156223,
|
|
"support/sampled_token_added_rate": 0.03209112957119942,
|
|
"support/sampled_winner_rank": 0.6245042234659195,
|
|
"support/selected_width": 42.824374198913574,
|
|
"support/stored_width": 42.824374198913574
|
|
},
|
|
{
|
|
"epoch": 0.0625,
|
|
"grad_norm": 92.23185729980469,
|
|
"kl/sequence_policy_ref": 0.12927092611789703,
|
|
"kl/vocab_forward": 0.5669154338538647,
|
|
"kl/vocab_js": 0.14143066108226776,
|
|
"kl/vocab_reverse": 0.5671258866786957,
|
|
"kl/vocab_symmetric": 1.1340412348508835,
|
|
"learning_rate": 8.57142857142857e-07,
|
|
"logps/chosen": -605.7631340026855,
|
|
"logps/rejected": -1292.5442504882812,
|
|
"loss": 0.8402,
|
|
"loss/dpo": 0.6932987496256828,
|
|
"misalign/J": 14.686549186706543,
|
|
"misalign/J_aux_loss": 0.14686548942700028,
|
|
"misalign/J_aux_loss_raw": 14.686549186706543,
|
|
"misalign/J_over_reverse_kl": 23.49239319562912,
|
|
"misalign/J_per_token": 0.015179012378212065,
|
|
"misalign/compressed_reward_absmax": 4190.518157958984,
|
|
"misalign/compressed_reward_range": 6010.22119140625,
|
|
"misalign/entropy_a": 1174.9087295532227,
|
|
"misalign/entropy_b": 1174.8447265625,
|
|
"misalign/forward_kl_divergence": 0.5669154338538647,
|
|
"misalign/forward_kl_divergence_per_token": 0.0007894696027506143,
|
|
"misalign/gamma_abs_times_reward_std": 3162996.953125,
|
|
"misalign/gamma_bracketed_rate": 0.9992709010839462,
|
|
"misalign/gamma_reward_residual": -2.8679768774964032e-05,
|
|
"misalign/gamma_star": 6157349.4375,
|
|
"misalign/js_divergence": 0.14143066108226776,
|
|
"misalign/reverse_kl_divergence": 0.5671258866786957,
|
|
"misalign/reverse_kl_divergence_per_token": 0.000786922340921592,
|
|
"misalign/reward_a": -18.330436378717422,
|
|
"misalign/reward_b": -18.468554601073265,
|
|
"misalign/reward_improvement": 0.1381110306829214,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.340573588386178,
|
|
"misalign/reward_improvement_per_token": 0.0015522810608672444,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -531.1860427856445,
|
|
"misalign/reward_vocab_std": 741.7677001953125,
|
|
"misalign/symmetric_kl": 1.1340412348508835,
|
|
"misalign/tv_distance": 6.983255743980408,
|
|
"num_tokens": 693958.0,
|
|
"rewards/accuracies": 0.5,
|
|
"rewards/chosen": 0.01616873685270548,
|
|
"rewards/margins": 0.006483289762400091,
|
|
"rewards/rejected": 0.009685446973890066,
|
|
"step": 4,
|
|
"support/residual_count": 151893.24609375,
|
|
"support/residual_mass_policy": 0.03730555484071374,
|
|
"support/residual_mass_reference": 0.03720196895301342,
|
|
"support/residual_reward": -0.5474085882306099,
|
|
"support/runtime_width": 42.75570487976074,
|
|
"support/sampled_loser_rank": 0.5622387044131756,
|
|
"support/sampled_reward_rank": -0.3531896872445941,
|
|
"support/sampled_token_added_rate": 0.03257534769363701,
|
|
"support/sampled_winner_rank": 0.5690335519611835,
|
|
"support/selected_width": 42.75570487976074,
|
|
"support/stored_width": 42.75570487976074
|
|
},
|
|
{
|
|
"epoch": 0.078125,
|
|
"grad_norm": 117.50943756103516,
|
|
"kl/sequence_policy_ref": -0.02329159528017044,
|
|
"kl/vocab_forward": 0.7679473981261253,
|
|
"kl/vocab_js": 0.1914622224867344,
|
|
"kl/vocab_reverse": 0.7658381760120392,
|
|
"kl/vocab_symmetric": 1.5337853729724884,
|
|
"learning_rate": 1.1428571428571428e-06,
|
|
"logps/chosen": -888.985481262207,
|
|
"logps/rejected": -1439.7103576660156,
|
|
"loss": 0.9771,
|
|
"loss/dpo": 0.6643600016832352,
|
|
"misalign/J": 31.272010684013367,
|
|
"misalign/J_aux_loss": 0.3127200985327363,
|
|
"misalign/J_aux_loss_raw": 31.272010684013367,
|
|
"misalign/J_over_reverse_kl": 76.80600309371948,
|
|
"misalign/J_per_token": 0.034490690915845335,
|
|
"misalign/compressed_reward_absmax": 4926.2969970703125,
|
|
"misalign/compressed_reward_range": 7034.768737792969,
|
|
"misalign/entropy_a": 1437.912109375,
|
|
"misalign/entropy_b": 1440.320785522461,
|
|
"misalign/forward_kl_divergence": 0.7679473981261253,
|
|
"misalign/forward_kl_divergence_per_token": 0.0006905792761244811,
|
|
"misalign/gamma_abs_times_reward_std": 6929239.875,
|
|
"misalign/gamma_bracketed_rate": 0.9981078654527664,
|
|
"misalign/gamma_reward_residual": -2.4677944281847886e-05,
|
|
"misalign/gamma_star": 13895793.9375,
|
|
"misalign/js_divergence": 0.1914622224867344,
|
|
"misalign/reverse_kl_divergence": 0.7658381760120392,
|
|
"misalign/reverse_kl_divergence_per_token": 0.0006879112333990633,
|
|
"misalign/reward_a": -16.945995092391968,
|
|
"misalign/reward_b": -18.123961448669434,
|
|
"misalign/reward_improvement": 1.1779705435037613,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.8247669339179993,
|
|
"misalign/reward_improvement_per_token": 0.0014292692139861174,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -799.9156303405762,
|
|
"misalign/reward_vocab_std": 839.3828201293945,
|
|
"misalign/symmetric_kl": 1.5337853729724884,
|
|
"misalign/tv_distance": 9.24750828742981,
|
|
"num_tokens": 885705.0,
|
|
"rewards/accuracies": 0.703125,
|
|
"rewards/chosen": 0.03075969440396875,
|
|
"rewards/margins": 0.06617770798038691,
|
|
"rewards/rejected": -0.035418014391325414,
|
|
"step": 5,
|
|
"support/residual_count": 151893.498046875,
|
|
"support/residual_mass_policy": 0.0339660148601979,
|
|
"support/residual_mass_reference": 0.03398727998137474,
|
|
"support/residual_reward": -0.5825221054255962,
|
|
"support/runtime_width": 42.503896713256836,
|
|
"support/sampled_loser_rank": 0.6222354024648666,
|
|
"support/sampled_reward_rank": -0.31187077052891254,
|
|
"support/sampled_token_added_rate": 0.03087039408273995,
|
|
"support/sampled_winner_rank": 0.6193482503294945,
|
|
"support/selected_width": 42.503896713256836,
|
|
"support/stored_width": 42.503896713256836
|
|
},
|
|
{
|
|
"epoch": 0.09375,
|
|
"grad_norm": 85.93820190429688,
|
|
"kl/sequence_policy_ref": -0.15299414843320847,
|
|
"kl/vocab_forward": 0.7101510316133499,
|
|
"kl/vocab_js": 0.1767149232327938,
|
|
"kl/vocab_reverse": 0.7063371688127518,
|
|
"kl/vocab_symmetric": 1.4164880961179733,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"logps/chosen": -632.6824798583984,
|
|
"logps/rejected": -1370.8635635375977,
|
|
"loss": 0.7987,
|
|
"loss/dpo": 0.6448436826467514,
|
|
"misalign/J": 15.385070085525513,
|
|
"misalign/J_aux_loss": 0.15385069977492094,
|
|
"misalign/J_aux_loss_raw": 15.385070085525513,
|
|
"misalign/J_over_reverse_kl": 83.44263046979904,
|
|
"misalign/J_per_token": 0.0304049692931585,
|
|
"misalign/compressed_reward_absmax": 4116.264617919922,
|
|
"misalign/compressed_reward_range": 5825.951171875,
|
|
"misalign/entropy_a": 1233.836441040039,
|
|
"misalign/entropy_b": 1237.5170669555664,
|
|
"misalign/forward_kl_divergence": 0.7101510316133499,
|
|
"misalign/forward_kl_divergence_per_token": 0.0011770288765546866,
|
|
"misalign/gamma_abs_times_reward_std": 3832625.03125,
|
|
"misalign/gamma_bracketed_rate": 0.9984977394342422,
|
|
"misalign/gamma_reward_residual": -2.866266277123941e-05,
|
|
"misalign/gamma_star": 7119063.15625,
|
|
"misalign/js_divergence": 0.1767149232327938,
|
|
"misalign/reverse_kl_divergence": 0.7063371688127518,
|
|
"misalign/reverse_kl_divergence_per_token": 0.001158414474048186,
|
|
"misalign/reward_a": -12.991081476211548,
|
|
"misalign/reward_b": -15.306684225797653,
|
|
"misalign/reward_improvement": 2.315602958202362,
|
|
"misalign/reward_improvement_over_reverse_kl": 4.326970279216766,
|
|
"misalign/reward_improvement_per_token": 0.00793102516036015,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -391.66842794418335,
|
|
"misalign/reward_vocab_std": 709.3101501464844,
|
|
"misalign/symmetric_kl": 1.4164880961179733,
|
|
"misalign/tv_distance": 8.171427369117737,
|
|
"num_tokens": 1051197.0,
|
|
"rewards/accuracies": 0.734375,
|
|
"rewards/chosen": 0.03977742395363748,
|
|
"rewards/margins": 0.11015367973595858,
|
|
"rewards/rejected": -0.07037625554949045,
|
|
"step": 6,
|
|
"support/residual_count": 151893.294921875,
|
|
"support/residual_mass_policy": 0.04003174742683768,
|
|
"support/residual_mass_reference": 0.039974321611225605,
|
|
"support/residual_reward": -0.44555533304810524,
|
|
"support/runtime_width": 42.70325422286987,
|
|
"support/sampled_loser_rank": 0.5838432982563972,
|
|
"support/sampled_reward_rank": -0.33837335743010044,
|
|
"support/sampled_token_added_rate": 0.035395737970247865,
|
|
"support/sampled_winner_rank": 0.5852662436664104,
|
|
"support/selected_width": 42.70325422286987,
|
|
"support/stored_width": 42.70325422286987
|
|
},
|
|
{
|
|
"epoch": 0.109375,
|
|
"grad_norm": 88.38682556152344,
|
|
"kl/sequence_policy_ref": -0.3988112509250641,
|
|
"kl/vocab_forward": 0.9821137934923172,
|
|
"kl/vocab_js": 0.2425236813724041,
|
|
"kl/vocab_reverse": 0.9639514982700348,
|
|
"kl/vocab_symmetric": 1.9460650980472565,
|
|
"learning_rate": 1.714285714285714e-06,
|
|
"logps/chosen": -645.1111679077148,
|
|
"logps/rejected": -1122.6695709228516,
|
|
"loss": 0.8321,
|
|
"loss/dpo": 0.5667471960186958,
|
|
"misalign/J": 26.53605580329895,
|
|
"misalign/J_aux_loss": 0.26536055374890566,
|
|
"misalign/J_aux_loss_raw": 26.53605580329895,
|
|
"misalign/J_over_reverse_kl": 24.359424114227295,
|
|
"misalign/J_per_token": 0.027136333868838847,
|
|
"misalign/compressed_reward_absmax": 4269.828765869141,
|
|
"misalign/compressed_reward_range": 6126.886657714844,
|
|
"misalign/entropy_a": 1084.383560180664,
|
|
"misalign/entropy_b": 1096.2449645996094,
|
|
"misalign/forward_kl_divergence": 0.9821137934923172,
|
|
"misalign/forward_kl_divergence_per_token": 0.0023296478029806167,
|
|
"misalign/gamma_abs_times_reward_std": 6525904.78125,
|
|
"misalign/gamma_bracketed_rate": 0.9985537081956863,
|
|
"misalign/gamma_reward_residual": -9.58655687099963e-06,
|
|
"misalign/gamma_star": 13095489.4375,
|
|
"misalign/js_divergence": 0.2425236813724041,
|
|
"misalign/reverse_kl_divergence": 0.9639514982700348,
|
|
"misalign/reverse_kl_divergence_per_token": 0.0022518262558151037,
|
|
"misalign/reward_a": -4.632188588380814,
|
|
"misalign/reward_b": -9.89891766011715,
|
|
"misalign/reward_improvement": 5.2667356133461,
|
|
"misalign/reward_improvement_over_reverse_kl": 5.781014442443848,
|
|
"misalign/reward_improvement_per_token": 0.021993891743477434,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -737.9532012939453,
|
|
"misalign/reward_vocab_std": 779.6220245361328,
|
|
"misalign/symmetric_kl": 1.9460650980472565,
|
|
"misalign/tv_distance": 9.269986510276794,
|
|
"num_tokens": 1226725.0,
|
|
"rewards/accuracies": 0.859375,
|
|
"rewards/chosen": 0.10787314153276384,
|
|
"rewards/margins": 0.29550853557884693,
|
|
"rewards/rejected": -0.18763539008796215,
|
|
"step": 7,
|
|
"support/residual_count": 151893.15234375,
|
|
"support/residual_mass_policy": 0.036638311110436916,
|
|
"support/residual_mass_reference": 0.03580877371132374,
|
|
"support/residual_reward": -0.6131154783070087,
|
|
"support/runtime_width": 42.84639596939087,
|
|
"support/sampled_loser_rank": 0.5267594642937183,
|
|
"support/sampled_reward_rank": -0.37450613733381033,
|
|
"support/sampled_token_added_rate": 0.02907218923792243,
|
|
"support/sampled_winner_rank": 0.5602664463222027,
|
|
"support/selected_width": 42.84639596939087,
|
|
"support/stored_width": 42.84639596939087
|
|
},
|
|
{
|
|
"epoch": 0.125,
|
|
"grad_norm": 69.52674102783203,
|
|
"kl/sequence_policy_ref": -1.1963431239128113,
|
|
"kl/vocab_forward": 1.3841880485415459,
|
|
"kl/vocab_js": 0.33685372211039066,
|
|
"kl/vocab_reverse": 1.3344588950276375,
|
|
"kl/vocab_symmetric": 2.7186470329761505,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -520.5281848907471,
|
|
"logps/rejected": -1275.8439178466797,
|
|
"loss": 0.6933,
|
|
"loss/dpo": 0.5151687189936638,
|
|
"misalign/J": 17.81585144996643,
|
|
"misalign/J_aux_loss": 0.17815851839259267,
|
|
"misalign/J_aux_loss_raw": 17.81585144996643,
|
|
"misalign/J_over_reverse_kl": 10.14825189113617,
|
|
"misalign/J_per_token": 0.029440748097840697,
|
|
"misalign/compressed_reward_absmax": 3982.9888916015625,
|
|
"misalign/compressed_reward_range": 5621.152526855469,
|
|
"misalign/entropy_a": 1110.4090042114258,
|
|
"misalign/entropy_b": 1124.430030822754,
|
|
"misalign/forward_kl_divergence": 1.3841880485415459,
|
|
"misalign/forward_kl_divergence_per_token": 0.005764905363321304,
|
|
"misalign/gamma_abs_times_reward_std": 3656771.703125,
|
|
"misalign/gamma_bracketed_rate": 0.9985413998365402,
|
|
"misalign/gamma_reward_residual": -2.2492993480227597e-05,
|
|
"misalign/gamma_star": 6422516.171875,
|
|
"misalign/js_divergence": 0.33685372211039066,
|
|
"misalign/reverse_kl_divergence": 1.3344588950276375,
|
|
"misalign/reverse_kl_divergence_per_token": 0.005282549886032939,
|
|
"misalign/reward_a": -9.868786454200745,
|
|
"misalign/reward_b": -17.78406047821045,
|
|
"misalign/reward_improvement": 7.915277659893036,
|
|
"misalign/reward_improvement_over_reverse_kl": 5.6070281863212585,
|
|
"misalign/reward_improvement_per_token": 0.02902397490106523,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -359.259729385376,
|
|
"misalign/reward_vocab_std": 703.0287933349609,
|
|
"misalign/symmetric_kl": 2.7186470329761505,
|
|
"misalign/tv_distance": 10.146484673023224,
|
|
"num_tokens": 1388327.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": 0.0993000838207081,
|
|
"rewards/margins": 0.43786880001425743,
|
|
"rewards/rejected": -0.3385687116533518,
|
|
"step": 8,
|
|
"support/residual_count": 151893.01171875,
|
|
"support/residual_mass_policy": 0.039598725736141205,
|
|
"support/residual_mass_reference": 0.03887052624486387,
|
|
"support/residual_reward": -0.35539715457707644,
|
|
"support/runtime_width": 42.98911476135254,
|
|
"support/sampled_loser_rank": 0.5893515609204769,
|
|
"support/sampled_reward_rank": -0.47250746935606003,
|
|
"support/sampled_token_added_rate": 0.03269299981184304,
|
|
"support/sampled_winner_rank": 0.5859329588711262,
|
|
"support/selected_width": 42.98911476135254,
|
|
"support/stored_width": 42.98911476135254
|
|
},
|
|
{
|
|
"epoch": 0.140625,
|
|
"grad_norm": 67.0599136352539,
|
|
"kl/sequence_policy_ref": -2.3771479576826096,
|
|
"kl/vocab_forward": 2.6128047704696655,
|
|
"kl/vocab_js": 0.6251252107322216,
|
|
"kl/vocab_reverse": 2.4535476565361023,
|
|
"kl/vocab_symmetric": 5.06635195016861,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -615.9590682983398,
|
|
"logps/rejected": -1274.1997985839844,
|
|
"loss": 0.6781,
|
|
"loss/dpo": 0.4201922379434109,
|
|
"misalign/J": 25.791358947753906,
|
|
"misalign/J_aux_loss": 0.2579135838896036,
|
|
"misalign/J_aux_loss_raw": 25.791358947753906,
|
|
"misalign/J_over_reverse_kl": 12.780362248420715,
|
|
"misalign/J_per_token": 0.05547305219806731,
|
|
"misalign/compressed_reward_absmax": 3792.750274658203,
|
|
"misalign/compressed_reward_range": 5447.807708740234,
|
|
"misalign/entropy_a": 1148.869960784912,
|
|
"misalign/entropy_b": 1178.2729797363281,
|
|
"misalign/forward_kl_divergence": 2.6128047704696655,
|
|
"misalign/forward_kl_divergence_per_token": 0.006875867606140673,
|
|
"misalign/gamma_abs_times_reward_std": 5016933.359375,
|
|
"misalign/gamma_bracketed_rate": 0.9971183687448502,
|
|
"misalign/gamma_reward_residual": -1.4140535995466053e-05,
|
|
"misalign/gamma_star": 9540094.09375,
|
|
"misalign/js_divergence": 0.6251252107322216,
|
|
"misalign/reverse_kl_divergence": 2.4535476565361023,
|
|
"misalign/reverse_kl_divergence_per_token": 0.006300629815086722,
|
|
"misalign/reward_a": -4.234024614095688,
|
|
"misalign/reward_b": -17.268560528755188,
|
|
"misalign/reward_improvement": 13.034542560577393,
|
|
"misalign/reward_improvement_over_reverse_kl": 4.815744161605835,
|
|
"misalign/reward_improvement_per_token": 0.030366417719051242,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -434.87897872924805,
|
|
"misalign/reward_vocab_std": 651.9961624145508,
|
|
"misalign/symmetric_kl": 5.06635195016861,
|
|
"misalign/tv_distance": 14.232259213924408,
|
|
"num_tokens": 1550382.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": 0.12840933166444302,
|
|
"rewards/margins": 0.7322482690215111,
|
|
"rewards/rejected": -0.6038389429450035,
|
|
"step": 9,
|
|
"support/residual_count": 151893.478515625,
|
|
"support/residual_mass_policy": 0.03697363403625786,
|
|
"support/residual_mass_reference": 0.037607218604534864,
|
|
"support/residual_reward": -0.5447902157902718,
|
|
"support/runtime_width": 42.5230073928833,
|
|
"support/sampled_loser_rank": 0.6056095100939274,
|
|
"support/sampled_reward_rank": -0.31354507617652416,
|
|
"support/sampled_token_added_rate": 0.03456789907068014,
|
|
"support/sampled_winner_rank": 0.6345465183258057,
|
|
"support/selected_width": 42.5230073928833,
|
|
"support/stored_width": 42.5230073928833
|
|
},
|
|
{
|
|
"epoch": 0.15625,
|
|
"grad_norm": 87.10169982910156,
|
|
"kl/sequence_policy_ref": -6.291569083929062,
|
|
"kl/vocab_forward": 5.60641685128212,
|
|
"kl/vocab_js": 1.291469193994999,
|
|
"kl/vocab_reverse": 5.0420292019844055,
|
|
"kl/vocab_symmetric": 10.648443281650543,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -468.0983257293701,
|
|
"logps/rejected": -1285.5955200195312,
|
|
"loss": 0.9025,
|
|
"loss/dpo": 0.3525316398590803,
|
|
"misalign/J": 54.998396158218384,
|
|
"misalign/J_aux_loss": 0.5499839466065168,
|
|
"misalign/J_aux_loss_raw": 54.998396158218384,
|
|
"misalign/J_over_reverse_kl": 9.880838811397552,
|
|
"misalign/J_per_token": 0.07741667260415852,
|
|
"misalign/compressed_reward_absmax": 3692.7807006835938,
|
|
"misalign/compressed_reward_range": 5182.304748535156,
|
|
"misalign/entropy_a": 1031.7844772338867,
|
|
"misalign/entropy_b": 1076.0891571044922,
|
|
"misalign/forward_kl_divergence": 5.60641685128212,
|
|
"misalign/forward_kl_divergence_per_token": 0.01369796262588352,
|
|
"misalign/gamma_abs_times_reward_std": 11830781.9375,
|
|
"misalign/gamma_bracketed_rate": 0.9962232336401939,
|
|
"misalign/gamma_reward_residual": 9.550651896006457e-06,
|
|
"misalign/gamma_star": 24339454.65625,
|
|
"misalign/js_divergence": 1.291469193994999,
|
|
"misalign/reverse_kl_divergence": 5.0420292019844055,
|
|
"misalign/reverse_kl_divergence_per_token": 0.011788319039624184,
|
|
"misalign/reward_a": -0.19761592149734497,
|
|
"misalign/reward_b": -18.621128231287003,
|
|
"misalign/reward_improvement": 18.42351984977722,
|
|
"misalign/reward_improvement_over_reverse_kl": 3.5281217098236084,
|
|
"misalign/reward_improvement_per_token": 0.044580711517482996,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -340.7347640991211,
|
|
"misalign/reward_vocab_std": 628.4291152954102,
|
|
"misalign/symmetric_kl": 10.648443281650543,
|
|
"misalign/tv_distance": 18.700318098068237,
|
|
"num_tokens": 1705187.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -0.055842478293925524,
|
|
"rewards/margins": 1.1466289013624191,
|
|
"rewards/rejected": -1.2024713829159737,
|
|
"step": 10,
|
|
"support/residual_count": 151893.390625,
|
|
"support/residual_mass_policy": 0.03804836701601744,
|
|
"support/residual_mass_reference": 0.03930599056184292,
|
|
"support/residual_reward": -0.42244877200573683,
|
|
"support/runtime_width": 42.61000061035156,
|
|
"support/sampled_loser_rank": 0.6178636997938156,
|
|
"support/sampled_reward_rank": -0.380008390173316,
|
|
"support/sampled_token_added_rate": 0.03352847881615162,
|
|
"support/sampled_winner_rank": 0.6379577368497849,
|
|
"support/selected_width": 42.61000061035156,
|
|
"support/stored_width": 42.61000061035156
|
|
},
|
|
{
|
|
"epoch": 0.171875,
|
|
"grad_norm": 105.65762329101562,
|
|
"kl/sequence_policy_ref": -11.886906266212463,
|
|
"kl/vocab_forward": 10.033158540725708,
|
|
"kl/vocab_js": 2.2728197276592255,
|
|
"kl/vocab_reverse": 8.8297780752182,
|
|
"kl/vocab_symmetric": 18.862935781478882,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -864.007137298584,
|
|
"logps/rejected": -1354.3248596191406,
|
|
"loss": 0.957,
|
|
"loss/dpo": 0.25358179584145546,
|
|
"misalign/J": 70.34657621383667,
|
|
"misalign/J_aux_loss": 0.7034657262265682,
|
|
"misalign/J_aux_loss_raw": 70.34657621383667,
|
|
"misalign/J_over_reverse_kl": 7.123360276222229,
|
|
"misalign/J_per_token": 0.07085963152348995,
|
|
"misalign/compressed_reward_absmax": 4522.3524169921875,
|
|
"misalign/compressed_reward_range": 6510.963439941406,
|
|
"misalign/entropy_a": 1291.0633544921875,
|
|
"misalign/entropy_b": 1368.3875427246094,
|
|
"misalign/forward_kl_divergence": 10.033158540725708,
|
|
"misalign/forward_kl_divergence_per_token": 0.015985821490176022,
|
|
"misalign/gamma_abs_times_reward_std": 16111157.5625,
|
|
"misalign/gamma_bracketed_rate": 0.9964210242033005,
|
|
"misalign/gamma_reward_residual": 2.2860098624732927e-05,
|
|
"misalign/gamma_star": 28422147.375,
|
|
"misalign/js_divergence": 2.2728197276592255,
|
|
"misalign/reverse_kl_divergence": 8.8297780752182,
|
|
"misalign/reverse_kl_divergence_per_token": 0.013298386707901955,
|
|
"misalign/reward_a": 11.391705840826035,
|
|
"misalign/reward_b": -19.758893489837646,
|
|
"misalign/reward_improvement": 31.150599718093872,
|
|
"misalign/reward_improvement_over_reverse_kl": 3.3264004588127136,
|
|
"misalign/reward_improvement_per_token": 0.04476729570887983,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -656.2439975738525,
|
|
"misalign/reward_vocab_std": 782.3782577514648,
|
|
"misalign/symmetric_kl": 18.862935781478882,
|
|
"misalign/tv_distance": 28.730425596237183,
|
|
"num_tokens": 1887093.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -0.3179726116359234,
|
|
"rewards/margins": 1.741436019539833,
|
|
"rewards/rejected": -2.0594086199998856,
|
|
"step": 11,
|
|
"support/residual_count": 151893.275390625,
|
|
"support/residual_mass_policy": 0.035074356477707624,
|
|
"support/residual_mass_reference": 0.037040101597085595,
|
|
"support/residual_reward": -0.4819745300337672,
|
|
"support/runtime_width": 42.72910785675049,
|
|
"support/sampled_loser_rank": 0.6259545609354973,
|
|
"support/sampled_reward_rank": -0.3720765591133386,
|
|
"support/sampled_token_added_rate": 0.03333452111110091,
|
|
"support/sampled_winner_rank": 0.6261583790183067,
|
|
"support/selected_width": 42.72910785675049,
|
|
"support/stored_width": 42.72910785675049
|
|
},
|
|
{
|
|
"epoch": 0.1875,
|
|
"grad_norm": 113.49061584472656,
|
|
"kl/sequence_policy_ref": -10.061190009117126,
|
|
"kl/vocab_forward": 11.224669754505157,
|
|
"kl/vocab_js": 2.5259178578853607,
|
|
"kl/vocab_reverse": 9.792702317237854,
|
|
"kl/vocab_symmetric": 21.017370462417603,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -895.415397644043,
|
|
"logps/rejected": -1483.4818572998047,
|
|
"loss": 1.1275,
|
|
"loss/dpo": 0.3059763703495264,
|
|
"misalign/J": 82.15509986877441,
|
|
"misalign/J_aux_loss": 0.8215509578585625,
|
|
"misalign/J_aux_loss_raw": 82.15509986877441,
|
|
"misalign/J_over_reverse_kl": 9.914618968963623,
|
|
"misalign/J_per_token": 0.1017089462839067,
|
|
"misalign/compressed_reward_absmax": 4446.288055419922,
|
|
"misalign/compressed_reward_range": 6255.252502441406,
|
|
"misalign/entropy_a": 1389.9479522705078,
|
|
"misalign/entropy_b": 1475.2511596679688,
|
|
"misalign/forward_kl_divergence": 11.224669754505157,
|
|
"misalign/forward_kl_divergence_per_token": 0.017279054620303214,
|
|
"misalign/gamma_abs_times_reward_std": 19089820.0,
|
|
"misalign/gamma_bracketed_rate": 0.994603119790554,
|
|
"misalign/gamma_reward_residual": 1.673686114145312e-05,
|
|
"misalign/gamma_star": 34029449.5,
|
|
"misalign/js_divergence": 2.5259178578853607,
|
|
"misalign/reverse_kl_divergence": 9.792702317237854,
|
|
"misalign/reverse_kl_divergence_per_token": 0.015302568324841559,
|
|
"misalign/reward_a": 18.606368482112885,
|
|
"misalign/reward_b": -13.686601161956787,
|
|
"misalign/reward_improvement": 32.2929527759552,
|
|
"misalign/reward_improvement_over_reverse_kl": 3.0961980521678925,
|
|
"misalign/reward_improvement_per_token": 0.04442322696559131,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -424.1544990539551,
|
|
"misalign/reward_vocab_std": 740.4278564453125,
|
|
"misalign/symmetric_kl": 21.017370462417603,
|
|
"misalign/tv_distance": 30.929341316223145,
|
|
"num_tokens": 2067343.0,
|
|
"rewards/accuracies": 0.921875,
|
|
"rewards/chosen": -0.20568968541920185,
|
|
"rewards/margins": 1.6008586585521698,
|
|
"rewards/rejected": -1.8065483272075653,
|
|
"step": 12,
|
|
"support/residual_count": 151893.384765625,
|
|
"support/residual_mass_policy": 0.04051102139055729,
|
|
"support/residual_mass_reference": 0.042460997588932514,
|
|
"support/residual_reward": -0.3618390057235956,
|
|
"support/runtime_width": 42.61893367767334,
|
|
"support/sampled_loser_rank": 0.6536427140235901,
|
|
"support/sampled_reward_rank": -0.37369205243885517,
|
|
"support/sampled_token_added_rate": 0.037714328384026885,
|
|
"support/sampled_winner_rank": 0.6573853716254234,
|
|
"support/selected_width": 42.61893367767334,
|
|
"support/stored_width": 42.61893367767334
|
|
},
|
|
{
|
|
"epoch": 0.1875,
|
|
"eval_kl/sequence_policy_ref": -17.01151405274868,
|
|
"eval_kl/vocab_forward": 15.591387048363686,
|
|
"eval_kl/vocab_js": 3.397782253101468,
|
|
"eval_kl/vocab_reverse": 13.213865287601948,
|
|
"eval_kl/vocab_symmetric": 28.80525030195713,
|
|
"eval_logps/chosen": -734.3654553890228,
|
|
"eval_logps/rejected": -1399.9234561920166,
|
|
"eval_loss": 1.0719456672668457,
|
|
"eval_loss/dpo": 0.21070287289330736,
|
|
"eval_misalign/J": 86.12427139282227,
|
|
"eval_misalign/J_aux_loss": 0.8612426882609725,
|
|
"eval_misalign/J_aux_loss_raw": 86.12427139282227,
|
|
"eval_misalign/J_over_reverse_kl": 7.439761482179165,
|
|
"eval_misalign/J_per_token": 0.11331278597936034,
|
|
"eval_misalign/compressed_reward_absmax": 4236.706287384033,
|
|
"eval_misalign/compressed_reward_range": 6027.056529998779,
|
|
"eval_misalign/entropy_a": 1207.335482597351,
|
|
"eval_misalign/entropy_b": 1298.0544576644897,
|
|
"eval_misalign/forward_kl_divergence": 15.591387048363686,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.027479787677293643,
|
|
"eval_misalign/gamma_abs_times_reward_std": 20802586.9921875,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9936437727883458,
|
|
"eval_misalign/gamma_reward_residual": 2.4353206566019026e-05,
|
|
"eval_misalign/gamma_star": 37987713.1640625,
|
|
"eval_misalign/js_divergence": 3.397782253101468,
|
|
"eval_misalign/reverse_kl_divergence": 13.213865287601948,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.022622147007496096,
|
|
"eval_misalign/reward_a": 23.110429362626746,
|
|
"eval_misalign/reward_b": -14.30728217586875,
|
|
"eval_misalign/reward_improvement": 37.41771391034126,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 2.6861571483314037,
|
|
"eval_misalign/reward_improvement_per_token": 0.05757526887464337,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.11555767059326,
|
|
"eval_misalign/reward_vocab_std": 731.3202633857727,
|
|
"eval_misalign/symmetric_kl": 28.80525030195713,
|
|
"eval_misalign/tv_distance": 33.611202627420425,
|
|
"eval_rewards/accuracies": 0.931640625,
|
|
"eval_rewards/chosen": -0.5189501565182582,
|
|
"eval_rewards/margins": 2.3644025400280952,
|
|
"eval_rewards/rejected": -2.883352691307664,
|
|
"eval_runtime": 100.9771,
|
|
"eval_samples_per_second": 5.07,
|
|
"eval_steps_per_second": 0.634,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.04048109907307662,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.203125,
|
|
"grad_norm": 143.4099884033203,
|
|
"kl/sequence_policy_ref": -17.67092001438141,
|
|
"kl/vocab_forward": 16.044883847236633,
|
|
"kl/vocab_js": 3.4656281918287277,
|
|
"kl/vocab_reverse": 13.393466770648956,
|
|
"kl/vocab_symmetric": 29.438353061676025,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -670.5424919128418,
|
|
"logps/rejected": -1345.9356536865234,
|
|
"loss": 1.3177,
|
|
"loss/dpo": 0.21267282590270042,
|
|
"misalign/J": 110.50714302062988,
|
|
"misalign/J_aux_loss": 1.1050714254379272,
|
|
"misalign/J_aux_loss_raw": 110.50714302062988,
|
|
"misalign/J_over_reverse_kl": 8.267582476139069,
|
|
"misalign/J_per_token": 0.11648859549313784,
|
|
"misalign/compressed_reward_absmax": 4362.797546386719,
|
|
"misalign/compressed_reward_range": 6223.5970458984375,
|
|
"misalign/entropy_a": 1131.1556549072266,
|
|
"misalign/entropy_b": 1221.6925888061523,
|
|
"misalign/forward_kl_divergence": 16.044883847236633,
|
|
"misalign/forward_kl_divergence_per_token": 0.02564867539331317,
|
|
"misalign/gamma_abs_times_reward_std": 24747786.5,
|
|
"misalign/gamma_bracketed_rate": 0.9937806725502014,
|
|
"misalign/gamma_reward_residual": 4.918595743674814e-05,
|
|
"misalign/gamma_star": 46901569.25,
|
|
"misalign/js_divergence": 3.4656281918287277,
|
|
"misalign/reverse_kl_divergence": 13.393466770648956,
|
|
"misalign/reverse_kl_divergence_per_token": 0.02141062926966697,
|
|
"misalign/reward_a": 23.634789615869522,
|
|
"misalign/reward_b": -12.488820567727089,
|
|
"misalign/reward_improvement": 36.12361431121826,
|
|
"misalign/reward_improvement_over_reverse_kl": 2.6490939259529114,
|
|
"misalign/reward_improvement_per_token": 0.05916513060219586,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -522.8331413269043,
|
|
"misalign/reward_vocab_std": 745.3072738647461,
|
|
"misalign/symmetric_kl": 29.438353061676025,
|
|
"misalign/tv_distance": 33.53639495372772,
|
|
"num_tokens": 2250359.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -0.6528808567672968,
|
|
"rewards/margins": 2.228422373533249,
|
|
"rewards/rejected": -2.8813031911849976,
|
|
"step": 13,
|
|
"support/residual_count": 151893.263671875,
|
|
"support/residual_mass_policy": 0.03615651559084654,
|
|
"support/residual_mass_reference": 0.03887659031897783,
|
|
"support/residual_reward": -0.4554095212370157,
|
|
"support/runtime_width": 42.7385835647583,
|
|
"support/sampled_loser_rank": 0.6245415061712265,
|
|
"support/sampled_reward_rank": -0.40703338757157326,
|
|
"support/sampled_token_added_rate": 0.032946799183264375,
|
|
"support/sampled_winner_rank": 0.6250991076231003,
|
|
"support/selected_width": 42.7385835647583,
|
|
"support/stored_width": 42.7385835647583
|
|
},
|
|
{
|
|
"epoch": 0.21875,
|
|
"grad_norm": 129.94761657714844,
|
|
"kl/sequence_policy_ref": -28.61777091026306,
|
|
"kl/vocab_forward": 24.432228088378906,
|
|
"kl/vocab_js": 5.003187119960785,
|
|
"kl/vocab_reverse": 19.63451886177063,
|
|
"kl/vocab_symmetric": 44.0667519569397,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -519.6495475769043,
|
|
"logps/rejected": -1307.973617553711,
|
|
"loss": 1.2097,
|
|
"loss/dpo": 0.13245126977562904,
|
|
"misalign/J": 107.72635746002197,
|
|
"misalign/J_aux_loss": 1.0772635713219643,
|
|
"misalign/J_aux_loss_raw": 107.72635746002197,
|
|
"misalign/J_over_reverse_kl": 6.927842974662781,
|
|
"misalign/J_per_token": 0.1508565410040319,
|
|
"misalign/compressed_reward_absmax": 3698.0084838867188,
|
|
"misalign/compressed_reward_range": 5319.326965332031,
|
|
"misalign/entropy_a": 1003.0962753295898,
|
|
"misalign/entropy_b": 1097.5271453857422,
|
|
"misalign/forward_kl_divergence": 24.432228088378906,
|
|
"misalign/forward_kl_divergence_per_token": 0.05230529000982642,
|
|
"misalign/gamma_abs_times_reward_std": 28536720.375,
|
|
"misalign/gamma_bracketed_rate": 0.9909562915563583,
|
|
"misalign/gamma_reward_residual": 4.479655626710155e-05,
|
|
"misalign/gamma_star": 52695571.75,
|
|
"misalign/js_divergence": 5.003187119960785,
|
|
"misalign/reverse_kl_divergence": 19.63451886177063,
|
|
"misalign/reverse_kl_divergence_per_token": 0.04139284580014646,
|
|
"misalign/reward_a": 25.409843683242798,
|
|
"misalign/reward_b": -16.79699671268463,
|
|
"misalign/reward_improvement": 42.20684003829956,
|
|
"misalign/reward_improvement_over_reverse_kl": 2.0897003561258316,
|
|
"misalign/reward_improvement_per_token": 0.08610636787489057,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -519.6820330619812,
|
|
"misalign/reward_vocab_std": 664.3161697387695,
|
|
"misalign/symmetric_kl": 44.0667519569397,
|
|
"misalign/tv_distance": 37.26307129859924,
|
|
"num_tokens": 2405248.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -0.8268119320273399,
|
|
"rewards/margins": 4.069930404424667,
|
|
"rewards/rejected": -4.896742224693298,
|
|
"step": 14,
|
|
"support/residual_count": 151893.26953125,
|
|
"support/residual_mass_policy": 0.03416757704690099,
|
|
"support/residual_mass_reference": 0.03690562699921429,
|
|
"support/residual_reward": -0.5111633716151118,
|
|
"support/runtime_width": 42.731074810028076,
|
|
"support/sampled_loser_rank": 0.5858863964676857,
|
|
"support/sampled_reward_rank": -0.4269859306514263,
|
|
"support/sampled_token_added_rate": 0.03297502198256552,
|
|
"support/sampled_winner_rank": 0.5992331206798553,
|
|
"support/selected_width": 42.731074810028076,
|
|
"support/stored_width": 42.731074810028076
|
|
},
|
|
{
|
|
"epoch": 0.234375,
|
|
"grad_norm": 181.91363525390625,
|
|
"kl/sequence_policy_ref": -37.09248995780945,
|
|
"kl/vocab_forward": 36.39622640609741,
|
|
"kl/vocab_js": 7.3817285895347595,
|
|
"kl/vocab_reverse": 28.830130338668823,
|
|
"kl/vocab_symmetric": 65.22635078430176,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -861.3702850341797,
|
|
"logps/rejected": -1442.107177734375,
|
|
"loss": 1.6622,
|
|
"loss/dpo": 0.07950026832986623,
|
|
"misalign/J": 158.2665023803711,
|
|
"misalign/J_aux_loss": 1.582665003836155,
|
|
"misalign/J_aux_loss_raw": 158.2665023803711,
|
|
"misalign/J_over_reverse_kl": 6.344271242618561,
|
|
"misalign/J_per_token": 0.14481874648481607,
|
|
"misalign/compressed_reward_absmax": 4787.8853759765625,
|
|
"misalign/compressed_reward_range": 6777.99462890625,
|
|
"misalign/entropy_a": 1239.900390625,
|
|
"misalign/entropy_b": 1391.7532577514648,
|
|
"misalign/forward_kl_divergence": 36.39622640609741,
|
|
"misalign/forward_kl_divergence_per_token": 0.046229132916778326,
|
|
"misalign/gamma_abs_times_reward_std": 40674196.875,
|
|
"misalign/gamma_bracketed_rate": 0.991030216217041,
|
|
"misalign/gamma_reward_residual": 7.2218020022774e-05,
|
|
"misalign/gamma_star": 71395850.0,
|
|
"misalign/js_divergence": 7.3817285895347595,
|
|
"misalign/reverse_kl_divergence": 28.830130338668823,
|
|
"misalign/reverse_kl_divergence_per_token": 0.034299688413739204,
|
|
"misalign/reward_a": 44.823195934295654,
|
|
"misalign/reward_b": -15.68426263332367,
|
|
"misalign/reward_improvement": 60.507455825805664,
|
|
"misalign/reward_improvement_over_reverse_kl": 2.0816327780485153,
|
|
"misalign/reward_improvement_per_token": 0.07316383346915245,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -574.4606781005859,
|
|
"misalign/reward_vocab_std": 834.6710662841797,
|
|
"misalign/symmetric_kl": 65.22635078430176,
|
|
"misalign/tv_distance": 52.97303628921509,
|
|
"num_tokens": 2598130.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -1.3571155220270157,
|
|
"rewards/margins": 4.704267263412476,
|
|
"rewards/rejected": -6.06138277053833,
|
|
"step": 15,
|
|
"support/residual_count": 151893.16015625,
|
|
"support/residual_mass_policy": 0.0329043276142329,
|
|
"support/residual_mass_reference": 0.0375856957398355,
|
|
"support/residual_reward": -0.433580182492733,
|
|
"support/runtime_width": 42.837427616119385,
|
|
"support/sampled_loser_rank": 0.6224480830132961,
|
|
"support/sampled_reward_rank": -0.4383445642888546,
|
|
"support/sampled_token_added_rate": 0.03260477026924491,
|
|
"support/sampled_winner_rank": 0.6216517090797424,
|
|
"support/selected_width": 42.837427616119385,
|
|
"support/stored_width": 42.837427616119385
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 197.3428497314453,
|
|
"kl/sequence_policy_ref": -48.6728458404541,
|
|
"kl/vocab_forward": 41.45539164543152,
|
|
"kl/vocab_js": 8.142653048038483,
|
|
"kl/vocab_reverse": 31.82970356941223,
|
|
"kl/vocab_symmetric": 73.28509140014648,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -746.3570137023926,
|
|
"logps/rejected": -1378.0999755859375,
|
|
"loss": 1.7196,
|
|
"loss/dpo": 0.21162739349529147,
|
|
"misalign/J": 150.7994818687439,
|
|
"misalign/J_aux_loss": 1.5079948231577873,
|
|
"misalign/J_aux_loss_raw": 150.7994818687439,
|
|
"misalign/J_over_reverse_kl": 6.891877442598343,
|
|
"misalign/J_per_token": 0.1977673191577196,
|
|
"misalign/compressed_reward_absmax": 4197.204010009766,
|
|
"misalign/compressed_reward_range": 6020.658874511719,
|
|
"misalign/entropy_a": 1102.5961456298828,
|
|
"misalign/entropy_b": 1252.7644424438477,
|
|
"misalign/forward_kl_divergence": 41.45539164543152,
|
|
"misalign/forward_kl_divergence_per_token": 0.07184931915253401,
|
|
"misalign/gamma_abs_times_reward_std": 42797047.125,
|
|
"misalign/gamma_bracketed_rate": 0.9879247918725014,
|
|
"misalign/gamma_reward_residual": 7.625430021107604e-05,
|
|
"misalign/gamma_star": 70593251.625,
|
|
"misalign/js_divergence": 8.142653048038483,
|
|
"misalign/reverse_kl_divergence": 31.82970356941223,
|
|
"misalign/reverse_kl_divergence_per_token": 0.04737356537953019,
|
|
"misalign/reward_a": 40.68007683753967,
|
|
"misalign/reward_b": -19.660471826791763,
|
|
"misalign/reward_improvement": 60.34055471420288,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.6718981862068176,
|
|
"misalign/reward_improvement_per_token": 0.06326864054426551,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -500.27739334106445,
|
|
"misalign/reward_vocab_std": 718.7537536621094,
|
|
"misalign/symmetric_kl": 73.28509140014648,
|
|
"misalign/tv_distance": 52.97745728492737,
|
|
"num_tokens": 2767079.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -2.074222356081009,
|
|
"rewards/margins": 5.586124628782272,
|
|
"rewards/rejected": -7.660347044467926,
|
|
"step": 16,
|
|
"support/residual_count": 151892.984375,
|
|
"support/residual_mass_policy": 0.04499144037254155,
|
|
"support/residual_mass_reference": 0.0501289798412472,
|
|
"support/residual_reward": -0.45880454778671265,
|
|
"support/runtime_width": 43.018609046936035,
|
|
"support/sampled_loser_rank": 0.5783994421362877,
|
|
"support/sampled_reward_rank": -0.40191334672272205,
|
|
"support/sampled_token_added_rate": 0.035224413964897394,
|
|
"support/sampled_winner_rank": 0.6004925258457661,
|
|
"support/selected_width": 43.018609046936035,
|
|
"support/stored_width": 43.018609046936035
|
|
},
|
|
{
|
|
"epoch": 0.265625,
|
|
"grad_norm": 180.9165802001953,
|
|
"kl/sequence_policy_ref": -57.82066249847412,
|
|
"kl/vocab_forward": 49.91004800796509,
|
|
"kl/vocab_js": 9.40449070930481,
|
|
"kl/vocab_reverse": 36.53464651107788,
|
|
"kl/vocab_symmetric": 86.44468975067139,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -609.6646957397461,
|
|
"logps/rejected": -1610.3834075927734,
|
|
"loss": 1.7231,
|
|
"loss/dpo": 0.09814724331954494,
|
|
"misalign/J": 162.49910640716553,
|
|
"misalign/J_aux_loss": 1.624990999698639,
|
|
"misalign/J_aux_loss_raw": 162.49910640716553,
|
|
"misalign/J_over_reverse_kl": 5.963241904973984,
|
|
"misalign/J_per_token": 0.25427408143877983,
|
|
"misalign/compressed_reward_absmax": 4177.482116699219,
|
|
"misalign/compressed_reward_range": 5814.5203857421875,
|
|
"misalign/entropy_a": 1132.220069885254,
|
|
"misalign/entropy_b": 1293.5945892333984,
|
|
"misalign/forward_kl_divergence": 49.91004800796509,
|
|
"misalign/forward_kl_divergence_per_token": 0.08865668019279838,
|
|
"misalign/gamma_abs_times_reward_std": 45817248.5,
|
|
"misalign/gamma_bracketed_rate": 0.9841953068971634,
|
|
"misalign/gamma_reward_residual": 6.89706002958701e-05,
|
|
"misalign/gamma_star": 86042709.0,
|
|
"misalign/js_divergence": 9.40449070930481,
|
|
"misalign/reverse_kl_divergence": 36.53464651107788,
|
|
"misalign/reverse_kl_divergence_per_token": 0.05950516927987337,
|
|
"misalign/reward_a": 46.90000104904175,
|
|
"misalign/reward_b": -18.144596874713898,
|
|
"misalign/reward_improvement": 65.04460048675537,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.706669107079506,
|
|
"misalign/reward_improvement_per_token": 0.0920080472715199,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -442.9830160140991,
|
|
"misalign/reward_vocab_std": 696.7616271972656,
|
|
"misalign/symmetric_kl": 86.44468975067139,
|
|
"misalign/tv_distance": 57.59225845336914,
|
|
"num_tokens": 2937506.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -1.9826722741127014,
|
|
"rewards/margins": 7.598788321018219,
|
|
"rewards/rejected": -9.581460356712341,
|
|
"step": 17,
|
|
"support/residual_count": 151893.49609375,
|
|
"support/residual_mass_policy": 0.0333835429046303,
|
|
"support/residual_mass_reference": 0.03812613524496555,
|
|
"support/residual_reward": -0.4731739591807127,
|
|
"support/runtime_width": 42.50527763366699,
|
|
"support/sampled_loser_rank": 0.6437485739588737,
|
|
"support/sampled_reward_rank": -0.3562327502295375,
|
|
"support/sampled_token_added_rate": 0.035286844009533525,
|
|
"support/sampled_winner_rank": 0.6677471101284027,
|
|
"support/selected_width": 42.50527763366699,
|
|
"support/stored_width": 42.50527763366699
|
|
},
|
|
{
|
|
"epoch": 0.28125,
|
|
"grad_norm": 149.1407928466797,
|
|
"kl/sequence_policy_ref": -51.036746978759766,
|
|
"kl/vocab_forward": 44.88189697265625,
|
|
"kl/vocab_js": 8.306392669677734,
|
|
"kl/vocab_reverse": 32.83620524406433,
|
|
"kl/vocab_symmetric": 77.71810150146484,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -560.9642581939697,
|
|
"logps/rejected": -1253.7114028930664,
|
|
"loss": 1.5028,
|
|
"loss/dpo": 0.17861688579432666,
|
|
"misalign/J": 132.42161083221436,
|
|
"misalign/J_aux_loss": 1.324216105043888,
|
|
"misalign/J_aux_loss_raw": 132.42161083221436,
|
|
"misalign/J_over_reverse_kl": 8.582664713263512,
|
|
"misalign/J_per_token": 0.27548689767718315,
|
|
"misalign/compressed_reward_absmax": 3723.6834411621094,
|
|
"misalign/compressed_reward_range": 5332.2451171875,
|
|
"misalign/entropy_a": 928.9597320556641,
|
|
"misalign/entropy_b": 1068.3447341918945,
|
|
"misalign/forward_kl_divergence": 44.88189697265625,
|
|
"misalign/forward_kl_divergence_per_token": 0.11330410279333591,
|
|
"misalign/gamma_abs_times_reward_std": 38645046.25,
|
|
"misalign/gamma_bracketed_rate": 0.9823459088802338,
|
|
"misalign/gamma_reward_residual": 7.946674531922326e-05,
|
|
"misalign/gamma_star": 71871948.75,
|
|
"misalign/js_divergence": 8.306392669677734,
|
|
"misalign/reverse_kl_divergence": 32.83620524406433,
|
|
"misalign/reverse_kl_divergence_per_token": 0.08078084606677294,
|
|
"misalign/reward_a": 35.60689043998718,
|
|
"misalign/reward_b": -16.05292272567749,
|
|
"misalign/reward_improvement": 51.65981483459473,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.344532385468483,
|
|
"misalign/reward_improvement_per_token": 0.06508979946374893,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -361.802695274353,
|
|
"misalign/reward_vocab_std": 639.2480430603027,
|
|
"misalign/symmetric_kl": 77.71810150146484,
|
|
"misalign/tv_distance": 48.04610013961792,
|
|
"num_tokens": 3089181.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -2.0581542253494263,
|
|
"rewards/margins": 6.091041147708893,
|
|
"rewards/rejected": -8.149195373058319,
|
|
"step": 18,
|
|
"support/residual_count": 151893.052734375,
|
|
"support/residual_mass_policy": 0.039642924442887306,
|
|
"support/residual_mass_reference": 0.049897957127541304,
|
|
"support/residual_reward": -0.4336713273078203,
|
|
"support/runtime_width": 42.948453426361084,
|
|
"support/sampled_loser_rank": 0.6003664061427116,
|
|
"support/sampled_reward_rank": -0.5117907077074051,
|
|
"support/sampled_token_added_rate": 0.03761800215579569,
|
|
"support/sampled_winner_rank": 0.6974114552140236,
|
|
"support/selected_width": 42.948453426361084,
|
|
"support/stored_width": 42.948453426361084
|
|
},
|
|
{
|
|
"epoch": 0.296875,
|
|
"grad_norm": 236.4145965576172,
|
|
"kl/sequence_policy_ref": -70.85580825805664,
|
|
"kl/vocab_forward": 61.658048152923584,
|
|
"kl/vocab_js": 11.049875855445862,
|
|
"kl/vocab_reverse": 42.84557771682739,
|
|
"kl/vocab_symmetric": 104.50362396240234,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -680.9906196594238,
|
|
"logps/rejected": -1526.8777770996094,
|
|
"loss": 2.3197,
|
|
"loss/dpo": 0.15028794163845305,
|
|
"misalign/J": 216.9369659423828,
|
|
"misalign/J_aux_loss": 2.169369585812092,
|
|
"misalign/J_aux_loss_raw": 216.9369659423828,
|
|
"misalign/J_over_reverse_kl": 6.0987227857112885,
|
|
"misalign/J_per_token": 0.24055337626487017,
|
|
"misalign/compressed_reward_absmax": 4234.713134765625,
|
|
"misalign/compressed_reward_range": 5953.008850097656,
|
|
"misalign/entropy_a": 1090.484504699707,
|
|
"misalign/entropy_b": 1275.7961730957031,
|
|
"misalign/forward_kl_divergence": 61.658048152923584,
|
|
"misalign/forward_kl_divergence_per_token": 0.0996482465416193,
|
|
"misalign/gamma_abs_times_reward_std": 67395967.375,
|
|
"misalign/gamma_bracketed_rate": 0.9844093844294548,
|
|
"misalign/gamma_reward_residual": 0.00012266499561519595,
|
|
"misalign/gamma_star": 125619402.0,
|
|
"misalign/js_divergence": 11.049875855445862,
|
|
"misalign/reverse_kl_divergence": 42.84557771682739,
|
|
"misalign/reverse_kl_divergence_per_token": 0.06486277049407363,
|
|
"misalign/reward_a": 51.76768445968628,
|
|
"misalign/reward_b": -14.660561382770538,
|
|
"misalign/reward_improvement": 66.42824363708496,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.57049061357975,
|
|
"misalign/reward_improvement_per_token": 0.11498506926000118,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -568.4373540878296,
|
|
"misalign/reward_vocab_std": 725.4858703613281,
|
|
"misalign/symmetric_kl": 104.50362396240234,
|
|
"misalign/tv_distance": 62.00803565979004,
|
|
"num_tokens": 3255573.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -3.0821579098701477,
|
|
"rewards/margins": 8.006846249103546,
|
|
"rewards/rejected": -11.089004278182983,
|
|
"step": 19,
|
|
"support/residual_count": 151893.365234375,
|
|
"support/residual_mass_policy": 0.04206296429038048,
|
|
"support/residual_mass_reference": 0.046698169549927115,
|
|
"support/residual_reward": -0.47489158436656,
|
|
"support/runtime_width": 42.64014720916748,
|
|
"support/sampled_loser_rank": 0.605025552213192,
|
|
"support/sampled_reward_rank": -0.3768458142876625,
|
|
"support/sampled_token_added_rate": 0.045746787916868925,
|
|
"support/sampled_winner_rank": 0.6195746287703514,
|
|
"support/selected_width": 42.64014720916748,
|
|
"support/stored_width": 42.64014720916748
|
|
},
|
|
{
|
|
"epoch": 0.3125,
|
|
"grad_norm": 116.85738372802734,
|
|
"kl/sequence_policy_ref": -65.05696249008179,
|
|
"kl/vocab_forward": 57.7905068397522,
|
|
"kl/vocab_js": 10.413756370544434,
|
|
"kl/vocab_reverse": 40.30951166152954,
|
|
"kl/vocab_symmetric": 98.10001516342163,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -673.3448905944824,
|
|
"logps/rejected": -1367.0256805419922,
|
|
"loss": 1.323,
|
|
"loss/dpo": 0.1641167537018191,
|
|
"misalign/J": 115.88617134094238,
|
|
"misalign/J_aux_loss": 1.1588616967201233,
|
|
"misalign/J_aux_loss_raw": 115.88617134094238,
|
|
"misalign/J_over_reverse_kl": 3.0085965991020203,
|
|
"misalign/J_per_token": 0.16471682861447334,
|
|
"misalign/compressed_reward_absmax": 4075.5525817871094,
|
|
"misalign/compressed_reward_range": 5858.878479003906,
|
|
"misalign/entropy_a": 1006.4973907470703,
|
|
"misalign/entropy_b": 1188.8379135131836,
|
|
"misalign/forward_kl_divergence": 57.7905068397522,
|
|
"misalign/forward_kl_divergence_per_token": 0.10363293252885342,
|
|
"misalign/gamma_abs_times_reward_std": 35261591.5,
|
|
"misalign/gamma_bracketed_rate": 0.9897296130657196,
|
|
"misalign/gamma_reward_residual": 6.685065909550758e-05,
|
|
"misalign/gamma_star": 56798719.5,
|
|
"misalign/js_divergence": 10.413756370544434,
|
|
"misalign/reverse_kl_divergence": 40.30951166152954,
|
|
"misalign/reverse_kl_divergence_per_token": 0.07322107395157218,
|
|
"misalign/reward_a": 52.47555136680603,
|
|
"misalign/reward_b": -10.230955243110657,
|
|
"misalign/reward_improvement": 62.70651149749756,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.4691433906555176,
|
|
"misalign/reward_improvement_per_token": 0.10327118635177612,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -398.2197914123535,
|
|
"misalign/reward_vocab_std": 702.5999450683594,
|
|
"misalign/symmetric_kl": 98.10001516342163,
|
|
"misalign/tv_distance": 58.26440095901489,
|
|
"num_tokens": 3415287.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -3.106875829398632,
|
|
"rewards/margins": 6.797641098499298,
|
|
"rewards/rejected": -9.904516816139221,
|
|
"step": 20,
|
|
"support/residual_count": 151892.931640625,
|
|
"support/residual_mass_policy": 0.03865605127066374,
|
|
"support/residual_mass_reference": 0.04426591098308563,
|
|
"support/residual_reward": -0.37147790379822254,
|
|
"support/runtime_width": 43.06875991821289,
|
|
"support/sampled_loser_rank": 0.6501528844237328,
|
|
"support/sampled_reward_rank": -0.459061823785305,
|
|
"support/sampled_token_added_rate": 0.037211825139820576,
|
|
"support/sampled_winner_rank": 0.6419277414679527,
|
|
"support/selected_width": 43.06875991821289,
|
|
"support/stored_width": 43.06875991821289
|
|
},
|
|
{
|
|
"epoch": 0.328125,
|
|
"grad_norm": 167.24000549316406,
|
|
"kl/sequence_policy_ref": -87.30181169509888,
|
|
"kl/vocab_forward": 80.11595010757446,
|
|
"kl/vocab_js": 13.753250360488892,
|
|
"kl/vocab_reverse": 52.890267848968506,
|
|
"kl/vocab_symmetric": 133.00623035430908,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -860.2458534240723,
|
|
"logps/rejected": -1500.8129959106445,
|
|
"loss": 1.9906,
|
|
"loss/dpo": 0.29799531144089997,
|
|
"misalign/J": 169.26398181915283,
|
|
"misalign/J_aux_loss": 1.6926398500800133,
|
|
"misalign/J_aux_loss_raw": 169.26398181915283,
|
|
"misalign/J_over_reverse_kl": 4.501902684569359,
|
|
"misalign/J_per_token": 0.20157606061547995,
|
|
"misalign/compressed_reward_absmax": 4415.365386962891,
|
|
"misalign/compressed_reward_range": 6252.089111328125,
|
|
"misalign/entropy_a": 1110.4772415161133,
|
|
"misalign/entropy_b": 1350.9320755004883,
|
|
"misalign/forward_kl_divergence": 80.11595010757446,
|
|
"misalign/forward_kl_divergence_per_token": 0.10054660588502884,
|
|
"misalign/gamma_abs_times_reward_std": 54722816.875,
|
|
"misalign/gamma_bracketed_rate": 0.9853794500231743,
|
|
"misalign/gamma_reward_residual": 0.00010197218091434479,
|
|
"misalign/gamma_star": 109237338.75,
|
|
"misalign/js_divergence": 13.753250360488892,
|
|
"misalign/reverse_kl_divergence": 52.890267848968506,
|
|
"misalign/reverse_kl_divergence_per_token": 0.06223124684765935,
|
|
"misalign/reward_a": 63.141523361206055,
|
|
"misalign/reward_b": -12.613204658031464,
|
|
"misalign/reward_improvement": 75.75473356246948,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.376510500907898,
|
|
"misalign/reward_improvement_per_token": 0.08627395983785391,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -474.14845275878906,
|
|
"misalign/reward_vocab_std": 756.6061401367188,
|
|
"misalign/symmetric_kl": 133.00623035430908,
|
|
"misalign/tv_distance": 71.77893543243408,
|
|
"num_tokens": 3596201.0,
|
|
"rewards/accuracies": 0.90625,
|
|
"rewards/chosen": -5.184990763664246,
|
|
"rewards/margins": 7.090380907058716,
|
|
"rewards/rejected": -12.275371551513672,
|
|
"step": 21,
|
|
"support/residual_count": 151893.302734375,
|
|
"support/residual_mass_policy": 0.03361299750395119,
|
|
"support/residual_mass_reference": 0.03923962963744998,
|
|
"support/residual_reward": -0.44669216219335794,
|
|
"support/runtime_width": 42.699111461639404,
|
|
"support/sampled_loser_rank": 0.5900973714888096,
|
|
"support/sampled_reward_rank": -0.3974966434761882,
|
|
"support/sampled_token_added_rate": 0.034115204587578773,
|
|
"support/sampled_winner_rank": 0.6001664698123932,
|
|
"support/selected_width": 42.699111461639404,
|
|
"support/stored_width": 42.699111461639404
|
|
},
|
|
{
|
|
"epoch": 0.34375,
|
|
"grad_norm": 182.3348846435547,
|
|
"kl/sequence_policy_ref": -98.32647848129272,
|
|
"kl/vocab_forward": 87.50644826889038,
|
|
"kl/vocab_js": 14.100131571292877,
|
|
"kl/vocab_reverse": 54.41349792480469,
|
|
"kl/vocab_symmetric": 141.9199457168579,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -720.9393196105957,
|
|
"logps/rejected": -1401.5432891845703,
|
|
"loss": 1.9288,
|
|
"loss/dpo": 0.18852760957088321,
|
|
"misalign/J": 174.02261638641357,
|
|
"misalign/J_aux_loss": 1.7402261197566986,
|
|
"misalign/J_aux_loss_raw": 174.02261638641357,
|
|
"misalign/J_over_reverse_kl": 3.6353148818016052,
|
|
"misalign/J_per_token": 0.21223169937729836,
|
|
"misalign/compressed_reward_absmax": 3982.8233032226562,
|
|
"misalign/compressed_reward_range": 5637.1624755859375,
|
|
"misalign/entropy_a": 958.0937767028809,
|
|
"misalign/entropy_b": 1190.8169174194336,
|
|
"misalign/forward_kl_divergence": 87.50644826889038,
|
|
"misalign/forward_kl_divergence_per_token": 0.14662323985248804,
|
|
"misalign/gamma_abs_times_reward_std": 59248922.75,
|
|
"misalign/gamma_bracketed_rate": 0.9862400367856026,
|
|
"misalign/gamma_reward_residual": 0.0001503152491295623,
|
|
"misalign/gamma_star": 115726537.75,
|
|
"misalign/js_divergence": 14.100131571292877,
|
|
"misalign/reverse_kl_divergence": 54.41349792480469,
|
|
"misalign/reverse_kl_divergence_per_token": 0.08750392450019717,
|
|
"misalign/reward_a": 56.57697582244873,
|
|
"misalign/reward_b": -11.629800856113434,
|
|
"misalign/reward_improvement": 68.20676565170288,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.19433955848217,
|
|
"misalign/reward_improvement_per_token": 0.09249244816601276,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -674.6355495452881,
|
|
"misalign/reward_vocab_std": 699.2789764404297,
|
|
"misalign/symmetric_kl": 141.9199457168579,
|
|
"misalign/tv_distance": 68.87815427780151,
|
|
"num_tokens": 3758885.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -4.974303662776947,
|
|
"rewards/margins": 9.71668916940689,
|
|
"rewards/rejected": -14.69099223613739,
|
|
"step": 22,
|
|
"support/residual_count": 151893.119140625,
|
|
"support/residual_mass_policy": 0.033487192122265697,
|
|
"support/residual_mass_reference": 0.041745478520169854,
|
|
"support/residual_reward": -0.5358738675713539,
|
|
"support/runtime_width": 42.88250732421875,
|
|
"support/sampled_loser_rank": 0.6002072133123875,
|
|
"support/sampled_reward_rank": -0.3628186024725437,
|
|
"support/sampled_token_added_rate": 0.036499075358733535,
|
|
"support/sampled_winner_rank": 0.6214041896164417,
|
|
"support/selected_width": 42.88250732421875,
|
|
"support/stored_width": 42.88250732421875
|
|
},
|
|
{
|
|
"epoch": 0.359375,
|
|
"grad_norm": 169.51712036132812,
|
|
"kl/sequence_policy_ref": -149.24591064453125,
|
|
"kl/vocab_forward": 133.08092784881592,
|
|
"kl/vocab_js": 19.9760000705719,
|
|
"kl/vocab_reverse": 77.37433004379272,
|
|
"kl/vocab_symmetric": 210.455228805542,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -737.869384765625,
|
|
"logps/rejected": -1762.2871856689453,
|
|
"loss": 1.9918,
|
|
"loss/dpo": 0.2823996262759465,
|
|
"misalign/J": 170.94224166870117,
|
|
"misalign/J_aux_loss": 1.7094224244356155,
|
|
"misalign/J_aux_loss_raw": 170.94224166870117,
|
|
"misalign/J_over_reverse_kl": 3.2799622118473053,
|
|
"misalign/J_per_token": 0.24247757904231548,
|
|
"misalign/compressed_reward_absmax": 4777.976379394531,
|
|
"misalign/compressed_reward_range": 6731.566955566406,
|
|
"misalign/entropy_a": 1061.749008178711,
|
|
"misalign/entropy_b": 1356.131118774414,
|
|
"misalign/forward_kl_divergence": 133.08092784881592,
|
|
"misalign/forward_kl_divergence_per_token": 0.1782828439027071,
|
|
"misalign/gamma_abs_times_reward_std": 53340090.0,
|
|
"misalign/gamma_bracketed_rate": 0.9852898493409157,
|
|
"misalign/gamma_reward_residual": 5.0128826615036814e-05,
|
|
"misalign/gamma_star": 100226211.0,
|
|
"misalign/js_divergence": 19.9760000705719,
|
|
"misalign/reverse_kl_divergence": 77.37433004379272,
|
|
"misalign/reverse_kl_divergence_per_token": 0.09528437815606594,
|
|
"misalign/reward_a": 69.03681755065918,
|
|
"misalign/reward_b": -18.53004103899002,
|
|
"misalign/reward_improvement": 87.56686687469482,
|
|
"misalign/reward_improvement_over_reverse_kl": 1.0574140399694443,
|
|
"misalign/reward_improvement_per_token": 0.08220357168465853,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -386.6137237548828,
|
|
"misalign/reward_vocab_std": 798.2211227416992,
|
|
"misalign/symmetric_kl": 210.455228805542,
|
|
"misalign/tv_distance": 88.67370319366455,
|
|
"num_tokens": 3948274.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -6.972290515899658,
|
|
"rewards/margins": 15.904601573944092,
|
|
"rewards/rejected": -22.876891613006592,
|
|
"step": 23,
|
|
"support/residual_count": 151893.173828125,
|
|
"support/residual_mass_policy": 0.027609078446403146,
|
|
"support/residual_mass_reference": 0.03574479790404439,
|
|
"support/residual_reward": -0.29973831586539745,
|
|
"support/runtime_width": 42.827510833740234,
|
|
"support/sampled_loser_rank": 0.6080890074372292,
|
|
"support/sampled_reward_rank": -0.44763438403606415,
|
|
"support/sampled_token_added_rate": 0.03303293650969863,
|
|
"support/sampled_winner_rank": 0.6279079839587212,
|
|
"support/selected_width": 42.827510833740234,
|
|
"support/stored_width": 42.827510833740234
|
|
},
|
|
{
|
|
"epoch": 0.375,
|
|
"grad_norm": 140.47096252441406,
|
|
"kl/sequence_policy_ref": -144.5389518737793,
|
|
"kl/vocab_forward": 128.69008922576904,
|
|
"kl/vocab_js": 18.313786387443542,
|
|
"kl/vocab_reverse": 71.02430152893066,
|
|
"kl/vocab_symmetric": 199.71442413330078,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -572.0278053283691,
|
|
"logps/rejected": -1404.0182037353516,
|
|
"loss": 2.6061,
|
|
"loss/dpo": 1.0913660326041281,
|
|
"misalign/J": 151.477144241333,
|
|
"misalign/J_aux_loss": 1.5147713869810104,
|
|
"misalign/J_aux_loss_raw": 151.477144241333,
|
|
"misalign/J_over_reverse_kl": 3.1627804189920425,
|
|
"misalign/J_per_token": 0.24758470617234707,
|
|
"misalign/compressed_reward_absmax": 3778.4814453125,
|
|
"misalign/compressed_reward_range": 5379.040588378906,
|
|
"misalign/entropy_a": 782.0902366638184,
|
|
"misalign/entropy_b": 1035.1530227661133,
|
|
"misalign/forward_kl_divergence": 128.69008922576904,
|
|
"misalign/forward_kl_divergence_per_token": 0.2601375840604305,
|
|
"misalign/gamma_abs_times_reward_std": 47515251.5,
|
|
"misalign/gamma_bracketed_rate": 0.9839186295866966,
|
|
"misalign/gamma_reward_residual": 0.00012259059758434887,
|
|
"misalign/gamma_star": 87554911.0,
|
|
"misalign/js_divergence": 18.313786387443542,
|
|
"misalign/reverse_kl_divergence": 71.02430152893066,
|
|
"misalign/reverse_kl_divergence_per_token": 0.13188489899039268,
|
|
"misalign/reward_a": 56.8120379447937,
|
|
"misalign/reward_b": -14.587293282151222,
|
|
"misalign/reward_improvement": 71.39935445785522,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.8568570390343666,
|
|
"misalign/reward_improvement_per_token": 0.08758416399359703,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -452.94491720199585,
|
|
"misalign/reward_vocab_std": 656.8603706359863,
|
|
"misalign/symmetric_kl": 199.71442413330078,
|
|
"misalign/tv_distance": 75.44480800628662,
|
|
"num_tokens": 4099141.0,
|
|
"rewards/accuracies": 0.890625,
|
|
"rewards/chosen": -7.403442680835724,
|
|
"rewards/margins": 14.100905060768127,
|
|
"rewards/rejected": -21.504348874092102,
|
|
"step": 24,
|
|
"support/residual_count": 151893.353515625,
|
|
"support/residual_mass_policy": 0.03428218117915094,
|
|
"support/residual_mass_reference": 0.04287252272479236,
|
|
"support/residual_reward": -0.5751103330403566,
|
|
"support/runtime_width": 42.65069341659546,
|
|
"support/sampled_loser_rank": 0.573052179068327,
|
|
"support/sampled_reward_rank": -0.40498005226254463,
|
|
"support/sampled_token_added_rate": 0.03546261019073427,
|
|
"support/sampled_winner_rank": 0.5977285951375961,
|
|
"support/selected_width": 42.65069341659546,
|
|
"support/stored_width": 42.65069341659546
|
|
},
|
|
{
|
|
"epoch": 0.375,
|
|
"eval_kl/sequence_policy_ref": -176.45206832885742,
|
|
"eval_kl/vocab_forward": 159.24169623851776,
|
|
"eval_kl/vocab_js": 22.288581863045692,
|
|
"eval_kl/vocab_reverse": 86.69451874494553,
|
|
"eval_kl/vocab_symmetric": 245.93626713752747,
|
|
"eval_logps/chosen": -822.1732840538025,
|
|
"eval_logps/rejected": -1630.9967403411865,
|
|
"eval_loss": 2.032864570617676,
|
|
"eval_loss/dpo": 0.513789746529512,
|
|
"eval_misalign/J": 151.90749096870422,
|
|
"eval_misalign/J_aux_loss": 1.519074865616858,
|
|
"eval_misalign/J_aux_loss_raw": 151.90749096870422,
|
|
"eval_misalign/J_over_reverse_kl": 2.979654673486948,
|
|
"eval_misalign/J_per_token": 0.2187155862338841,
|
|
"eval_misalign/compressed_reward_absmax": 4236.706275939941,
|
|
"eval_misalign/compressed_reward_range": 6027.056526184082,
|
|
"eval_misalign/entropy_a": 972.2038908004761,
|
|
"eval_misalign/entropy_b": 1298.0544624328613,
|
|
"eval_misalign/forward_kl_divergence": 159.24169623851776,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.28451165836304426,
|
|
"eval_misalign/gamma_abs_times_reward_std": 44413324.9375,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9879434006288648,
|
|
"eval_misalign/gamma_reward_residual": 9.759679009846423e-05,
|
|
"eval_misalign/gamma_star": 76999057.8125,
|
|
"eval_misalign/js_divergence": 22.288581863045692,
|
|
"eval_misalign/reverse_kl_divergence": 86.69451874494553,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.14481490349862725,
|
|
"eval_misalign/reward_a": 73.34608280658722,
|
|
"eval_misalign/reward_b": -14.307282455265522,
|
|
"eval_misalign/reward_improvement": 87.65336620807648,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 0.9149683965370059,
|
|
"eval_misalign/reward_improvement_per_token": 0.08359824417857453,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.11556124687195,
|
|
"eval_misalign/reward_vocab_std": 731.3202571868896,
|
|
"eval_misalign/symmetric_kl": 245.93626713752747,
|
|
"eval_misalign/tv_distance": 92.47358250617981,
|
|
"eval_rewards/accuracies": 0.91796875,
|
|
"eval_rewards/chosen": -9.299732282757759,
|
|
"eval_rewards/margins": 16.690949447453022,
|
|
"eval_rewards/rejected": -25.99068196117878,
|
|
"eval_runtime": 101.3791,
|
|
"eval_samples_per_second": 5.05,
|
|
"eval_steps_per_second": 0.631,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.031916850464767776,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.390625,
|
|
"grad_norm": 103.1752700805664,
|
|
"kl/sequence_policy_ref": -175.5179786682129,
|
|
"kl/vocab_forward": 165.2755527496338,
|
|
"kl/vocab_js": 22.945865869522095,
|
|
"kl/vocab_reverse": 88.68728542327881,
|
|
"kl/vocab_symmetric": 253.96291160583496,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -823.9939117431641,
|
|
"logps/rejected": -1613.8134307861328,
|
|
"loss": 1.7244,
|
|
"loss/dpo": 0.30688550411116466,
|
|
"misalign/J": 141.75555324554443,
|
|
"misalign/J_aux_loss": 1.4175555855035782,
|
|
"misalign/J_aux_loss_raw": 141.75555324554443,
|
|
"misalign/J_over_reverse_kl": 1.7237665206193924,
|
|
"misalign/J_per_token": 0.20317152328789234,
|
|
"misalign/compressed_reward_absmax": 4355.498870849609,
|
|
"misalign/compressed_reward_range": 6246.024597167969,
|
|
"misalign/entropy_a": 947.64599609375,
|
|
"misalign/entropy_b": 1290.7304077148438,
|
|
"misalign/forward_kl_divergence": 165.2755527496338,
|
|
"misalign/forward_kl_divergence_per_token": 0.3030826188623905,
|
|
"misalign/gamma_abs_times_reward_std": 42652071.0,
|
|
"misalign/gamma_bracketed_rate": 0.9883464574813843,
|
|
"misalign/gamma_reward_residual": 5.872455130884191e-05,
|
|
"misalign/gamma_star": 72449817.125,
|
|
"misalign/js_divergence": 22.945865869522095,
|
|
"misalign/reverse_kl_divergence": 88.68728542327881,
|
|
"misalign/reverse_kl_divergence_per_token": 0.13877300918102264,
|
|
"misalign/reward_a": 74.63873767852783,
|
|
"misalign/reward_b": -13.649469316005707,
|
|
"misalign/reward_improvement": 88.28820371627808,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.9083794951438904,
|
|
"misalign/reward_improvement_per_token": 0.09202832682058215,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -502.7140769958496,
|
|
"misalign/reward_vocab_std": 763.2846450805664,
|
|
"misalign/symmetric_kl": 253.96291160583496,
|
|
"misalign/tv_distance": 94.16235828399658,
|
|
"num_tokens": 4267525.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -9.091135799884796,
|
|
"rewards/margins": 16.92132532596588,
|
|
"rewards/rejected": -26.012461185455322,
|
|
"step": 25,
|
|
"support/residual_count": 151893.228515625,
|
|
"support/residual_mass_policy": 0.02821849100291729,
|
|
"support/residual_mass_reference": 0.03902764664962888,
|
|
"support/residual_reward": -0.4604416638612747,
|
|
"support/runtime_width": 42.77067279815674,
|
|
"support/sampled_loser_rank": 0.6332324221730232,
|
|
"support/sampled_reward_rank": -0.385429447516799,
|
|
"support/sampled_token_added_rate": 0.03589798975735903,
|
|
"support/sampled_winner_rank": 0.6475523337721825,
|
|
"support/selected_width": 42.77067279815674,
|
|
"support/stored_width": 42.77067279815674
|
|
},
|
|
{
|
|
"epoch": 0.40625,
|
|
"grad_norm": 174.2784423828125,
|
|
"kl/sequence_policy_ref": -163.66453552246094,
|
|
"kl/vocab_forward": 142.52412605285645,
|
|
"kl/vocab_js": 19.224056720733643,
|
|
"kl/vocab_reverse": 76.05754041671753,
|
|
"kl/vocab_symmetric": 218.58167839050293,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -591.3704357147217,
|
|
"logps/rejected": -1515.6297912597656,
|
|
"loss": 1.9078,
|
|
"loss/dpo": 0.4340968047727074,
|
|
"misalign/J": 147.37513256072998,
|
|
"misalign/J_aux_loss": 1.473751276731491,
|
|
"misalign/J_aux_loss_raw": 147.37513256072998,
|
|
"misalign/J_over_reverse_kl": 2.757804274559021,
|
|
"misalign/J_per_token": 0.28767452389001846,
|
|
"misalign/compressed_reward_absmax": 4011.147979736328,
|
|
"misalign/compressed_reward_range": 5775.938781738281,
|
|
"misalign/entropy_a": 838.7755889892578,
|
|
"misalign/entropy_b": 1110.4411010742188,
|
|
"misalign/forward_kl_divergence": 142.52412605285645,
|
|
"misalign/forward_kl_divergence_per_token": 0.36550967395305634,
|
|
"misalign/gamma_abs_times_reward_std": 50544949.5,
|
|
"misalign/gamma_bracketed_rate": 0.9852296411991119,
|
|
"misalign/gamma_reward_residual": 4.505999822868034e-05,
|
|
"misalign/gamma_star": 83547619.75,
|
|
"misalign/js_divergence": 19.224056720733643,
|
|
"misalign/reverse_kl_divergence": 76.05754041671753,
|
|
"misalign/reverse_kl_divergence_per_token": 0.18261760007590055,
|
|
"misalign/reward_a": 57.265894651412964,
|
|
"misalign/reward_b": -16.07930701971054,
|
|
"misalign/reward_improvement": 73.3451886177063,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.8678321242332458,
|
|
"misalign/reward_improvement_per_token": 0.11522631160914898,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -398.8517951965332,
|
|
"misalign/reward_vocab_std": 701.5557250976562,
|
|
"misalign/symmetric_kl": 218.58167839050293,
|
|
"misalign/tv_distance": 77.64586639404297,
|
|
"num_tokens": 4432914.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -6.874026566743851,
|
|
"rewards/margins": 18.98485553264618,
|
|
"rewards/rejected": -25.85888135433197,
|
|
"step": 26,
|
|
"support/residual_count": 151893.0546875,
|
|
"support/residual_mass_policy": 0.034172143787145615,
|
|
"support/residual_mass_reference": 0.045072893146425486,
|
|
"support/residual_reward": -0.4141153208911419,
|
|
"support/runtime_width": 42.94584655761719,
|
|
"support/sampled_loser_rank": 0.5826500616967678,
|
|
"support/sampled_reward_rank": -0.4691601078957319,
|
|
"support/sampled_token_added_rate": 0.04121970618143678,
|
|
"support/sampled_winner_rank": 0.5887424424290657,
|
|
"support/selected_width": 42.94584655761719,
|
|
"support/stored_width": 42.94584655761719
|
|
},
|
|
{
|
|
"epoch": 0.421875,
|
|
"grad_norm": 136.3807830810547,
|
|
"kl/sequence_policy_ref": -203.77911186218262,
|
|
"kl/vocab_forward": 183.4600429534912,
|
|
"kl/vocab_js": 24.132691860198975,
|
|
"kl/vocab_reverse": 93.94395637512207,
|
|
"kl/vocab_symmetric": 277.40405654907227,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -713.9009399414062,
|
|
"logps/rejected": -1687.3765106201172,
|
|
"loss": 2.0315,
|
|
"loss/dpo": 0.32593174448902573,
|
|
"misalign/J": 170.55465126037598,
|
|
"misalign/J_aux_loss": 1.7055464833974838,
|
|
"misalign/J_aux_loss_raw": 170.55465126037598,
|
|
"misalign/J_over_reverse_kl": 2.438810557126999,
|
|
"misalign/J_per_token": 0.20938482321798801,
|
|
"misalign/compressed_reward_absmax": 4179.306243896484,
|
|
"misalign/compressed_reward_range": 5945.363037109375,
|
|
"misalign/entropy_a": 905.4970626831055,
|
|
"misalign/entropy_b": 1233.0114974975586,
|
|
"misalign/forward_kl_divergence": 183.4600429534912,
|
|
"misalign/forward_kl_divergence_per_token": 0.35007214546203613,
|
|
"misalign/gamma_abs_times_reward_std": 59701588.25,
|
|
"misalign/gamma_bracketed_rate": 0.9896951243281364,
|
|
"misalign/gamma_reward_residual": 4.522019105479558e-05,
|
|
"misalign/gamma_star": 83829036.25,
|
|
"misalign/js_divergence": 24.132691860198975,
|
|
"misalign/reverse_kl_divergence": 93.94395637512207,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1601700335741043,
|
|
"misalign/reward_a": 70.40993356704712,
|
|
"misalign/reward_b": -15.953831195831299,
|
|
"misalign/reward_improvement": 86.36374855041504,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.9155527576804161,
|
|
"misalign/reward_improvement_per_token": 0.10616821236908436,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -455.97426986694336,
|
|
"misalign/reward_vocab_std": 724.7982482910156,
|
|
"misalign/symmetric_kl": 277.40405654907227,
|
|
"misalign/tv_distance": 94.7668981552124,
|
|
"num_tokens": 4606090.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -9.796147882938385,
|
|
"rewards/margins": 21.163527250289917,
|
|
"rewards/rejected": -30.959676027297974,
|
|
"step": 27,
|
|
"support/residual_count": 151893.162109375,
|
|
"support/residual_mass_policy": 0.03356131399050355,
|
|
"support/residual_mass_reference": 0.041941048577427864,
|
|
"support/residual_reward": -0.38252438232302666,
|
|
"support/runtime_width": 42.837002754211426,
|
|
"support/sampled_loser_rank": 0.6207218393683434,
|
|
"support/sampled_reward_rank": -0.4833753891289234,
|
|
"support/sampled_token_added_rate": 0.031088492134585977,
|
|
"support/sampled_winner_rank": 0.6683962419629097,
|
|
"support/selected_width": 42.837002754211426,
|
|
"support/stored_width": 42.837002754211426
|
|
},
|
|
{
|
|
"epoch": 0.4375,
|
|
"grad_norm": 196.56495666503906,
|
|
"kl/sequence_policy_ref": -229.19044494628906,
|
|
"kl/vocab_forward": 210.6693572998047,
|
|
"kl/vocab_js": 27.503621578216553,
|
|
"kl/vocab_reverse": 107.11871337890625,
|
|
"kl/vocab_symmetric": 317.7881450653076,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -1008.6218376159668,
|
|
"logps/rejected": -1565.5734558105469,
|
|
"loss": 2.9609,
|
|
"loss/dpo": 1.097994428826496,
|
|
"misalign/J": 186.2927417755127,
|
|
"misalign/J_aux_loss": 1.8629273921251297,
|
|
"misalign/J_aux_loss_raw": 186.2927417755127,
|
|
"misalign/J_over_reverse_kl": 1.8979013413190842,
|
|
"misalign/J_per_token": 0.27667875960469246,
|
|
"misalign/compressed_reward_absmax": 4361.166076660156,
|
|
"misalign/compressed_reward_range": 6270.502258300781,
|
|
"misalign/entropy_a": 908.857048034668,
|
|
"misalign/entropy_b": 1304.3608856201172,
|
|
"misalign/forward_kl_divergence": 210.6693572998047,
|
|
"misalign/forward_kl_divergence_per_token": 0.3658239506185055,
|
|
"misalign/gamma_abs_times_reward_std": 59128653.0,
|
|
"misalign/gamma_bracketed_rate": 0.981993056833744,
|
|
"misalign/gamma_reward_residual": 0.0002913941991664615,
|
|
"misalign/gamma_star": 100789119.5,
|
|
"misalign/js_divergence": 27.503621578216553,
|
|
"misalign/reverse_kl_divergence": 107.11871337890625,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1738772690296173,
|
|
"misalign/reward_a": 89.05416059494019,
|
|
"misalign/reward_b": -10.882870197296143,
|
|
"misalign/reward_improvement": 99.93703746795654,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.8969720676541328,
|
|
"misalign/reward_improvement_per_token": 0.14762359578162432,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -650.0788879394531,
|
|
"misalign/reward_vocab_std": 765.3363800048828,
|
|
"misalign/symmetric_kl": 317.7881450653076,
|
|
"misalign/tv_distance": 106.59637641906738,
|
|
"num_tokens": 4786127.0,
|
|
"rewards/accuracies": 0.859375,
|
|
"rewards/chosen": -13.720831990242004,
|
|
"rewards/margins": 18.396424293518066,
|
|
"rewards/rejected": -32.1172571182251,
|
|
"step": 28,
|
|
"support/residual_count": 151893.193359375,
|
|
"support/residual_mass_policy": 0.03176172194071114,
|
|
"support/residual_mass_reference": 0.041156242368742824,
|
|
"support/residual_reward": -0.6319293715059757,
|
|
"support/runtime_width": 42.804439544677734,
|
|
"support/sampled_loser_rank": 0.6620542109012604,
|
|
"support/sampled_reward_rank": -0.30558538623154163,
|
|
"support/sampled_token_added_rate": 0.03132295864634216,
|
|
"support/sampled_winner_rank": 0.7198682501912117,
|
|
"support/selected_width": 42.804439544677734,
|
|
"support/stored_width": 42.804439544677734
|
|
},
|
|
{
|
|
"epoch": 0.453125,
|
|
"grad_norm": 121.38319396972656,
|
|
"kl/sequence_policy_ref": -252.93916511535645,
|
|
"kl/vocab_forward": 226.2414608001709,
|
|
"kl/vocab_js": 28.960803031921387,
|
|
"kl/vocab_reverse": 114.66798114776611,
|
|
"kl/vocab_symmetric": 340.90957260131836,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -779.5240745544434,
|
|
"logps/rejected": -1872.5811767578125,
|
|
"loss": 2.4603,
|
|
"loss/dpo": 0.9713822825047167,
|
|
"misalign/J": 148.88968753814697,
|
|
"misalign/J_aux_loss": 1.4888968467712402,
|
|
"misalign/J_aux_loss_raw": 148.88968753814697,
|
|
"misalign/J_over_reverse_kl": 2.155982196331024,
|
|
"misalign/J_per_token": 0.26285428553819656,
|
|
"misalign/compressed_reward_absmax": 4548.9989013671875,
|
|
"misalign/compressed_reward_range": 6488.315979003906,
|
|
"misalign/entropy_a": 937.3698120117188,
|
|
"misalign/entropy_b": 1328.4651641845703,
|
|
"misalign/forward_kl_divergence": 226.2414608001709,
|
|
"misalign/forward_kl_divergence_per_token": 0.329929880797863,
|
|
"misalign/gamma_abs_times_reward_std": 42800947.75,
|
|
"misalign/gamma_bracketed_rate": 0.9842586368322372,
|
|
"misalign/gamma_reward_residual": 7.247616258609924e-05,
|
|
"misalign/gamma_star": 66069326.0,
|
|
"misalign/js_divergence": 28.960803031921387,
|
|
"misalign/reverse_kl_divergence": 114.66798114776611,
|
|
"misalign/reverse_kl_divergence_per_token": 0.15985783841460943,
|
|
"misalign/reward_a": 79.22361898422241,
|
|
"misalign/reward_b": -19.311943411827087,
|
|
"misalign/reward_improvement": 98.53554153442383,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.820253424346447,
|
|
"misalign/reward_improvement_per_token": 0.09689361555501819,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -195.40936851501465,
|
|
"misalign/reward_vocab_std": 769.0406875610352,
|
|
"misalign/symmetric_kl": 340.90957260131836,
|
|
"misalign/tv_distance": 107.74736499786377,
|
|
"num_tokens": 4960206.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -11.959493935108185,
|
|
"rewards/margins": 26.66884672641754,
|
|
"rewards/rejected": -38.62834072113037,
|
|
"step": 29,
|
|
"support/residual_count": 151892.96875,
|
|
"support/residual_mass_policy": 0.033485232619568706,
|
|
"support/residual_mass_reference": 0.0447953250259161,
|
|
"support/residual_reward": -0.18092468939721584,
|
|
"support/runtime_width": 43.02750873565674,
|
|
"support/sampled_loser_rank": 0.6013398505747318,
|
|
"support/sampled_reward_rank": -0.48561038076877594,
|
|
"support/sampled_token_added_rate": 0.042466682847589254,
|
|
"support/sampled_winner_rank": 0.5962688289582729,
|
|
"support/selected_width": 43.02750873565674,
|
|
"support/stored_width": 43.02750873565674
|
|
},
|
|
{
|
|
"epoch": 0.46875,
|
|
"grad_norm": 191.35165405273438,
|
|
"kl/sequence_policy_ref": -265.8605842590332,
|
|
"kl/vocab_forward": 241.83115577697754,
|
|
"kl/vocab_js": 29.952810764312744,
|
|
"kl/vocab_reverse": 117.4506607055664,
|
|
"kl/vocab_symmetric": 359.28198051452637,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -902.8605346679688,
|
|
"logps/rejected": -1808.0689544677734,
|
|
"loss": 2.3052,
|
|
"loss/dpo": 0.2984987065605812,
|
|
"misalign/J": 200.67014503479004,
|
|
"misalign/J_aux_loss": 2.0067013800144196,
|
|
"misalign/J_aux_loss_raw": 200.67014503479004,
|
|
"misalign/J_over_reverse_kl": 1.9287290573120117,
|
|
"misalign/J_per_token": 0.2536418605595827,
|
|
"misalign/compressed_reward_absmax": 4862.0013427734375,
|
|
"misalign/compressed_reward_range": 6898.528076171875,
|
|
"misalign/entropy_a": 945.1607437133789,
|
|
"misalign/entropy_b": 1346.2866134643555,
|
|
"misalign/forward_kl_divergence": 241.83115577697754,
|
|
"misalign/forward_kl_divergence_per_token": 0.34030735678970814,
|
|
"misalign/gamma_abs_times_reward_std": 66157058.0,
|
|
"misalign/gamma_bracketed_rate": 0.9856267645955086,
|
|
"misalign/gamma_reward_residual": 0.000264001724190166,
|
|
"misalign/gamma_star": 109703980.5,
|
|
"misalign/js_divergence": 29.952810764312744,
|
|
"misalign/reverse_kl_divergence": 117.4506607055664,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1695484183728695,
|
|
"misalign/reward_a": 87.51940584182739,
|
|
"misalign/reward_b": -12.946220338344574,
|
|
"misalign/reward_improvement": 100.46563053131104,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.804968811571598,
|
|
"misalign/reward_improvement_per_token": 0.11112680193036795,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -619.1823959350586,
|
|
"misalign/reward_vocab_std": 829.3263244628906,
|
|
"misalign/symmetric_kl": 359.28198051452637,
|
|
"misalign/tv_distance": 112.75522899627686,
|
|
"num_tokens": 5145509.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -15.322714567184448,
|
|
"rewards/margins": 22.526688814163208,
|
|
"rewards/rejected": -37.84940314292908,
|
|
"step": 30,
|
|
"support/residual_count": 151893.208984375,
|
|
"support/residual_mass_policy": 0.034048222471028566,
|
|
"support/residual_mass_reference": 0.04346996312960982,
|
|
"support/residual_reward": -0.5197547674179077,
|
|
"support/runtime_width": 42.78953218460083,
|
|
"support/sampled_loser_rank": 0.5563570559024811,
|
|
"support/sampled_reward_rank": -0.39020144287496805,
|
|
"support/sampled_token_added_rate": 0.03398139285854995,
|
|
"support/sampled_winner_rank": 0.5739484503865242,
|
|
"support/selected_width": 42.78953218460083,
|
|
"support/stored_width": 42.78953218460083
|
|
},
|
|
{
|
|
"epoch": 0.484375,
|
|
"grad_norm": 157.51846313476562,
|
|
"kl/sequence_policy_ref": -242.7173252105713,
|
|
"kl/vocab_forward": 220.59081268310547,
|
|
"kl/vocab_js": 26.569517850875854,
|
|
"kl/vocab_reverse": 104.70345973968506,
|
|
"kl/vocab_symmetric": 325.2943916320801,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -677.5861015319824,
|
|
"logps/rejected": -1586.1520690917969,
|
|
"loss": 1.9096,
|
|
"loss/dpo": 0.26202132055277616,
|
|
"misalign/J": 164.75636100769043,
|
|
"misalign/J_aux_loss": 1.6475635841488838,
|
|
"misalign/J_aux_loss_raw": 164.75636100769043,
|
|
"misalign/J_over_reverse_kl": 1.940863698720932,
|
|
"misalign/J_per_token": 0.33494884334504604,
|
|
"misalign/compressed_reward_absmax": 3763.813751220703,
|
|
"misalign/compressed_reward_range": 5305.598388671875,
|
|
"misalign/entropy_a": 752.1155014038086,
|
|
"misalign/entropy_b": 1103.1325607299805,
|
|
"misalign/forward_kl_divergence": 220.59081268310547,
|
|
"misalign/forward_kl_divergence_per_token": 0.5400971993803978,
|
|
"misalign/gamma_abs_times_reward_std": 49524192.5,
|
|
"misalign/gamma_bracketed_rate": 0.9842683598399162,
|
|
"misalign/gamma_reward_residual": 0.00013795335132726905,
|
|
"misalign/gamma_star": 76385996.75,
|
|
"misalign/js_divergence": 26.569517850875854,
|
|
"misalign/reverse_kl_divergence": 104.70345973968506,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2469406109303236,
|
|
"misalign/reward_a": 72.4141092300415,
|
|
"misalign/reward_b": -14.692016035318375,
|
|
"misalign/reward_improvement": 87.10610628128052,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7758874297142029,
|
|
"misalign/reward_improvement_per_token": 0.11436527967453003,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -336.6119108200073,
|
|
"misalign/reward_vocab_std": 635.3649368286133,
|
|
"misalign/symmetric_kl": 325.2943916320801,
|
|
"misalign/tv_distance": 95.78610897064209,
|
|
"num_tokens": 5302053.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -11.388235569000244,
|
|
"rewards/margins": 25.766995549201965,
|
|
"rewards/rejected": -37.155229806900024,
|
|
"step": 31,
|
|
"support/residual_count": 151893.30859375,
|
|
"support/residual_mass_policy": 0.026038944022729993,
|
|
"support/residual_mass_reference": 0.04330639448016882,
|
|
"support/residual_reward": -0.4683221112936735,
|
|
"support/runtime_width": 42.69255495071411,
|
|
"support/sampled_loser_rank": 0.6677984669804573,
|
|
"support/sampled_reward_rank": -0.38564055040478706,
|
|
"support/sampled_token_added_rate": 0.03589020320214331,
|
|
"support/sampled_winner_rank": 0.6767471358180046,
|
|
"support/selected_width": 42.69255495071411,
|
|
"support/stored_width": 42.69255495071411
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 115.89986419677734,
|
|
"kl/sequence_policy_ref": -291.1677303314209,
|
|
"kl/vocab_forward": 264.76793098449707,
|
|
"kl/vocab_js": 31.6591854095459,
|
|
"kl/vocab_reverse": 126.53115463256836,
|
|
"kl/vocab_symmetric": 391.29920196533203,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -758.1873931884766,
|
|
"logps/rejected": -1748.2294845581055,
|
|
"loss": 1.9955,
|
|
"loss/dpo": 0.4298266823877448,
|
|
"misalign/J": 156.56506061553955,
|
|
"misalign/J_aux_loss": 1.5656505972146988,
|
|
"misalign/J_aux_loss_raw": 156.56506061553955,
|
|
"misalign/J_over_reverse_kl": 1.3217194080352783,
|
|
"misalign/J_per_token": 0.21796293556690216,
|
|
"misalign/compressed_reward_absmax": 4141.018249511719,
|
|
"misalign/compressed_reward_range": 5932.4368896484375,
|
|
"misalign/entropy_a": 796.6573028564453,
|
|
"misalign/entropy_b": 1197.2570190429688,
|
|
"misalign/forward_kl_divergence": 264.76793098449707,
|
|
"misalign/forward_kl_divergence_per_token": 0.42143452540040016,
|
|
"misalign/gamma_abs_times_reward_std": 42270098.25,
|
|
"misalign/gamma_bracketed_rate": 0.9892738536000252,
|
|
"misalign/gamma_reward_residual": 0.0006614696701490175,
|
|
"misalign/gamma_star": 77172042.5,
|
|
"misalign/js_divergence": 31.6591854095459,
|
|
"misalign/reverse_kl_divergence": 126.53115463256836,
|
|
"misalign/reverse_kl_divergence_per_token": 0.20403443090617657,
|
|
"misalign/reward_a": 84.93575382232666,
|
|
"misalign/reward_b": -16.301965177059174,
|
|
"misalign/reward_improvement": 101.23770046234131,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6962632201611996,
|
|
"misalign/reward_improvement_per_token": 0.06460105488076806,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -461.3788146972656,
|
|
"misalign/reward_vocab_std": 712.9334030151367,
|
|
"misalign/symmetric_kl": 391.29920196533203,
|
|
"misalign/tv_distance": 110.64313316345215,
|
|
"num_tokens": 5464702.0,
|
|
"rewards/accuracies": 0.890625,
|
|
"rewards/chosen": -13.653896808624268,
|
|
"rewards/margins": 30.925754070281982,
|
|
"rewards/rejected": -44.57965087890625,
|
|
"step": 32,
|
|
"support/residual_count": 151893.287109375,
|
|
"support/residual_mass_policy": 0.026596042443998158,
|
|
"support/residual_mass_reference": 0.044471810571849346,
|
|
"support/residual_reward": -0.42068428732454777,
|
|
"support/runtime_width": 42.70789432525635,
|
|
"support/sampled_loser_rank": 0.6153440810739994,
|
|
"support/sampled_reward_rank": -0.4130892716348171,
|
|
"support/sampled_token_added_rate": 0.034831034019589424,
|
|
"support/sampled_winner_rank": 0.6262499615550041,
|
|
"support/selected_width": 42.70789432525635,
|
|
"support/stored_width": 42.70789432525635
|
|
},
|
|
{
|
|
"epoch": 0.515625,
|
|
"grad_norm": 126.86652374267578,
|
|
"kl/sequence_policy_ref": -330.00819396972656,
|
|
"kl/vocab_forward": 300.4244632720947,
|
|
"kl/vocab_js": 34.439491748809814,
|
|
"kl/vocab_reverse": 136.73623180389404,
|
|
"kl/vocab_symmetric": 437.1608543395996,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -748.5886764526367,
|
|
"logps/rejected": -1879.9018249511719,
|
|
"loss": 1.9269,
|
|
"loss/dpo": 0.15133077676370377,
|
|
"misalign/J": 177.56014442443848,
|
|
"misalign/J_aux_loss": 1.77560143917799,
|
|
"misalign/J_aux_loss_raw": 177.56014442443848,
|
|
"misalign/J_over_reverse_kl": 1.510396808385849,
|
|
"misalign/J_per_token": 0.25404511764645576,
|
|
"misalign/compressed_reward_absmax": 4184.280670166016,
|
|
"misalign/compressed_reward_range": 5938.451599121094,
|
|
"misalign/entropy_a": 794.0090255737305,
|
|
"misalign/entropy_b": 1205.6077117919922,
|
|
"misalign/forward_kl_divergence": 300.4244632720947,
|
|
"misalign/forward_kl_divergence_per_token": 0.5809952989220619,
|
|
"misalign/gamma_abs_times_reward_std": 47938589.5,
|
|
"misalign/gamma_bracketed_rate": 0.9874916970729828,
|
|
"misalign/gamma_reward_residual": 0.0018580270816528355,
|
|
"misalign/gamma_star": 84848901.75,
|
|
"misalign/js_divergence": 34.439491748809814,
|
|
"misalign/reverse_kl_divergence": 136.73623180389404,
|
|
"misalign/reverse_kl_divergence_per_token": 0.22614295408129692,
|
|
"misalign/reward_a": 93.50389242172241,
|
|
"misalign/reward_b": -11.571515798568726,
|
|
"misalign/reward_improvement": 105.07538223266602,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7212403789162636,
|
|
"misalign/reward_improvement_per_token": 0.08620550157502294,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -539.4033613204956,
|
|
"misalign/reward_vocab_std": 717.4302520751953,
|
|
"misalign/symmetric_kl": 437.1608543395996,
|
|
"misalign/tv_distance": 118.05486106872559,
|
|
"num_tokens": 5633244.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -14.54743093252182,
|
|
"rewards/margins": 36.90677738189697,
|
|
"rewards/rejected": -51.454208850860596,
|
|
"step": 33,
|
|
"support/residual_count": 151893.3203125,
|
|
"support/residual_mass_policy": 0.029681737767532468,
|
|
"support/residual_mass_reference": 0.04252100153826177,
|
|
"support/residual_reward": -0.4979167296551168,
|
|
"support/runtime_width": 42.67843770980835,
|
|
"support/sampled_loser_rank": 0.6283881887793541,
|
|
"support/sampled_reward_rank": -0.38332303427159786,
|
|
"support/sampled_token_added_rate": 0.03253701771609485,
|
|
"support/sampled_winner_rank": 0.6316058188676834,
|
|
"support/selected_width": 42.67843770980835,
|
|
"support/stored_width": 42.67843770980835
|
|
},
|
|
{
|
|
"epoch": 0.53125,
|
|
"grad_norm": 189.66943359375,
|
|
"kl/sequence_policy_ref": -295.09754180908203,
|
|
"kl/vocab_forward": 265.09803009033203,
|
|
"kl/vocab_js": 31.023924469947815,
|
|
"kl/vocab_reverse": 124.39066219329834,
|
|
"kl/vocab_symmetric": 389.488920211792,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -869.112964630127,
|
|
"logps/rejected": -1561.075454711914,
|
|
"loss": 2.3314,
|
|
"loss/dpo": 0.4870968231589359,
|
|
"misalign/J": 184.4345703125,
|
|
"misalign/J_aux_loss": 1.8443456441164017,
|
|
"misalign/J_aux_loss_raw": 184.4345703125,
|
|
"misalign/J_over_reverse_kl": 2.160892277956009,
|
|
"misalign/J_per_token": 0.36055343225598335,
|
|
"misalign/compressed_reward_absmax": 4057.2051391601562,
|
|
"misalign/compressed_reward_range": 5788.110046386719,
|
|
"misalign/entropy_a": 733.9059944152832,
|
|
"misalign/entropy_b": 1137.9230575561523,
|
|
"misalign/forward_kl_divergence": 265.09803009033203,
|
|
"misalign/forward_kl_divergence_per_token": 0.5678062625229359,
|
|
"misalign/gamma_abs_times_reward_std": 71265453.5,
|
|
"misalign/gamma_bracketed_rate": 0.9808945804834366,
|
|
"misalign/gamma_reward_residual": 4.436415292730089e-05,
|
|
"misalign/gamma_star": 82635977.5,
|
|
"misalign/js_divergence": 31.023924469947815,
|
|
"misalign/reverse_kl_divergence": 124.39066219329834,
|
|
"misalign/reverse_kl_divergence_per_token": 0.22786439768970013,
|
|
"misalign/reward_a": 93.78890228271484,
|
|
"misalign/reward_b": -7.1546797305345535,
|
|
"misalign/reward_improvement": 100.94355964660645,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6350699551403522,
|
|
"misalign/reward_improvement_per_token": 0.05376583803445101,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -452.1529130935669,
|
|
"misalign/reward_vocab_std": 700.5487747192383,
|
|
"misalign/symmetric_kl": 389.488920211792,
|
|
"misalign/tv_distance": 107.56306266784668,
|
|
"num_tokens": 5794166.0,
|
|
"rewards/accuracies": 0.90625,
|
|
"rewards/chosen": -14.67345118522644,
|
|
"rewards/margins": 29.672606945037842,
|
|
"rewards/rejected": -44.34605646133423,
|
|
"step": 34,
|
|
"support/residual_count": 151893.341796875,
|
|
"support/residual_mass_policy": 0.023359368089586496,
|
|
"support/residual_mass_reference": 0.03612355049699545,
|
|
"support/residual_reward": -0.4855753555893898,
|
|
"support/runtime_width": 42.65322256088257,
|
|
"support/sampled_loser_rank": 0.5644064396619797,
|
|
"support/sampled_reward_rank": -0.3794688871130347,
|
|
"support/sampled_token_added_rate": 0.030243139481171966,
|
|
"support/sampled_winner_rank": 0.5993468686938286,
|
|
"support/selected_width": 42.65322256088257,
|
|
"support/stored_width": 42.65322256088257
|
|
},
|
|
{
|
|
"epoch": 0.546875,
|
|
"grad_norm": 255.15330505371094,
|
|
"kl/sequence_policy_ref": -334.76751708984375,
|
|
"kl/vocab_forward": 300.60124015808105,
|
|
"kl/vocab_js": 34.761489152908325,
|
|
"kl/vocab_reverse": 139.5803165435791,
|
|
"kl/vocab_symmetric": 440.1817283630371,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -799.9229431152344,
|
|
"logps/rejected": -1850.2046356201172,
|
|
"loss": 2.7865,
|
|
"loss/dpo": 0.628988600539742,
|
|
"misalign/J": 215.75445175170898,
|
|
"misalign/J_aux_loss": 2.1575444042682648,
|
|
"misalign/J_aux_loss_raw": 215.75445175170898,
|
|
"misalign/J_over_reverse_kl": 1.6279902905225754,
|
|
"misalign/J_per_token": 0.23844042047858238,
|
|
"misalign/compressed_reward_absmax": 4261.061584472656,
|
|
"misalign/compressed_reward_range": 6006.7581787109375,
|
|
"misalign/entropy_a": 801.278564453125,
|
|
"misalign/entropy_b": 1216.669448852539,
|
|
"misalign/forward_kl_divergence": 300.60124015808105,
|
|
"misalign/forward_kl_divergence_per_token": 0.4489123970270157,
|
|
"misalign/gamma_abs_times_reward_std": 66620462.0,
|
|
"misalign/gamma_bracketed_rate": 0.9877287149429321,
|
|
"misalign/gamma_reward_residual": 0.00013351680354389828,
|
|
"misalign/gamma_star": 108750856.5,
|
|
"misalign/js_divergence": 34.761489152908325,
|
|
"misalign/reverse_kl_divergence": 139.5803165435791,
|
|
"misalign/reverse_kl_divergence_per_token": 0.20873420871794224,
|
|
"misalign/reward_a": 96.30846405029297,
|
|
"misalign/reward_b": -10.182962775230408,
|
|
"misalign/reward_improvement": 106.4914083480835,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7169731482863426,
|
|
"misalign/reward_improvement_per_token": 0.1033505480736494,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -652.5512466430664,
|
|
"misalign/reward_vocab_std": 728.8263397216797,
|
|
"misalign/symmetric_kl": 440.1817283630371,
|
|
"misalign/tv_distance": 117.93358135223389,
|
|
"num_tokens": 5967652.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -16.057342648506165,
|
|
"rewards/margins": 34.83882117271423,
|
|
"rewards/rejected": -50.89616346359253,
|
|
"step": 35,
|
|
"support/residual_count": 151893.296875,
|
|
"support/residual_mass_policy": 0.025821004761382937,
|
|
"support/residual_mass_reference": 0.039951348677277565,
|
|
"support/residual_reward": -0.5654018372297287,
|
|
"support/runtime_width": 42.70614957809448,
|
|
"support/sampled_loser_rank": 0.6046793200075626,
|
|
"support/sampled_reward_rank": -0.2875976013019681,
|
|
"support/sampled_token_added_rate": 0.03660787723492831,
|
|
"support/sampled_winner_rank": 0.5936227701604366,
|
|
"support/selected_width": 42.70614957809448,
|
|
"support/stored_width": 42.70614957809448
|
|
},
|
|
{
|
|
"epoch": 0.5625,
|
|
"grad_norm": 372.3753967285156,
|
|
"kl/sequence_policy_ref": -301.4326972961426,
|
|
"kl/vocab_forward": 272.3033618927002,
|
|
"kl/vocab_js": 31.47647452354431,
|
|
"kl/vocab_reverse": 129.38499641418457,
|
|
"kl/vocab_symmetric": 401.688533782959,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -617.8253440856934,
|
|
"logps/rejected": -1656.3383178710938,
|
|
"loss": 2.0888,
|
|
"loss/dpo": 0.20348351792887343,
|
|
"misalign/J": 188.5364990234375,
|
|
"misalign/J_aux_loss": 1.8853649497032166,
|
|
"misalign/J_aux_loss_raw": 188.5364990234375,
|
|
"misalign/J_over_reverse_kl": 1.918866515159607,
|
|
"misalign/J_per_token": 0.2657326404005289,
|
|
"misalign/compressed_reward_absmax": 3813.5306091308594,
|
|
"misalign/compressed_reward_range": 5372.776062011719,
|
|
"misalign/entropy_a": 678.8123512268066,
|
|
"misalign/entropy_b": 1029.1281280517578,
|
|
"misalign/forward_kl_divergence": 272.3033618927002,
|
|
"misalign/forward_kl_divergence_per_token": 0.5031169354915619,
|
|
"misalign/gamma_abs_times_reward_std": 54824950.5,
|
|
"misalign/gamma_bracketed_rate": 0.984458789229393,
|
|
"misalign/gamma_reward_residual": 9.551036919219769e-05,
|
|
"misalign/gamma_star": 76879923.5,
|
|
"misalign/js_divergence": 31.47647452354431,
|
|
"misalign/reverse_kl_divergence": 129.38499641418457,
|
|
"misalign/reverse_kl_divergence_per_token": 0.20918168872594833,
|
|
"misalign/reward_a": 85.22445583343506,
|
|
"misalign/reward_b": -13.628453433513641,
|
|
"misalign/reward_improvement": 98.85287952423096,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6402187570929527,
|
|
"misalign/reward_improvement_per_token": 0.06344311079010367,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -478.2143135070801,
|
|
"misalign/reward_vocab_std": 661.5251770019531,
|
|
"misalign/symmetric_kl": 401.688533782959,
|
|
"misalign/tv_distance": 104.53119087219238,
|
|
"num_tokens": 6125949.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -12.255744695663452,
|
|
"rewards/margins": 35.775049448013306,
|
|
"rewards/rejected": -48.03079414367676,
|
|
"step": 36,
|
|
"support/residual_count": 151893.57421875,
|
|
"support/residual_mass_policy": 0.0228013499872759,
|
|
"support/residual_mass_reference": 0.03604377689771354,
|
|
"support/residual_reward": -0.5941759124398232,
|
|
"support/runtime_width": 42.42575168609619,
|
|
"support/sampled_loser_rank": 0.579292468726635,
|
|
"support/sampled_reward_rank": -0.31505878642201424,
|
|
"support/sampled_token_added_rate": 0.027777738636359572,
|
|
"support/sampled_winner_rank": 0.6083027757704258,
|
|
"support/selected_width": 42.42575168609619,
|
|
"support/stored_width": 42.42575168609619
|
|
},
|
|
{
|
|
"epoch": 0.5625,
|
|
"eval_kl/sequence_policy_ref": -370.0239589214325,
|
|
"eval_kl/vocab_forward": 326.3884401321411,
|
|
"eval_kl/vocab_js": 39.170413970947266,
|
|
"eval_kl/vocab_reverse": 163.1976842880249,
|
|
"eval_kl/vocab_symmetric": 489.5863370895386,
|
|
"eval_logps/chosen": -878.7963104248047,
|
|
"eval_logps/rejected": -1961.517490386963,
|
|
"eval_loss": 1.9275543689727783,
|
|
"eval_loss/dpo": 0.07911084008669506,
|
|
"eval_misalign/J": 184.84435880184174,
|
|
"eval_misalign/J_aux_loss": 1.8484435249119997,
|
|
"eval_misalign/J_aux_loss_raw": 184.84435880184174,
|
|
"eval_misalign/J_over_reverse_kl": 1.7581309108063579,
|
|
"eval_misalign/J_per_token": 0.26096369861625135,
|
|
"eval_misalign/compressed_reward_absmax": 4236.706245422363,
|
|
"eval_misalign/compressed_reward_range": 6027.056587219238,
|
|
"eval_misalign/entropy_a": 859.5940890312195,
|
|
"eval_misalign/entropy_b": 1298.0544710159302,
|
|
"eval_misalign/forward_kl_divergence": 326.3884401321411,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.5667336815968156,
|
|
"eval_misalign/gamma_abs_times_reward_std": 44952110.546875,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9881090503185987,
|
|
"eval_misalign/gamma_reward_residual": 0.0003100246618572555,
|
|
"eval_misalign/gamma_star": 68325932.625,
|
|
"eval_misalign/js_divergence": 39.170413970947266,
|
|
"eval_misalign/reverse_kl_divergence": 163.1976842880249,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.2310976292937994,
|
|
"eval_misalign/reward_a": 111.55592322349548,
|
|
"eval_misalign/reward_b": -14.307281229645014,
|
|
"eval_misalign/reward_improvement": 125.86316466331482,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 0.6893182648345828,
|
|
"eval_misalign/reward_improvement_per_token": 0.08624049881473184,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.1155492067337,
|
|
"eval_misalign/reward_vocab_std": 731.3202810287476,
|
|
"eval_misalign/symmetric_kl": 489.5863370895386,
|
|
"eval_misalign/tv_distance": 129.546555519104,
|
|
"eval_rewards/accuracies": 0.98046875,
|
|
"eval_rewards/chosen": -14.962035872042179,
|
|
"eval_rewards/margins": 44.080720245838165,
|
|
"eval_rewards/rejected": -59.04275727272034,
|
|
"eval_runtime": 100.6796,
|
|
"eval_samples_per_second": 5.085,
|
|
"eval_steps_per_second": 0.636,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.02652598696295172,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.578125,
|
|
"grad_norm": 538.4682006835938,
|
|
"kl/sequence_policy_ref": -340.5377769470215,
|
|
"kl/vocab_forward": 290.87598991394043,
|
|
"kl/vocab_js": 34.37863755226135,
|
|
"kl/vocab_reverse": 144.22690105438232,
|
|
"kl/vocab_symmetric": 435.10305404663086,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -651.7600479125977,
|
|
"logps/rejected": -1730.038101196289,
|
|
"loss": 2.6095,
|
|
"loss/dpo": 0.025369518539697822,
|
|
"misalign/J": 258.41460514068604,
|
|
"misalign/J_aux_loss": 2.58414613455534,
|
|
"misalign/J_aux_loss_raw": 258.41460514068604,
|
|
"misalign/J_over_reverse_kl": 3.293791249394417,
|
|
"misalign/J_per_token": 0.3237530868500471,
|
|
"misalign/compressed_reward_absmax": 3946.171417236328,
|
|
"misalign/compressed_reward_range": 5592.2857666015625,
|
|
"misalign/entropy_a": 679.9681549072266,
|
|
"misalign/entropy_b": 1041.4010391235352,
|
|
"misalign/forward_kl_divergence": 290.87598991394043,
|
|
"misalign/forward_kl_divergence_per_token": 0.6766270510852337,
|
|
"misalign/gamma_abs_times_reward_std": 77732721.625,
|
|
"misalign/gamma_bracketed_rate": 0.9842484146356583,
|
|
"misalign/gamma_reward_residual": 2.0939698629263148e-05,
|
|
"misalign/gamma_star": 120903140.125,
|
|
"misalign/js_divergence": 34.37863755226135,
|
|
"misalign/reverse_kl_divergence": 144.22690105438232,
|
|
"misalign/reverse_kl_divergence_per_token": 0.24700743332505226,
|
|
"misalign/reward_a": 94.92676162719727,
|
|
"misalign/reward_b": -12.971765249967575,
|
|
"misalign/reward_improvement": 107.89847755432129,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6338806599378586,
|
|
"misalign/reward_improvement_per_token": 0.060572607442736626,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -481.5314302444458,
|
|
"misalign/reward_vocab_std": 672.1194839477539,
|
|
"misalign/symmetric_kl": 435.10305404663086,
|
|
"misalign/tv_distance": 110.60170650482178,
|
|
"num_tokens": 6285208.0,
|
|
"rewards/accuracies": 1.0,
|
|
"rewards/chosen": -12.246494770050049,
|
|
"rewards/margins": 43.61456775665283,
|
|
"rewards/rejected": -55.861063957214355,
|
|
"step": 37,
|
|
"support/residual_count": 151893.3203125,
|
|
"support/residual_mass_policy": 0.023932685144245625,
|
|
"support/residual_mass_reference": 0.041495030745863914,
|
|
"support/residual_reward": -0.5419074520468712,
|
|
"support/runtime_width": 42.680593967437744,
|
|
"support/sampled_loser_rank": 0.5905993320047855,
|
|
"support/sampled_reward_rank": -0.3638652637600899,
|
|
"support/sampled_token_added_rate": 0.030867979861795902,
|
|
"support/sampled_winner_rank": 0.6219254210591316,
|
|
"support/selected_width": 42.680593967437744,
|
|
"support/stored_width": 42.680593967437744
|
|
},
|
|
{
|
|
"epoch": 0.59375,
|
|
"grad_norm": 103.29216766357422,
|
|
"kl/sequence_policy_ref": -322.09266471862793,
|
|
"kl/vocab_forward": 284.572021484375,
|
|
"kl/vocab_js": 32.71440887451172,
|
|
"kl/vocab_reverse": 135.91109657287598,
|
|
"kl/vocab_symmetric": 420.4832000732422,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -666.5978660583496,
|
|
"logps/rejected": -1681.1878814697266,
|
|
"loss": 1.7536,
|
|
"loss/dpo": 0.07719759906125201,
|
|
"misalign/J": 167.64227294921875,
|
|
"misalign/J_aux_loss": 1.6764226853847504,
|
|
"misalign/J_aux_loss_raw": 167.64227294921875,
|
|
"misalign/J_over_reverse_kl": 1.7161841690540314,
|
|
"misalign/J_per_token": 0.27796192467212677,
|
|
"misalign/compressed_reward_absmax": 3628.3260498046875,
|
|
"misalign/compressed_reward_range": 5132.099914550781,
|
|
"misalign/entropy_a": 690.4412384033203,
|
|
"misalign/entropy_b": 1045.768310546875,
|
|
"misalign/forward_kl_divergence": 284.572021484375,
|
|
"misalign/forward_kl_divergence_per_token": 0.6853830218315125,
|
|
"misalign/gamma_abs_times_reward_std": 51078868.0,
|
|
"misalign/gamma_bracketed_rate": 0.9857224076986313,
|
|
"misalign/gamma_reward_residual": 0.00023307789706450421,
|
|
"misalign/gamma_star": 63338430.75,
|
|
"misalign/js_divergence": 32.71440887451172,
|
|
"misalign/reverse_kl_divergence": 135.91109657287598,
|
|
"misalign/reverse_kl_divergence_per_token": 0.26742945425212383,
|
|
"misalign/reward_a": 82.06362342834473,
|
|
"misalign/reward_b": -14.572960376739502,
|
|
"misalign/reward_improvement": 96.63658332824707,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6172222569584846,
|
|
"misalign/reward_improvement_per_token": 0.09521574154496193,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -462.82337760925293,
|
|
"misalign/reward_vocab_std": 629.3218460083008,
|
|
"misalign/symmetric_kl": 420.4832000732422,
|
|
"misalign/tv_distance": 107.11642932891846,
|
|
"num_tokens": 6439380.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -13.20527732372284,
|
|
"rewards/margins": 38.00797891616821,
|
|
"rewards/rejected": -51.21325659751892,
|
|
"step": 38,
|
|
"support/residual_count": 151893.580078125,
|
|
"support/residual_mass_policy": 0.02642570063471794,
|
|
"support/residual_mass_reference": 0.0395309254527092,
|
|
"support/residual_reward": -0.619575060904026,
|
|
"support/runtime_width": 42.41612482070923,
|
|
"support/sampled_loser_rank": 0.6446574702858925,
|
|
"support/sampled_reward_rank": -0.3532958813011646,
|
|
"support/sampled_token_added_rate": 0.029658236424438655,
|
|
"support/sampled_winner_rank": 0.6769233047962189,
|
|
"support/selected_width": 42.41612482070923,
|
|
"support/stored_width": 42.41612482070923
|
|
},
|
|
{
|
|
"epoch": 0.609375,
|
|
"grad_norm": 1020.2838745117188,
|
|
"kl/sequence_policy_ref": -395.0605163574219,
|
|
"kl/vocab_forward": 347.51882553100586,
|
|
"kl/vocab_js": 42.35039806365967,
|
|
"kl/vocab_reverse": 184.1147804260254,
|
|
"kl/vocab_symmetric": 531.6339111328125,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -740.7064399719238,
|
|
"logps/rejected": -2187.0342712402344,
|
|
"loss": 3.4136,
|
|
"loss/dpo": 0.032895612518908,
|
|
"misalign/J": 338.06642150878906,
|
|
"misalign/J_aux_loss": 3.3806639164686203,
|
|
"misalign/J_aux_loss_raw": 338.06642150878906,
|
|
"misalign/J_over_reverse_kl": 2.6226917803287506,
|
|
"misalign/J_per_token": 0.3244504798203707,
|
|
"misalign/compressed_reward_absmax": 4427.0225830078125,
|
|
"misalign/compressed_reward_range": 6178.325927734375,
|
|
"misalign/entropy_a": 868.1625289916992,
|
|
"misalign/entropy_b": 1331.6126098632812,
|
|
"misalign/forward_kl_divergence": 347.51882553100586,
|
|
"misalign/forward_kl_divergence_per_token": 0.5268885493278503,
|
|
"misalign/gamma_abs_times_reward_std": 78415596.0,
|
|
"misalign/gamma_bracketed_rate": 0.9849846512079239,
|
|
"misalign/gamma_reward_residual": 0.00048569267073617084,
|
|
"misalign/gamma_star": 86910211.0,
|
|
"misalign/js_divergence": 42.35039806365967,
|
|
"misalign/reverse_kl_divergence": 184.1147804260254,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2407812997698784,
|
|
"misalign/reward_a": 94.71883296966553,
|
|
"misalign/reward_b": -19.811948537826538,
|
|
"misalign/reward_improvement": 114.53073215484619,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6183451376855373,
|
|
"misalign/reward_improvement_per_token": 0.07184543320909142,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -322.06157875061035,
|
|
"misalign/reward_vocab_std": 766.7786178588867,
|
|
"misalign/symmetric_kl": 531.6339111328125,
|
|
"misalign/tv_distance": 135.00493335723877,
|
|
"num_tokens": 6618112.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -14.374651193618774,
|
|
"rewards/margins": 50.26280069351196,
|
|
"rewards/rejected": -64.63745069503784,
|
|
"step": 39,
|
|
"support/residual_count": 151892.978515625,
|
|
"support/residual_mass_policy": 0.030612861970439553,
|
|
"support/residual_mass_reference": 0.04724447149783373,
|
|
"support/residual_reward": -0.27408459782600403,
|
|
"support/runtime_width": 43.020020484924316,
|
|
"support/sampled_loser_rank": 0.6019720807671547,
|
|
"support/sampled_reward_rank": -0.4871169701218605,
|
|
"support/sampled_token_added_rate": 0.038056216202676296,
|
|
"support/sampled_winner_rank": 0.6120708398520947,
|
|
"support/selected_width": 43.020020484924316,
|
|
"support/stored_width": 43.020020484924316
|
|
},
|
|
{
|
|
"epoch": 0.625,
|
|
"grad_norm": 1029.6490478515625,
|
|
"kl/sequence_policy_ref": -391.72465896606445,
|
|
"kl/vocab_forward": 347.66312408447266,
|
|
"kl/vocab_js": 43.16923809051514,
|
|
"kl/vocab_reverse": 182.91918754577637,
|
|
"kl/vocab_symmetric": 530.5825958251953,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -836.8339767456055,
|
|
"logps/rejected": -2054.7322845458984,
|
|
"loss": 2.4484,
|
|
"loss/dpo": 0.08931858758296585,
|
|
"misalign/J": 235.91084098815918,
|
|
"misalign/J_aux_loss": 2.3591084629297256,
|
|
"misalign/J_aux_loss_raw": 235.91084098815918,
|
|
"misalign/J_over_reverse_kl": 2.021958939731121,
|
|
"misalign/J_per_token": 0.2609993116930127,
|
|
"misalign/compressed_reward_absmax": 4498.334259033203,
|
|
"misalign/compressed_reward_range": 6353.229675292969,
|
|
"misalign/entropy_a": 859.5539245605469,
|
|
"misalign/entropy_b": 1316.3250732421875,
|
|
"misalign/forward_kl_divergence": 347.66312408447266,
|
|
"misalign/forward_kl_divergence_per_token": 0.4745451509952545,
|
|
"misalign/gamma_abs_times_reward_std": 67711939.75,
|
|
"misalign/gamma_bracketed_rate": 0.9849048256874084,
|
|
"misalign/gamma_reward_residual": 6.0246247358008986e-05,
|
|
"misalign/gamma_star": 116100731.0,
|
|
"misalign/js_divergence": 43.16923809051514,
|
|
"misalign/reverse_kl_divergence": 182.91918754577637,
|
|
"misalign/reverse_kl_divergence_per_token": 0.22102734073996544,
|
|
"misalign/reward_a": 119.16459369659424,
|
|
"misalign/reward_b": -17.058857798576355,
|
|
"misalign/reward_improvement": 136.22340965270996,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6773070022463799,
|
|
"misalign/reward_improvement_per_token": 0.09231107356026769,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -541.3807773590088,
|
|
"misalign/reward_vocab_std": 752.7018508911133,
|
|
"misalign/symmetric_kl": 530.5825958251953,
|
|
"misalign/tv_distance": 137.86173248291016,
|
|
"num_tokens": 6793032.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -15.797016501426697,
|
|
"rewards/margins": 46.75089979171753,
|
|
"rewards/rejected": -62.547916412353516,
|
|
"step": 40,
|
|
"support/residual_count": 151893.34375,
|
|
"support/residual_mass_policy": 0.02554402849636972,
|
|
"support/residual_mass_reference": 0.04131174925714731,
|
|
"support/residual_reward": -0.43343046586960554,
|
|
"support/runtime_width": 42.65411186218262,
|
|
"support/sampled_loser_rank": 0.635368824005127,
|
|
"support/sampled_reward_rank": -0.3783828802406788,
|
|
"support/sampled_token_added_rate": 0.03763708798214793,
|
|
"support/sampled_winner_rank": 0.632044330239296,
|
|
"support/selected_width": 42.65411186218262,
|
|
"support/stored_width": 42.65411186218262
|
|
},
|
|
{
|
|
"epoch": 0.640625,
|
|
"grad_norm": 123.83010864257812,
|
|
"kl/sequence_policy_ref": -379.3154487609863,
|
|
"kl/vocab_forward": 331.34838104248047,
|
|
"kl/vocab_js": 39.656344413757324,
|
|
"kl/vocab_reverse": 165.32555389404297,
|
|
"kl/vocab_symmetric": 496.6741180419922,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -821.5552520751953,
|
|
"logps/rejected": -2054.2618560791016,
|
|
"loss": 2.1358,
|
|
"loss/dpo": 0.3257599932614975,
|
|
"misalign/J": 181.00886344909668,
|
|
"misalign/J_aux_loss": 1.8100886344909668,
|
|
"misalign/J_aux_loss_raw": 181.00886344909668,
|
|
"misalign/J_over_reverse_kl": 1.6241952329874039,
|
|
"misalign/J_per_token": 0.2575971782207489,
|
|
"misalign/compressed_reward_absmax": 4115.048797607422,
|
|
"misalign/compressed_reward_range": 5863.2059326171875,
|
|
"misalign/entropy_a": 867.4101486206055,
|
|
"misalign/entropy_b": 1317.5307006835938,
|
|
"misalign/forward_kl_divergence": 331.34838104248047,
|
|
"misalign/forward_kl_divergence_per_token": 0.5753209926187992,
|
|
"misalign/gamma_abs_times_reward_std": 46303981.5,
|
|
"misalign/gamma_bracketed_rate": 0.9869667664170265,
|
|
"misalign/gamma_reward_residual": 8.94081304068095e-05,
|
|
"misalign/gamma_star": 55120054.25,
|
|
"misalign/js_divergence": 39.656344413757324,
|
|
"misalign/reverse_kl_divergence": 165.32555389404297,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2224746011197567,
|
|
"misalign/reward_a": 104.61750793457031,
|
|
"misalign/reward_b": -15.681229546666145,
|
|
"misalign/reward_improvement": 120.29868698120117,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6759593263268471,
|
|
"misalign/reward_improvement_per_token": 0.08714451128616929,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -421.68537425994873,
|
|
"misalign/reward_vocab_std": 710.1155014038086,
|
|
"misalign/symmetric_kl": 496.6741180419922,
|
|
"misalign/tv_distance": 129.80499076843262,
|
|
"num_tokens": 6964060.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -13.713181614875793,
|
|
"rewards/margins": 48.436728954315186,
|
|
"rewards/rejected": -62.14991092681885,
|
|
"step": 41,
|
|
"support/residual_count": 151893.20703125,
|
|
"support/residual_mass_policy": 0.031454769195988774,
|
|
"support/residual_mass_reference": 0.04673444852232933,
|
|
"support/residual_reward": -0.3744997123721987,
|
|
"support/runtime_width": 42.79198360443115,
|
|
"support/sampled_loser_rank": 0.6497581750154495,
|
|
"support/sampled_reward_rank": -0.433025848120451,
|
|
"support/sampled_token_added_rate": 0.039118685061112046,
|
|
"support/sampled_winner_rank": 0.6626867726445198,
|
|
"support/selected_width": 42.79198360443115,
|
|
"support/stored_width": 42.79198360443115
|
|
},
|
|
{
|
|
"epoch": 0.65625,
|
|
"grad_norm": 150.85992431640625,
|
|
"kl/sequence_policy_ref": -384.6586265563965,
|
|
"kl/vocab_forward": 345.101411819458,
|
|
"kl/vocab_js": 41.772791624069214,
|
|
"kl/vocab_reverse": 176.38330841064453,
|
|
"kl/vocab_symmetric": 521.48486328125,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -863.8736877441406,
|
|
"logps/rejected": -1970.995620727539,
|
|
"loss": 1.7582,
|
|
"loss/dpo": 0.017987981137206566,
|
|
"misalign/J": 174.01841640472412,
|
|
"misalign/J_aux_loss": 1.7401841282844543,
|
|
"misalign/J_aux_loss_raw": 174.01841640472412,
|
|
"misalign/J_over_reverse_kl": 1.5599696189165115,
|
|
"misalign/J_per_token": 0.22202685475349426,
|
|
"misalign/compressed_reward_absmax": 4183.230377197266,
|
|
"misalign/compressed_reward_range": 5966.482116699219,
|
|
"misalign/entropy_a": 820.5932464599609,
|
|
"misalign/entropy_b": 1289.2757186889648,
|
|
"misalign/forward_kl_divergence": 345.101411819458,
|
|
"misalign/forward_kl_divergence_per_token": 0.5078705288469791,
|
|
"misalign/gamma_abs_times_reward_std": 39975210.5,
|
|
"misalign/gamma_bracketed_rate": 0.9893263578414917,
|
|
"misalign/gamma_reward_residual": 7.978630480920401e-05,
|
|
"misalign/gamma_star": 61503279.625,
|
|
"misalign/js_divergence": 41.772791624069214,
|
|
"misalign/reverse_kl_divergence": 176.38330841064453,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2160295583307743,
|
|
"misalign/reward_a": 117.82565593719482,
|
|
"misalign/reward_b": -13.596548825502396,
|
|
"misalign/reward_improvement": 131.42217826843262,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.661291316151619,
|
|
"misalign/reward_improvement_per_token": 0.08064441289752722,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -524.60085105896,
|
|
"misalign/reward_vocab_std": 725.3326187133789,
|
|
"misalign/symmetric_kl": 521.48486328125,
|
|
"misalign/tv_distance": 134.1311206817627,
|
|
"num_tokens": 7130526.0,
|
|
"rewards/accuracies": 1.0,
|
|
"rewards/chosen": -14.464305222034454,
|
|
"rewards/margins": 48.003116607666016,
|
|
"rewards/rejected": -62.467421531677246,
|
|
"step": 42,
|
|
"support/residual_count": 151893.248046875,
|
|
"support/residual_mass_policy": 0.023710966343060136,
|
|
"support/residual_mass_reference": 0.03921722201630473,
|
|
"support/residual_reward": -0.5094128809869289,
|
|
"support/runtime_width": 42.75533056259155,
|
|
"support/sampled_loser_rank": 0.6201684325933456,
|
|
"support/sampled_reward_rank": -0.38710433803498745,
|
|
"support/sampled_token_added_rate": 0.034145432990044355,
|
|
"support/sampled_winner_rank": 0.6373011693358421,
|
|
"support/selected_width": 42.75533056259155,
|
|
"support/stored_width": 42.75533056259155
|
|
},
|
|
{
|
|
"epoch": 0.671875,
|
|
"grad_norm": 80.83043670654297,
|
|
"kl/sequence_policy_ref": -430.89680099487305,
|
|
"kl/vocab_forward": 383.08553886413574,
|
|
"kl/vocab_js": 42.64789915084839,
|
|
"kl/vocab_reverse": 176.26643562316895,
|
|
"kl/vocab_symmetric": 559.3521347045898,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -795.0814590454102,
|
|
"logps/rejected": -2073.403335571289,
|
|
"loss": 1.947,
|
|
"loss/dpo": 0.20042237156056064,
|
|
"misalign/J": 174.65477752685547,
|
|
"misalign/J_aux_loss": 1.7465477734804153,
|
|
"misalign/J_aux_loss_raw": 174.65477752685547,
|
|
"misalign/J_over_reverse_kl": 1.2193461656570435,
|
|
"misalign/J_per_token": 0.21656284667551517,
|
|
"misalign/compressed_reward_absmax": 4145.209014892578,
|
|
"misalign/compressed_reward_range": 5894.319396972656,
|
|
"misalign/entropy_a": 773.8354606628418,
|
|
"misalign/entropy_b": 1241.6957702636719,
|
|
"misalign/forward_kl_divergence": 383.08553886413574,
|
|
"misalign/forward_kl_divergence_per_token": 0.6106686592102051,
|
|
"misalign/gamma_abs_times_reward_std": 42811992.0,
|
|
"misalign/gamma_bracketed_rate": 0.9881041571497917,
|
|
"misalign/gamma_reward_residual": 0.0022615561604197865,
|
|
"misalign/gamma_star": 57841907.5,
|
|
"misalign/js_divergence": 42.64789915084839,
|
|
"misalign/reverse_kl_divergence": 176.26643562316895,
|
|
"misalign/reverse_kl_divergence_per_token": 0.20993488654494286,
|
|
"misalign/reward_a": 112.22536277770996,
|
|
"misalign/reward_b": -13.715306758880615,
|
|
"misalign/reward_improvement": 125.94063758850098,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7269639819860458,
|
|
"misalign/reward_improvement_per_token": 0.13059457764029503,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -529.42356300354,
|
|
"misalign/reward_vocab_std": 729.1826972961426,
|
|
"misalign/symmetric_kl": 559.3521347045898,
|
|
"misalign/tv_distance": 134.69511699676514,
|
|
"num_tokens": 7295429.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -16.980572760105133,
|
|
"rewards/margins": 52.21821975708008,
|
|
"rewards/rejected": -69.19879198074341,
|
|
"step": 43,
|
|
"support/residual_count": 151893.271484375,
|
|
"support/residual_mass_policy": 0.020681084133684635,
|
|
"support/residual_mass_reference": 0.03672680747695267,
|
|
"support/residual_reward": -0.5217800214886665,
|
|
"support/runtime_width": 42.73170518875122,
|
|
"support/sampled_loser_rank": 0.6487637758255005,
|
|
"support/sampled_reward_rank": -0.33589007146656513,
|
|
"support/sampled_token_added_rate": 0.0326521759852767,
|
|
"support/sampled_winner_rank": 0.6681758984923363,
|
|
"support/selected_width": 42.73170518875122,
|
|
"support/stored_width": 42.73170518875122
|
|
},
|
|
{
|
|
"epoch": 0.6875,
|
|
"grad_norm": 126.96430206298828,
|
|
"kl/sequence_policy_ref": -369.60080337524414,
|
|
"kl/vocab_forward": 333.5518455505371,
|
|
"kl/vocab_js": 38.65760946273804,
|
|
"kl/vocab_reverse": 158.4586296081543,
|
|
"kl/vocab_symmetric": 492.01073837280273,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -845.3204116821289,
|
|
"logps/rejected": -1843.438232421875,
|
|
"loss": 2.4664,
|
|
"loss/dpo": 0.5317850863039725,
|
|
"misalign/J": 193.46406745910645,
|
|
"misalign/J_aux_loss": 1.9346406310796738,
|
|
"misalign/J_aux_loss_raw": 193.46406745910645,
|
|
"misalign/J_over_reverse_kl": 1.8410249948501587,
|
|
"misalign/J_per_token": 0.21784362383186817,
|
|
"misalign/compressed_reward_absmax": 4217.687438964844,
|
|
"misalign/compressed_reward_range": 6031.9879150390625,
|
|
"misalign/entropy_a": 751.3342437744141,
|
|
"misalign/entropy_b": 1211.2969436645508,
|
|
"misalign/forward_kl_divergence": 333.5518455505371,
|
|
"misalign/forward_kl_divergence_per_token": 0.4570343755185604,
|
|
"misalign/gamma_abs_times_reward_std": 54839250.375,
|
|
"misalign/gamma_bracketed_rate": 0.9886480942368507,
|
|
"misalign/gamma_reward_residual": 0.0009870923743164894,
|
|
"misalign/gamma_star": 66258410.125,
|
|
"misalign/js_divergence": 38.65760946273804,
|
|
"misalign/reverse_kl_divergence": 158.4586296081543,
|
|
"misalign/reverse_kl_divergence_per_token": 0.20796416513621807,
|
|
"misalign/reward_a": 101.46306848526001,
|
|
"misalign/reward_b": -14.63782051205635,
|
|
"misalign/reward_improvement": 116.10085487365723,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6505585312843323,
|
|
"misalign/reward_improvement_per_token": 0.08616658858954906,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -558.583703994751,
|
|
"misalign/reward_vocab_std": 734.9819030761719,
|
|
"misalign/symmetric_kl": 492.01073837280273,
|
|
"misalign/tv_distance": 125.39980697631836,
|
|
"num_tokens": 7467937.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -16.166881680488586,
|
|
"rewards/margins": 41.5863995552063,
|
|
"rewards/rejected": -57.75328016281128,
|
|
"step": 44,
|
|
"support/residual_count": 151893.1171875,
|
|
"support/residual_mass_policy": 0.02624459331855178,
|
|
"support/residual_mass_reference": 0.04203576873987913,
|
|
"support/residual_reward": -0.41040224581956863,
|
|
"support/runtime_width": 42.882601737976074,
|
|
"support/sampled_loser_rank": 0.597820907831192,
|
|
"support/sampled_reward_rank": -0.4271346926689148,
|
|
"support/sampled_token_added_rate": 0.03776927734725177,
|
|
"support/sampled_winner_rank": 0.6288959942758083,
|
|
"support/selected_width": 42.882601737976074,
|
|
"support/stored_width": 42.882601737976074
|
|
},
|
|
{
|
|
"epoch": 0.703125,
|
|
"grad_norm": 263.4734191894531,
|
|
"kl/sequence_policy_ref": -398.33577728271484,
|
|
"kl/vocab_forward": 362.0641288757324,
|
|
"kl/vocab_js": 40.054423570632935,
|
|
"kl/vocab_reverse": 163.50957679748535,
|
|
"kl/vocab_symmetric": 525.5739707946777,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -861.9217529296875,
|
|
"logps/rejected": -1947.7921600341797,
|
|
"loss": 2.821,
|
|
"loss/dpo": 0.805006888877976,
|
|
"misalign/J": 201.59605598449707,
|
|
"misalign/J_aux_loss": 2.0159604400396347,
|
|
"misalign/J_aux_loss_raw": 201.59605598449707,
|
|
"misalign/J_over_reverse_kl": 1.4087589755654335,
|
|
"misalign/J_per_token": 0.23312609270215034,
|
|
"misalign/compressed_reward_absmax": 4261.386077880859,
|
|
"misalign/compressed_reward_range": 5968.6805419921875,
|
|
"misalign/entropy_a": 787.204231262207,
|
|
"misalign/entropy_b": 1236.5688171386719,
|
|
"misalign/forward_kl_divergence": 362.0641288757324,
|
|
"misalign/forward_kl_divergence_per_token": 0.5964512750506401,
|
|
"misalign/gamma_abs_times_reward_std": 58279984.25,
|
|
"misalign/gamma_bracketed_rate": 0.9880961626768112,
|
|
"misalign/gamma_reward_residual": 0.0008914316013033385,
|
|
"misalign/gamma_star": 100481896.0,
|
|
"misalign/js_divergence": 40.054423570632935,
|
|
"misalign/reverse_kl_divergence": 163.50957679748535,
|
|
"misalign/reverse_kl_divergence_per_token": 0.22092271596193314,
|
|
"misalign/reward_a": 105.81989192962646,
|
|
"misalign/reward_b": -13.749351739883423,
|
|
"misalign/reward_improvement": 119.56921005249023,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6199643798172474,
|
|
"misalign/reward_improvement_per_token": 0.08543467940762639,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -543.9455938339233,
|
|
"misalign/reward_vocab_std": 719.025936126709,
|
|
"misalign/symmetric_kl": 525.5739707946777,
|
|
"misalign/tv_distance": 128.89692497253418,
|
|
"num_tokens": 7641400.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -18.768484711647034,
|
|
"rewards/margins": 42.13018798828125,
|
|
"rewards/rejected": -60.89867305755615,
|
|
"step": 45,
|
|
"support/residual_count": 151893.509765625,
|
|
"support/residual_mass_policy": 0.023412443813867867,
|
|
"support/residual_mass_reference": 0.03834630874916911,
|
|
"support/residual_reward": -0.5029764696955681,
|
|
"support/runtime_width": 42.48880910873413,
|
|
"support/sampled_loser_rank": 0.6054023541510105,
|
|
"support/sampled_reward_rank": -0.30370173789560795,
|
|
"support/sampled_token_added_rate": 0.03430053312331438,
|
|
"support/sampled_winner_rank": 0.6158883348107338,
|
|
"support/selected_width": 42.48880910873413,
|
|
"support/stored_width": 42.48880910873413
|
|
},
|
|
{
|
|
"epoch": 0.71875,
|
|
"grad_norm": 99.2862548828125,
|
|
"kl/sequence_policy_ref": -352.9979362487793,
|
|
"kl/vocab_forward": 320.0415687561035,
|
|
"kl/vocab_js": 35.314189434051514,
|
|
"kl/vocab_reverse": 143.37086391448975,
|
|
"kl/vocab_symmetric": 463.41268157958984,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -780.6201934814453,
|
|
"logps/rejected": -1745.756332397461,
|
|
"loss": 1.967,
|
|
"loss/dpo": 0.3397530964894031,
|
|
"misalign/J": 162.72915935516357,
|
|
"misalign/J_aux_loss": 1.6272915750741959,
|
|
"misalign/J_aux_loss_raw": 162.72915935516357,
|
|
"misalign/J_over_reverse_kl": 1.814868986606598,
|
|
"misalign/J_per_token": 0.19831308163702488,
|
|
"misalign/compressed_reward_absmax": 3964.359344482422,
|
|
"misalign/compressed_reward_range": 5592.5999755859375,
|
|
"misalign/entropy_a": 701.7979049682617,
|
|
"misalign/entropy_b": 1128.1244354248047,
|
|
"misalign/forward_kl_divergence": 320.0415687561035,
|
|
"misalign/forward_kl_divergence_per_token": 0.5070049501955509,
|
|
"misalign/gamma_abs_times_reward_std": 43917063.0,
|
|
"misalign/gamma_bracketed_rate": 0.9886893406510353,
|
|
"misalign/gamma_reward_residual": 0.00017012334137689322,
|
|
"misalign/gamma_star": 66023362.0,
|
|
"misalign/js_divergence": 35.314189434051514,
|
|
"misalign/reverse_kl_divergence": 143.37086391448975,
|
|
"misalign/reverse_kl_divergence_per_token": 0.18735219724476337,
|
|
"misalign/reward_a": 90.78490257263184,
|
|
"misalign/reward_b": -14.648303270339966,
|
|
"misalign/reward_improvement": 105.43315505981445,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6790562570095062,
|
|
"misalign/reward_improvement_per_token": 0.09955848660320044,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -502.35912132263184,
|
|
"misalign/reward_vocab_std": 681.5238342285156,
|
|
"misalign/symmetric_kl": 463.41268157958984,
|
|
"misalign/tv_distance": 114.9539966583252,
|
|
"num_tokens": 7808320.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -15.536956906318665,
|
|
"rewards/margins": 39.525673627853394,
|
|
"rewards/rejected": -55.06262969970703,
|
|
"step": 46,
|
|
"support/residual_count": 151893.275390625,
|
|
"support/residual_mass_policy": 0.027146896114572883,
|
|
"support/residual_mass_reference": 0.04411950474604964,
|
|
"support/residual_reward": -0.5129662416875362,
|
|
"support/runtime_width": 42.72548723220825,
|
|
"support/sampled_loser_rank": 0.5832961872220039,
|
|
"support/sampled_reward_rank": -0.366660688072443,
|
|
"support/sampled_token_added_rate": 0.03585506067611277,
|
|
"support/sampled_winner_rank": 0.6062828227877617,
|
|
"support/selected_width": 42.72548723220825,
|
|
"support/stored_width": 42.72548723220825
|
|
},
|
|
{
|
|
"epoch": 0.734375,
|
|
"grad_norm": 95.33216857910156,
|
|
"kl/sequence_policy_ref": -339.36829376220703,
|
|
"kl/vocab_forward": 309.0083351135254,
|
|
"kl/vocab_js": 34.39964842796326,
|
|
"kl/vocab_reverse": 138.5347490310669,
|
|
"kl/vocab_symmetric": 447.5432891845703,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -834.6276512145996,
|
|
"logps/rejected": -1626.2992553710938,
|
|
"loss": 2.1621,
|
|
"loss/dpo": 0.5789944041461155,
|
|
"misalign/J": 158.3114309310913,
|
|
"misalign/J_aux_loss": 1.5831142514944077,
|
|
"misalign/J_aux_loss_raw": 158.3114309310913,
|
|
"misalign/J_over_reverse_kl": 1.3874521106481552,
|
|
"misalign/J_per_token": 0.2954816836863756,
|
|
"misalign/compressed_reward_absmax": 3782.6796264648438,
|
|
"misalign/compressed_reward_range": 5458.201934814453,
|
|
"misalign/entropy_a": 683.5626831054688,
|
|
"misalign/entropy_b": 1110.1263809204102,
|
|
"misalign/forward_kl_divergence": 309.0083351135254,
|
|
"misalign/forward_kl_divergence_per_token": 0.7647044509649277,
|
|
"misalign/gamma_abs_times_reward_std": 42300354.5,
|
|
"misalign/gamma_bracketed_rate": 0.9892060980200768,
|
|
"misalign/gamma_reward_residual": 0.00012735200425595394,
|
|
"misalign/gamma_star": 70152250.0,
|
|
"misalign/js_divergence": 34.39964842796326,
|
|
"misalign/reverse_kl_divergence": 138.5347490310669,
|
|
"misalign/reverse_kl_divergence_per_token": 0.3200199883431196,
|
|
"misalign/reward_a": 96.08270931243896,
|
|
"misalign/reward_b": -11.616803467273712,
|
|
"misalign/reward_improvement": 107.69949054718018,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6338916420936584,
|
|
"misalign/reward_improvement_per_token": 0.0737753571011126,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -587.4529418945312,
|
|
"misalign/reward_vocab_std": 666.2506561279297,
|
|
"misalign/symmetric_kl": 447.5432891845703,
|
|
"misalign/tv_distance": 114.28812599182129,
|
|
"num_tokens": 7963180.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -15.032280445098877,
|
|
"rewards/margins": 37.809099435806274,
|
|
"rewards/rejected": -52.84138059616089,
|
|
"step": 47,
|
|
"support/residual_count": 151893.236328125,
|
|
"support/residual_mass_policy": 0.02375667286105454,
|
|
"support/residual_mass_reference": 0.04523510206490755,
|
|
"support/residual_reward": -0.6332408636808395,
|
|
"support/runtime_width": 42.76191568374634,
|
|
"support/sampled_loser_rank": 0.6728832796216011,
|
|
"support/sampled_reward_rank": -0.326838955283165,
|
|
"support/sampled_token_added_rate": 0.029435024363920093,
|
|
"support/sampled_winner_rank": 0.7073031365871429,
|
|
"support/selected_width": 42.76191568374634,
|
|
"support/stored_width": 42.76191568374634
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 92.29680633544922,
|
|
"kl/sequence_policy_ref": -349.18139266967773,
|
|
"kl/vocab_forward": 318.0182914733887,
|
|
"kl/vocab_js": 36.353710412979126,
|
|
"kl/vocab_reverse": 146.12834548950195,
|
|
"kl/vocab_symmetric": 464.14682960510254,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -845.9873504638672,
|
|
"logps/rejected": -1924.1956024169922,
|
|
"loss": 1.6528,
|
|
"loss/dpo": 0.049382371892768256,
|
|
"misalign/J": 160.33691692352295,
|
|
"misalign/J_aux_loss": 1.6033690869808197,
|
|
"misalign/J_aux_loss_raw": 160.33691692352295,
|
|
"misalign/J_over_reverse_kl": 1.2529658675193787,
|
|
"misalign/J_per_token": 0.20762860495597124,
|
|
"misalign/compressed_reward_absmax": 4354.188568115234,
|
|
"misalign/compressed_reward_range": 6215.323059082031,
|
|
"misalign/entropy_a": 822.5513381958008,
|
|
"misalign/entropy_b": 1289.558578491211,
|
|
"misalign/forward_kl_divergence": 318.0182914733887,
|
|
"misalign/forward_kl_divergence_per_token": 0.440604854375124,
|
|
"misalign/gamma_abs_times_reward_std": 41074954.5,
|
|
"misalign/gamma_bracketed_rate": 0.9895801991224289,
|
|
"misalign/gamma_reward_residual": 0.00018931449562842317,
|
|
"misalign/gamma_star": 57840401.875,
|
|
"misalign/js_divergence": 36.353710412979126,
|
|
"misalign/reverse_kl_divergence": 146.12834548950195,
|
|
"misalign/reverse_kl_divergence_per_token": 0.18436118587851524,
|
|
"misalign/reward_a": 101.43512630462646,
|
|
"misalign/reward_b": -11.868624448776245,
|
|
"misalign/reward_improvement": 113.30374765396118,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7551356106996536,
|
|
"misalign/reward_improvement_per_token": 0.1261532404460013,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -493.02664852142334,
|
|
"misalign/reward_vocab_std": 759.0527877807617,
|
|
"misalign/symmetric_kl": 464.14682960510254,
|
|
"misalign/tv_distance": 123.64643812179565,
|
|
"num_tokens": 8135261.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -14.46340036392212,
|
|
"rewards/margins": 40.90948009490967,
|
|
"rewards/rejected": -55.37287950515747,
|
|
"step": 48,
|
|
"support/residual_count": 151893.193359375,
|
|
"support/residual_mass_policy": 0.026260258397087455,
|
|
"support/residual_mass_reference": 0.042555712163448334,
|
|
"support/residual_reward": -0.47505020070821047,
|
|
"support/runtime_width": 42.80619525909424,
|
|
"support/sampled_loser_rank": 0.625778254121542,
|
|
"support/sampled_reward_rank": -0.3972127176821232,
|
|
"support/sampled_token_added_rate": 0.03378989826887846,
|
|
"support/sampled_winner_rank": 0.6260612569749355,
|
|
"support/selected_width": 42.80619525909424,
|
|
"support/stored_width": 42.80619525909424
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"eval_kl/sequence_policy_ref": -364.4170129299164,
|
|
"eval_kl/vocab_forward": 328.8754801750183,
|
|
"eval_kl/vocab_js": 37.498633831739426,
|
|
"eval_kl/vocab_reverse": 150.4579164981842,
|
|
"eval_kl/vocab_symmetric": 479.3336043357849,
|
|
"eval_logps/chosen": -882.4125304222107,
|
|
"eval_logps/rejected": -1946.6873836517334,
|
|
"eval_loss": 1.7490763664245605,
|
|
"eval_loss/dpo": 0.06740551616172231,
|
|
"eval_misalign/J": 168.167093873024,
|
|
"eval_misalign/J_aux_loss": 1.6816708873957396,
|
|
"eval_misalign/J_aux_loss_raw": 168.167093873024,
|
|
"eval_misalign/J_over_reverse_kl": 1.7788694016635418,
|
|
"eval_misalign/J_per_token": 0.22979121375828981,
|
|
"eval_misalign/compressed_reward_absmax": 4236.706272125244,
|
|
"eval_misalign/compressed_reward_range": 6027.056537628174,
|
|
"eval_misalign/entropy_a": 823.1465611457825,
|
|
"eval_misalign/entropy_b": 1298.0544729232788,
|
|
"eval_misalign/forward_kl_divergence": 328.8754801750183,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.4856905459892005,
|
|
"eval_misalign/gamma_abs_times_reward_std": 43424142.421875,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9878035467118025,
|
|
"eval_misalign/gamma_reward_residual": 0.0007769855760599853,
|
|
"eval_misalign/gamma_star": 61287659.90625,
|
|
"eval_misalign/js_divergence": 37.498633831739426,
|
|
"eval_misalign/reverse_kl_divergence": 150.4579164981842,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.19300507346633822,
|
|
"eval_misalign/reward_a": 101.7501335144043,
|
|
"eval_misalign/reward_b": -14.307281976565719,
|
|
"eval_misalign/reward_improvement": 116.05739098787308,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 0.7271916684694588,
|
|
"eval_misalign/reward_improvement_per_token": 0.09193722825148143,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.11554992198944,
|
|
"eval_misalign/reward_vocab_std": 731.32026720047,
|
|
"eval_misalign/symmetric_kl": 479.3336043357849,
|
|
"eval_misalign/tv_distance": 127.07569408416748,
|
|
"eval_rewards/accuracies": 0.984375,
|
|
"eval_rewards/chosen": -15.323657296597958,
|
|
"eval_rewards/margins": 42.23608899116516,
|
|
"eval_rewards/rejected": -57.55974632501602,
|
|
"eval_runtime": 100.8337,
|
|
"eval_samples_per_second": 5.078,
|
|
"eval_steps_per_second": 0.635,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.025306705734692514,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.765625,
|
|
"grad_norm": 84.55875396728516,
|
|
"kl/sequence_policy_ref": -355.52197265625,
|
|
"kl/vocab_forward": 323.58666229248047,
|
|
"kl/vocab_js": 38.03043556213379,
|
|
"kl/vocab_reverse": 151.7306032180786,
|
|
"kl/vocab_symmetric": 475.31740951538086,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -917.9943237304688,
|
|
"logps/rejected": -1895.99658203125,
|
|
"loss": 1.6704,
|
|
"loss/dpo": 0.04557584371703283,
|
|
"misalign/J": 162.48288917541504,
|
|
"misalign/J_aux_loss": 1.624828889966011,
|
|
"misalign/J_aux_loss_raw": 162.48288917541504,
|
|
"misalign/J_over_reverse_kl": 1.4654640778899193,
|
|
"misalign/J_per_token": 0.20114040188491344,
|
|
"misalign/compressed_reward_absmax": 4219.985290527344,
|
|
"misalign/compressed_reward_range": 5980.884826660156,
|
|
"misalign/entropy_a": 811.0443420410156,
|
|
"misalign/entropy_b": 1299.4309158325195,
|
|
"misalign/forward_kl_divergence": 323.58666229248047,
|
|
"misalign/forward_kl_divergence_per_token": 0.399110134691,
|
|
"misalign/gamma_abs_times_reward_std": 53172764.0,
|
|
"misalign/gamma_bracketed_rate": 0.9869352579116821,
|
|
"misalign/gamma_reward_residual": 3.9546100197185297e-05,
|
|
"misalign/gamma_star": 60015785.75,
|
|
"misalign/js_divergence": 38.03043556213379,
|
|
"misalign/reverse_kl_divergence": 151.7306032180786,
|
|
"misalign/reverse_kl_divergence_per_token": 0.16637993790209293,
|
|
"misalign/reward_a": 105.46696472167969,
|
|
"misalign/reward_b": -16.47221863269806,
|
|
"misalign/reward_improvement": 121.9391622543335,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.7262164875864983,
|
|
"misalign/reward_improvement_per_token": 0.10510652232915163,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -500.18709564208984,
|
|
"misalign/reward_vocab_std": 730.5291976928711,
|
|
"misalign/symmetric_kl": 475.31740951538086,
|
|
"misalign/tv_distance": 129.61585903167725,
|
|
"num_tokens": 8305232.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -15.759445548057556,
|
|
"rewards/margins": 39.58550405502319,
|
|
"rewards/rejected": -55.34494924545288,
|
|
"step": 49,
|
|
"support/residual_count": 151893.353515625,
|
|
"support/residual_mass_policy": 0.022411803365685046,
|
|
"support/residual_mass_reference": 0.03780948673374951,
|
|
"support/residual_reward": -0.5111432895064354,
|
|
"support/runtime_width": 42.64701747894287,
|
|
"support/sampled_loser_rank": 0.6062813438475132,
|
|
"support/sampled_reward_rank": -0.361979590728879,
|
|
"support/sampled_token_added_rate": 0.03365356335416436,
|
|
"support/sampled_winner_rank": 0.626273512840271,
|
|
"support/selected_width": 42.64701747894287,
|
|
"support/stored_width": 42.64701747894287
|
|
},
|
|
{
|
|
"epoch": 0.78125,
|
|
"grad_norm": 120.11223602294922,
|
|
"kl/sequence_policy_ref": -378.5549774169922,
|
|
"kl/vocab_forward": 352.6853256225586,
|
|
"kl/vocab_js": 37.74899077415466,
|
|
"kl/vocab_reverse": 148.82802867889404,
|
|
"kl/vocab_symmetric": 501.5135078430176,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -884.3883514404297,
|
|
"logps/rejected": -1912.2733612060547,
|
|
"loss": 2.1813,
|
|
"loss/dpo": 0.1028611735179559,
|
|
"misalign/J": 207.84398746490479,
|
|
"misalign/J_aux_loss": 2.078439861536026,
|
|
"misalign/J_aux_loss_raw": 207.84398746490479,
|
|
"misalign/J_over_reverse_kl": 1.7021770626306534,
|
|
"misalign/J_per_token": 0.2538851350545883,
|
|
"misalign/compressed_reward_absmax": 4287.376739501953,
|
|
"misalign/compressed_reward_range": 6082.450378417969,
|
|
"misalign/entropy_a": 770.8007431030273,
|
|
"misalign/entropy_b": 1264.5378875732422,
|
|
"misalign/forward_kl_divergence": 352.6853256225586,
|
|
"misalign/forward_kl_divergence_per_token": 0.5054213367402554,
|
|
"misalign/gamma_abs_times_reward_std": 62323988.25,
|
|
"misalign/gamma_bracketed_rate": 0.985651396214962,
|
|
"misalign/gamma_reward_residual": 0.002676691350643523,
|
|
"misalign/gamma_star": 81304675.0,
|
|
"misalign/js_divergence": 37.74899077415466,
|
|
"misalign/reverse_kl_divergence": 148.82802867889404,
|
|
"misalign/reverse_kl_divergence_per_token": 0.19959929399192333,
|
|
"misalign/reward_a": 91.68502187728882,
|
|
"misalign/reward_b": -13.228169560432434,
|
|
"misalign/reward_improvement": 104.91317367553711,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6395450867712498,
|
|
"misalign/reward_improvement_per_token": 0.07659031543880701,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -545.8368339538574,
|
|
"misalign/reward_vocab_std": 735.2048110961914,
|
|
"misalign/symmetric_kl": 501.5135078430176,
|
|
"misalign/tv_distance": 127.10494995117188,
|
|
"num_tokens": 8481546.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -16.9174667596817,
|
|
"rewards/margins": 41.87606382369995,
|
|
"rewards/rejected": -58.793529987335205,
|
|
"step": 50,
|
|
"support/residual_count": 151893.240234375,
|
|
"support/residual_mass_policy": 0.028523427667096257,
|
|
"support/residual_mass_reference": 0.04290076903998852,
|
|
"support/residual_reward": -0.4372365176677704,
|
|
"support/runtime_width": 42.76475811004639,
|
|
"support/sampled_loser_rank": 0.5924237333238125,
|
|
"support/sampled_reward_rank": -0.40600746124982834,
|
|
"support/sampled_token_added_rate": 0.03576831449754536,
|
|
"support/sampled_winner_rank": 0.618581123650074,
|
|
"support/selected_width": 42.76475811004639,
|
|
"support/stored_width": 42.76475811004639
|
|
},
|
|
{
|
|
"epoch": 0.796875,
|
|
"grad_norm": 74.84430694580078,
|
|
"kl/sequence_policy_ref": -348.85993576049805,
|
|
"kl/vocab_forward": 322.82544136047363,
|
|
"kl/vocab_js": 34.94163501262665,
|
|
"kl/vocab_reverse": 137.33496236801147,
|
|
"kl/vocab_symmetric": 460.160680770874,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -807.7403802871704,
|
|
"logps/rejected": -1745.3731842041016,
|
|
"loss": 2.0267,
|
|
"loss/dpo": 0.47453043650421023,
|
|
"misalign/J": 155.21207237243652,
|
|
"misalign/J_aux_loss": 1.5521207079291344,
|
|
"misalign/J_aux_loss_raw": 155.21207237243652,
|
|
"misalign/J_over_reverse_kl": 1.433588370680809,
|
|
"misalign/J_per_token": 0.2532362565398216,
|
|
"misalign/compressed_reward_absmax": 3888.342071533203,
|
|
"misalign/compressed_reward_range": 5480.323181152344,
|
|
"misalign/entropy_a": 695.3439636230469,
|
|
"misalign/entropy_b": 1156.5927391052246,
|
|
"misalign/forward_kl_divergence": 322.82544136047363,
|
|
"misalign/forward_kl_divergence_per_token": 0.725763525813818,
|
|
"misalign/gamma_abs_times_reward_std": 41670287.75,
|
|
"misalign/gamma_bracketed_rate": 0.9877297207713127,
|
|
"misalign/gamma_reward_residual": -6.590514796300795e-05,
|
|
"misalign/gamma_star": 57525834.875,
|
|
"misalign/js_divergence": 34.94163501262665,
|
|
"misalign/reverse_kl_divergence": 137.33496236801147,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2455148883163929,
|
|
"misalign/reward_a": 83.87791728973389,
|
|
"misalign/reward_b": -14.364118754863739,
|
|
"misalign/reward_improvement": 98.24201107025146,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5628711394965649,
|
|
"misalign/reward_improvement_per_token": 0.03299418743699789,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -404.0699119567871,
|
|
"misalign/reward_vocab_std": 662.7630424499512,
|
|
"misalign/symmetric_kl": 460.160680770874,
|
|
"misalign/tv_distance": 116.86641025543213,
|
|
"num_tokens": 8645585.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -16.726518213748932,
|
|
"rewards/margins": 36.31895399093628,
|
|
"rewards/rejected": -53.045472145080566,
|
|
"step": 51,
|
|
"support/residual_count": 151893.197265625,
|
|
"support/residual_mass_policy": 0.024921999080106616,
|
|
"support/residual_mass_reference": 0.04474550345912576,
|
|
"support/residual_reward": -0.4795303028076887,
|
|
"support/runtime_width": 42.801236152648926,
|
|
"support/sampled_loser_rank": 0.624417282640934,
|
|
"support/sampled_reward_rank": -0.37105782236903906,
|
|
"support/sampled_token_added_rate": 0.032293472439050674,
|
|
"support/sampled_winner_rank": 0.6685933172702789,
|
|
"support/selected_width": 42.801236152648926,
|
|
"support/stored_width": 42.801236152648926
|
|
},
|
|
{
|
|
"epoch": 0.8125,
|
|
"grad_norm": 86.25784301757812,
|
|
"kl/sequence_policy_ref": -477.46276092529297,
|
|
"kl/vocab_forward": 436.0279541015625,
|
|
"kl/vocab_js": 44.9802827835083,
|
|
"kl/vocab_reverse": 177.54276657104492,
|
|
"kl/vocab_symmetric": 613.5709609985352,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -881.7166900634766,
|
|
"logps/rejected": -2414.805862426758,
|
|
"loss": 2.5689,
|
|
"loss/dpo": 0.6295568409377225,
|
|
"misalign/J": 193.93076133728027,
|
|
"misalign/J_aux_loss": 1.939307525753975,
|
|
"misalign/J_aux_loss_raw": 193.93076133728027,
|
|
"misalign/J_over_reverse_kl": 1.406748965382576,
|
|
"misalign/J_per_token": 0.2848157715052366,
|
|
"misalign/compressed_reward_absmax": 4759.470764160156,
|
|
"misalign/compressed_reward_range": 6730.137023925781,
|
|
"misalign/entropy_a": 874.9066314697266,
|
|
"misalign/entropy_b": 1443.2865905761719,
|
|
"misalign/forward_kl_divergence": 436.0279541015625,
|
|
"misalign/forward_kl_divergence_per_token": 0.5377647392451763,
|
|
"misalign/gamma_abs_times_reward_std": 52484937.5,
|
|
"misalign/gamma_bracketed_rate": 0.9844366908073425,
|
|
"misalign/gamma_reward_residual": 0.0008822159904866567,
|
|
"misalign/gamma_star": 86450198.0,
|
|
"misalign/js_divergence": 44.9802827835083,
|
|
"misalign/reverse_kl_divergence": 177.54276657104492,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1856713891029358,
|
|
"misalign/reward_a": 107.41517734527588,
|
|
"misalign/reward_b": -19.218833327293396,
|
|
"misalign/reward_improvement": 126.63399696350098,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6467054821550846,
|
|
"misalign/reward_improvement_per_token": 0.07611742825247347,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -570.9265441894531,
|
|
"misalign/reward_vocab_std": 818.5440444946289,
|
|
"misalign/symmetric_kl": 613.5709609985352,
|
|
"misalign/tv_distance": 146.72683811187744,
|
|
"num_tokens": 8830616.0,
|
|
"rewards/accuracies": 0.96875,
|
|
"rewards/chosen": -20.915815234184265,
|
|
"rewards/margins": 53.66092252731323,
|
|
"rewards/rejected": -74.57673835754395,
|
|
"step": 52,
|
|
"support/residual_count": 151893.328125,
|
|
"support/residual_mass_policy": 0.02075903001241386,
|
|
"support/residual_mass_reference": 0.04033026983961463,
|
|
"support/residual_reward": -0.39157247683033347,
|
|
"support/runtime_width": 42.66762590408325,
|
|
"support/sampled_loser_rank": 0.6271255537867546,
|
|
"support/sampled_reward_rank": -0.3960692873224616,
|
|
"support/sampled_token_added_rate": 0.0356892254203558,
|
|
"support/sampled_winner_rank": 0.6444137506186962,
|
|
"support/selected_width": 42.66762590408325,
|
|
"support/stored_width": 42.66762590408325
|
|
},
|
|
{
|
|
"epoch": 0.828125,
|
|
"grad_norm": 101.57430267333984,
|
|
"kl/sequence_policy_ref": -465.77392578125,
|
|
"kl/vocab_forward": 446.5703926086426,
|
|
"kl/vocab_js": 47.15303373336792,
|
|
"kl/vocab_reverse": 184.80498790740967,
|
|
"kl/vocab_symmetric": 631.3756484985352,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -1251.7670364379883,
|
|
"logps/rejected": -2130.667724609375,
|
|
"loss": 2.2582,
|
|
"loss/dpo": 0.29861677209665966,
|
|
"misalign/J": 195.95514106750488,
|
|
"misalign/J_aux_loss": 1.959551364183426,
|
|
"misalign/J_aux_loss_raw": 195.95514106750488,
|
|
"misalign/J_over_reverse_kl": 1.1659726202487946,
|
|
"misalign/J_per_token": 0.2168925404548645,
|
|
"misalign/compressed_reward_absmax": 4992.182373046875,
|
|
"misalign/compressed_reward_range": 7108.8575439453125,
|
|
"misalign/entropy_a": 898.7776031494141,
|
|
"misalign/entropy_b": 1514.9154586791992,
|
|
"misalign/forward_kl_divergence": 446.5703926086426,
|
|
"misalign/forward_kl_divergence_per_token": 0.5860045477747917,
|
|
"misalign/gamma_abs_times_reward_std": 51011083.5,
|
|
"misalign/gamma_bracketed_rate": 0.9903441444039345,
|
|
"misalign/gamma_reward_residual": -0.0019245314256295387,
|
|
"misalign/gamma_star": 85517778.5,
|
|
"misalign/js_divergence": 47.15303373336792,
|
|
"misalign/reverse_kl_divergence": 184.80498790740967,
|
|
"misalign/reverse_kl_divergence_per_token": 0.21940777078270912,
|
|
"misalign/reward_a": 114.58473777770996,
|
|
"misalign/reward_b": -14.659283697605133,
|
|
"misalign/reward_improvement": 129.2440366744995,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6434826478362083,
|
|
"misalign/reward_improvement_per_token": 0.05605245754122734,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -762.099235534668,
|
|
"misalign/reward_vocab_std": 874.5540542602539,
|
|
"misalign/symmetric_kl": 631.3756484985352,
|
|
"misalign/tv_distance": 157.77043342590332,
|
|
"num_tokens": 9025743.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -26.162310361862183,
|
|
"rewards/margins": 40.83016753196716,
|
|
"rewards/rejected": -66.99247980117798,
|
|
"step": 53,
|
|
"support/residual_count": 151893.298828125,
|
|
"support/residual_mass_policy": 0.020821302896365523,
|
|
"support/residual_mass_reference": 0.039504863787442446,
|
|
"support/residual_reward": -0.5619673319160938,
|
|
"support/runtime_width": 42.6985387802124,
|
|
"support/sampled_loser_rank": 0.6461238414049149,
|
|
"support/sampled_reward_rank": -0.34166772849857807,
|
|
"support/sampled_token_added_rate": 0.03369109332561493,
|
|
"support/sampled_winner_rank": 0.6555347442626953,
|
|
"support/selected_width": 42.6985387802124,
|
|
"support/stored_width": 42.6985387802124
|
|
},
|
|
{
|
|
"epoch": 0.84375,
|
|
"grad_norm": 298.0096435546875,
|
|
"kl/sequence_policy_ref": -386.71077728271484,
|
|
"kl/vocab_forward": 365.6785316467285,
|
|
"kl/vocab_js": 35.59814095497131,
|
|
"kl/vocab_reverse": 139.05288410186768,
|
|
"kl/vocab_symmetric": 504.7316131591797,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -760.6196022033691,
|
|
"logps/rejected": -1875.7993927001953,
|
|
"loss": 2.171,
|
|
"loss/dpo": 0.14888411300079252,
|
|
"misalign/J": 202.2076416015625,
|
|
"misalign/J_aux_loss": 2.022076301276684,
|
|
"misalign/J_aux_loss_raw": 202.2076416015625,
|
|
"misalign/J_over_reverse_kl": 2.347415864467621,
|
|
"misalign/J_per_token": 0.2071497868746519,
|
|
"misalign/compressed_reward_absmax": 4018.2659912109375,
|
|
"misalign/compressed_reward_range": 5681.7738037109375,
|
|
"misalign/entropy_a": 716.9813385009766,
|
|
"misalign/entropy_b": 1167.6287994384766,
|
|
"misalign/forward_kl_divergence": 365.6785316467285,
|
|
"misalign/forward_kl_divergence_per_token": 0.49891503155231476,
|
|
"misalign/gamma_abs_times_reward_std": 61696461.5,
|
|
"misalign/gamma_bracketed_rate": 0.9874565973877907,
|
|
"misalign/gamma_reward_residual": -0.004232324329905168,
|
|
"misalign/gamma_star": 28188861.5,
|
|
"misalign/js_divergence": 35.59814095497131,
|
|
"misalign/reverse_kl_divergence": 139.05288410186768,
|
|
"misalign/reverse_kl_divergence_per_token": 0.17908263765275478,
|
|
"misalign/reward_a": 83.29793310165405,
|
|
"misalign/reward_b": -13.053004205226898,
|
|
"misalign/reward_improvement": 96.35093402862549,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.588286180049181,
|
|
"misalign/reward_improvement_per_token": 0.07466917904093862,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -496.0263900756836,
|
|
"misalign/reward_vocab_std": 698.2202835083008,
|
|
"misalign/symmetric_kl": 504.7316131591797,
|
|
"misalign/tv_distance": 115.95970249176025,
|
|
"num_tokens": 9183237.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -18.112544775009155,
|
|
"rewards/margins": 41.11706781387329,
|
|
"rewards/rejected": -59.229612827301025,
|
|
"step": 54,
|
|
"support/residual_count": 151893.44140625,
|
|
"support/residual_mass_policy": 0.02262613782659173,
|
|
"support/residual_mass_reference": 0.03800426935777068,
|
|
"support/residual_reward": -0.5341151673346758,
|
|
"support/runtime_width": 42.55538368225098,
|
|
"support/sampled_loser_rank": 0.5614169128239155,
|
|
"support/sampled_reward_rank": -0.29479603469371796,
|
|
"support/sampled_token_added_rate": 0.03493571188300848,
|
|
"support/sampled_winner_rank": 0.5692789405584335,
|
|
"support/selected_width": 42.55538368225098,
|
|
"support/stored_width": 42.55538368225098
|
|
},
|
|
{
|
|
"epoch": 0.859375,
|
|
"grad_norm": 99.84505462646484,
|
|
"kl/sequence_policy_ref": -418.7293930053711,
|
|
"kl/vocab_forward": 371.4789810180664,
|
|
"kl/vocab_js": 35.276673793792725,
|
|
"kl/vocab_reverse": 138.0893669128418,
|
|
"kl/vocab_symmetric": 509.5685806274414,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -689.4199485778809,
|
|
"logps/rejected": -1892.0952606201172,
|
|
"loss": 1.7759,
|
|
"loss/dpo": 0.14481948580403764,
|
|
"misalign/J": 163.1080617904663,
|
|
"misalign/J_aux_loss": 1.6310805529356003,
|
|
"misalign/J_aux_loss_raw": 163.1080617904663,
|
|
"misalign/J_over_reverse_kl": 1.3716232553124428,
|
|
"misalign/J_per_token": 0.2615004684776068,
|
|
"misalign/compressed_reward_absmax": 3880.816650390625,
|
|
"misalign/compressed_reward_range": 5555.395263671875,
|
|
"misalign/entropy_a": 626.7629547119141,
|
|
"misalign/entropy_b": 1067.2015533447266,
|
|
"misalign/forward_kl_divergence": 371.4789810180664,
|
|
"misalign/forward_kl_divergence_per_token": 0.8582677766680717,
|
|
"misalign/gamma_abs_times_reward_std": 44081586.0,
|
|
"misalign/gamma_bracketed_rate": 0.9872501865029335,
|
|
"misalign/gamma_reward_residual": 0.007101273212128945,
|
|
"misalign/gamma_star": 65061232.25,
|
|
"misalign/js_divergence": 35.276673793792725,
|
|
"misalign/reverse_kl_divergence": 138.0893669128418,
|
|
"misalign/reverse_kl_divergence_per_token": 0.27060581743717194,
|
|
"misalign/reward_a": 76.1998405456543,
|
|
"misalign/reward_b": -13.09484925866127,
|
|
"misalign/reward_improvement": 89.29467296600342,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5231703817844391,
|
|
"misalign/reward_improvement_per_token": 0.035240316297858953,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -446.2036018371582,
|
|
"misalign/reward_vocab_std": 662.775749206543,
|
|
"misalign/symmetric_kl": 509.5685806274414,
|
|
"misalign/tv_distance": 113.11602115631104,
|
|
"num_tokens": 9339840.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -17.44824308156967,
|
|
"rewards/margins": 48.849395751953125,
|
|
"rewards/rejected": -66.29763889312744,
|
|
"step": 55,
|
|
"support/residual_count": 151893.3671875,
|
|
"support/residual_mass_policy": 0.019164926023222506,
|
|
"support/residual_mass_reference": 0.03859002981334925,
|
|
"support/residual_reward": -0.47019451297819614,
|
|
"support/runtime_width": 42.63320541381836,
|
|
"support/sampled_loser_rank": 0.6031154692173004,
|
|
"support/sampled_reward_rank": -0.38968720100820065,
|
|
"support/sampled_token_added_rate": 0.0316556547768414,
|
|
"support/sampled_winner_rank": 0.6063822247087955,
|
|
"support/selected_width": 42.63320541381836,
|
|
"support/stored_width": 42.63320541381836
|
|
},
|
|
{
|
|
"epoch": 0.875,
|
|
"grad_norm": 68.76524353027344,
|
|
"kl/sequence_policy_ref": -402.62422370910645,
|
|
"kl/vocab_forward": 360.61656951904297,
|
|
"kl/vocab_js": 36.79127216339111,
|
|
"kl/vocab_reverse": 143.1292266845703,
|
|
"kl/vocab_symmetric": 503.7460651397705,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -796.70947265625,
|
|
"logps/rejected": -1951.9369506835938,
|
|
"loss": 1.9268,
|
|
"loss/dpo": 0.31827243250995924,
|
|
"misalign/J": 160.8573293685913,
|
|
"misalign/J_aux_loss": 1.608573243021965,
|
|
"misalign/J_aux_loss_raw": 160.8573293685913,
|
|
"misalign/J_over_reverse_kl": 1.212674729526043,
|
|
"misalign/J_per_token": 0.23724722862243652,
|
|
"misalign/compressed_reward_absmax": 3996.8917541503906,
|
|
"misalign/compressed_reward_range": 5663.479064941406,
|
|
"misalign/entropy_a": 717.5296401977539,
|
|
"misalign/entropy_b": 1203.4962768554688,
|
|
"misalign/forward_kl_divergence": 360.61656951904297,
|
|
"misalign/forward_kl_divergence_per_token": 0.7455775737762451,
|
|
"misalign/gamma_abs_times_reward_std": 49514251.0,
|
|
"misalign/gamma_bracketed_rate": 0.9896730110049248,
|
|
"misalign/gamma_reward_residual": 9.679878201040992e-05,
|
|
"misalign/gamma_star": 57219496.125,
|
|
"misalign/js_divergence": 36.79127216339111,
|
|
"misalign/reverse_kl_divergence": 143.1292266845703,
|
|
"misalign/reverse_kl_divergence_per_token": 0.26022260822355747,
|
|
"misalign/reward_a": 82.29156970977783,
|
|
"misalign/reward_b": -11.702201634645462,
|
|
"misalign/reward_improvement": 93.99376678466797,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5378214567899704,
|
|
"misalign/reward_improvement_per_token": 0.04054644517600536,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -461.6409397125244,
|
|
"misalign/reward_vocab_std": 698.131046295166,
|
|
"misalign/symmetric_kl": 503.7460651397705,
|
|
"misalign/tv_distance": 122.93070888519287,
|
|
"num_tokens": 9506436.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -17.97724825143814,
|
|
"rewards/margins": 44.570350646972656,
|
|
"rewards/rejected": -62.54759979248047,
|
|
"step": 56,
|
|
"support/residual_count": 151893.306640625,
|
|
"support/residual_mass_policy": 0.020866328850388527,
|
|
"support/residual_mass_reference": 0.039209546288475394,
|
|
"support/residual_reward": -0.41471402533352375,
|
|
"support/runtime_width": 42.69208335876465,
|
|
"support/sampled_loser_rank": 0.6182565689086914,
|
|
"support/sampled_reward_rank": -0.4434542544186115,
|
|
"support/sampled_token_added_rate": 0.03403148171491921,
|
|
"support/sampled_winner_rank": 0.6329206973314285,
|
|
"support/selected_width": 42.69208335876465,
|
|
"support/stored_width": 42.69208335876465
|
|
},
|
|
{
|
|
"epoch": 0.890625,
|
|
"grad_norm": 94.79530334472656,
|
|
"kl/sequence_policy_ref": -375.49475288391113,
|
|
"kl/vocab_forward": 340.9373073577881,
|
|
"kl/vocab_js": 33.93104815483093,
|
|
"kl/vocab_reverse": 132.34368515014648,
|
|
"kl/vocab_symmetric": 473.28116607666016,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -666.1807670593262,
|
|
"logps/rejected": -1875.8219909667969,
|
|
"loss": 1.7866,
|
|
"loss/dpo": 0.24838248359601162,
|
|
"misalign/J": 153.82119750976562,
|
|
"misalign/J_aux_loss": 1.5382119417190552,
|
|
"misalign/J_aux_loss_raw": 153.82119750976562,
|
|
"misalign/J_over_reverse_kl": 1.4413592517375946,
|
|
"misalign/J_per_token": 0.2751000728458166,
|
|
"misalign/compressed_reward_absmax": 3810.44873046875,
|
|
"misalign/compressed_reward_range": 5403.443389892578,
|
|
"misalign/entropy_a": 678.8246765136719,
|
|
"misalign/entropy_b": 1104.3003616333008,
|
|
"misalign/forward_kl_divergence": 340.9373073577881,
|
|
"misalign/forward_kl_divergence_per_token": 0.9033116102218628,
|
|
"misalign/gamma_abs_times_reward_std": 43435399.0,
|
|
"misalign/gamma_bracketed_rate": 0.9872131571173668,
|
|
"misalign/gamma_reward_residual": 0.00028186071233449184,
|
|
"misalign/gamma_star": 71246069.25,
|
|
"misalign/js_divergence": 33.93104815483093,
|
|
"misalign/reverse_kl_divergence": 132.34368515014648,
|
|
"misalign/reverse_kl_divergence_per_token": 0.27233118936419487,
|
|
"misalign/reward_a": 77.04429864883423,
|
|
"misalign/reward_b": -15.656381011009216,
|
|
"misalign/reward_improvement": 92.70064735412598,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.46722693368792534,
|
|
"misalign/reward_improvement_per_token": -0.003935309126973152,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -453.2850399017334,
|
|
"misalign/reward_vocab_std": 652.1301879882812,
|
|
"misalign/symmetric_kl": 473.28116607666016,
|
|
"misalign/tv_distance": 112.50246047973633,
|
|
"num_tokens": 9663583.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -16.42910087108612,
|
|
"rewards/margins": 42.240750551223755,
|
|
"rewards/rejected": -58.6698522567749,
|
|
"step": 57,
|
|
"support/residual_count": 151893.123046875,
|
|
"support/residual_mass_policy": 0.025635873898863792,
|
|
"support/residual_mass_reference": 0.051683404948562384,
|
|
"support/residual_reward": -0.43221110105514526,
|
|
"support/runtime_width": 42.8780837059021,
|
|
"support/sampled_loser_rank": 0.6225372664630413,
|
|
"support/sampled_reward_rank": -0.4814031980931759,
|
|
"support/sampled_token_added_rate": 0.044704005820676684,
|
|
"support/sampled_winner_rank": 0.6614899709820747,
|
|
"support/selected_width": 42.8780837059021,
|
|
"support/stored_width": 42.8780837059021
|
|
},
|
|
{
|
|
"epoch": 0.90625,
|
|
"grad_norm": 63.19841003417969,
|
|
"kl/sequence_policy_ref": -429.0251274108887,
|
|
"kl/vocab_forward": 393.4870414733887,
|
|
"kl/vocab_js": 40.02447009086609,
|
|
"kl/vocab_reverse": 155.88319969177246,
|
|
"kl/vocab_symmetric": 549.3705825805664,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -829.9505310058594,
|
|
"logps/rejected": -2197.3387451171875,
|
|
"loss": 2.1333,
|
|
"loss/dpo": 0.4092487035359162,
|
|
"misalign/J": 172.40936088562012,
|
|
"misalign/J_aux_loss": 1.724093571305275,
|
|
"misalign/J_aux_loss_raw": 172.40936088562012,
|
|
"misalign/J_over_reverse_kl": 2.0524225011467934,
|
|
"misalign/J_per_token": 0.27197333984076977,
|
|
"misalign/compressed_reward_absmax": 4601.915740966797,
|
|
"misalign/compressed_reward_range": 6508.782531738281,
|
|
"misalign/entropy_a": 811.3467330932617,
|
|
"misalign/entropy_b": 1340.6858673095703,
|
|
"misalign/forward_kl_divergence": 393.4870414733887,
|
|
"misalign/forward_kl_divergence_per_token": 0.5244201272726059,
|
|
"misalign/gamma_abs_times_reward_std": 48018970.5,
|
|
"misalign/gamma_bracketed_rate": 0.9816265851259232,
|
|
"misalign/gamma_reward_residual": 0.00013214930413596448,
|
|
"misalign/gamma_star": 68324594.0,
|
|
"misalign/js_divergence": 40.02447009086609,
|
|
"misalign/reverse_kl_divergence": 155.88319969177246,
|
|
"misalign/reverse_kl_divergence_per_token": 0.17863536719232798,
|
|
"misalign/reward_a": 84.13565301895142,
|
|
"misalign/reward_b": -20.68537664413452,
|
|
"misalign/reward_improvement": 104.8209924697876,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.602836437523365,
|
|
"misalign/reward_improvement_per_token": 0.056509769055992365,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -554.3869686126709,
|
|
"misalign/reward_vocab_std": 789.2750701904297,
|
|
"misalign/symmetric_kl": 549.3705825805664,
|
|
"misalign/tv_distance": 134.0800666809082,
|
|
"num_tokens": 9847213.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -18.04733145236969,
|
|
"rewards/margins": 49.71036195755005,
|
|
"rewards/rejected": -67.75769662857056,
|
|
"step": 58,
|
|
"support/residual_count": 151893.20703125,
|
|
"support/residual_mass_policy": 0.02004858397413045,
|
|
"support/residual_mass_reference": 0.043993874453008175,
|
|
"support/residual_reward": -0.44899558275938034,
|
|
"support/runtime_width": 42.797929763793945,
|
|
"support/sampled_loser_rank": 0.6209223605692387,
|
|
"support/sampled_reward_rank": -0.39453774876892567,
|
|
"support/sampled_token_added_rate": 0.03281214344315231,
|
|
"support/sampled_winner_rank": 0.663671188056469,
|
|
"support/selected_width": 42.797929763793945,
|
|
"support/stored_width": 42.797929763793945
|
|
},
|
|
{
|
|
"epoch": 0.921875,
|
|
"grad_norm": 117.94857788085938,
|
|
"kl/sequence_policy_ref": -443.7041206359863,
|
|
"kl/vocab_forward": 409.88890075683594,
|
|
"kl/vocab_js": 40.246328592300415,
|
|
"kl/vocab_reverse": 156.80553817749023,
|
|
"kl/vocab_symmetric": 566.6947135925293,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -869.2680358886719,
|
|
"logps/rejected": -2070.011489868164,
|
|
"loss": 3.0679,
|
|
"loss/dpo": 1.1368572648187203,
|
|
"misalign/J": 193.10802459716797,
|
|
"misalign/J_aux_loss": 1.931080162525177,
|
|
"misalign/J_aux_loss_raw": 193.10802459716797,
|
|
"misalign/J_over_reverse_kl": 1.537727639079094,
|
|
"misalign/J_per_token": 0.22832004725933075,
|
|
"misalign/compressed_reward_absmax": 4309.089996337891,
|
|
"misalign/compressed_reward_range": 6035.041198730469,
|
|
"misalign/entropy_a": 770.8880767822266,
|
|
"misalign/entropy_b": 1271.0429077148438,
|
|
"misalign/forward_kl_divergence": 409.88890075683594,
|
|
"misalign/forward_kl_divergence_per_token": 0.548376951366663,
|
|
"misalign/gamma_abs_times_reward_std": 57010088.0,
|
|
"misalign/gamma_bracketed_rate": 0.9858352392911911,
|
|
"misalign/gamma_reward_residual": 0.00474659322799198,
|
|
"misalign/gamma_star": 87458212.0,
|
|
"misalign/js_divergence": 40.246328592300415,
|
|
"misalign/reverse_kl_divergence": 156.80553817749023,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1863451935350895,
|
|
"misalign/reward_a": 87.77897262573242,
|
|
"misalign/reward_b": -18.043125957250595,
|
|
"misalign/reward_improvement": 105.82207298278809,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6192803308367729,
|
|
"misalign/reward_improvement_per_token": 0.07612488837912679,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -637.8235740661621,
|
|
"misalign/reward_vocab_std": 722.6581039428711,
|
|
"misalign/symmetric_kl": 566.6947135925293,
|
|
"misalign/tv_distance": 131.4430446624756,
|
|
"num_tokens": 10021862.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -21.588929533958435,
|
|
"rewards/margins": 45.56296682357788,
|
|
"rewards/rejected": -67.15189743041992,
|
|
"step": 59,
|
|
"support/residual_count": 151893.498046875,
|
|
"support/residual_mass_policy": 0.022956559900194407,
|
|
"support/residual_mass_reference": 0.03896482312120497,
|
|
"support/residual_reward": -0.552052453160286,
|
|
"support/runtime_width": 42.50350904464722,
|
|
"support/sampled_loser_rank": 0.5951877385377884,
|
|
"support/sampled_reward_rank": -0.2748332447372377,
|
|
"support/sampled_token_added_rate": 0.03523444454185665,
|
|
"support/sampled_winner_rank": 0.6131882853806019,
|
|
"support/selected_width": 42.50350904464722,
|
|
"support/stored_width": 42.50350904464722
|
|
},
|
|
{
|
|
"epoch": 0.9375,
|
|
"grad_norm": 52.73037338256836,
|
|
"kl/sequence_policy_ref": -394.9115695953369,
|
|
"kl/vocab_forward": 363.2713165283203,
|
|
"kl/vocab_js": 35.156471252441406,
|
|
"kl/vocab_reverse": 136.1243805885315,
|
|
"kl/vocab_symmetric": 499.39588928222656,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -676.7391777038574,
|
|
"logps/rejected": -1959.0352096557617,
|
|
"loss": 1.6395,
|
|
"loss/dpo": 0.1764393468254366,
|
|
"misalign/J": 146.30127716064453,
|
|
"misalign/J_aux_loss": 1.4630127176642418,
|
|
"misalign/J_aux_loss_raw": 146.30127716064453,
|
|
"misalign/J_over_reverse_kl": 1.2657062262296677,
|
|
"misalign/J_per_token": 0.22581798769533634,
|
|
"misalign/compressed_reward_absmax": 4125.538238525391,
|
|
"misalign/compressed_reward_range": 5841.7431640625,
|
|
"misalign/entropy_a": 674.7129745483398,
|
|
"misalign/entropy_b": 1137.8532485961914,
|
|
"misalign/forward_kl_divergence": 363.2713165283203,
|
|
"misalign/forward_kl_divergence_per_token": 0.5696188099682331,
|
|
"misalign/gamma_abs_times_reward_std": 40711104.25,
|
|
"misalign/gamma_bracketed_rate": 0.9862173870205879,
|
|
"misalign/gamma_reward_residual": 0.0012141035936110711,
|
|
"misalign/gamma_star": 54984167.75,
|
|
"misalign/js_divergence": 35.156471252441406,
|
|
"misalign/reverse_kl_divergence": 136.1243805885315,
|
|
"misalign/reverse_kl_divergence_per_token": 0.18038787879049778,
|
|
"misalign/reward_a": 68.60513997077942,
|
|
"misalign/reward_b": -15.08642065525055,
|
|
"misalign/reward_improvement": 83.69155693054199,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6113412380218506,
|
|
"misalign/reward_improvement_per_token": 0.08037259662523866,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -327.37775897979736,
|
|
"misalign/reward_vocab_std": 704.0034561157227,
|
|
"misalign/symmetric_kl": 499.39588928222656,
|
|
"misalign/tv_distance": 114.7151231765747,
|
|
"num_tokens": 10193216.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -16.442305088043213,
|
|
"rewards/margins": 46.09770584106445,
|
|
"rewards/rejected": -62.540010929107666,
|
|
"step": 60,
|
|
"support/residual_count": 151893.306640625,
|
|
"support/residual_mass_policy": 0.020310348831117153,
|
|
"support/residual_mass_reference": 0.03744203574024141,
|
|
"support/residual_reward": -0.35806123074144125,
|
|
"support/runtime_width": 42.69117593765259,
|
|
"support/sampled_loser_rank": 0.5424250774085522,
|
|
"support/sampled_reward_rank": -0.44320493191480637,
|
|
"support/sampled_token_added_rate": 0.03534874925389886,
|
|
"support/sampled_winner_rank": 0.5376182310283184,
|
|
"support/selected_width": 42.69117593765259,
|
|
"support/stored_width": 42.69117593765259
|
|
},
|
|
{
|
|
"epoch": 0.9375,
|
|
"eval_kl/sequence_policy_ref": -443.8763482570648,
|
|
"eval_kl/vocab_forward": 408.00172185897827,
|
|
"eval_kl/vocab_js": 39.08841371536255,
|
|
"eval_kl/vocab_reverse": 151.38377118110657,
|
|
"eval_kl/vocab_symmetric": 559.3856892585754,
|
|
"eval_logps/chosen": -939.2870273590088,
|
|
"eval_logps/rejected": -2048.731554031372,
|
|
"eval_loss": 1.7673890590667725,
|
|
"eval_loss/dpo": 0.14154351732228904,
|
|
"eval_misalign/J": 162.58456230163574,
|
|
"eval_misalign/J_aux_loss": 1.625845598988235,
|
|
"eval_misalign/J_aux_loss_raw": 162.58456230163574,
|
|
"eval_misalign/J_over_reverse_kl": 1.5481905555352569,
|
|
"eval_misalign/J_per_token": 0.22693954594433308,
|
|
"eval_misalign/compressed_reward_absmax": 4236.706272125244,
|
|
"eval_misalign/compressed_reward_range": 6027.056491851807,
|
|
"eval_misalign/entropy_a": 786.7468104362488,
|
|
"eval_misalign/entropy_b": 1298.0544500350952,
|
|
"eval_misalign/forward_kl_divergence": 408.00172185897827,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.6159884915687144,
|
|
"eval_misalign/gamma_abs_times_reward_std": 42470128.234375,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9880082719027996,
|
|
"eval_misalign/gamma_reward_residual": 0.0014271657525881665,
|
|
"eval_misalign/gamma_star": 53453969.375,
|
|
"eval_misalign/js_divergence": 39.08841371536255,
|
|
"eval_misalign/reverse_kl_divergence": 151.38377118110657,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.1974795301211998,
|
|
"eval_misalign/reward_a": 88.27734404802322,
|
|
"eval_misalign/reward_b": -14.307282019406557,
|
|
"eval_misalign/reward_improvement": 102.58461898565292,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 0.6177075733430684,
|
|
"eval_misalign/reward_improvement_per_token": 0.07934931269846857,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.11556017398834,
|
|
"eval_misalign/reward_vocab_std": 731.320264339447,
|
|
"eval_misalign/symmetric_kl": 559.3856892585754,
|
|
"eval_misalign/tv_distance": 129.75177884101868,
|
|
"eval_rewards/accuracies": 0.982421875,
|
|
"eval_rewards/chosen": -21.011107921600342,
|
|
"eval_rewards/margins": 46.75305512547493,
|
|
"eval_rewards/rejected": -67.76416301727295,
|
|
"eval_runtime": 101.586,
|
|
"eval_samples_per_second": 5.04,
|
|
"eval_steps_per_second": 0.63,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.023736586881568655,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.953125,
|
|
"grad_norm": 66.27374267578125,
|
|
"kl/sequence_policy_ref": -495.910831451416,
|
|
"kl/vocab_forward": 454.87990951538086,
|
|
"kl/vocab_js": 42.96171951293945,
|
|
"kl/vocab_reverse": 166.5788345336914,
|
|
"kl/vocab_symmetric": 621.4590148925781,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -882.9071960449219,
|
|
"logps/rejected": -2324.2291259765625,
|
|
"loss": 1.9092,
|
|
"loss/dpo": 0.00656887573249243,
|
|
"misalign/J": 190.26472282409668,
|
|
"misalign/J_aux_loss": 1.9026471227407455,
|
|
"misalign/J_aux_loss_raw": 190.26472282409668,
|
|
"misalign/J_over_reverse_kl": 1.2592844367027283,
|
|
"misalign/J_per_token": 0.22797120176255703,
|
|
"misalign/compressed_reward_absmax": 4472.839447021484,
|
|
"misalign/compressed_reward_range": 6308.651184082031,
|
|
"misalign/entropy_a": 808.8803405761719,
|
|
"misalign/entropy_b": 1358.7424087524414,
|
|
"misalign/forward_kl_divergence": 454.87990951538086,
|
|
"misalign/forward_kl_divergence_per_token": 0.6771250255405903,
|
|
"misalign/gamma_abs_times_reward_std": 54845561.25,
|
|
"misalign/gamma_bracketed_rate": 0.9894062727689743,
|
|
"misalign/gamma_reward_residual": 0.0002826992104019155,
|
|
"misalign/gamma_star": 55459094.75,
|
|
"misalign/js_divergence": 42.96171951293945,
|
|
"misalign/reverse_kl_divergence": 166.5788345336914,
|
|
"misalign/reverse_kl_divergence_per_token": 0.2539278268814087,
|
|
"misalign/reward_a": 87.17640686035156,
|
|
"misalign/reward_b": -18.71793007850647,
|
|
"misalign/reward_improvement": 105.89432716369629,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5326569080352783,
|
|
"misalign/reward_improvement_per_token": 0.01839596056379378,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -465.77495288848877,
|
|
"misalign/reward_vocab_std": 756.1640853881836,
|
|
"misalign/symmetric_kl": 621.4590148925781,
|
|
"misalign/tv_distance": 140.44063568115234,
|
|
"num_tokens": 10376767.0,
|
|
"rewards/accuracies": 1.0,
|
|
"rewards/chosen": -23.160260915756226,
|
|
"rewards/margins": 52.86164569854736,
|
|
"rewards/rejected": -76.02190685272217,
|
|
"step": 61,
|
|
"support/residual_count": 151893.14453125,
|
|
"support/residual_mass_policy": 0.020877071423456073,
|
|
"support/residual_mass_reference": 0.04333586525171995,
|
|
"support/residual_reward": -0.3012515353038907,
|
|
"support/runtime_width": 42.852439880371094,
|
|
"support/sampled_loser_rank": 0.6411669962108135,
|
|
"support/sampled_reward_rank": -0.4843590557575226,
|
|
"support/sampled_token_added_rate": 0.03510869154706597,
|
|
"support/sampled_winner_rank": 0.7503333985805511,
|
|
"support/selected_width": 42.852439880371094,
|
|
"support/stored_width": 42.852439880371094
|
|
},
|
|
{
|
|
"epoch": 0.96875,
|
|
"grad_norm": 135.78811645507812,
|
|
"kl/sequence_policy_ref": -521.3121376037598,
|
|
"kl/vocab_forward": 479.5559272766113,
|
|
"kl/vocab_js": 44.95201635360718,
|
|
"kl/vocab_reverse": 172.62139701843262,
|
|
"kl/vocab_symmetric": 652.1775360107422,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -1089.0823440551758,
|
|
"logps/rejected": -2275.8909606933594,
|
|
"loss": 2.0802,
|
|
"loss/dpo": 0.21537577021728538,
|
|
"misalign/J": 186.48083686828613,
|
|
"misalign/J_aux_loss": 1.8648083209991455,
|
|
"misalign/J_aux_loss_raw": 186.48083686828613,
|
|
"misalign/J_over_reverse_kl": 1.9623293429613113,
|
|
"misalign/J_per_token": 0.2417179737240076,
|
|
"misalign/compressed_reward_absmax": 4746.217712402344,
|
|
"misalign/compressed_reward_range": 6676.980529785156,
|
|
"misalign/entropy_a": 843.7399063110352,
|
|
"misalign/entropy_b": 1419.2857818603516,
|
|
"misalign/forward_kl_divergence": 479.5559272766113,
|
|
"misalign/forward_kl_divergence_per_token": 0.5206913501024246,
|
|
"misalign/gamma_abs_times_reward_std": 52206773.75,
|
|
"misalign/gamma_bracketed_rate": 0.9809182211756706,
|
|
"misalign/gamma_reward_residual": 0.0015386186018986336,
|
|
"misalign/gamma_star": 70413638.5,
|
|
"misalign/js_divergence": 44.95201635360718,
|
|
"misalign/reverse_kl_divergence": 172.62139701843262,
|
|
"misalign/reverse_kl_divergence_per_token": 0.17035234905779362,
|
|
"misalign/reward_a": 94.845046043396,
|
|
"misalign/reward_b": -16.313238620758057,
|
|
"misalign/reward_improvement": 111.15826416015625,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5862268581986427,
|
|
"misalign/reward_improvement_per_token": 0.0937041062861681,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -627.9793968200684,
|
|
"misalign/reward_vocab_std": 811.575309753418,
|
|
"misalign/symmetric_kl": 652.1775360107422,
|
|
"misalign/tv_distance": 147.92323780059814,
|
|
"num_tokens": 10568093.0,
|
|
"rewards/accuracies": 0.953125,
|
|
"rewards/chosen": -28.54444169998169,
|
|
"rewards/margins": 47.17354154586792,
|
|
"rewards/rejected": -75.7179822921753,
|
|
"step": 62,
|
|
"support/residual_count": 151893.359375,
|
|
"support/residual_mass_policy": 0.022450818214565516,
|
|
"support/residual_mass_reference": 0.04165853979066014,
|
|
"support/residual_reward": -0.45337859727442265,
|
|
"support/runtime_width": 42.641594886779785,
|
|
"support/sampled_loser_rank": 0.6541027873754501,
|
|
"support/sampled_reward_rank": -0.38643040135502815,
|
|
"support/sampled_token_added_rate": 0.037647833582013845,
|
|
"support/sampled_winner_rank": 0.6750443577766418,
|
|
"support/selected_width": 42.641594886779785,
|
|
"support/stored_width": 42.641594886779785
|
|
},
|
|
{
|
|
"epoch": 0.984375,
|
|
"grad_norm": 157.7623748779297,
|
|
"kl/sequence_policy_ref": -430.5328941345215,
|
|
"kl/vocab_forward": 418.2077407836914,
|
|
"kl/vocab_js": 37.59886360168457,
|
|
"kl/vocab_reverse": 146.4429168701172,
|
|
"kl/vocab_symmetric": 564.6509666442871,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -991.5014877319336,
|
|
"logps/rejected": -1757.1116333007812,
|
|
"loss": 2.0983,
|
|
"loss/dpo": 0.4923999092846994,
|
|
"misalign/J": 160.58584594726562,
|
|
"misalign/J_aux_loss": 1.6058583855628967,
|
|
"misalign/J_aux_loss_raw": 160.58584594726562,
|
|
"misalign/J_over_reverse_kl": 1.1775383204221725,
|
|
"misalign/J_per_token": 0.2121292594820261,
|
|
"misalign/compressed_reward_absmax": 4109.173980712891,
|
|
"misalign/compressed_reward_range": 5852.2374267578125,
|
|
"misalign/entropy_a": 698.0654525756836,
|
|
"misalign/entropy_b": 1176.365493774414,
|
|
"misalign/forward_kl_divergence": 418.2077407836914,
|
|
"misalign/forward_kl_divergence_per_token": 0.7069764323532581,
|
|
"misalign/gamma_abs_times_reward_std": 45094587.0,
|
|
"misalign/gamma_bracketed_rate": 0.9876261055469513,
|
|
"misalign/gamma_reward_residual": 0.00018861131684388965,
|
|
"misalign/gamma_star": 68429815.0,
|
|
"misalign/js_divergence": 37.59886360168457,
|
|
"misalign/reverse_kl_divergence": 146.4429168701172,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1941323447972536,
|
|
"misalign/reward_a": 87.26894760131836,
|
|
"misalign/reward_b": -12.587202161550522,
|
|
"misalign/reward_improvement": 99.85614490509033,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.5914403721690178,
|
|
"misalign/reward_improvement_per_token": 0.09591837041079998,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -588.2237000465393,
|
|
"misalign/reward_vocab_std": 723.0128402709961,
|
|
"misalign/symmetric_kl": 564.6509666442871,
|
|
"misalign/tv_distance": 122.21442317962646,
|
|
"num_tokens": 10736658.0,
|
|
"rewards/accuracies": 0.9375,
|
|
"rewards/chosen": -24.446208238601685,
|
|
"rewards/margins": 37.21416687965393,
|
|
"rewards/rejected": -61.6603741645813,
|
|
"step": 63,
|
|
"support/residual_count": 151893.123046875,
|
|
"support/residual_mass_policy": 0.026434314902871847,
|
|
"support/residual_mass_reference": 0.04181864345446229,
|
|
"support/residual_reward": -0.5436067841947079,
|
|
"support/runtime_width": 42.87605428695679,
|
|
"support/sampled_loser_rank": 0.5825388208031654,
|
|
"support/sampled_reward_rank": -0.3311811648309231,
|
|
"support/sampled_token_added_rate": 0.032393347937613726,
|
|
"support/sampled_winner_rank": 0.6503275334835052,
|
|
"support/selected_width": 42.87605428695679,
|
|
"support/stored_width": 42.87605428695679
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 53.39131164550781,
|
|
"kl/sequence_policy_ref": -518.7998046875,
|
|
"kl/vocab_forward": 483.2027702331543,
|
|
"kl/vocab_js": 43.7255322933197,
|
|
"kl/vocab_reverse": 169.42027759552002,
|
|
"kl/vocab_symmetric": 652.6233749389648,
|
|
"learning_rate": 2e-06,
|
|
"logps/chosen": -1134.265338897705,
|
|
"logps/rejected": -2165.7147216796875,
|
|
"loss": 1.7421,
|
|
"loss/dpo": 0.08747747553955643,
|
|
"misalign/J": 165.4585018157959,
|
|
"misalign/J_aux_loss": 1.6545849740505219,
|
|
"misalign/J_aux_loss_raw": 165.4585018157959,
|
|
"misalign/J_over_reverse_kl": 1.0515589267015457,
|
|
"misalign/J_per_token": 0.1934837531298399,
|
|
"misalign/compressed_reward_absmax": 4486.107025146484,
|
|
"misalign/compressed_reward_range": 6402.580139160156,
|
|
"misalign/entropy_a": 834.7258224487305,
|
|
"misalign/entropy_b": 1397.0357360839844,
|
|
"misalign/forward_kl_divergence": 483.2027702331543,
|
|
"misalign/forward_kl_divergence_per_token": 0.6137732639908791,
|
|
"misalign/gamma_abs_times_reward_std": 49280889.75,
|
|
"misalign/gamma_bracketed_rate": 0.9895988926291466,
|
|
"misalign/gamma_reward_residual": 0.00024569173842792225,
|
|
"misalign/gamma_star": 40878552.0,
|
|
"misalign/js_divergence": 43.7255322933197,
|
|
"misalign/reverse_kl_divergence": 169.42027759552002,
|
|
"misalign/reverse_kl_divergence_per_token": 0.1976974420249462,
|
|
"misalign/reward_a": 98.72580575942993,
|
|
"misalign/reward_b": -13.32999886572361,
|
|
"misalign/reward_improvement": 112.05579376220703,
|
|
"misalign/reward_improvement_over_reverse_kl": 0.6203483864665031,
|
|
"misalign/reward_improvement_per_token": 0.10607674531638622,
|
|
"misalign/reward_signal_low_rate": 0.0,
|
|
"misalign/reward_vocab_mean": -555.1519546508789,
|
|
"misalign/reward_vocab_std": 773.9923553466797,
|
|
"misalign/symmetric_kl": 652.6233749389648,
|
|
"misalign/tv_distance": 142.4766387939453,
|
|
"num_tokens": 10915114.0,
|
|
"rewards/accuracies": 0.984375,
|
|
"rewards/chosen": -29.072665452957153,
|
|
"rewards/margins": 45.61463260650635,
|
|
"rewards/rejected": -74.68729877471924,
|
|
"step": 64,
|
|
"support/residual_count": 151893.263671875,
|
|
"support/residual_mass_policy": 0.024331093532964587,
|
|
"support/residual_mass_reference": 0.04252262390218675,
|
|
"support/residual_reward": -0.4992054486647248,
|
|
"support/runtime_width": 42.739386558532715,
|
|
"support/sampled_loser_rank": 0.6560819111764431,
|
|
"support/sampled_reward_rank": -0.35881325230002403,
|
|
"support/sampled_token_added_rate": 0.03401010250672698,
|
|
"support/sampled_winner_rank": 0.6759711802005768,
|
|
"support/selected_width": 42.739386558532715,
|
|
"support/stored_width": 42.739386558532715
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 64,
|
|
"total_flos": 4.111435733296742e+16,
|
|
"train_loss": 1.8484279848635197,
|
|
"train_runtime": 2097.8551,
|
|
"train_samples_per_second": 1.952,
|
|
"train_steps_per_second": 0.031
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_kl/sequence_policy_ref": -484.36079502105713,
|
|
"eval_kl/vocab_forward": 447.0612106323242,
|
|
"eval_kl/vocab_js": 40.982308477163315,
|
|
"eval_kl/vocab_reverse": 159.29439425468445,
|
|
"eval_kl/vocab_symmetric": 606.3558564186096,
|
|
"eval_logps/chosen": -960.8710179328918,
|
|
"eval_logps/rejected": -2108.1164512634277,
|
|
"eval_loss": 1.7901853322982788,
|
|
"eval_loss/dpo": 0.1378665120205333,
|
|
"eval_misalign/J": 165.23188638687134,
|
|
"eval_misalign/J_aux_loss": 1.6523188361898065,
|
|
"eval_misalign/J_aux_loss_raw": 165.23188638687134,
|
|
"eval_misalign/J_over_reverse_kl": 1.4319368861615658,
|
|
"eval_misalign/J_per_token": 0.2249188085552305,
|
|
"eval_misalign/compressed_reward_absmax": 4236.7062911987305,
|
|
"eval_misalign/compressed_reward_range": 6027.056602478027,
|
|
"eval_misalign/entropy_a": 773.5548601150513,
|
|
"eval_misalign/entropy_b": 1298.0544300079346,
|
|
"eval_misalign/forward_kl_divergence": 447.0612106323242,
|
|
"eval_misalign/forward_kl_divergence_per_token": 0.6650652508251369,
|
|
"eval_misalign/gamma_abs_times_reward_std": 42179871.046875,
|
|
"eval_misalign/gamma_bracketed_rate": 0.9879104141145945,
|
|
"eval_misalign/gamma_reward_residual": 0.0007516551395241322,
|
|
"eval_misalign/gamma_star": 54525594.609375,
|
|
"eval_misalign/js_divergence": 40.982308477163315,
|
|
"eval_misalign/reverse_kl_divergence": 159.29439425468445,
|
|
"eval_misalign/reverse_kl_divergence_per_token": 0.20468837535008788,
|
|
"eval_misalign/reward_a": 88.6484876871109,
|
|
"eval_misalign/reward_b": -14.307281013578176,
|
|
"eval_misalign/reward_improvement": 102.95576465129852,
|
|
"eval_misalign/reward_improvement_over_reverse_kl": 0.5992142450995743,
|
|
"eval_misalign/reward_improvement_per_token": 0.0850256277481094,
|
|
"eval_misalign/reward_signal_low_rate": 0.0,
|
|
"eval_misalign/reward_vocab_mean": -490.11554074287415,
|
|
"eval_misalign/reward_vocab_std": 731.3202610015869,
|
|
"eval_misalign/symmetric_kl": 606.3558564186096,
|
|
"eval_misalign/tv_distance": 133.03198266029358,
|
|
"eval_rewards/accuracies": 0.984375,
|
|
"eval_rewards/chosen": -23.169506430625916,
|
|
"eval_rewards/margins": 50.53314882516861,
|
|
"eval_rewards/rejected": -73.70265424251556,
|
|
"eval_runtime": 101.2427,
|
|
"eval_samples_per_second": 5.057,
|
|
"eval_steps_per_second": 0.632,
|
|
"eval_support/residual_count": 151893.29125976562,
|
|
"eval_support/residual_mass_policy": 0.023531361599452794,
|
|
"eval_support/residual_mass_reference": 0.04279232310364023,
|
|
"eval_support/residual_reward": -0.4695481152739376,
|
|
"eval_support/runtime_width": 42.70963191986084,
|
|
"eval_support/sampled_loser_rank": 0.6486562248319387,
|
|
"eval_support/sampled_reward_rank": -0.37071577250026166,
|
|
"eval_support/sampled_token_added_rate": 0.037317203474231064,
|
|
"eval_support/sampled_winner_rank": 0.6506854901090264,
|
|
"eval_support/selected_width": 42.70963191986084,
|
|
"eval_support/stored_width": 42.70963191986084,
|
|
"step": 64
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 64,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 6,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.111435733296742e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|