Files
ubq30i_qwen4b_dpo_topk20_ba…/trainer_state.json
ModelHub XC 46586558a4 初始化项目,由ModelHub XC社区提供模型
Model: yunjae-won/ubq30i_qwen4b_dpo_topk20_backprop_j001
Source: Original Platform
2026-05-22 18:14:55 +08:00

4174 lines
198 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 12,
"global_step": 64,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015625,
"grad_norm": 99.72795104980469,
"kl/sequence_policy_ref": 0.0,
"kl/vocab_forward": 0.0,
"kl/vocab_js": 1.071127120333415e-08,
"kl/vocab_reverse": 0.0,
"kl/vocab_symmetric": 0.0,
"learning_rate": 0.0,
"logps/chosen": -742.8982963562012,
"logps/rejected": -1508.9743347167969,
"loss": 0.8464,
"loss/dpo": 0.6931471824645996,
"misalign/J": 15.325548589229584,
"misalign/J_aux_loss": 0.15325548127293587,
"misalign/J_aux_loss_raw": 15.325548589229584,
"misalign/J_over_reverse_kl": 128560019.5,
"misalign/J_per_token": 0.019860354135744274,
"misalign/compressed_reward_absmax": 4573.125,
"misalign/compressed_reward_range": 6496.04150390625,
"misalign/entropy_a": 1394.3690948486328,
"misalign/entropy_b": 1394.3689727783203,
"misalign/forward_kl_divergence": 0.0,
"misalign/forward_kl_divergence_per_token": 0.0,
"misalign/gamma_abs_times_reward_std": 3358948.609375,
"misalign/gamma_bracketed_rate": 0.9989435374736786,
"misalign/gamma_reward_residual": -3.5030208209718694e-05,
"misalign/gamma_star": 6581222.96875,
"misalign/js_divergence": 1.071127120333415e-08,
"misalign/reverse_kl_divergence": 0.0,
"misalign/reverse_kl_divergence_per_token": 0.0,
"misalign/reward_a": -14.673600375652313,
"misalign/reward_b": -14.673597425222397,
"misalign/reward_improvement": 0.0,
"misalign/reward_improvement_over_reverse_kl": 0.0,
"misalign/reward_improvement_per_token": 0.0,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -492.57253074645996,
"misalign/reward_vocab_std": 775.2254867553711,
"misalign/symmetric_kl": 0.0,
"misalign/tv_distance": 0.0,
"num_tokens": 178419.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1,
"support/residual_count": 151893.28125,
"support/residual_mass_policy": 0.043290185276418924,
"support/residual_mass_reference": 0.043290185276418924,
"support/residual_reward": -0.42998734675347805,
"support/runtime_width": 42.71537160873413,
"support/sampled_loser_rank": 0.6159256920218468,
"support/sampled_reward_rank": -0.34261330030858517,
"support/sampled_token_added_rate": 0.03961701481603086,
"support/sampled_winner_rank": 0.6312572248280048,
"support/selected_width": 42.71537160873413,
"support/stored_width": 42.71537160873413
},
{
"epoch": 0.03125,
"grad_norm": 90.65897369384766,
"kl/sequence_policy_ref": 0.0,
"kl/vocab_forward": 0.0,
"kl/vocab_js": 1.1555954915198896e-08,
"kl/vocab_reverse": 0.0,
"kl/vocab_symmetric": 0.0,
"learning_rate": 2.857142857142857e-07,
"logps/chosen": -639.7089538574219,
"logps/rejected": -1256.9280319213867,
"loss": 0.8522,
"loss/dpo": 0.6931471824645996,
"misalign/J": 15.909758105874062,
"misalign/J_aux_loss": 0.15909757697954774,
"misalign/J_aux_loss_raw": 15.909758105874062,
"misalign/J_over_reverse_kl": 133460724.125,
"misalign/J_per_token": 0.022011274049873464,
"misalign/compressed_reward_absmax": 4128.2406005859375,
"misalign/compressed_reward_range": 5892.38916015625,
"misalign/entropy_a": 1163.6351776123047,
"misalign/entropy_b": 1163.6351852416992,
"misalign/forward_kl_divergence": 0.0,
"misalign/forward_kl_divergence_per_token": 0.0,
"misalign/gamma_abs_times_reward_std": 3731882.666015625,
"misalign/gamma_bracketed_rate": 0.99904465675354,
"misalign/gamma_reward_residual": -2.5868400712170114e-05,
"misalign/gamma_star": 6485823.03515625,
"misalign/js_divergence": 1.1555954915198896e-08,
"misalign/reverse_kl_divergence": 0.0,
"misalign/reverse_kl_divergence_per_token": 0.0,
"misalign/reward_a": -14.091559052467346,
"misalign/reward_b": -14.091563642024994,
"misalign/reward_improvement": 0.0,
"misalign/reward_improvement_over_reverse_kl": 0.0,
"misalign/reward_improvement_per_token": 0.0,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -373.4254665374756,
"misalign/reward_vocab_std": 714.1293716430664,
"misalign/symmetric_kl": 0.0,
"misalign/tv_distance": 0.0,
"num_tokens": 353406.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2,
"support/residual_count": 151893.015625,
"support/residual_mass_policy": 0.04986313637346029,
"support/residual_mass_reference": 0.04986313637346029,
"support/residual_reward": -0.33420879393815994,
"support/runtime_width": 42.98376941680908,
"support/sampled_loser_rank": 0.5725675709545612,
"support/sampled_reward_rank": -0.46486284397542477,
"support/sampled_token_added_rate": 0.039943449199199677,
"support/sampled_winner_rank": 0.6429276466369629,
"support/selected_width": 42.98376941680908,
"support/stored_width": 42.98376941680908
},
{
"epoch": 0.046875,
"grad_norm": 93.3929672241211,
"kl/sequence_policy_ref": 0.012497343122959137,
"kl/vocab_forward": 0.583718778565526,
"kl/vocab_js": 0.14564759889617562,
"kl/vocab_reverse": 0.5832288525998592,
"kl/vocab_symmetric": 1.1669474430382252,
"learning_rate": 5.714285714285714e-07,
"logps/chosen": -682.0669765472412,
"logps/rejected": -1253.8106155395508,
"loss": 0.8984,
"loss/dpo": 0.7069794088602066,
"misalign/J": 19.14525681734085,
"misalign/J_aux_loss": 0.1914525495376438,
"misalign/J_aux_loss_raw": 19.14525681734085,
"misalign/J_over_reverse_kl": 181.960098862648,
"misalign/J_per_token": 0.11238381525618024,
"misalign/compressed_reward_absmax": 4042.6752319335938,
"misalign/compressed_reward_range": 5874.695739746094,
"misalign/entropy_a": 1187.622413635254,
"misalign/entropy_b": 1187.9832382202148,
"misalign/forward_kl_divergence": 0.583718778565526,
"misalign/forward_kl_divergence_per_token": 0.0007106075063347816,
"misalign/gamma_abs_times_reward_std": 4061511.5234375,
"misalign/gamma_bracketed_rate": 0.9965383857488632,
"misalign/gamma_reward_residual": -1.116046053084574e-05,
"misalign/gamma_star": 6953297.8359375,
"misalign/js_divergence": 0.14564759889617562,
"misalign/reverse_kl_divergence": 0.5832288525998592,
"misalign/reverse_kl_divergence_per_token": 0.0007096838744473644,
"misalign/reward_a": -14.631245076656342,
"misalign/reward_b": -14.68013870716095,
"misalign/reward_improvement": 0.04889671457931399,
"misalign/reward_improvement_over_reverse_kl": -0.01731939986348152,
"misalign/reward_improvement_per_token": 0.0011197524540875747,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -477.72042179107666,
"misalign/reward_vocab_std": 708.0760498046875,
"misalign/symmetric_kl": 1.1669474430382252,
"misalign/tv_distance": 7.252490729093552,
"num_tokens": 522739.0,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.009032575355377048,
"rewards/margins": -0.020564619218930602,
"rewards/rejected": 0.011532044620253146,
"step": 3,
"support/residual_count": 151893.173828125,
"support/residual_mass_policy": 0.041550057008862495,
"support/residual_mass_reference": 0.04161923169158399,
"support/residual_reward": -0.48134796414524317,
"support/runtime_width": 42.824374198913574,
"support/sampled_loser_rank": 0.6047959439456463,
"support/sampled_reward_rank": -0.43707933463156223,
"support/sampled_token_added_rate": 0.03209112957119942,
"support/sampled_winner_rank": 0.6245042234659195,
"support/selected_width": 42.824374198913574,
"support/stored_width": 42.824374198913574
},
{
"epoch": 0.0625,
"grad_norm": 92.23185729980469,
"kl/sequence_policy_ref": 0.12927092611789703,
"kl/vocab_forward": 0.5669154338538647,
"kl/vocab_js": 0.14143066108226776,
"kl/vocab_reverse": 0.5671258866786957,
"kl/vocab_symmetric": 1.1340412348508835,
"learning_rate": 8.57142857142857e-07,
"logps/chosen": -605.7631340026855,
"logps/rejected": -1292.5442504882812,
"loss": 0.8402,
"loss/dpo": 0.6932987496256828,
"misalign/J": 14.686549186706543,
"misalign/J_aux_loss": 0.14686548942700028,
"misalign/J_aux_loss_raw": 14.686549186706543,
"misalign/J_over_reverse_kl": 23.49239319562912,
"misalign/J_per_token": 0.015179012378212065,
"misalign/compressed_reward_absmax": 4190.518157958984,
"misalign/compressed_reward_range": 6010.22119140625,
"misalign/entropy_a": 1174.9087295532227,
"misalign/entropy_b": 1174.8447265625,
"misalign/forward_kl_divergence": 0.5669154338538647,
"misalign/forward_kl_divergence_per_token": 0.0007894696027506143,
"misalign/gamma_abs_times_reward_std": 3162996.953125,
"misalign/gamma_bracketed_rate": 0.9992709010839462,
"misalign/gamma_reward_residual": -2.8679768774964032e-05,
"misalign/gamma_star": 6157349.4375,
"misalign/js_divergence": 0.14143066108226776,
"misalign/reverse_kl_divergence": 0.5671258866786957,
"misalign/reverse_kl_divergence_per_token": 0.000786922340921592,
"misalign/reward_a": -18.330436378717422,
"misalign/reward_b": -18.468554601073265,
"misalign/reward_improvement": 0.1381110306829214,
"misalign/reward_improvement_over_reverse_kl": 1.340573588386178,
"misalign/reward_improvement_per_token": 0.0015522810608672444,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -531.1860427856445,
"misalign/reward_vocab_std": 741.7677001953125,
"misalign/symmetric_kl": 1.1340412348508835,
"misalign/tv_distance": 6.983255743980408,
"num_tokens": 693958.0,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.01616873685270548,
"rewards/margins": 0.006483289762400091,
"rewards/rejected": 0.009685446973890066,
"step": 4,
"support/residual_count": 151893.24609375,
"support/residual_mass_policy": 0.03730555484071374,
"support/residual_mass_reference": 0.03720196895301342,
"support/residual_reward": -0.5474085882306099,
"support/runtime_width": 42.75570487976074,
"support/sampled_loser_rank": 0.5622387044131756,
"support/sampled_reward_rank": -0.3531896872445941,
"support/sampled_token_added_rate": 0.03257534769363701,
"support/sampled_winner_rank": 0.5690335519611835,
"support/selected_width": 42.75570487976074,
"support/stored_width": 42.75570487976074
},
{
"epoch": 0.078125,
"grad_norm": 117.50943756103516,
"kl/sequence_policy_ref": -0.02329159528017044,
"kl/vocab_forward": 0.7679473981261253,
"kl/vocab_js": 0.1914622224867344,
"kl/vocab_reverse": 0.7658381760120392,
"kl/vocab_symmetric": 1.5337853729724884,
"learning_rate": 1.1428571428571428e-06,
"logps/chosen": -888.985481262207,
"logps/rejected": -1439.7103576660156,
"loss": 0.9771,
"loss/dpo": 0.6643600016832352,
"misalign/J": 31.272010684013367,
"misalign/J_aux_loss": 0.3127200985327363,
"misalign/J_aux_loss_raw": 31.272010684013367,
"misalign/J_over_reverse_kl": 76.80600309371948,
"misalign/J_per_token": 0.034490690915845335,
"misalign/compressed_reward_absmax": 4926.2969970703125,
"misalign/compressed_reward_range": 7034.768737792969,
"misalign/entropy_a": 1437.912109375,
"misalign/entropy_b": 1440.320785522461,
"misalign/forward_kl_divergence": 0.7679473981261253,
"misalign/forward_kl_divergence_per_token": 0.0006905792761244811,
"misalign/gamma_abs_times_reward_std": 6929239.875,
"misalign/gamma_bracketed_rate": 0.9981078654527664,
"misalign/gamma_reward_residual": -2.4677944281847886e-05,
"misalign/gamma_star": 13895793.9375,
"misalign/js_divergence": 0.1914622224867344,
"misalign/reverse_kl_divergence": 0.7658381760120392,
"misalign/reverse_kl_divergence_per_token": 0.0006879112333990633,
"misalign/reward_a": -16.945995092391968,
"misalign/reward_b": -18.123961448669434,
"misalign/reward_improvement": 1.1779705435037613,
"misalign/reward_improvement_over_reverse_kl": 1.8247669339179993,
"misalign/reward_improvement_per_token": 0.0014292692139861174,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -799.9156303405762,
"misalign/reward_vocab_std": 839.3828201293945,
"misalign/symmetric_kl": 1.5337853729724884,
"misalign/tv_distance": 9.24750828742981,
"num_tokens": 885705.0,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.03075969440396875,
"rewards/margins": 0.06617770798038691,
"rewards/rejected": -0.035418014391325414,
"step": 5,
"support/residual_count": 151893.498046875,
"support/residual_mass_policy": 0.0339660148601979,
"support/residual_mass_reference": 0.03398727998137474,
"support/residual_reward": -0.5825221054255962,
"support/runtime_width": 42.503896713256836,
"support/sampled_loser_rank": 0.6222354024648666,
"support/sampled_reward_rank": -0.31187077052891254,
"support/sampled_token_added_rate": 0.03087039408273995,
"support/sampled_winner_rank": 0.6193482503294945,
"support/selected_width": 42.503896713256836,
"support/stored_width": 42.503896713256836
},
{
"epoch": 0.09375,
"grad_norm": 85.93820190429688,
"kl/sequence_policy_ref": -0.15299414843320847,
"kl/vocab_forward": 0.7101510316133499,
"kl/vocab_js": 0.1767149232327938,
"kl/vocab_reverse": 0.7063371688127518,
"kl/vocab_symmetric": 1.4164880961179733,
"learning_rate": 1.4285714285714286e-06,
"logps/chosen": -632.6824798583984,
"logps/rejected": -1370.8635635375977,
"loss": 0.7987,
"loss/dpo": 0.6448436826467514,
"misalign/J": 15.385070085525513,
"misalign/J_aux_loss": 0.15385069977492094,
"misalign/J_aux_loss_raw": 15.385070085525513,
"misalign/J_over_reverse_kl": 83.44263046979904,
"misalign/J_per_token": 0.0304049692931585,
"misalign/compressed_reward_absmax": 4116.264617919922,
"misalign/compressed_reward_range": 5825.951171875,
"misalign/entropy_a": 1233.836441040039,
"misalign/entropy_b": 1237.5170669555664,
"misalign/forward_kl_divergence": 0.7101510316133499,
"misalign/forward_kl_divergence_per_token": 0.0011770288765546866,
"misalign/gamma_abs_times_reward_std": 3832625.03125,
"misalign/gamma_bracketed_rate": 0.9984977394342422,
"misalign/gamma_reward_residual": -2.866266277123941e-05,
"misalign/gamma_star": 7119063.15625,
"misalign/js_divergence": 0.1767149232327938,
"misalign/reverse_kl_divergence": 0.7063371688127518,
"misalign/reverse_kl_divergence_per_token": 0.001158414474048186,
"misalign/reward_a": -12.991081476211548,
"misalign/reward_b": -15.306684225797653,
"misalign/reward_improvement": 2.315602958202362,
"misalign/reward_improvement_over_reverse_kl": 4.326970279216766,
"misalign/reward_improvement_per_token": 0.00793102516036015,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -391.66842794418335,
"misalign/reward_vocab_std": 709.3101501464844,
"misalign/symmetric_kl": 1.4164880961179733,
"misalign/tv_distance": 8.171427369117737,
"num_tokens": 1051197.0,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.03977742395363748,
"rewards/margins": 0.11015367973595858,
"rewards/rejected": -0.07037625554949045,
"step": 6,
"support/residual_count": 151893.294921875,
"support/residual_mass_policy": 0.04003174742683768,
"support/residual_mass_reference": 0.039974321611225605,
"support/residual_reward": -0.44555533304810524,
"support/runtime_width": 42.70325422286987,
"support/sampled_loser_rank": 0.5838432982563972,
"support/sampled_reward_rank": -0.33837335743010044,
"support/sampled_token_added_rate": 0.035395737970247865,
"support/sampled_winner_rank": 0.5852662436664104,
"support/selected_width": 42.70325422286987,
"support/stored_width": 42.70325422286987
},
{
"epoch": 0.109375,
"grad_norm": 88.38682556152344,
"kl/sequence_policy_ref": -0.3988112509250641,
"kl/vocab_forward": 0.9821137934923172,
"kl/vocab_js": 0.2425236813724041,
"kl/vocab_reverse": 0.9639514982700348,
"kl/vocab_symmetric": 1.9460650980472565,
"learning_rate": 1.714285714285714e-06,
"logps/chosen": -645.1111679077148,
"logps/rejected": -1122.6695709228516,
"loss": 0.8321,
"loss/dpo": 0.5667471960186958,
"misalign/J": 26.53605580329895,
"misalign/J_aux_loss": 0.26536055374890566,
"misalign/J_aux_loss_raw": 26.53605580329895,
"misalign/J_over_reverse_kl": 24.359424114227295,
"misalign/J_per_token": 0.027136333868838847,
"misalign/compressed_reward_absmax": 4269.828765869141,
"misalign/compressed_reward_range": 6126.886657714844,
"misalign/entropy_a": 1084.383560180664,
"misalign/entropy_b": 1096.2449645996094,
"misalign/forward_kl_divergence": 0.9821137934923172,
"misalign/forward_kl_divergence_per_token": 0.0023296478029806167,
"misalign/gamma_abs_times_reward_std": 6525904.78125,
"misalign/gamma_bracketed_rate": 0.9985537081956863,
"misalign/gamma_reward_residual": -9.58655687099963e-06,
"misalign/gamma_star": 13095489.4375,
"misalign/js_divergence": 0.2425236813724041,
"misalign/reverse_kl_divergence": 0.9639514982700348,
"misalign/reverse_kl_divergence_per_token": 0.0022518262558151037,
"misalign/reward_a": -4.632188588380814,
"misalign/reward_b": -9.89891766011715,
"misalign/reward_improvement": 5.2667356133461,
"misalign/reward_improvement_over_reverse_kl": 5.781014442443848,
"misalign/reward_improvement_per_token": 0.021993891743477434,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -737.9532012939453,
"misalign/reward_vocab_std": 779.6220245361328,
"misalign/symmetric_kl": 1.9460650980472565,
"misalign/tv_distance": 9.269986510276794,
"num_tokens": 1226725.0,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.10787314153276384,
"rewards/margins": 0.29550853557884693,
"rewards/rejected": -0.18763539008796215,
"step": 7,
"support/residual_count": 151893.15234375,
"support/residual_mass_policy": 0.036638311110436916,
"support/residual_mass_reference": 0.03580877371132374,
"support/residual_reward": -0.6131154783070087,
"support/runtime_width": 42.84639596939087,
"support/sampled_loser_rank": 0.5267594642937183,
"support/sampled_reward_rank": -0.37450613733381033,
"support/sampled_token_added_rate": 0.02907218923792243,
"support/sampled_winner_rank": 0.5602664463222027,
"support/selected_width": 42.84639596939087,
"support/stored_width": 42.84639596939087
},
{
"epoch": 0.125,
"grad_norm": 69.52674102783203,
"kl/sequence_policy_ref": -1.1963431239128113,
"kl/vocab_forward": 1.3841880485415459,
"kl/vocab_js": 0.33685372211039066,
"kl/vocab_reverse": 1.3344588950276375,
"kl/vocab_symmetric": 2.7186470329761505,
"learning_rate": 2e-06,
"logps/chosen": -520.5281848907471,
"logps/rejected": -1275.8439178466797,
"loss": 0.6933,
"loss/dpo": 0.5151687189936638,
"misalign/J": 17.81585144996643,
"misalign/J_aux_loss": 0.17815851839259267,
"misalign/J_aux_loss_raw": 17.81585144996643,
"misalign/J_over_reverse_kl": 10.14825189113617,
"misalign/J_per_token": 0.029440748097840697,
"misalign/compressed_reward_absmax": 3982.9888916015625,
"misalign/compressed_reward_range": 5621.152526855469,
"misalign/entropy_a": 1110.4090042114258,
"misalign/entropy_b": 1124.430030822754,
"misalign/forward_kl_divergence": 1.3841880485415459,
"misalign/forward_kl_divergence_per_token": 0.005764905363321304,
"misalign/gamma_abs_times_reward_std": 3656771.703125,
"misalign/gamma_bracketed_rate": 0.9985413998365402,
"misalign/gamma_reward_residual": -2.2492993480227597e-05,
"misalign/gamma_star": 6422516.171875,
"misalign/js_divergence": 0.33685372211039066,
"misalign/reverse_kl_divergence": 1.3344588950276375,
"misalign/reverse_kl_divergence_per_token": 0.005282549886032939,
"misalign/reward_a": -9.868786454200745,
"misalign/reward_b": -17.78406047821045,
"misalign/reward_improvement": 7.915277659893036,
"misalign/reward_improvement_over_reverse_kl": 5.6070281863212585,
"misalign/reward_improvement_per_token": 0.02902397490106523,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -359.259729385376,
"misalign/reward_vocab_std": 703.0287933349609,
"misalign/symmetric_kl": 2.7186470329761505,
"misalign/tv_distance": 10.146484673023224,
"num_tokens": 1388327.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.0993000838207081,
"rewards/margins": 0.43786880001425743,
"rewards/rejected": -0.3385687116533518,
"step": 8,
"support/residual_count": 151893.01171875,
"support/residual_mass_policy": 0.039598725736141205,
"support/residual_mass_reference": 0.03887052624486387,
"support/residual_reward": -0.35539715457707644,
"support/runtime_width": 42.98911476135254,
"support/sampled_loser_rank": 0.5893515609204769,
"support/sampled_reward_rank": -0.47250746935606003,
"support/sampled_token_added_rate": 0.03269299981184304,
"support/sampled_winner_rank": 0.5859329588711262,
"support/selected_width": 42.98911476135254,
"support/stored_width": 42.98911476135254
},
{
"epoch": 0.140625,
"grad_norm": 67.0599136352539,
"kl/sequence_policy_ref": -2.3771479576826096,
"kl/vocab_forward": 2.6128047704696655,
"kl/vocab_js": 0.6251252107322216,
"kl/vocab_reverse": 2.4535476565361023,
"kl/vocab_symmetric": 5.06635195016861,
"learning_rate": 2e-06,
"logps/chosen": -615.9590682983398,
"logps/rejected": -1274.1997985839844,
"loss": 0.6781,
"loss/dpo": 0.4201922379434109,
"misalign/J": 25.791358947753906,
"misalign/J_aux_loss": 0.2579135838896036,
"misalign/J_aux_loss_raw": 25.791358947753906,
"misalign/J_over_reverse_kl": 12.780362248420715,
"misalign/J_per_token": 0.05547305219806731,
"misalign/compressed_reward_absmax": 3792.750274658203,
"misalign/compressed_reward_range": 5447.807708740234,
"misalign/entropy_a": 1148.869960784912,
"misalign/entropy_b": 1178.2729797363281,
"misalign/forward_kl_divergence": 2.6128047704696655,
"misalign/forward_kl_divergence_per_token": 0.006875867606140673,
"misalign/gamma_abs_times_reward_std": 5016933.359375,
"misalign/gamma_bracketed_rate": 0.9971183687448502,
"misalign/gamma_reward_residual": -1.4140535995466053e-05,
"misalign/gamma_star": 9540094.09375,
"misalign/js_divergence": 0.6251252107322216,
"misalign/reverse_kl_divergence": 2.4535476565361023,
"misalign/reverse_kl_divergence_per_token": 0.006300629815086722,
"misalign/reward_a": -4.234024614095688,
"misalign/reward_b": -17.268560528755188,
"misalign/reward_improvement": 13.034542560577393,
"misalign/reward_improvement_over_reverse_kl": 4.815744161605835,
"misalign/reward_improvement_per_token": 0.030366417719051242,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -434.87897872924805,
"misalign/reward_vocab_std": 651.9961624145508,
"misalign/symmetric_kl": 5.06635195016861,
"misalign/tv_distance": 14.232259213924408,
"num_tokens": 1550382.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.12840933166444302,
"rewards/margins": 0.7322482690215111,
"rewards/rejected": -0.6038389429450035,
"step": 9,
"support/residual_count": 151893.478515625,
"support/residual_mass_policy": 0.03697363403625786,
"support/residual_mass_reference": 0.037607218604534864,
"support/residual_reward": -0.5447902157902718,
"support/runtime_width": 42.5230073928833,
"support/sampled_loser_rank": 0.6056095100939274,
"support/sampled_reward_rank": -0.31354507617652416,
"support/sampled_token_added_rate": 0.03456789907068014,
"support/sampled_winner_rank": 0.6345465183258057,
"support/selected_width": 42.5230073928833,
"support/stored_width": 42.5230073928833
},
{
"epoch": 0.15625,
"grad_norm": 87.10169982910156,
"kl/sequence_policy_ref": -6.291569083929062,
"kl/vocab_forward": 5.60641685128212,
"kl/vocab_js": 1.291469193994999,
"kl/vocab_reverse": 5.0420292019844055,
"kl/vocab_symmetric": 10.648443281650543,
"learning_rate": 2e-06,
"logps/chosen": -468.0983257293701,
"logps/rejected": -1285.5955200195312,
"loss": 0.9025,
"loss/dpo": 0.3525316398590803,
"misalign/J": 54.998396158218384,
"misalign/J_aux_loss": 0.5499839466065168,
"misalign/J_aux_loss_raw": 54.998396158218384,
"misalign/J_over_reverse_kl": 9.880838811397552,
"misalign/J_per_token": 0.07741667260415852,
"misalign/compressed_reward_absmax": 3692.7807006835938,
"misalign/compressed_reward_range": 5182.304748535156,
"misalign/entropy_a": 1031.7844772338867,
"misalign/entropy_b": 1076.0891571044922,
"misalign/forward_kl_divergence": 5.60641685128212,
"misalign/forward_kl_divergence_per_token": 0.01369796262588352,
"misalign/gamma_abs_times_reward_std": 11830781.9375,
"misalign/gamma_bracketed_rate": 0.9962232336401939,
"misalign/gamma_reward_residual": 9.550651896006457e-06,
"misalign/gamma_star": 24339454.65625,
"misalign/js_divergence": 1.291469193994999,
"misalign/reverse_kl_divergence": 5.0420292019844055,
"misalign/reverse_kl_divergence_per_token": 0.011788319039624184,
"misalign/reward_a": -0.19761592149734497,
"misalign/reward_b": -18.621128231287003,
"misalign/reward_improvement": 18.42351984977722,
"misalign/reward_improvement_over_reverse_kl": 3.5281217098236084,
"misalign/reward_improvement_per_token": 0.044580711517482996,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -340.7347640991211,
"misalign/reward_vocab_std": 628.4291152954102,
"misalign/symmetric_kl": 10.648443281650543,
"misalign/tv_distance": 18.700318098068237,
"num_tokens": 1705187.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.055842478293925524,
"rewards/margins": 1.1466289013624191,
"rewards/rejected": -1.2024713829159737,
"step": 10,
"support/residual_count": 151893.390625,
"support/residual_mass_policy": 0.03804836701601744,
"support/residual_mass_reference": 0.03930599056184292,
"support/residual_reward": -0.42244877200573683,
"support/runtime_width": 42.61000061035156,
"support/sampled_loser_rank": 0.6178636997938156,
"support/sampled_reward_rank": -0.380008390173316,
"support/sampled_token_added_rate": 0.03352847881615162,
"support/sampled_winner_rank": 0.6379577368497849,
"support/selected_width": 42.61000061035156,
"support/stored_width": 42.61000061035156
},
{
"epoch": 0.171875,
"grad_norm": 105.65762329101562,
"kl/sequence_policy_ref": -11.886906266212463,
"kl/vocab_forward": 10.033158540725708,
"kl/vocab_js": 2.2728197276592255,
"kl/vocab_reverse": 8.8297780752182,
"kl/vocab_symmetric": 18.862935781478882,
"learning_rate": 2e-06,
"logps/chosen": -864.007137298584,
"logps/rejected": -1354.3248596191406,
"loss": 0.957,
"loss/dpo": 0.25358179584145546,
"misalign/J": 70.34657621383667,
"misalign/J_aux_loss": 0.7034657262265682,
"misalign/J_aux_loss_raw": 70.34657621383667,
"misalign/J_over_reverse_kl": 7.123360276222229,
"misalign/J_per_token": 0.07085963152348995,
"misalign/compressed_reward_absmax": 4522.3524169921875,
"misalign/compressed_reward_range": 6510.963439941406,
"misalign/entropy_a": 1291.0633544921875,
"misalign/entropy_b": 1368.3875427246094,
"misalign/forward_kl_divergence": 10.033158540725708,
"misalign/forward_kl_divergence_per_token": 0.015985821490176022,
"misalign/gamma_abs_times_reward_std": 16111157.5625,
"misalign/gamma_bracketed_rate": 0.9964210242033005,
"misalign/gamma_reward_residual": 2.2860098624732927e-05,
"misalign/gamma_star": 28422147.375,
"misalign/js_divergence": 2.2728197276592255,
"misalign/reverse_kl_divergence": 8.8297780752182,
"misalign/reverse_kl_divergence_per_token": 0.013298386707901955,
"misalign/reward_a": 11.391705840826035,
"misalign/reward_b": -19.758893489837646,
"misalign/reward_improvement": 31.150599718093872,
"misalign/reward_improvement_over_reverse_kl": 3.3264004588127136,
"misalign/reward_improvement_per_token": 0.04476729570887983,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -656.2439975738525,
"misalign/reward_vocab_std": 782.3782577514648,
"misalign/symmetric_kl": 18.862935781478882,
"misalign/tv_distance": 28.730425596237183,
"num_tokens": 1887093.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.3179726116359234,
"rewards/margins": 1.741436019539833,
"rewards/rejected": -2.0594086199998856,
"step": 11,
"support/residual_count": 151893.275390625,
"support/residual_mass_policy": 0.035074356477707624,
"support/residual_mass_reference": 0.037040101597085595,
"support/residual_reward": -0.4819745300337672,
"support/runtime_width": 42.72910785675049,
"support/sampled_loser_rank": 0.6259545609354973,
"support/sampled_reward_rank": -0.3720765591133386,
"support/sampled_token_added_rate": 0.03333452111110091,
"support/sampled_winner_rank": 0.6261583790183067,
"support/selected_width": 42.72910785675049,
"support/stored_width": 42.72910785675049
},
{
"epoch": 0.1875,
"grad_norm": 113.49061584472656,
"kl/sequence_policy_ref": -10.061190009117126,
"kl/vocab_forward": 11.224669754505157,
"kl/vocab_js": 2.5259178578853607,
"kl/vocab_reverse": 9.792702317237854,
"kl/vocab_symmetric": 21.017370462417603,
"learning_rate": 2e-06,
"logps/chosen": -895.415397644043,
"logps/rejected": -1483.4818572998047,
"loss": 1.1275,
"loss/dpo": 0.3059763703495264,
"misalign/J": 82.15509986877441,
"misalign/J_aux_loss": 0.8215509578585625,
"misalign/J_aux_loss_raw": 82.15509986877441,
"misalign/J_over_reverse_kl": 9.914618968963623,
"misalign/J_per_token": 0.1017089462839067,
"misalign/compressed_reward_absmax": 4446.288055419922,
"misalign/compressed_reward_range": 6255.252502441406,
"misalign/entropy_a": 1389.9479522705078,
"misalign/entropy_b": 1475.2511596679688,
"misalign/forward_kl_divergence": 11.224669754505157,
"misalign/forward_kl_divergence_per_token": 0.017279054620303214,
"misalign/gamma_abs_times_reward_std": 19089820.0,
"misalign/gamma_bracketed_rate": 0.994603119790554,
"misalign/gamma_reward_residual": 1.673686114145312e-05,
"misalign/gamma_star": 34029449.5,
"misalign/js_divergence": 2.5259178578853607,
"misalign/reverse_kl_divergence": 9.792702317237854,
"misalign/reverse_kl_divergence_per_token": 0.015302568324841559,
"misalign/reward_a": 18.606368482112885,
"misalign/reward_b": -13.686601161956787,
"misalign/reward_improvement": 32.2929527759552,
"misalign/reward_improvement_over_reverse_kl": 3.0961980521678925,
"misalign/reward_improvement_per_token": 0.04442322696559131,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -424.1544990539551,
"misalign/reward_vocab_std": 740.4278564453125,
"misalign/symmetric_kl": 21.017370462417603,
"misalign/tv_distance": 30.929341316223145,
"num_tokens": 2067343.0,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.20568968541920185,
"rewards/margins": 1.6008586585521698,
"rewards/rejected": -1.8065483272075653,
"step": 12,
"support/residual_count": 151893.384765625,
"support/residual_mass_policy": 0.04051102139055729,
"support/residual_mass_reference": 0.042460997588932514,
"support/residual_reward": -0.3618390057235956,
"support/runtime_width": 42.61893367767334,
"support/sampled_loser_rank": 0.6536427140235901,
"support/sampled_reward_rank": -0.37369205243885517,
"support/sampled_token_added_rate": 0.037714328384026885,
"support/sampled_winner_rank": 0.6573853716254234,
"support/selected_width": 42.61893367767334,
"support/stored_width": 42.61893367767334
},
{
"epoch": 0.1875,
"eval_kl/sequence_policy_ref": -17.01151405274868,
"eval_kl/vocab_forward": 15.591387048363686,
"eval_kl/vocab_js": 3.397782253101468,
"eval_kl/vocab_reverse": 13.213865287601948,
"eval_kl/vocab_symmetric": 28.80525030195713,
"eval_logps/chosen": -734.3654553890228,
"eval_logps/rejected": -1399.9234561920166,
"eval_loss": 1.0719456672668457,
"eval_loss/dpo": 0.21070287289330736,
"eval_misalign/J": 86.12427139282227,
"eval_misalign/J_aux_loss": 0.8612426882609725,
"eval_misalign/J_aux_loss_raw": 86.12427139282227,
"eval_misalign/J_over_reverse_kl": 7.439761482179165,
"eval_misalign/J_per_token": 0.11331278597936034,
"eval_misalign/compressed_reward_absmax": 4236.706287384033,
"eval_misalign/compressed_reward_range": 6027.056529998779,
"eval_misalign/entropy_a": 1207.335482597351,
"eval_misalign/entropy_b": 1298.0544576644897,
"eval_misalign/forward_kl_divergence": 15.591387048363686,
"eval_misalign/forward_kl_divergence_per_token": 0.027479787677293643,
"eval_misalign/gamma_abs_times_reward_std": 20802586.9921875,
"eval_misalign/gamma_bracketed_rate": 0.9936437727883458,
"eval_misalign/gamma_reward_residual": 2.4353206566019026e-05,
"eval_misalign/gamma_star": 37987713.1640625,
"eval_misalign/js_divergence": 3.397782253101468,
"eval_misalign/reverse_kl_divergence": 13.213865287601948,
"eval_misalign/reverse_kl_divergence_per_token": 0.022622147007496096,
"eval_misalign/reward_a": 23.110429362626746,
"eval_misalign/reward_b": -14.30728217586875,
"eval_misalign/reward_improvement": 37.41771391034126,
"eval_misalign/reward_improvement_over_reverse_kl": 2.6861571483314037,
"eval_misalign/reward_improvement_per_token": 0.05757526887464337,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.11555767059326,
"eval_misalign/reward_vocab_std": 731.3202633857727,
"eval_misalign/symmetric_kl": 28.80525030195713,
"eval_misalign/tv_distance": 33.611202627420425,
"eval_rewards/accuracies": 0.931640625,
"eval_rewards/chosen": -0.5189501565182582,
"eval_rewards/margins": 2.3644025400280952,
"eval_rewards/rejected": -2.883352691307664,
"eval_runtime": 100.9771,
"eval_samples_per_second": 5.07,
"eval_steps_per_second": 0.634,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.04048109907307662,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 12
},
{
"epoch": 0.203125,
"grad_norm": 143.4099884033203,
"kl/sequence_policy_ref": -17.67092001438141,
"kl/vocab_forward": 16.044883847236633,
"kl/vocab_js": 3.4656281918287277,
"kl/vocab_reverse": 13.393466770648956,
"kl/vocab_symmetric": 29.438353061676025,
"learning_rate": 2e-06,
"logps/chosen": -670.5424919128418,
"logps/rejected": -1345.9356536865234,
"loss": 1.3177,
"loss/dpo": 0.21267282590270042,
"misalign/J": 110.50714302062988,
"misalign/J_aux_loss": 1.1050714254379272,
"misalign/J_aux_loss_raw": 110.50714302062988,
"misalign/J_over_reverse_kl": 8.267582476139069,
"misalign/J_per_token": 0.11648859549313784,
"misalign/compressed_reward_absmax": 4362.797546386719,
"misalign/compressed_reward_range": 6223.5970458984375,
"misalign/entropy_a": 1131.1556549072266,
"misalign/entropy_b": 1221.6925888061523,
"misalign/forward_kl_divergence": 16.044883847236633,
"misalign/forward_kl_divergence_per_token": 0.02564867539331317,
"misalign/gamma_abs_times_reward_std": 24747786.5,
"misalign/gamma_bracketed_rate": 0.9937806725502014,
"misalign/gamma_reward_residual": 4.918595743674814e-05,
"misalign/gamma_star": 46901569.25,
"misalign/js_divergence": 3.4656281918287277,
"misalign/reverse_kl_divergence": 13.393466770648956,
"misalign/reverse_kl_divergence_per_token": 0.02141062926966697,
"misalign/reward_a": 23.634789615869522,
"misalign/reward_b": -12.488820567727089,
"misalign/reward_improvement": 36.12361431121826,
"misalign/reward_improvement_over_reverse_kl": 2.6490939259529114,
"misalign/reward_improvement_per_token": 0.05916513060219586,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -522.8331413269043,
"misalign/reward_vocab_std": 745.3072738647461,
"misalign/symmetric_kl": 29.438353061676025,
"misalign/tv_distance": 33.53639495372772,
"num_tokens": 2250359.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -0.6528808567672968,
"rewards/margins": 2.228422373533249,
"rewards/rejected": -2.8813031911849976,
"step": 13,
"support/residual_count": 151893.263671875,
"support/residual_mass_policy": 0.03615651559084654,
"support/residual_mass_reference": 0.03887659031897783,
"support/residual_reward": -0.4554095212370157,
"support/runtime_width": 42.7385835647583,
"support/sampled_loser_rank": 0.6245415061712265,
"support/sampled_reward_rank": -0.40703338757157326,
"support/sampled_token_added_rate": 0.032946799183264375,
"support/sampled_winner_rank": 0.6250991076231003,
"support/selected_width": 42.7385835647583,
"support/stored_width": 42.7385835647583
},
{
"epoch": 0.21875,
"grad_norm": 129.94761657714844,
"kl/sequence_policy_ref": -28.61777091026306,
"kl/vocab_forward": 24.432228088378906,
"kl/vocab_js": 5.003187119960785,
"kl/vocab_reverse": 19.63451886177063,
"kl/vocab_symmetric": 44.0667519569397,
"learning_rate": 2e-06,
"logps/chosen": -519.6495475769043,
"logps/rejected": -1307.973617553711,
"loss": 1.2097,
"loss/dpo": 0.13245126977562904,
"misalign/J": 107.72635746002197,
"misalign/J_aux_loss": 1.0772635713219643,
"misalign/J_aux_loss_raw": 107.72635746002197,
"misalign/J_over_reverse_kl": 6.927842974662781,
"misalign/J_per_token": 0.1508565410040319,
"misalign/compressed_reward_absmax": 3698.0084838867188,
"misalign/compressed_reward_range": 5319.326965332031,
"misalign/entropy_a": 1003.0962753295898,
"misalign/entropy_b": 1097.5271453857422,
"misalign/forward_kl_divergence": 24.432228088378906,
"misalign/forward_kl_divergence_per_token": 0.05230529000982642,
"misalign/gamma_abs_times_reward_std": 28536720.375,
"misalign/gamma_bracketed_rate": 0.9909562915563583,
"misalign/gamma_reward_residual": 4.479655626710155e-05,
"misalign/gamma_star": 52695571.75,
"misalign/js_divergence": 5.003187119960785,
"misalign/reverse_kl_divergence": 19.63451886177063,
"misalign/reverse_kl_divergence_per_token": 0.04139284580014646,
"misalign/reward_a": 25.409843683242798,
"misalign/reward_b": -16.79699671268463,
"misalign/reward_improvement": 42.20684003829956,
"misalign/reward_improvement_over_reverse_kl": 2.0897003561258316,
"misalign/reward_improvement_per_token": 0.08610636787489057,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -519.6820330619812,
"misalign/reward_vocab_std": 664.3161697387695,
"misalign/symmetric_kl": 44.0667519569397,
"misalign/tv_distance": 37.26307129859924,
"num_tokens": 2405248.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -0.8268119320273399,
"rewards/margins": 4.069930404424667,
"rewards/rejected": -4.896742224693298,
"step": 14,
"support/residual_count": 151893.26953125,
"support/residual_mass_policy": 0.03416757704690099,
"support/residual_mass_reference": 0.03690562699921429,
"support/residual_reward": -0.5111633716151118,
"support/runtime_width": 42.731074810028076,
"support/sampled_loser_rank": 0.5858863964676857,
"support/sampled_reward_rank": -0.4269859306514263,
"support/sampled_token_added_rate": 0.03297502198256552,
"support/sampled_winner_rank": 0.5992331206798553,
"support/selected_width": 42.731074810028076,
"support/stored_width": 42.731074810028076
},
{
"epoch": 0.234375,
"grad_norm": 181.91363525390625,
"kl/sequence_policy_ref": -37.09248995780945,
"kl/vocab_forward": 36.39622640609741,
"kl/vocab_js": 7.3817285895347595,
"kl/vocab_reverse": 28.830130338668823,
"kl/vocab_symmetric": 65.22635078430176,
"learning_rate": 2e-06,
"logps/chosen": -861.3702850341797,
"logps/rejected": -1442.107177734375,
"loss": 1.6622,
"loss/dpo": 0.07950026832986623,
"misalign/J": 158.2665023803711,
"misalign/J_aux_loss": 1.582665003836155,
"misalign/J_aux_loss_raw": 158.2665023803711,
"misalign/J_over_reverse_kl": 6.344271242618561,
"misalign/J_per_token": 0.14481874648481607,
"misalign/compressed_reward_absmax": 4787.8853759765625,
"misalign/compressed_reward_range": 6777.99462890625,
"misalign/entropy_a": 1239.900390625,
"misalign/entropy_b": 1391.7532577514648,
"misalign/forward_kl_divergence": 36.39622640609741,
"misalign/forward_kl_divergence_per_token": 0.046229132916778326,
"misalign/gamma_abs_times_reward_std": 40674196.875,
"misalign/gamma_bracketed_rate": 0.991030216217041,
"misalign/gamma_reward_residual": 7.2218020022774e-05,
"misalign/gamma_star": 71395850.0,
"misalign/js_divergence": 7.3817285895347595,
"misalign/reverse_kl_divergence": 28.830130338668823,
"misalign/reverse_kl_divergence_per_token": 0.034299688413739204,
"misalign/reward_a": 44.823195934295654,
"misalign/reward_b": -15.68426263332367,
"misalign/reward_improvement": 60.507455825805664,
"misalign/reward_improvement_over_reverse_kl": 2.0816327780485153,
"misalign/reward_improvement_per_token": 0.07316383346915245,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -574.4606781005859,
"misalign/reward_vocab_std": 834.6710662841797,
"misalign/symmetric_kl": 65.22635078430176,
"misalign/tv_distance": 52.97303628921509,
"num_tokens": 2598130.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -1.3571155220270157,
"rewards/margins": 4.704267263412476,
"rewards/rejected": -6.06138277053833,
"step": 15,
"support/residual_count": 151893.16015625,
"support/residual_mass_policy": 0.0329043276142329,
"support/residual_mass_reference": 0.0375856957398355,
"support/residual_reward": -0.433580182492733,
"support/runtime_width": 42.837427616119385,
"support/sampled_loser_rank": 0.6224480830132961,
"support/sampled_reward_rank": -0.4383445642888546,
"support/sampled_token_added_rate": 0.03260477026924491,
"support/sampled_winner_rank": 0.6216517090797424,
"support/selected_width": 42.837427616119385,
"support/stored_width": 42.837427616119385
},
{
"epoch": 0.25,
"grad_norm": 197.3428497314453,
"kl/sequence_policy_ref": -48.6728458404541,
"kl/vocab_forward": 41.45539164543152,
"kl/vocab_js": 8.142653048038483,
"kl/vocab_reverse": 31.82970356941223,
"kl/vocab_symmetric": 73.28509140014648,
"learning_rate": 2e-06,
"logps/chosen": -746.3570137023926,
"logps/rejected": -1378.0999755859375,
"loss": 1.7196,
"loss/dpo": 0.21162739349529147,
"misalign/J": 150.7994818687439,
"misalign/J_aux_loss": 1.5079948231577873,
"misalign/J_aux_loss_raw": 150.7994818687439,
"misalign/J_over_reverse_kl": 6.891877442598343,
"misalign/J_per_token": 0.1977673191577196,
"misalign/compressed_reward_absmax": 4197.204010009766,
"misalign/compressed_reward_range": 6020.658874511719,
"misalign/entropy_a": 1102.5961456298828,
"misalign/entropy_b": 1252.7644424438477,
"misalign/forward_kl_divergence": 41.45539164543152,
"misalign/forward_kl_divergence_per_token": 0.07184931915253401,
"misalign/gamma_abs_times_reward_std": 42797047.125,
"misalign/gamma_bracketed_rate": 0.9879247918725014,
"misalign/gamma_reward_residual": 7.625430021107604e-05,
"misalign/gamma_star": 70593251.625,
"misalign/js_divergence": 8.142653048038483,
"misalign/reverse_kl_divergence": 31.82970356941223,
"misalign/reverse_kl_divergence_per_token": 0.04737356537953019,
"misalign/reward_a": 40.68007683753967,
"misalign/reward_b": -19.660471826791763,
"misalign/reward_improvement": 60.34055471420288,
"misalign/reward_improvement_over_reverse_kl": 1.6718981862068176,
"misalign/reward_improvement_per_token": 0.06326864054426551,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -500.27739334106445,
"misalign/reward_vocab_std": 718.7537536621094,
"misalign/symmetric_kl": 73.28509140014648,
"misalign/tv_distance": 52.97745728492737,
"num_tokens": 2767079.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.074222356081009,
"rewards/margins": 5.586124628782272,
"rewards/rejected": -7.660347044467926,
"step": 16,
"support/residual_count": 151892.984375,
"support/residual_mass_policy": 0.04499144037254155,
"support/residual_mass_reference": 0.0501289798412472,
"support/residual_reward": -0.45880454778671265,
"support/runtime_width": 43.018609046936035,
"support/sampled_loser_rank": 0.5783994421362877,
"support/sampled_reward_rank": -0.40191334672272205,
"support/sampled_token_added_rate": 0.035224413964897394,
"support/sampled_winner_rank": 0.6004925258457661,
"support/selected_width": 43.018609046936035,
"support/stored_width": 43.018609046936035
},
{
"epoch": 0.265625,
"grad_norm": 180.9165802001953,
"kl/sequence_policy_ref": -57.82066249847412,
"kl/vocab_forward": 49.91004800796509,
"kl/vocab_js": 9.40449070930481,
"kl/vocab_reverse": 36.53464651107788,
"kl/vocab_symmetric": 86.44468975067139,
"learning_rate": 2e-06,
"logps/chosen": -609.6646957397461,
"logps/rejected": -1610.3834075927734,
"loss": 1.7231,
"loss/dpo": 0.09814724331954494,
"misalign/J": 162.49910640716553,
"misalign/J_aux_loss": 1.624990999698639,
"misalign/J_aux_loss_raw": 162.49910640716553,
"misalign/J_over_reverse_kl": 5.963241904973984,
"misalign/J_per_token": 0.25427408143877983,
"misalign/compressed_reward_absmax": 4177.482116699219,
"misalign/compressed_reward_range": 5814.5203857421875,
"misalign/entropy_a": 1132.220069885254,
"misalign/entropy_b": 1293.5945892333984,
"misalign/forward_kl_divergence": 49.91004800796509,
"misalign/forward_kl_divergence_per_token": 0.08865668019279838,
"misalign/gamma_abs_times_reward_std": 45817248.5,
"misalign/gamma_bracketed_rate": 0.9841953068971634,
"misalign/gamma_reward_residual": 6.89706002958701e-05,
"misalign/gamma_star": 86042709.0,
"misalign/js_divergence": 9.40449070930481,
"misalign/reverse_kl_divergence": 36.53464651107788,
"misalign/reverse_kl_divergence_per_token": 0.05950516927987337,
"misalign/reward_a": 46.90000104904175,
"misalign/reward_b": -18.144596874713898,
"misalign/reward_improvement": 65.04460048675537,
"misalign/reward_improvement_over_reverse_kl": 1.706669107079506,
"misalign/reward_improvement_per_token": 0.0920080472715199,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -442.9830160140991,
"misalign/reward_vocab_std": 696.7616271972656,
"misalign/symmetric_kl": 86.44468975067139,
"misalign/tv_distance": 57.59225845336914,
"num_tokens": 2937506.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -1.9826722741127014,
"rewards/margins": 7.598788321018219,
"rewards/rejected": -9.581460356712341,
"step": 17,
"support/residual_count": 151893.49609375,
"support/residual_mass_policy": 0.0333835429046303,
"support/residual_mass_reference": 0.03812613524496555,
"support/residual_reward": -0.4731739591807127,
"support/runtime_width": 42.50527763366699,
"support/sampled_loser_rank": 0.6437485739588737,
"support/sampled_reward_rank": -0.3562327502295375,
"support/sampled_token_added_rate": 0.035286844009533525,
"support/sampled_winner_rank": 0.6677471101284027,
"support/selected_width": 42.50527763366699,
"support/stored_width": 42.50527763366699
},
{
"epoch": 0.28125,
"grad_norm": 149.1407928466797,
"kl/sequence_policy_ref": -51.036746978759766,
"kl/vocab_forward": 44.88189697265625,
"kl/vocab_js": 8.306392669677734,
"kl/vocab_reverse": 32.83620524406433,
"kl/vocab_symmetric": 77.71810150146484,
"learning_rate": 2e-06,
"logps/chosen": -560.9642581939697,
"logps/rejected": -1253.7114028930664,
"loss": 1.5028,
"loss/dpo": 0.17861688579432666,
"misalign/J": 132.42161083221436,
"misalign/J_aux_loss": 1.324216105043888,
"misalign/J_aux_loss_raw": 132.42161083221436,
"misalign/J_over_reverse_kl": 8.582664713263512,
"misalign/J_per_token": 0.27548689767718315,
"misalign/compressed_reward_absmax": 3723.6834411621094,
"misalign/compressed_reward_range": 5332.2451171875,
"misalign/entropy_a": 928.9597320556641,
"misalign/entropy_b": 1068.3447341918945,
"misalign/forward_kl_divergence": 44.88189697265625,
"misalign/forward_kl_divergence_per_token": 0.11330410279333591,
"misalign/gamma_abs_times_reward_std": 38645046.25,
"misalign/gamma_bracketed_rate": 0.9823459088802338,
"misalign/gamma_reward_residual": 7.946674531922326e-05,
"misalign/gamma_star": 71871948.75,
"misalign/js_divergence": 8.306392669677734,
"misalign/reverse_kl_divergence": 32.83620524406433,
"misalign/reverse_kl_divergence_per_token": 0.08078084606677294,
"misalign/reward_a": 35.60689043998718,
"misalign/reward_b": -16.05292272567749,
"misalign/reward_improvement": 51.65981483459473,
"misalign/reward_improvement_over_reverse_kl": 1.344532385468483,
"misalign/reward_improvement_per_token": 0.06508979946374893,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -361.802695274353,
"misalign/reward_vocab_std": 639.2480430603027,
"misalign/symmetric_kl": 77.71810150146484,
"misalign/tv_distance": 48.04610013961792,
"num_tokens": 3089181.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -2.0581542253494263,
"rewards/margins": 6.091041147708893,
"rewards/rejected": -8.149195373058319,
"step": 18,
"support/residual_count": 151893.052734375,
"support/residual_mass_policy": 0.039642924442887306,
"support/residual_mass_reference": 0.049897957127541304,
"support/residual_reward": -0.4336713273078203,
"support/runtime_width": 42.948453426361084,
"support/sampled_loser_rank": 0.6003664061427116,
"support/sampled_reward_rank": -0.5117907077074051,
"support/sampled_token_added_rate": 0.03761800215579569,
"support/sampled_winner_rank": 0.6974114552140236,
"support/selected_width": 42.948453426361084,
"support/stored_width": 42.948453426361084
},
{
"epoch": 0.296875,
"grad_norm": 236.4145965576172,
"kl/sequence_policy_ref": -70.85580825805664,
"kl/vocab_forward": 61.658048152923584,
"kl/vocab_js": 11.049875855445862,
"kl/vocab_reverse": 42.84557771682739,
"kl/vocab_symmetric": 104.50362396240234,
"learning_rate": 2e-06,
"logps/chosen": -680.9906196594238,
"logps/rejected": -1526.8777770996094,
"loss": 2.3197,
"loss/dpo": 0.15028794163845305,
"misalign/J": 216.9369659423828,
"misalign/J_aux_loss": 2.169369585812092,
"misalign/J_aux_loss_raw": 216.9369659423828,
"misalign/J_over_reverse_kl": 6.0987227857112885,
"misalign/J_per_token": 0.24055337626487017,
"misalign/compressed_reward_absmax": 4234.713134765625,
"misalign/compressed_reward_range": 5953.008850097656,
"misalign/entropy_a": 1090.484504699707,
"misalign/entropy_b": 1275.7961730957031,
"misalign/forward_kl_divergence": 61.658048152923584,
"misalign/forward_kl_divergence_per_token": 0.0996482465416193,
"misalign/gamma_abs_times_reward_std": 67395967.375,
"misalign/gamma_bracketed_rate": 0.9844093844294548,
"misalign/gamma_reward_residual": 0.00012266499561519595,
"misalign/gamma_star": 125619402.0,
"misalign/js_divergence": 11.049875855445862,
"misalign/reverse_kl_divergence": 42.84557771682739,
"misalign/reverse_kl_divergence_per_token": 0.06486277049407363,
"misalign/reward_a": 51.76768445968628,
"misalign/reward_b": -14.660561382770538,
"misalign/reward_improvement": 66.42824363708496,
"misalign/reward_improvement_over_reverse_kl": 1.57049061357975,
"misalign/reward_improvement_per_token": 0.11498506926000118,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -568.4373540878296,
"misalign/reward_vocab_std": 725.4858703613281,
"misalign/symmetric_kl": 104.50362396240234,
"misalign/tv_distance": 62.00803565979004,
"num_tokens": 3255573.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -3.0821579098701477,
"rewards/margins": 8.006846249103546,
"rewards/rejected": -11.089004278182983,
"step": 19,
"support/residual_count": 151893.365234375,
"support/residual_mass_policy": 0.04206296429038048,
"support/residual_mass_reference": 0.046698169549927115,
"support/residual_reward": -0.47489158436656,
"support/runtime_width": 42.64014720916748,
"support/sampled_loser_rank": 0.605025552213192,
"support/sampled_reward_rank": -0.3768458142876625,
"support/sampled_token_added_rate": 0.045746787916868925,
"support/sampled_winner_rank": 0.6195746287703514,
"support/selected_width": 42.64014720916748,
"support/stored_width": 42.64014720916748
},
{
"epoch": 0.3125,
"grad_norm": 116.85738372802734,
"kl/sequence_policy_ref": -65.05696249008179,
"kl/vocab_forward": 57.7905068397522,
"kl/vocab_js": 10.413756370544434,
"kl/vocab_reverse": 40.30951166152954,
"kl/vocab_symmetric": 98.10001516342163,
"learning_rate": 2e-06,
"logps/chosen": -673.3448905944824,
"logps/rejected": -1367.0256805419922,
"loss": 1.323,
"loss/dpo": 0.1641167537018191,
"misalign/J": 115.88617134094238,
"misalign/J_aux_loss": 1.1588616967201233,
"misalign/J_aux_loss_raw": 115.88617134094238,
"misalign/J_over_reverse_kl": 3.0085965991020203,
"misalign/J_per_token": 0.16471682861447334,
"misalign/compressed_reward_absmax": 4075.5525817871094,
"misalign/compressed_reward_range": 5858.878479003906,
"misalign/entropy_a": 1006.4973907470703,
"misalign/entropy_b": 1188.8379135131836,
"misalign/forward_kl_divergence": 57.7905068397522,
"misalign/forward_kl_divergence_per_token": 0.10363293252885342,
"misalign/gamma_abs_times_reward_std": 35261591.5,
"misalign/gamma_bracketed_rate": 0.9897296130657196,
"misalign/gamma_reward_residual": 6.685065909550758e-05,
"misalign/gamma_star": 56798719.5,
"misalign/js_divergence": 10.413756370544434,
"misalign/reverse_kl_divergence": 40.30951166152954,
"misalign/reverse_kl_divergence_per_token": 0.07322107395157218,
"misalign/reward_a": 52.47555136680603,
"misalign/reward_b": -10.230955243110657,
"misalign/reward_improvement": 62.70651149749756,
"misalign/reward_improvement_over_reverse_kl": 1.4691433906555176,
"misalign/reward_improvement_per_token": 0.10327118635177612,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -398.2197914123535,
"misalign/reward_vocab_std": 702.5999450683594,
"misalign/symmetric_kl": 98.10001516342163,
"misalign/tv_distance": 58.26440095901489,
"num_tokens": 3415287.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -3.106875829398632,
"rewards/margins": 6.797641098499298,
"rewards/rejected": -9.904516816139221,
"step": 20,
"support/residual_count": 151892.931640625,
"support/residual_mass_policy": 0.03865605127066374,
"support/residual_mass_reference": 0.04426591098308563,
"support/residual_reward": -0.37147790379822254,
"support/runtime_width": 43.06875991821289,
"support/sampled_loser_rank": 0.6501528844237328,
"support/sampled_reward_rank": -0.459061823785305,
"support/sampled_token_added_rate": 0.037211825139820576,
"support/sampled_winner_rank": 0.6419277414679527,
"support/selected_width": 43.06875991821289,
"support/stored_width": 43.06875991821289
},
{
"epoch": 0.328125,
"grad_norm": 167.24000549316406,
"kl/sequence_policy_ref": -87.30181169509888,
"kl/vocab_forward": 80.11595010757446,
"kl/vocab_js": 13.753250360488892,
"kl/vocab_reverse": 52.890267848968506,
"kl/vocab_symmetric": 133.00623035430908,
"learning_rate": 2e-06,
"logps/chosen": -860.2458534240723,
"logps/rejected": -1500.8129959106445,
"loss": 1.9906,
"loss/dpo": 0.29799531144089997,
"misalign/J": 169.26398181915283,
"misalign/J_aux_loss": 1.6926398500800133,
"misalign/J_aux_loss_raw": 169.26398181915283,
"misalign/J_over_reverse_kl": 4.501902684569359,
"misalign/J_per_token": 0.20157606061547995,
"misalign/compressed_reward_absmax": 4415.365386962891,
"misalign/compressed_reward_range": 6252.089111328125,
"misalign/entropy_a": 1110.4772415161133,
"misalign/entropy_b": 1350.9320755004883,
"misalign/forward_kl_divergence": 80.11595010757446,
"misalign/forward_kl_divergence_per_token": 0.10054660588502884,
"misalign/gamma_abs_times_reward_std": 54722816.875,
"misalign/gamma_bracketed_rate": 0.9853794500231743,
"misalign/gamma_reward_residual": 0.00010197218091434479,
"misalign/gamma_star": 109237338.75,
"misalign/js_divergence": 13.753250360488892,
"misalign/reverse_kl_divergence": 52.890267848968506,
"misalign/reverse_kl_divergence_per_token": 0.06223124684765935,
"misalign/reward_a": 63.141523361206055,
"misalign/reward_b": -12.613204658031464,
"misalign/reward_improvement": 75.75473356246948,
"misalign/reward_improvement_over_reverse_kl": 1.376510500907898,
"misalign/reward_improvement_per_token": 0.08627395983785391,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -474.14845275878906,
"misalign/reward_vocab_std": 756.6061401367188,
"misalign/symmetric_kl": 133.00623035430908,
"misalign/tv_distance": 71.77893543243408,
"num_tokens": 3596201.0,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.184990763664246,
"rewards/margins": 7.090380907058716,
"rewards/rejected": -12.275371551513672,
"step": 21,
"support/residual_count": 151893.302734375,
"support/residual_mass_policy": 0.03361299750395119,
"support/residual_mass_reference": 0.03923962963744998,
"support/residual_reward": -0.44669216219335794,
"support/runtime_width": 42.699111461639404,
"support/sampled_loser_rank": 0.5900973714888096,
"support/sampled_reward_rank": -0.3974966434761882,
"support/sampled_token_added_rate": 0.034115204587578773,
"support/sampled_winner_rank": 0.6001664698123932,
"support/selected_width": 42.699111461639404,
"support/stored_width": 42.699111461639404
},
{
"epoch": 0.34375,
"grad_norm": 182.3348846435547,
"kl/sequence_policy_ref": -98.32647848129272,
"kl/vocab_forward": 87.50644826889038,
"kl/vocab_js": 14.100131571292877,
"kl/vocab_reverse": 54.41349792480469,
"kl/vocab_symmetric": 141.9199457168579,
"learning_rate": 2e-06,
"logps/chosen": -720.9393196105957,
"logps/rejected": -1401.5432891845703,
"loss": 1.9288,
"loss/dpo": 0.18852760957088321,
"misalign/J": 174.02261638641357,
"misalign/J_aux_loss": 1.7402261197566986,
"misalign/J_aux_loss_raw": 174.02261638641357,
"misalign/J_over_reverse_kl": 3.6353148818016052,
"misalign/J_per_token": 0.21223169937729836,
"misalign/compressed_reward_absmax": 3982.8233032226562,
"misalign/compressed_reward_range": 5637.1624755859375,
"misalign/entropy_a": 958.0937767028809,
"misalign/entropy_b": 1190.8169174194336,
"misalign/forward_kl_divergence": 87.50644826889038,
"misalign/forward_kl_divergence_per_token": 0.14662323985248804,
"misalign/gamma_abs_times_reward_std": 59248922.75,
"misalign/gamma_bracketed_rate": 0.9862400367856026,
"misalign/gamma_reward_residual": 0.0001503152491295623,
"misalign/gamma_star": 115726537.75,
"misalign/js_divergence": 14.100131571292877,
"misalign/reverse_kl_divergence": 54.41349792480469,
"misalign/reverse_kl_divergence_per_token": 0.08750392450019717,
"misalign/reward_a": 56.57697582244873,
"misalign/reward_b": -11.629800856113434,
"misalign/reward_improvement": 68.20676565170288,
"misalign/reward_improvement_over_reverse_kl": 1.19433955848217,
"misalign/reward_improvement_per_token": 0.09249244816601276,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -674.6355495452881,
"misalign/reward_vocab_std": 699.2789764404297,
"misalign/symmetric_kl": 141.9199457168579,
"misalign/tv_distance": 68.87815427780151,
"num_tokens": 3758885.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -4.974303662776947,
"rewards/margins": 9.71668916940689,
"rewards/rejected": -14.69099223613739,
"step": 22,
"support/residual_count": 151893.119140625,
"support/residual_mass_policy": 0.033487192122265697,
"support/residual_mass_reference": 0.041745478520169854,
"support/residual_reward": -0.5358738675713539,
"support/runtime_width": 42.88250732421875,
"support/sampled_loser_rank": 0.6002072133123875,
"support/sampled_reward_rank": -0.3628186024725437,
"support/sampled_token_added_rate": 0.036499075358733535,
"support/sampled_winner_rank": 0.6214041896164417,
"support/selected_width": 42.88250732421875,
"support/stored_width": 42.88250732421875
},
{
"epoch": 0.359375,
"grad_norm": 169.51712036132812,
"kl/sequence_policy_ref": -149.24591064453125,
"kl/vocab_forward": 133.08092784881592,
"kl/vocab_js": 19.9760000705719,
"kl/vocab_reverse": 77.37433004379272,
"kl/vocab_symmetric": 210.455228805542,
"learning_rate": 2e-06,
"logps/chosen": -737.869384765625,
"logps/rejected": -1762.2871856689453,
"loss": 1.9918,
"loss/dpo": 0.2823996262759465,
"misalign/J": 170.94224166870117,
"misalign/J_aux_loss": 1.7094224244356155,
"misalign/J_aux_loss_raw": 170.94224166870117,
"misalign/J_over_reverse_kl": 3.2799622118473053,
"misalign/J_per_token": 0.24247757904231548,
"misalign/compressed_reward_absmax": 4777.976379394531,
"misalign/compressed_reward_range": 6731.566955566406,
"misalign/entropy_a": 1061.749008178711,
"misalign/entropy_b": 1356.131118774414,
"misalign/forward_kl_divergence": 133.08092784881592,
"misalign/forward_kl_divergence_per_token": 0.1782828439027071,
"misalign/gamma_abs_times_reward_std": 53340090.0,
"misalign/gamma_bracketed_rate": 0.9852898493409157,
"misalign/gamma_reward_residual": 5.0128826615036814e-05,
"misalign/gamma_star": 100226211.0,
"misalign/js_divergence": 19.9760000705719,
"misalign/reverse_kl_divergence": 77.37433004379272,
"misalign/reverse_kl_divergence_per_token": 0.09528437815606594,
"misalign/reward_a": 69.03681755065918,
"misalign/reward_b": -18.53004103899002,
"misalign/reward_improvement": 87.56686687469482,
"misalign/reward_improvement_over_reverse_kl": 1.0574140399694443,
"misalign/reward_improvement_per_token": 0.08220357168465853,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -386.6137237548828,
"misalign/reward_vocab_std": 798.2211227416992,
"misalign/symmetric_kl": 210.455228805542,
"misalign/tv_distance": 88.67370319366455,
"num_tokens": 3948274.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -6.972290515899658,
"rewards/margins": 15.904601573944092,
"rewards/rejected": -22.876891613006592,
"step": 23,
"support/residual_count": 151893.173828125,
"support/residual_mass_policy": 0.027609078446403146,
"support/residual_mass_reference": 0.03574479790404439,
"support/residual_reward": -0.29973831586539745,
"support/runtime_width": 42.827510833740234,
"support/sampled_loser_rank": 0.6080890074372292,
"support/sampled_reward_rank": -0.44763438403606415,
"support/sampled_token_added_rate": 0.03303293650969863,
"support/sampled_winner_rank": 0.6279079839587212,
"support/selected_width": 42.827510833740234,
"support/stored_width": 42.827510833740234
},
{
"epoch": 0.375,
"grad_norm": 140.47096252441406,
"kl/sequence_policy_ref": -144.5389518737793,
"kl/vocab_forward": 128.69008922576904,
"kl/vocab_js": 18.313786387443542,
"kl/vocab_reverse": 71.02430152893066,
"kl/vocab_symmetric": 199.71442413330078,
"learning_rate": 2e-06,
"logps/chosen": -572.0278053283691,
"logps/rejected": -1404.0182037353516,
"loss": 2.6061,
"loss/dpo": 1.0913660326041281,
"misalign/J": 151.477144241333,
"misalign/J_aux_loss": 1.5147713869810104,
"misalign/J_aux_loss_raw": 151.477144241333,
"misalign/J_over_reverse_kl": 3.1627804189920425,
"misalign/J_per_token": 0.24758470617234707,
"misalign/compressed_reward_absmax": 3778.4814453125,
"misalign/compressed_reward_range": 5379.040588378906,
"misalign/entropy_a": 782.0902366638184,
"misalign/entropy_b": 1035.1530227661133,
"misalign/forward_kl_divergence": 128.69008922576904,
"misalign/forward_kl_divergence_per_token": 0.2601375840604305,
"misalign/gamma_abs_times_reward_std": 47515251.5,
"misalign/gamma_bracketed_rate": 0.9839186295866966,
"misalign/gamma_reward_residual": 0.00012259059758434887,
"misalign/gamma_star": 87554911.0,
"misalign/js_divergence": 18.313786387443542,
"misalign/reverse_kl_divergence": 71.02430152893066,
"misalign/reverse_kl_divergence_per_token": 0.13188489899039268,
"misalign/reward_a": 56.8120379447937,
"misalign/reward_b": -14.587293282151222,
"misalign/reward_improvement": 71.39935445785522,
"misalign/reward_improvement_over_reverse_kl": 0.8568570390343666,
"misalign/reward_improvement_per_token": 0.08758416399359703,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -452.94491720199585,
"misalign/reward_vocab_std": 656.8603706359863,
"misalign/symmetric_kl": 199.71442413330078,
"misalign/tv_distance": 75.44480800628662,
"num_tokens": 4099141.0,
"rewards/accuracies": 0.890625,
"rewards/chosen": -7.403442680835724,
"rewards/margins": 14.100905060768127,
"rewards/rejected": -21.504348874092102,
"step": 24,
"support/residual_count": 151893.353515625,
"support/residual_mass_policy": 0.03428218117915094,
"support/residual_mass_reference": 0.04287252272479236,
"support/residual_reward": -0.5751103330403566,
"support/runtime_width": 42.65069341659546,
"support/sampled_loser_rank": 0.573052179068327,
"support/sampled_reward_rank": -0.40498005226254463,
"support/sampled_token_added_rate": 0.03546261019073427,
"support/sampled_winner_rank": 0.5977285951375961,
"support/selected_width": 42.65069341659546,
"support/stored_width": 42.65069341659546
},
{
"epoch": 0.375,
"eval_kl/sequence_policy_ref": -176.45206832885742,
"eval_kl/vocab_forward": 159.24169623851776,
"eval_kl/vocab_js": 22.288581863045692,
"eval_kl/vocab_reverse": 86.69451874494553,
"eval_kl/vocab_symmetric": 245.93626713752747,
"eval_logps/chosen": -822.1732840538025,
"eval_logps/rejected": -1630.9967403411865,
"eval_loss": 2.032864570617676,
"eval_loss/dpo": 0.513789746529512,
"eval_misalign/J": 151.90749096870422,
"eval_misalign/J_aux_loss": 1.519074865616858,
"eval_misalign/J_aux_loss_raw": 151.90749096870422,
"eval_misalign/J_over_reverse_kl": 2.979654673486948,
"eval_misalign/J_per_token": 0.2187155862338841,
"eval_misalign/compressed_reward_absmax": 4236.706275939941,
"eval_misalign/compressed_reward_range": 6027.056526184082,
"eval_misalign/entropy_a": 972.2038908004761,
"eval_misalign/entropy_b": 1298.0544624328613,
"eval_misalign/forward_kl_divergence": 159.24169623851776,
"eval_misalign/forward_kl_divergence_per_token": 0.28451165836304426,
"eval_misalign/gamma_abs_times_reward_std": 44413324.9375,
"eval_misalign/gamma_bracketed_rate": 0.9879434006288648,
"eval_misalign/gamma_reward_residual": 9.759679009846423e-05,
"eval_misalign/gamma_star": 76999057.8125,
"eval_misalign/js_divergence": 22.288581863045692,
"eval_misalign/reverse_kl_divergence": 86.69451874494553,
"eval_misalign/reverse_kl_divergence_per_token": 0.14481490349862725,
"eval_misalign/reward_a": 73.34608280658722,
"eval_misalign/reward_b": -14.307282455265522,
"eval_misalign/reward_improvement": 87.65336620807648,
"eval_misalign/reward_improvement_over_reverse_kl": 0.9149683965370059,
"eval_misalign/reward_improvement_per_token": 0.08359824417857453,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.11556124687195,
"eval_misalign/reward_vocab_std": 731.3202571868896,
"eval_misalign/symmetric_kl": 245.93626713752747,
"eval_misalign/tv_distance": 92.47358250617981,
"eval_rewards/accuracies": 0.91796875,
"eval_rewards/chosen": -9.299732282757759,
"eval_rewards/margins": 16.690949447453022,
"eval_rewards/rejected": -25.99068196117878,
"eval_runtime": 101.3791,
"eval_samples_per_second": 5.05,
"eval_steps_per_second": 0.631,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.031916850464767776,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 24
},
{
"epoch": 0.390625,
"grad_norm": 103.1752700805664,
"kl/sequence_policy_ref": -175.5179786682129,
"kl/vocab_forward": 165.2755527496338,
"kl/vocab_js": 22.945865869522095,
"kl/vocab_reverse": 88.68728542327881,
"kl/vocab_symmetric": 253.96291160583496,
"learning_rate": 2e-06,
"logps/chosen": -823.9939117431641,
"logps/rejected": -1613.8134307861328,
"loss": 1.7244,
"loss/dpo": 0.30688550411116466,
"misalign/J": 141.75555324554443,
"misalign/J_aux_loss": 1.4175555855035782,
"misalign/J_aux_loss_raw": 141.75555324554443,
"misalign/J_over_reverse_kl": 1.7237665206193924,
"misalign/J_per_token": 0.20317152328789234,
"misalign/compressed_reward_absmax": 4355.498870849609,
"misalign/compressed_reward_range": 6246.024597167969,
"misalign/entropy_a": 947.64599609375,
"misalign/entropy_b": 1290.7304077148438,
"misalign/forward_kl_divergence": 165.2755527496338,
"misalign/forward_kl_divergence_per_token": 0.3030826188623905,
"misalign/gamma_abs_times_reward_std": 42652071.0,
"misalign/gamma_bracketed_rate": 0.9883464574813843,
"misalign/gamma_reward_residual": 5.872455130884191e-05,
"misalign/gamma_star": 72449817.125,
"misalign/js_divergence": 22.945865869522095,
"misalign/reverse_kl_divergence": 88.68728542327881,
"misalign/reverse_kl_divergence_per_token": 0.13877300918102264,
"misalign/reward_a": 74.63873767852783,
"misalign/reward_b": -13.649469316005707,
"misalign/reward_improvement": 88.28820371627808,
"misalign/reward_improvement_over_reverse_kl": 0.9083794951438904,
"misalign/reward_improvement_per_token": 0.09202832682058215,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -502.7140769958496,
"misalign/reward_vocab_std": 763.2846450805664,
"misalign/symmetric_kl": 253.96291160583496,
"misalign/tv_distance": 94.16235828399658,
"num_tokens": 4267525.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -9.091135799884796,
"rewards/margins": 16.92132532596588,
"rewards/rejected": -26.012461185455322,
"step": 25,
"support/residual_count": 151893.228515625,
"support/residual_mass_policy": 0.02821849100291729,
"support/residual_mass_reference": 0.03902764664962888,
"support/residual_reward": -0.4604416638612747,
"support/runtime_width": 42.77067279815674,
"support/sampled_loser_rank": 0.6332324221730232,
"support/sampled_reward_rank": -0.385429447516799,
"support/sampled_token_added_rate": 0.03589798975735903,
"support/sampled_winner_rank": 0.6475523337721825,
"support/selected_width": 42.77067279815674,
"support/stored_width": 42.77067279815674
},
{
"epoch": 0.40625,
"grad_norm": 174.2784423828125,
"kl/sequence_policy_ref": -163.66453552246094,
"kl/vocab_forward": 142.52412605285645,
"kl/vocab_js": 19.224056720733643,
"kl/vocab_reverse": 76.05754041671753,
"kl/vocab_symmetric": 218.58167839050293,
"learning_rate": 2e-06,
"logps/chosen": -591.3704357147217,
"logps/rejected": -1515.6297912597656,
"loss": 1.9078,
"loss/dpo": 0.4340968047727074,
"misalign/J": 147.37513256072998,
"misalign/J_aux_loss": 1.473751276731491,
"misalign/J_aux_loss_raw": 147.37513256072998,
"misalign/J_over_reverse_kl": 2.757804274559021,
"misalign/J_per_token": 0.28767452389001846,
"misalign/compressed_reward_absmax": 4011.147979736328,
"misalign/compressed_reward_range": 5775.938781738281,
"misalign/entropy_a": 838.7755889892578,
"misalign/entropy_b": 1110.4411010742188,
"misalign/forward_kl_divergence": 142.52412605285645,
"misalign/forward_kl_divergence_per_token": 0.36550967395305634,
"misalign/gamma_abs_times_reward_std": 50544949.5,
"misalign/gamma_bracketed_rate": 0.9852296411991119,
"misalign/gamma_reward_residual": 4.505999822868034e-05,
"misalign/gamma_star": 83547619.75,
"misalign/js_divergence": 19.224056720733643,
"misalign/reverse_kl_divergence": 76.05754041671753,
"misalign/reverse_kl_divergence_per_token": 0.18261760007590055,
"misalign/reward_a": 57.265894651412964,
"misalign/reward_b": -16.07930701971054,
"misalign/reward_improvement": 73.3451886177063,
"misalign/reward_improvement_over_reverse_kl": 0.8678321242332458,
"misalign/reward_improvement_per_token": 0.11522631160914898,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -398.8517951965332,
"misalign/reward_vocab_std": 701.5557250976562,
"misalign/symmetric_kl": 218.58167839050293,
"misalign/tv_distance": 77.64586639404297,
"num_tokens": 4432914.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -6.874026566743851,
"rewards/margins": 18.98485553264618,
"rewards/rejected": -25.85888135433197,
"step": 26,
"support/residual_count": 151893.0546875,
"support/residual_mass_policy": 0.034172143787145615,
"support/residual_mass_reference": 0.045072893146425486,
"support/residual_reward": -0.4141153208911419,
"support/runtime_width": 42.94584655761719,
"support/sampled_loser_rank": 0.5826500616967678,
"support/sampled_reward_rank": -0.4691601078957319,
"support/sampled_token_added_rate": 0.04121970618143678,
"support/sampled_winner_rank": 0.5887424424290657,
"support/selected_width": 42.94584655761719,
"support/stored_width": 42.94584655761719
},
{
"epoch": 0.421875,
"grad_norm": 136.3807830810547,
"kl/sequence_policy_ref": -203.77911186218262,
"kl/vocab_forward": 183.4600429534912,
"kl/vocab_js": 24.132691860198975,
"kl/vocab_reverse": 93.94395637512207,
"kl/vocab_symmetric": 277.40405654907227,
"learning_rate": 2e-06,
"logps/chosen": -713.9009399414062,
"logps/rejected": -1687.3765106201172,
"loss": 2.0315,
"loss/dpo": 0.32593174448902573,
"misalign/J": 170.55465126037598,
"misalign/J_aux_loss": 1.7055464833974838,
"misalign/J_aux_loss_raw": 170.55465126037598,
"misalign/J_over_reverse_kl": 2.438810557126999,
"misalign/J_per_token": 0.20938482321798801,
"misalign/compressed_reward_absmax": 4179.306243896484,
"misalign/compressed_reward_range": 5945.363037109375,
"misalign/entropy_a": 905.4970626831055,
"misalign/entropy_b": 1233.0114974975586,
"misalign/forward_kl_divergence": 183.4600429534912,
"misalign/forward_kl_divergence_per_token": 0.35007214546203613,
"misalign/gamma_abs_times_reward_std": 59701588.25,
"misalign/gamma_bracketed_rate": 0.9896951243281364,
"misalign/gamma_reward_residual": 4.522019105479558e-05,
"misalign/gamma_star": 83829036.25,
"misalign/js_divergence": 24.132691860198975,
"misalign/reverse_kl_divergence": 93.94395637512207,
"misalign/reverse_kl_divergence_per_token": 0.1601700335741043,
"misalign/reward_a": 70.40993356704712,
"misalign/reward_b": -15.953831195831299,
"misalign/reward_improvement": 86.36374855041504,
"misalign/reward_improvement_over_reverse_kl": 0.9155527576804161,
"misalign/reward_improvement_per_token": 0.10616821236908436,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -455.97426986694336,
"misalign/reward_vocab_std": 724.7982482910156,
"misalign/symmetric_kl": 277.40405654907227,
"misalign/tv_distance": 94.7668981552124,
"num_tokens": 4606090.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -9.796147882938385,
"rewards/margins": 21.163527250289917,
"rewards/rejected": -30.959676027297974,
"step": 27,
"support/residual_count": 151893.162109375,
"support/residual_mass_policy": 0.03356131399050355,
"support/residual_mass_reference": 0.041941048577427864,
"support/residual_reward": -0.38252438232302666,
"support/runtime_width": 42.837002754211426,
"support/sampled_loser_rank": 0.6207218393683434,
"support/sampled_reward_rank": -0.4833753891289234,
"support/sampled_token_added_rate": 0.031088492134585977,
"support/sampled_winner_rank": 0.6683962419629097,
"support/selected_width": 42.837002754211426,
"support/stored_width": 42.837002754211426
},
{
"epoch": 0.4375,
"grad_norm": 196.56495666503906,
"kl/sequence_policy_ref": -229.19044494628906,
"kl/vocab_forward": 210.6693572998047,
"kl/vocab_js": 27.503621578216553,
"kl/vocab_reverse": 107.11871337890625,
"kl/vocab_symmetric": 317.7881450653076,
"learning_rate": 2e-06,
"logps/chosen": -1008.6218376159668,
"logps/rejected": -1565.5734558105469,
"loss": 2.9609,
"loss/dpo": 1.097994428826496,
"misalign/J": 186.2927417755127,
"misalign/J_aux_loss": 1.8629273921251297,
"misalign/J_aux_loss_raw": 186.2927417755127,
"misalign/J_over_reverse_kl": 1.8979013413190842,
"misalign/J_per_token": 0.27667875960469246,
"misalign/compressed_reward_absmax": 4361.166076660156,
"misalign/compressed_reward_range": 6270.502258300781,
"misalign/entropy_a": 908.857048034668,
"misalign/entropy_b": 1304.3608856201172,
"misalign/forward_kl_divergence": 210.6693572998047,
"misalign/forward_kl_divergence_per_token": 0.3658239506185055,
"misalign/gamma_abs_times_reward_std": 59128653.0,
"misalign/gamma_bracketed_rate": 0.981993056833744,
"misalign/gamma_reward_residual": 0.0002913941991664615,
"misalign/gamma_star": 100789119.5,
"misalign/js_divergence": 27.503621578216553,
"misalign/reverse_kl_divergence": 107.11871337890625,
"misalign/reverse_kl_divergence_per_token": 0.1738772690296173,
"misalign/reward_a": 89.05416059494019,
"misalign/reward_b": -10.882870197296143,
"misalign/reward_improvement": 99.93703746795654,
"misalign/reward_improvement_over_reverse_kl": 0.8969720676541328,
"misalign/reward_improvement_per_token": 0.14762359578162432,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -650.0788879394531,
"misalign/reward_vocab_std": 765.3363800048828,
"misalign/symmetric_kl": 317.7881450653076,
"misalign/tv_distance": 106.59637641906738,
"num_tokens": 4786127.0,
"rewards/accuracies": 0.859375,
"rewards/chosen": -13.720831990242004,
"rewards/margins": 18.396424293518066,
"rewards/rejected": -32.1172571182251,
"step": 28,
"support/residual_count": 151893.193359375,
"support/residual_mass_policy": 0.03176172194071114,
"support/residual_mass_reference": 0.041156242368742824,
"support/residual_reward": -0.6319293715059757,
"support/runtime_width": 42.804439544677734,
"support/sampled_loser_rank": 0.6620542109012604,
"support/sampled_reward_rank": -0.30558538623154163,
"support/sampled_token_added_rate": 0.03132295864634216,
"support/sampled_winner_rank": 0.7198682501912117,
"support/selected_width": 42.804439544677734,
"support/stored_width": 42.804439544677734
},
{
"epoch": 0.453125,
"grad_norm": 121.38319396972656,
"kl/sequence_policy_ref": -252.93916511535645,
"kl/vocab_forward": 226.2414608001709,
"kl/vocab_js": 28.960803031921387,
"kl/vocab_reverse": 114.66798114776611,
"kl/vocab_symmetric": 340.90957260131836,
"learning_rate": 2e-06,
"logps/chosen": -779.5240745544434,
"logps/rejected": -1872.5811767578125,
"loss": 2.4603,
"loss/dpo": 0.9713822825047167,
"misalign/J": 148.88968753814697,
"misalign/J_aux_loss": 1.4888968467712402,
"misalign/J_aux_loss_raw": 148.88968753814697,
"misalign/J_over_reverse_kl": 2.155982196331024,
"misalign/J_per_token": 0.26285428553819656,
"misalign/compressed_reward_absmax": 4548.9989013671875,
"misalign/compressed_reward_range": 6488.315979003906,
"misalign/entropy_a": 937.3698120117188,
"misalign/entropy_b": 1328.4651641845703,
"misalign/forward_kl_divergence": 226.2414608001709,
"misalign/forward_kl_divergence_per_token": 0.329929880797863,
"misalign/gamma_abs_times_reward_std": 42800947.75,
"misalign/gamma_bracketed_rate": 0.9842586368322372,
"misalign/gamma_reward_residual": 7.247616258609924e-05,
"misalign/gamma_star": 66069326.0,
"misalign/js_divergence": 28.960803031921387,
"misalign/reverse_kl_divergence": 114.66798114776611,
"misalign/reverse_kl_divergence_per_token": 0.15985783841460943,
"misalign/reward_a": 79.22361898422241,
"misalign/reward_b": -19.311943411827087,
"misalign/reward_improvement": 98.53554153442383,
"misalign/reward_improvement_over_reverse_kl": 0.820253424346447,
"misalign/reward_improvement_per_token": 0.09689361555501819,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -195.40936851501465,
"misalign/reward_vocab_std": 769.0406875610352,
"misalign/symmetric_kl": 340.90957260131836,
"misalign/tv_distance": 107.74736499786377,
"num_tokens": 4960206.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -11.959493935108185,
"rewards/margins": 26.66884672641754,
"rewards/rejected": -38.62834072113037,
"step": 29,
"support/residual_count": 151892.96875,
"support/residual_mass_policy": 0.033485232619568706,
"support/residual_mass_reference": 0.0447953250259161,
"support/residual_reward": -0.18092468939721584,
"support/runtime_width": 43.02750873565674,
"support/sampled_loser_rank": 0.6013398505747318,
"support/sampled_reward_rank": -0.48561038076877594,
"support/sampled_token_added_rate": 0.042466682847589254,
"support/sampled_winner_rank": 0.5962688289582729,
"support/selected_width": 43.02750873565674,
"support/stored_width": 43.02750873565674
},
{
"epoch": 0.46875,
"grad_norm": 191.35165405273438,
"kl/sequence_policy_ref": -265.8605842590332,
"kl/vocab_forward": 241.83115577697754,
"kl/vocab_js": 29.952810764312744,
"kl/vocab_reverse": 117.4506607055664,
"kl/vocab_symmetric": 359.28198051452637,
"learning_rate": 2e-06,
"logps/chosen": -902.8605346679688,
"logps/rejected": -1808.0689544677734,
"loss": 2.3052,
"loss/dpo": 0.2984987065605812,
"misalign/J": 200.67014503479004,
"misalign/J_aux_loss": 2.0067013800144196,
"misalign/J_aux_loss_raw": 200.67014503479004,
"misalign/J_over_reverse_kl": 1.9287290573120117,
"misalign/J_per_token": 0.2536418605595827,
"misalign/compressed_reward_absmax": 4862.0013427734375,
"misalign/compressed_reward_range": 6898.528076171875,
"misalign/entropy_a": 945.1607437133789,
"misalign/entropy_b": 1346.2866134643555,
"misalign/forward_kl_divergence": 241.83115577697754,
"misalign/forward_kl_divergence_per_token": 0.34030735678970814,
"misalign/gamma_abs_times_reward_std": 66157058.0,
"misalign/gamma_bracketed_rate": 0.9856267645955086,
"misalign/gamma_reward_residual": 0.000264001724190166,
"misalign/gamma_star": 109703980.5,
"misalign/js_divergence": 29.952810764312744,
"misalign/reverse_kl_divergence": 117.4506607055664,
"misalign/reverse_kl_divergence_per_token": 0.1695484183728695,
"misalign/reward_a": 87.51940584182739,
"misalign/reward_b": -12.946220338344574,
"misalign/reward_improvement": 100.46563053131104,
"misalign/reward_improvement_over_reverse_kl": 0.804968811571598,
"misalign/reward_improvement_per_token": 0.11112680193036795,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -619.1823959350586,
"misalign/reward_vocab_std": 829.3263244628906,
"misalign/symmetric_kl": 359.28198051452637,
"misalign/tv_distance": 112.75522899627686,
"num_tokens": 5145509.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -15.322714567184448,
"rewards/margins": 22.526688814163208,
"rewards/rejected": -37.84940314292908,
"step": 30,
"support/residual_count": 151893.208984375,
"support/residual_mass_policy": 0.034048222471028566,
"support/residual_mass_reference": 0.04346996312960982,
"support/residual_reward": -0.5197547674179077,
"support/runtime_width": 42.78953218460083,
"support/sampled_loser_rank": 0.5563570559024811,
"support/sampled_reward_rank": -0.39020144287496805,
"support/sampled_token_added_rate": 0.03398139285854995,
"support/sampled_winner_rank": 0.5739484503865242,
"support/selected_width": 42.78953218460083,
"support/stored_width": 42.78953218460083
},
{
"epoch": 0.484375,
"grad_norm": 157.51846313476562,
"kl/sequence_policy_ref": -242.7173252105713,
"kl/vocab_forward": 220.59081268310547,
"kl/vocab_js": 26.569517850875854,
"kl/vocab_reverse": 104.70345973968506,
"kl/vocab_symmetric": 325.2943916320801,
"learning_rate": 2e-06,
"logps/chosen": -677.5861015319824,
"logps/rejected": -1586.1520690917969,
"loss": 1.9096,
"loss/dpo": 0.26202132055277616,
"misalign/J": 164.75636100769043,
"misalign/J_aux_loss": 1.6475635841488838,
"misalign/J_aux_loss_raw": 164.75636100769043,
"misalign/J_over_reverse_kl": 1.940863698720932,
"misalign/J_per_token": 0.33494884334504604,
"misalign/compressed_reward_absmax": 3763.813751220703,
"misalign/compressed_reward_range": 5305.598388671875,
"misalign/entropy_a": 752.1155014038086,
"misalign/entropy_b": 1103.1325607299805,
"misalign/forward_kl_divergence": 220.59081268310547,
"misalign/forward_kl_divergence_per_token": 0.5400971993803978,
"misalign/gamma_abs_times_reward_std": 49524192.5,
"misalign/gamma_bracketed_rate": 0.9842683598399162,
"misalign/gamma_reward_residual": 0.00013795335132726905,
"misalign/gamma_star": 76385996.75,
"misalign/js_divergence": 26.569517850875854,
"misalign/reverse_kl_divergence": 104.70345973968506,
"misalign/reverse_kl_divergence_per_token": 0.2469406109303236,
"misalign/reward_a": 72.4141092300415,
"misalign/reward_b": -14.692016035318375,
"misalign/reward_improvement": 87.10610628128052,
"misalign/reward_improvement_over_reverse_kl": 0.7758874297142029,
"misalign/reward_improvement_per_token": 0.11436527967453003,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -336.6119108200073,
"misalign/reward_vocab_std": 635.3649368286133,
"misalign/symmetric_kl": 325.2943916320801,
"misalign/tv_distance": 95.78610897064209,
"num_tokens": 5302053.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -11.388235569000244,
"rewards/margins": 25.766995549201965,
"rewards/rejected": -37.155229806900024,
"step": 31,
"support/residual_count": 151893.30859375,
"support/residual_mass_policy": 0.026038944022729993,
"support/residual_mass_reference": 0.04330639448016882,
"support/residual_reward": -0.4683221112936735,
"support/runtime_width": 42.69255495071411,
"support/sampled_loser_rank": 0.6677984669804573,
"support/sampled_reward_rank": -0.38564055040478706,
"support/sampled_token_added_rate": 0.03589020320214331,
"support/sampled_winner_rank": 0.6767471358180046,
"support/selected_width": 42.69255495071411,
"support/stored_width": 42.69255495071411
},
{
"epoch": 0.5,
"grad_norm": 115.89986419677734,
"kl/sequence_policy_ref": -291.1677303314209,
"kl/vocab_forward": 264.76793098449707,
"kl/vocab_js": 31.6591854095459,
"kl/vocab_reverse": 126.53115463256836,
"kl/vocab_symmetric": 391.29920196533203,
"learning_rate": 2e-06,
"logps/chosen": -758.1873931884766,
"logps/rejected": -1748.2294845581055,
"loss": 1.9955,
"loss/dpo": 0.4298266823877448,
"misalign/J": 156.56506061553955,
"misalign/J_aux_loss": 1.5656505972146988,
"misalign/J_aux_loss_raw": 156.56506061553955,
"misalign/J_over_reverse_kl": 1.3217194080352783,
"misalign/J_per_token": 0.21796293556690216,
"misalign/compressed_reward_absmax": 4141.018249511719,
"misalign/compressed_reward_range": 5932.4368896484375,
"misalign/entropy_a": 796.6573028564453,
"misalign/entropy_b": 1197.2570190429688,
"misalign/forward_kl_divergence": 264.76793098449707,
"misalign/forward_kl_divergence_per_token": 0.42143452540040016,
"misalign/gamma_abs_times_reward_std": 42270098.25,
"misalign/gamma_bracketed_rate": 0.9892738536000252,
"misalign/gamma_reward_residual": 0.0006614696701490175,
"misalign/gamma_star": 77172042.5,
"misalign/js_divergence": 31.6591854095459,
"misalign/reverse_kl_divergence": 126.53115463256836,
"misalign/reverse_kl_divergence_per_token": 0.20403443090617657,
"misalign/reward_a": 84.93575382232666,
"misalign/reward_b": -16.301965177059174,
"misalign/reward_improvement": 101.23770046234131,
"misalign/reward_improvement_over_reverse_kl": 0.6962632201611996,
"misalign/reward_improvement_per_token": 0.06460105488076806,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -461.3788146972656,
"misalign/reward_vocab_std": 712.9334030151367,
"misalign/symmetric_kl": 391.29920196533203,
"misalign/tv_distance": 110.64313316345215,
"num_tokens": 5464702.0,
"rewards/accuracies": 0.890625,
"rewards/chosen": -13.653896808624268,
"rewards/margins": 30.925754070281982,
"rewards/rejected": -44.57965087890625,
"step": 32,
"support/residual_count": 151893.287109375,
"support/residual_mass_policy": 0.026596042443998158,
"support/residual_mass_reference": 0.044471810571849346,
"support/residual_reward": -0.42068428732454777,
"support/runtime_width": 42.70789432525635,
"support/sampled_loser_rank": 0.6153440810739994,
"support/sampled_reward_rank": -0.4130892716348171,
"support/sampled_token_added_rate": 0.034831034019589424,
"support/sampled_winner_rank": 0.6262499615550041,
"support/selected_width": 42.70789432525635,
"support/stored_width": 42.70789432525635
},
{
"epoch": 0.515625,
"grad_norm": 126.86652374267578,
"kl/sequence_policy_ref": -330.00819396972656,
"kl/vocab_forward": 300.4244632720947,
"kl/vocab_js": 34.439491748809814,
"kl/vocab_reverse": 136.73623180389404,
"kl/vocab_symmetric": 437.1608543395996,
"learning_rate": 2e-06,
"logps/chosen": -748.5886764526367,
"logps/rejected": -1879.9018249511719,
"loss": 1.9269,
"loss/dpo": 0.15133077676370377,
"misalign/J": 177.56014442443848,
"misalign/J_aux_loss": 1.77560143917799,
"misalign/J_aux_loss_raw": 177.56014442443848,
"misalign/J_over_reverse_kl": 1.510396808385849,
"misalign/J_per_token": 0.25404511764645576,
"misalign/compressed_reward_absmax": 4184.280670166016,
"misalign/compressed_reward_range": 5938.451599121094,
"misalign/entropy_a": 794.0090255737305,
"misalign/entropy_b": 1205.6077117919922,
"misalign/forward_kl_divergence": 300.4244632720947,
"misalign/forward_kl_divergence_per_token": 0.5809952989220619,
"misalign/gamma_abs_times_reward_std": 47938589.5,
"misalign/gamma_bracketed_rate": 0.9874916970729828,
"misalign/gamma_reward_residual": 0.0018580270816528355,
"misalign/gamma_star": 84848901.75,
"misalign/js_divergence": 34.439491748809814,
"misalign/reverse_kl_divergence": 136.73623180389404,
"misalign/reverse_kl_divergence_per_token": 0.22614295408129692,
"misalign/reward_a": 93.50389242172241,
"misalign/reward_b": -11.571515798568726,
"misalign/reward_improvement": 105.07538223266602,
"misalign/reward_improvement_over_reverse_kl": 0.7212403789162636,
"misalign/reward_improvement_per_token": 0.08620550157502294,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -539.4033613204956,
"misalign/reward_vocab_std": 717.4302520751953,
"misalign/symmetric_kl": 437.1608543395996,
"misalign/tv_distance": 118.05486106872559,
"num_tokens": 5633244.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -14.54743093252182,
"rewards/margins": 36.90677738189697,
"rewards/rejected": -51.454208850860596,
"step": 33,
"support/residual_count": 151893.3203125,
"support/residual_mass_policy": 0.029681737767532468,
"support/residual_mass_reference": 0.04252100153826177,
"support/residual_reward": -0.4979167296551168,
"support/runtime_width": 42.67843770980835,
"support/sampled_loser_rank": 0.6283881887793541,
"support/sampled_reward_rank": -0.38332303427159786,
"support/sampled_token_added_rate": 0.03253701771609485,
"support/sampled_winner_rank": 0.6316058188676834,
"support/selected_width": 42.67843770980835,
"support/stored_width": 42.67843770980835
},
{
"epoch": 0.53125,
"grad_norm": 189.66943359375,
"kl/sequence_policy_ref": -295.09754180908203,
"kl/vocab_forward": 265.09803009033203,
"kl/vocab_js": 31.023924469947815,
"kl/vocab_reverse": 124.39066219329834,
"kl/vocab_symmetric": 389.488920211792,
"learning_rate": 2e-06,
"logps/chosen": -869.112964630127,
"logps/rejected": -1561.075454711914,
"loss": 2.3314,
"loss/dpo": 0.4870968231589359,
"misalign/J": 184.4345703125,
"misalign/J_aux_loss": 1.8443456441164017,
"misalign/J_aux_loss_raw": 184.4345703125,
"misalign/J_over_reverse_kl": 2.160892277956009,
"misalign/J_per_token": 0.36055343225598335,
"misalign/compressed_reward_absmax": 4057.2051391601562,
"misalign/compressed_reward_range": 5788.110046386719,
"misalign/entropy_a": 733.9059944152832,
"misalign/entropy_b": 1137.9230575561523,
"misalign/forward_kl_divergence": 265.09803009033203,
"misalign/forward_kl_divergence_per_token": 0.5678062625229359,
"misalign/gamma_abs_times_reward_std": 71265453.5,
"misalign/gamma_bracketed_rate": 0.9808945804834366,
"misalign/gamma_reward_residual": 4.436415292730089e-05,
"misalign/gamma_star": 82635977.5,
"misalign/js_divergence": 31.023924469947815,
"misalign/reverse_kl_divergence": 124.39066219329834,
"misalign/reverse_kl_divergence_per_token": 0.22786439768970013,
"misalign/reward_a": 93.78890228271484,
"misalign/reward_b": -7.1546797305345535,
"misalign/reward_improvement": 100.94355964660645,
"misalign/reward_improvement_over_reverse_kl": 0.6350699551403522,
"misalign/reward_improvement_per_token": 0.05376583803445101,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -452.1529130935669,
"misalign/reward_vocab_std": 700.5487747192383,
"misalign/symmetric_kl": 389.488920211792,
"misalign/tv_distance": 107.56306266784668,
"num_tokens": 5794166.0,
"rewards/accuracies": 0.90625,
"rewards/chosen": -14.67345118522644,
"rewards/margins": 29.672606945037842,
"rewards/rejected": -44.34605646133423,
"step": 34,
"support/residual_count": 151893.341796875,
"support/residual_mass_policy": 0.023359368089586496,
"support/residual_mass_reference": 0.03612355049699545,
"support/residual_reward": -0.4855753555893898,
"support/runtime_width": 42.65322256088257,
"support/sampled_loser_rank": 0.5644064396619797,
"support/sampled_reward_rank": -0.3794688871130347,
"support/sampled_token_added_rate": 0.030243139481171966,
"support/sampled_winner_rank": 0.5993468686938286,
"support/selected_width": 42.65322256088257,
"support/stored_width": 42.65322256088257
},
{
"epoch": 0.546875,
"grad_norm": 255.15330505371094,
"kl/sequence_policy_ref": -334.76751708984375,
"kl/vocab_forward": 300.60124015808105,
"kl/vocab_js": 34.761489152908325,
"kl/vocab_reverse": 139.5803165435791,
"kl/vocab_symmetric": 440.1817283630371,
"learning_rate": 2e-06,
"logps/chosen": -799.9229431152344,
"logps/rejected": -1850.2046356201172,
"loss": 2.7865,
"loss/dpo": 0.628988600539742,
"misalign/J": 215.75445175170898,
"misalign/J_aux_loss": 2.1575444042682648,
"misalign/J_aux_loss_raw": 215.75445175170898,
"misalign/J_over_reverse_kl": 1.6279902905225754,
"misalign/J_per_token": 0.23844042047858238,
"misalign/compressed_reward_absmax": 4261.061584472656,
"misalign/compressed_reward_range": 6006.7581787109375,
"misalign/entropy_a": 801.278564453125,
"misalign/entropy_b": 1216.669448852539,
"misalign/forward_kl_divergence": 300.60124015808105,
"misalign/forward_kl_divergence_per_token": 0.4489123970270157,
"misalign/gamma_abs_times_reward_std": 66620462.0,
"misalign/gamma_bracketed_rate": 0.9877287149429321,
"misalign/gamma_reward_residual": 0.00013351680354389828,
"misalign/gamma_star": 108750856.5,
"misalign/js_divergence": 34.761489152908325,
"misalign/reverse_kl_divergence": 139.5803165435791,
"misalign/reverse_kl_divergence_per_token": 0.20873420871794224,
"misalign/reward_a": 96.30846405029297,
"misalign/reward_b": -10.182962775230408,
"misalign/reward_improvement": 106.4914083480835,
"misalign/reward_improvement_over_reverse_kl": 0.7169731482863426,
"misalign/reward_improvement_per_token": 0.1033505480736494,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -652.5512466430664,
"misalign/reward_vocab_std": 728.8263397216797,
"misalign/symmetric_kl": 440.1817283630371,
"misalign/tv_distance": 117.93358135223389,
"num_tokens": 5967652.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -16.057342648506165,
"rewards/margins": 34.83882117271423,
"rewards/rejected": -50.89616346359253,
"step": 35,
"support/residual_count": 151893.296875,
"support/residual_mass_policy": 0.025821004761382937,
"support/residual_mass_reference": 0.039951348677277565,
"support/residual_reward": -0.5654018372297287,
"support/runtime_width": 42.70614957809448,
"support/sampled_loser_rank": 0.6046793200075626,
"support/sampled_reward_rank": -0.2875976013019681,
"support/sampled_token_added_rate": 0.03660787723492831,
"support/sampled_winner_rank": 0.5936227701604366,
"support/selected_width": 42.70614957809448,
"support/stored_width": 42.70614957809448
},
{
"epoch": 0.5625,
"grad_norm": 372.3753967285156,
"kl/sequence_policy_ref": -301.4326972961426,
"kl/vocab_forward": 272.3033618927002,
"kl/vocab_js": 31.47647452354431,
"kl/vocab_reverse": 129.38499641418457,
"kl/vocab_symmetric": 401.688533782959,
"learning_rate": 2e-06,
"logps/chosen": -617.8253440856934,
"logps/rejected": -1656.3383178710938,
"loss": 2.0888,
"loss/dpo": 0.20348351792887343,
"misalign/J": 188.5364990234375,
"misalign/J_aux_loss": 1.8853649497032166,
"misalign/J_aux_loss_raw": 188.5364990234375,
"misalign/J_over_reverse_kl": 1.918866515159607,
"misalign/J_per_token": 0.2657326404005289,
"misalign/compressed_reward_absmax": 3813.5306091308594,
"misalign/compressed_reward_range": 5372.776062011719,
"misalign/entropy_a": 678.8123512268066,
"misalign/entropy_b": 1029.1281280517578,
"misalign/forward_kl_divergence": 272.3033618927002,
"misalign/forward_kl_divergence_per_token": 0.5031169354915619,
"misalign/gamma_abs_times_reward_std": 54824950.5,
"misalign/gamma_bracketed_rate": 0.984458789229393,
"misalign/gamma_reward_residual": 9.551036919219769e-05,
"misalign/gamma_star": 76879923.5,
"misalign/js_divergence": 31.47647452354431,
"misalign/reverse_kl_divergence": 129.38499641418457,
"misalign/reverse_kl_divergence_per_token": 0.20918168872594833,
"misalign/reward_a": 85.22445583343506,
"misalign/reward_b": -13.628453433513641,
"misalign/reward_improvement": 98.85287952423096,
"misalign/reward_improvement_over_reverse_kl": 0.6402187570929527,
"misalign/reward_improvement_per_token": 0.06344311079010367,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -478.2143135070801,
"misalign/reward_vocab_std": 661.5251770019531,
"misalign/symmetric_kl": 401.688533782959,
"misalign/tv_distance": 104.53119087219238,
"num_tokens": 6125949.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -12.255744695663452,
"rewards/margins": 35.775049448013306,
"rewards/rejected": -48.03079414367676,
"step": 36,
"support/residual_count": 151893.57421875,
"support/residual_mass_policy": 0.0228013499872759,
"support/residual_mass_reference": 0.03604377689771354,
"support/residual_reward": -0.5941759124398232,
"support/runtime_width": 42.42575168609619,
"support/sampled_loser_rank": 0.579292468726635,
"support/sampled_reward_rank": -0.31505878642201424,
"support/sampled_token_added_rate": 0.027777738636359572,
"support/sampled_winner_rank": 0.6083027757704258,
"support/selected_width": 42.42575168609619,
"support/stored_width": 42.42575168609619
},
{
"epoch": 0.5625,
"eval_kl/sequence_policy_ref": -370.0239589214325,
"eval_kl/vocab_forward": 326.3884401321411,
"eval_kl/vocab_js": 39.170413970947266,
"eval_kl/vocab_reverse": 163.1976842880249,
"eval_kl/vocab_symmetric": 489.5863370895386,
"eval_logps/chosen": -878.7963104248047,
"eval_logps/rejected": -1961.517490386963,
"eval_loss": 1.9275543689727783,
"eval_loss/dpo": 0.07911084008669506,
"eval_misalign/J": 184.84435880184174,
"eval_misalign/J_aux_loss": 1.8484435249119997,
"eval_misalign/J_aux_loss_raw": 184.84435880184174,
"eval_misalign/J_over_reverse_kl": 1.7581309108063579,
"eval_misalign/J_per_token": 0.26096369861625135,
"eval_misalign/compressed_reward_absmax": 4236.706245422363,
"eval_misalign/compressed_reward_range": 6027.056587219238,
"eval_misalign/entropy_a": 859.5940890312195,
"eval_misalign/entropy_b": 1298.0544710159302,
"eval_misalign/forward_kl_divergence": 326.3884401321411,
"eval_misalign/forward_kl_divergence_per_token": 0.5667336815968156,
"eval_misalign/gamma_abs_times_reward_std": 44952110.546875,
"eval_misalign/gamma_bracketed_rate": 0.9881090503185987,
"eval_misalign/gamma_reward_residual": 0.0003100246618572555,
"eval_misalign/gamma_star": 68325932.625,
"eval_misalign/js_divergence": 39.170413970947266,
"eval_misalign/reverse_kl_divergence": 163.1976842880249,
"eval_misalign/reverse_kl_divergence_per_token": 0.2310976292937994,
"eval_misalign/reward_a": 111.55592322349548,
"eval_misalign/reward_b": -14.307281229645014,
"eval_misalign/reward_improvement": 125.86316466331482,
"eval_misalign/reward_improvement_over_reverse_kl": 0.6893182648345828,
"eval_misalign/reward_improvement_per_token": 0.08624049881473184,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.1155492067337,
"eval_misalign/reward_vocab_std": 731.3202810287476,
"eval_misalign/symmetric_kl": 489.5863370895386,
"eval_misalign/tv_distance": 129.546555519104,
"eval_rewards/accuracies": 0.98046875,
"eval_rewards/chosen": -14.962035872042179,
"eval_rewards/margins": 44.080720245838165,
"eval_rewards/rejected": -59.04275727272034,
"eval_runtime": 100.6796,
"eval_samples_per_second": 5.085,
"eval_steps_per_second": 0.636,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.02652598696295172,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 36
},
{
"epoch": 0.578125,
"grad_norm": 538.4682006835938,
"kl/sequence_policy_ref": -340.5377769470215,
"kl/vocab_forward": 290.87598991394043,
"kl/vocab_js": 34.37863755226135,
"kl/vocab_reverse": 144.22690105438232,
"kl/vocab_symmetric": 435.10305404663086,
"learning_rate": 2e-06,
"logps/chosen": -651.7600479125977,
"logps/rejected": -1730.038101196289,
"loss": 2.6095,
"loss/dpo": 0.025369518539697822,
"misalign/J": 258.41460514068604,
"misalign/J_aux_loss": 2.58414613455534,
"misalign/J_aux_loss_raw": 258.41460514068604,
"misalign/J_over_reverse_kl": 3.293791249394417,
"misalign/J_per_token": 0.3237530868500471,
"misalign/compressed_reward_absmax": 3946.171417236328,
"misalign/compressed_reward_range": 5592.2857666015625,
"misalign/entropy_a": 679.9681549072266,
"misalign/entropy_b": 1041.4010391235352,
"misalign/forward_kl_divergence": 290.87598991394043,
"misalign/forward_kl_divergence_per_token": 0.6766270510852337,
"misalign/gamma_abs_times_reward_std": 77732721.625,
"misalign/gamma_bracketed_rate": 0.9842484146356583,
"misalign/gamma_reward_residual": 2.0939698629263148e-05,
"misalign/gamma_star": 120903140.125,
"misalign/js_divergence": 34.37863755226135,
"misalign/reverse_kl_divergence": 144.22690105438232,
"misalign/reverse_kl_divergence_per_token": 0.24700743332505226,
"misalign/reward_a": 94.92676162719727,
"misalign/reward_b": -12.971765249967575,
"misalign/reward_improvement": 107.89847755432129,
"misalign/reward_improvement_over_reverse_kl": 0.6338806599378586,
"misalign/reward_improvement_per_token": 0.060572607442736626,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -481.5314302444458,
"misalign/reward_vocab_std": 672.1194839477539,
"misalign/symmetric_kl": 435.10305404663086,
"misalign/tv_distance": 110.60170650482178,
"num_tokens": 6285208.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.246494770050049,
"rewards/margins": 43.61456775665283,
"rewards/rejected": -55.861063957214355,
"step": 37,
"support/residual_count": 151893.3203125,
"support/residual_mass_policy": 0.023932685144245625,
"support/residual_mass_reference": 0.041495030745863914,
"support/residual_reward": -0.5419074520468712,
"support/runtime_width": 42.680593967437744,
"support/sampled_loser_rank": 0.5905993320047855,
"support/sampled_reward_rank": -0.3638652637600899,
"support/sampled_token_added_rate": 0.030867979861795902,
"support/sampled_winner_rank": 0.6219254210591316,
"support/selected_width": 42.680593967437744,
"support/stored_width": 42.680593967437744
},
{
"epoch": 0.59375,
"grad_norm": 103.29216766357422,
"kl/sequence_policy_ref": -322.09266471862793,
"kl/vocab_forward": 284.572021484375,
"kl/vocab_js": 32.71440887451172,
"kl/vocab_reverse": 135.91109657287598,
"kl/vocab_symmetric": 420.4832000732422,
"learning_rate": 2e-06,
"logps/chosen": -666.5978660583496,
"logps/rejected": -1681.1878814697266,
"loss": 1.7536,
"loss/dpo": 0.07719759906125201,
"misalign/J": 167.64227294921875,
"misalign/J_aux_loss": 1.6764226853847504,
"misalign/J_aux_loss_raw": 167.64227294921875,
"misalign/J_over_reverse_kl": 1.7161841690540314,
"misalign/J_per_token": 0.27796192467212677,
"misalign/compressed_reward_absmax": 3628.3260498046875,
"misalign/compressed_reward_range": 5132.099914550781,
"misalign/entropy_a": 690.4412384033203,
"misalign/entropy_b": 1045.768310546875,
"misalign/forward_kl_divergence": 284.572021484375,
"misalign/forward_kl_divergence_per_token": 0.6853830218315125,
"misalign/gamma_abs_times_reward_std": 51078868.0,
"misalign/gamma_bracketed_rate": 0.9857224076986313,
"misalign/gamma_reward_residual": 0.00023307789706450421,
"misalign/gamma_star": 63338430.75,
"misalign/js_divergence": 32.71440887451172,
"misalign/reverse_kl_divergence": 135.91109657287598,
"misalign/reverse_kl_divergence_per_token": 0.26742945425212383,
"misalign/reward_a": 82.06362342834473,
"misalign/reward_b": -14.572960376739502,
"misalign/reward_improvement": 96.63658332824707,
"misalign/reward_improvement_over_reverse_kl": 0.6172222569584846,
"misalign/reward_improvement_per_token": 0.09521574154496193,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -462.82337760925293,
"misalign/reward_vocab_std": 629.3218460083008,
"misalign/symmetric_kl": 420.4832000732422,
"misalign/tv_distance": 107.11642932891846,
"num_tokens": 6439380.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.20527732372284,
"rewards/margins": 38.00797891616821,
"rewards/rejected": -51.21325659751892,
"step": 38,
"support/residual_count": 151893.580078125,
"support/residual_mass_policy": 0.02642570063471794,
"support/residual_mass_reference": 0.0395309254527092,
"support/residual_reward": -0.619575060904026,
"support/runtime_width": 42.41612482070923,
"support/sampled_loser_rank": 0.6446574702858925,
"support/sampled_reward_rank": -0.3532958813011646,
"support/sampled_token_added_rate": 0.029658236424438655,
"support/sampled_winner_rank": 0.6769233047962189,
"support/selected_width": 42.41612482070923,
"support/stored_width": 42.41612482070923
},
{
"epoch": 0.609375,
"grad_norm": 1020.2838745117188,
"kl/sequence_policy_ref": -395.0605163574219,
"kl/vocab_forward": 347.51882553100586,
"kl/vocab_js": 42.35039806365967,
"kl/vocab_reverse": 184.1147804260254,
"kl/vocab_symmetric": 531.6339111328125,
"learning_rate": 2e-06,
"logps/chosen": -740.7064399719238,
"logps/rejected": -2187.0342712402344,
"loss": 3.4136,
"loss/dpo": 0.032895612518908,
"misalign/J": 338.06642150878906,
"misalign/J_aux_loss": 3.3806639164686203,
"misalign/J_aux_loss_raw": 338.06642150878906,
"misalign/J_over_reverse_kl": 2.6226917803287506,
"misalign/J_per_token": 0.3244504798203707,
"misalign/compressed_reward_absmax": 4427.0225830078125,
"misalign/compressed_reward_range": 6178.325927734375,
"misalign/entropy_a": 868.1625289916992,
"misalign/entropy_b": 1331.6126098632812,
"misalign/forward_kl_divergence": 347.51882553100586,
"misalign/forward_kl_divergence_per_token": 0.5268885493278503,
"misalign/gamma_abs_times_reward_std": 78415596.0,
"misalign/gamma_bracketed_rate": 0.9849846512079239,
"misalign/gamma_reward_residual": 0.00048569267073617084,
"misalign/gamma_star": 86910211.0,
"misalign/js_divergence": 42.35039806365967,
"misalign/reverse_kl_divergence": 184.1147804260254,
"misalign/reverse_kl_divergence_per_token": 0.2407812997698784,
"misalign/reward_a": 94.71883296966553,
"misalign/reward_b": -19.811948537826538,
"misalign/reward_improvement": 114.53073215484619,
"misalign/reward_improvement_over_reverse_kl": 0.6183451376855373,
"misalign/reward_improvement_per_token": 0.07184543320909142,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -322.06157875061035,
"misalign/reward_vocab_std": 766.7786178588867,
"misalign/symmetric_kl": 531.6339111328125,
"misalign/tv_distance": 135.00493335723877,
"num_tokens": 6618112.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.374651193618774,
"rewards/margins": 50.26280069351196,
"rewards/rejected": -64.63745069503784,
"step": 39,
"support/residual_count": 151892.978515625,
"support/residual_mass_policy": 0.030612861970439553,
"support/residual_mass_reference": 0.04724447149783373,
"support/residual_reward": -0.27408459782600403,
"support/runtime_width": 43.020020484924316,
"support/sampled_loser_rank": 0.6019720807671547,
"support/sampled_reward_rank": -0.4871169701218605,
"support/sampled_token_added_rate": 0.038056216202676296,
"support/sampled_winner_rank": 0.6120708398520947,
"support/selected_width": 43.020020484924316,
"support/stored_width": 43.020020484924316
},
{
"epoch": 0.625,
"grad_norm": 1029.6490478515625,
"kl/sequence_policy_ref": -391.72465896606445,
"kl/vocab_forward": 347.66312408447266,
"kl/vocab_js": 43.16923809051514,
"kl/vocab_reverse": 182.91918754577637,
"kl/vocab_symmetric": 530.5825958251953,
"learning_rate": 2e-06,
"logps/chosen": -836.8339767456055,
"logps/rejected": -2054.7322845458984,
"loss": 2.4484,
"loss/dpo": 0.08931858758296585,
"misalign/J": 235.91084098815918,
"misalign/J_aux_loss": 2.3591084629297256,
"misalign/J_aux_loss_raw": 235.91084098815918,
"misalign/J_over_reverse_kl": 2.021958939731121,
"misalign/J_per_token": 0.2609993116930127,
"misalign/compressed_reward_absmax": 4498.334259033203,
"misalign/compressed_reward_range": 6353.229675292969,
"misalign/entropy_a": 859.5539245605469,
"misalign/entropy_b": 1316.3250732421875,
"misalign/forward_kl_divergence": 347.66312408447266,
"misalign/forward_kl_divergence_per_token": 0.4745451509952545,
"misalign/gamma_abs_times_reward_std": 67711939.75,
"misalign/gamma_bracketed_rate": 0.9849048256874084,
"misalign/gamma_reward_residual": 6.0246247358008986e-05,
"misalign/gamma_star": 116100731.0,
"misalign/js_divergence": 43.16923809051514,
"misalign/reverse_kl_divergence": 182.91918754577637,
"misalign/reverse_kl_divergence_per_token": 0.22102734073996544,
"misalign/reward_a": 119.16459369659424,
"misalign/reward_b": -17.058857798576355,
"misalign/reward_improvement": 136.22340965270996,
"misalign/reward_improvement_over_reverse_kl": 0.6773070022463799,
"misalign/reward_improvement_per_token": 0.09231107356026769,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -541.3807773590088,
"misalign/reward_vocab_std": 752.7018508911133,
"misalign/symmetric_kl": 530.5825958251953,
"misalign/tv_distance": 137.86173248291016,
"num_tokens": 6793032.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -15.797016501426697,
"rewards/margins": 46.75089979171753,
"rewards/rejected": -62.547916412353516,
"step": 40,
"support/residual_count": 151893.34375,
"support/residual_mass_policy": 0.02554402849636972,
"support/residual_mass_reference": 0.04131174925714731,
"support/residual_reward": -0.43343046586960554,
"support/runtime_width": 42.65411186218262,
"support/sampled_loser_rank": 0.635368824005127,
"support/sampled_reward_rank": -0.3783828802406788,
"support/sampled_token_added_rate": 0.03763708798214793,
"support/sampled_winner_rank": 0.632044330239296,
"support/selected_width": 42.65411186218262,
"support/stored_width": 42.65411186218262
},
{
"epoch": 0.640625,
"grad_norm": 123.83010864257812,
"kl/sequence_policy_ref": -379.3154487609863,
"kl/vocab_forward": 331.34838104248047,
"kl/vocab_js": 39.656344413757324,
"kl/vocab_reverse": 165.32555389404297,
"kl/vocab_symmetric": 496.6741180419922,
"learning_rate": 2e-06,
"logps/chosen": -821.5552520751953,
"logps/rejected": -2054.2618560791016,
"loss": 2.1358,
"loss/dpo": 0.3257599932614975,
"misalign/J": 181.00886344909668,
"misalign/J_aux_loss": 1.8100886344909668,
"misalign/J_aux_loss_raw": 181.00886344909668,
"misalign/J_over_reverse_kl": 1.6241952329874039,
"misalign/J_per_token": 0.2575971782207489,
"misalign/compressed_reward_absmax": 4115.048797607422,
"misalign/compressed_reward_range": 5863.2059326171875,
"misalign/entropy_a": 867.4101486206055,
"misalign/entropy_b": 1317.5307006835938,
"misalign/forward_kl_divergence": 331.34838104248047,
"misalign/forward_kl_divergence_per_token": 0.5753209926187992,
"misalign/gamma_abs_times_reward_std": 46303981.5,
"misalign/gamma_bracketed_rate": 0.9869667664170265,
"misalign/gamma_reward_residual": 8.94081304068095e-05,
"misalign/gamma_star": 55120054.25,
"misalign/js_divergence": 39.656344413757324,
"misalign/reverse_kl_divergence": 165.32555389404297,
"misalign/reverse_kl_divergence_per_token": 0.2224746011197567,
"misalign/reward_a": 104.61750793457031,
"misalign/reward_b": -15.681229546666145,
"misalign/reward_improvement": 120.29868698120117,
"misalign/reward_improvement_over_reverse_kl": 0.6759593263268471,
"misalign/reward_improvement_per_token": 0.08714451128616929,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -421.68537425994873,
"misalign/reward_vocab_std": 710.1155014038086,
"misalign/symmetric_kl": 496.6741180419922,
"misalign/tv_distance": 129.80499076843262,
"num_tokens": 6964060.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.713181614875793,
"rewards/margins": 48.436728954315186,
"rewards/rejected": -62.14991092681885,
"step": 41,
"support/residual_count": 151893.20703125,
"support/residual_mass_policy": 0.031454769195988774,
"support/residual_mass_reference": 0.04673444852232933,
"support/residual_reward": -0.3744997123721987,
"support/runtime_width": 42.79198360443115,
"support/sampled_loser_rank": 0.6497581750154495,
"support/sampled_reward_rank": -0.433025848120451,
"support/sampled_token_added_rate": 0.039118685061112046,
"support/sampled_winner_rank": 0.6626867726445198,
"support/selected_width": 42.79198360443115,
"support/stored_width": 42.79198360443115
},
{
"epoch": 0.65625,
"grad_norm": 150.85992431640625,
"kl/sequence_policy_ref": -384.6586265563965,
"kl/vocab_forward": 345.101411819458,
"kl/vocab_js": 41.772791624069214,
"kl/vocab_reverse": 176.38330841064453,
"kl/vocab_symmetric": 521.48486328125,
"learning_rate": 2e-06,
"logps/chosen": -863.8736877441406,
"logps/rejected": -1970.995620727539,
"loss": 1.7582,
"loss/dpo": 0.017987981137206566,
"misalign/J": 174.01841640472412,
"misalign/J_aux_loss": 1.7401841282844543,
"misalign/J_aux_loss_raw": 174.01841640472412,
"misalign/J_over_reverse_kl": 1.5599696189165115,
"misalign/J_per_token": 0.22202685475349426,
"misalign/compressed_reward_absmax": 4183.230377197266,
"misalign/compressed_reward_range": 5966.482116699219,
"misalign/entropy_a": 820.5932464599609,
"misalign/entropy_b": 1289.2757186889648,
"misalign/forward_kl_divergence": 345.101411819458,
"misalign/forward_kl_divergence_per_token": 0.5078705288469791,
"misalign/gamma_abs_times_reward_std": 39975210.5,
"misalign/gamma_bracketed_rate": 0.9893263578414917,
"misalign/gamma_reward_residual": 7.978630480920401e-05,
"misalign/gamma_star": 61503279.625,
"misalign/js_divergence": 41.772791624069214,
"misalign/reverse_kl_divergence": 176.38330841064453,
"misalign/reverse_kl_divergence_per_token": 0.2160295583307743,
"misalign/reward_a": 117.82565593719482,
"misalign/reward_b": -13.596548825502396,
"misalign/reward_improvement": 131.42217826843262,
"misalign/reward_improvement_over_reverse_kl": 0.661291316151619,
"misalign/reward_improvement_per_token": 0.08064441289752722,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -524.60085105896,
"misalign/reward_vocab_std": 725.3326187133789,
"misalign/symmetric_kl": 521.48486328125,
"misalign/tv_distance": 134.1311206817627,
"num_tokens": 7130526.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -14.464305222034454,
"rewards/margins": 48.003116607666016,
"rewards/rejected": -62.467421531677246,
"step": 42,
"support/residual_count": 151893.248046875,
"support/residual_mass_policy": 0.023710966343060136,
"support/residual_mass_reference": 0.03921722201630473,
"support/residual_reward": -0.5094128809869289,
"support/runtime_width": 42.75533056259155,
"support/sampled_loser_rank": 0.6201684325933456,
"support/sampled_reward_rank": -0.38710433803498745,
"support/sampled_token_added_rate": 0.034145432990044355,
"support/sampled_winner_rank": 0.6373011693358421,
"support/selected_width": 42.75533056259155,
"support/stored_width": 42.75533056259155
},
{
"epoch": 0.671875,
"grad_norm": 80.83043670654297,
"kl/sequence_policy_ref": -430.89680099487305,
"kl/vocab_forward": 383.08553886413574,
"kl/vocab_js": 42.64789915084839,
"kl/vocab_reverse": 176.26643562316895,
"kl/vocab_symmetric": 559.3521347045898,
"learning_rate": 2e-06,
"logps/chosen": -795.0814590454102,
"logps/rejected": -2073.403335571289,
"loss": 1.947,
"loss/dpo": 0.20042237156056064,
"misalign/J": 174.65477752685547,
"misalign/J_aux_loss": 1.7465477734804153,
"misalign/J_aux_loss_raw": 174.65477752685547,
"misalign/J_over_reverse_kl": 1.2193461656570435,
"misalign/J_per_token": 0.21656284667551517,
"misalign/compressed_reward_absmax": 4145.209014892578,
"misalign/compressed_reward_range": 5894.319396972656,
"misalign/entropy_a": 773.8354606628418,
"misalign/entropy_b": 1241.6957702636719,
"misalign/forward_kl_divergence": 383.08553886413574,
"misalign/forward_kl_divergence_per_token": 0.6106686592102051,
"misalign/gamma_abs_times_reward_std": 42811992.0,
"misalign/gamma_bracketed_rate": 0.9881041571497917,
"misalign/gamma_reward_residual": 0.0022615561604197865,
"misalign/gamma_star": 57841907.5,
"misalign/js_divergence": 42.64789915084839,
"misalign/reverse_kl_divergence": 176.26643562316895,
"misalign/reverse_kl_divergence_per_token": 0.20993488654494286,
"misalign/reward_a": 112.22536277770996,
"misalign/reward_b": -13.715306758880615,
"misalign/reward_improvement": 125.94063758850098,
"misalign/reward_improvement_over_reverse_kl": 0.7269639819860458,
"misalign/reward_improvement_per_token": 0.13059457764029503,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -529.42356300354,
"misalign/reward_vocab_std": 729.1826972961426,
"misalign/symmetric_kl": 559.3521347045898,
"misalign/tv_distance": 134.69511699676514,
"num_tokens": 7295429.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -16.980572760105133,
"rewards/margins": 52.21821975708008,
"rewards/rejected": -69.19879198074341,
"step": 43,
"support/residual_count": 151893.271484375,
"support/residual_mass_policy": 0.020681084133684635,
"support/residual_mass_reference": 0.03672680747695267,
"support/residual_reward": -0.5217800214886665,
"support/runtime_width": 42.73170518875122,
"support/sampled_loser_rank": 0.6487637758255005,
"support/sampled_reward_rank": -0.33589007146656513,
"support/sampled_token_added_rate": 0.0326521759852767,
"support/sampled_winner_rank": 0.6681758984923363,
"support/selected_width": 42.73170518875122,
"support/stored_width": 42.73170518875122
},
{
"epoch": 0.6875,
"grad_norm": 126.96430206298828,
"kl/sequence_policy_ref": -369.60080337524414,
"kl/vocab_forward": 333.5518455505371,
"kl/vocab_js": 38.65760946273804,
"kl/vocab_reverse": 158.4586296081543,
"kl/vocab_symmetric": 492.01073837280273,
"learning_rate": 2e-06,
"logps/chosen": -845.3204116821289,
"logps/rejected": -1843.438232421875,
"loss": 2.4664,
"loss/dpo": 0.5317850863039725,
"misalign/J": 193.46406745910645,
"misalign/J_aux_loss": 1.9346406310796738,
"misalign/J_aux_loss_raw": 193.46406745910645,
"misalign/J_over_reverse_kl": 1.8410249948501587,
"misalign/J_per_token": 0.21784362383186817,
"misalign/compressed_reward_absmax": 4217.687438964844,
"misalign/compressed_reward_range": 6031.9879150390625,
"misalign/entropy_a": 751.3342437744141,
"misalign/entropy_b": 1211.2969436645508,
"misalign/forward_kl_divergence": 333.5518455505371,
"misalign/forward_kl_divergence_per_token": 0.4570343755185604,
"misalign/gamma_abs_times_reward_std": 54839250.375,
"misalign/gamma_bracketed_rate": 0.9886480942368507,
"misalign/gamma_reward_residual": 0.0009870923743164894,
"misalign/gamma_star": 66258410.125,
"misalign/js_divergence": 38.65760946273804,
"misalign/reverse_kl_divergence": 158.4586296081543,
"misalign/reverse_kl_divergence_per_token": 0.20796416513621807,
"misalign/reward_a": 101.46306848526001,
"misalign/reward_b": -14.63782051205635,
"misalign/reward_improvement": 116.10085487365723,
"misalign/reward_improvement_over_reverse_kl": 0.6505585312843323,
"misalign/reward_improvement_per_token": 0.08616658858954906,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -558.583703994751,
"misalign/reward_vocab_std": 734.9819030761719,
"misalign/symmetric_kl": 492.01073837280273,
"misalign/tv_distance": 125.39980697631836,
"num_tokens": 7467937.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -16.166881680488586,
"rewards/margins": 41.5863995552063,
"rewards/rejected": -57.75328016281128,
"step": 44,
"support/residual_count": 151893.1171875,
"support/residual_mass_policy": 0.02624459331855178,
"support/residual_mass_reference": 0.04203576873987913,
"support/residual_reward": -0.41040224581956863,
"support/runtime_width": 42.882601737976074,
"support/sampled_loser_rank": 0.597820907831192,
"support/sampled_reward_rank": -0.4271346926689148,
"support/sampled_token_added_rate": 0.03776927734725177,
"support/sampled_winner_rank": 0.6288959942758083,
"support/selected_width": 42.882601737976074,
"support/stored_width": 42.882601737976074
},
{
"epoch": 0.703125,
"grad_norm": 263.4734191894531,
"kl/sequence_policy_ref": -398.33577728271484,
"kl/vocab_forward": 362.0641288757324,
"kl/vocab_js": 40.054423570632935,
"kl/vocab_reverse": 163.50957679748535,
"kl/vocab_symmetric": 525.5739707946777,
"learning_rate": 2e-06,
"logps/chosen": -861.9217529296875,
"logps/rejected": -1947.7921600341797,
"loss": 2.821,
"loss/dpo": 0.805006888877976,
"misalign/J": 201.59605598449707,
"misalign/J_aux_loss": 2.0159604400396347,
"misalign/J_aux_loss_raw": 201.59605598449707,
"misalign/J_over_reverse_kl": 1.4087589755654335,
"misalign/J_per_token": 0.23312609270215034,
"misalign/compressed_reward_absmax": 4261.386077880859,
"misalign/compressed_reward_range": 5968.6805419921875,
"misalign/entropy_a": 787.204231262207,
"misalign/entropy_b": 1236.5688171386719,
"misalign/forward_kl_divergence": 362.0641288757324,
"misalign/forward_kl_divergence_per_token": 0.5964512750506401,
"misalign/gamma_abs_times_reward_std": 58279984.25,
"misalign/gamma_bracketed_rate": 0.9880961626768112,
"misalign/gamma_reward_residual": 0.0008914316013033385,
"misalign/gamma_star": 100481896.0,
"misalign/js_divergence": 40.054423570632935,
"misalign/reverse_kl_divergence": 163.50957679748535,
"misalign/reverse_kl_divergence_per_token": 0.22092271596193314,
"misalign/reward_a": 105.81989192962646,
"misalign/reward_b": -13.749351739883423,
"misalign/reward_improvement": 119.56921005249023,
"misalign/reward_improvement_over_reverse_kl": 0.6199643798172474,
"misalign/reward_improvement_per_token": 0.08543467940762639,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -543.9455938339233,
"misalign/reward_vocab_std": 719.025936126709,
"misalign/symmetric_kl": 525.5739707946777,
"misalign/tv_distance": 128.89692497253418,
"num_tokens": 7641400.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -18.768484711647034,
"rewards/margins": 42.13018798828125,
"rewards/rejected": -60.89867305755615,
"step": 45,
"support/residual_count": 151893.509765625,
"support/residual_mass_policy": 0.023412443813867867,
"support/residual_mass_reference": 0.03834630874916911,
"support/residual_reward": -0.5029764696955681,
"support/runtime_width": 42.48880910873413,
"support/sampled_loser_rank": 0.6054023541510105,
"support/sampled_reward_rank": -0.30370173789560795,
"support/sampled_token_added_rate": 0.03430053312331438,
"support/sampled_winner_rank": 0.6158883348107338,
"support/selected_width": 42.48880910873413,
"support/stored_width": 42.48880910873413
},
{
"epoch": 0.71875,
"grad_norm": 99.2862548828125,
"kl/sequence_policy_ref": -352.9979362487793,
"kl/vocab_forward": 320.0415687561035,
"kl/vocab_js": 35.314189434051514,
"kl/vocab_reverse": 143.37086391448975,
"kl/vocab_symmetric": 463.41268157958984,
"learning_rate": 2e-06,
"logps/chosen": -780.6201934814453,
"logps/rejected": -1745.756332397461,
"loss": 1.967,
"loss/dpo": 0.3397530964894031,
"misalign/J": 162.72915935516357,
"misalign/J_aux_loss": 1.6272915750741959,
"misalign/J_aux_loss_raw": 162.72915935516357,
"misalign/J_over_reverse_kl": 1.814868986606598,
"misalign/J_per_token": 0.19831308163702488,
"misalign/compressed_reward_absmax": 3964.359344482422,
"misalign/compressed_reward_range": 5592.5999755859375,
"misalign/entropy_a": 701.7979049682617,
"misalign/entropy_b": 1128.1244354248047,
"misalign/forward_kl_divergence": 320.0415687561035,
"misalign/forward_kl_divergence_per_token": 0.5070049501955509,
"misalign/gamma_abs_times_reward_std": 43917063.0,
"misalign/gamma_bracketed_rate": 0.9886893406510353,
"misalign/gamma_reward_residual": 0.00017012334137689322,
"misalign/gamma_star": 66023362.0,
"misalign/js_divergence": 35.314189434051514,
"misalign/reverse_kl_divergence": 143.37086391448975,
"misalign/reverse_kl_divergence_per_token": 0.18735219724476337,
"misalign/reward_a": 90.78490257263184,
"misalign/reward_b": -14.648303270339966,
"misalign/reward_improvement": 105.43315505981445,
"misalign/reward_improvement_over_reverse_kl": 0.6790562570095062,
"misalign/reward_improvement_per_token": 0.09955848660320044,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -502.35912132263184,
"misalign/reward_vocab_std": 681.5238342285156,
"misalign/symmetric_kl": 463.41268157958984,
"misalign/tv_distance": 114.9539966583252,
"num_tokens": 7808320.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -15.536956906318665,
"rewards/margins": 39.525673627853394,
"rewards/rejected": -55.06262969970703,
"step": 46,
"support/residual_count": 151893.275390625,
"support/residual_mass_policy": 0.027146896114572883,
"support/residual_mass_reference": 0.04411950474604964,
"support/residual_reward": -0.5129662416875362,
"support/runtime_width": 42.72548723220825,
"support/sampled_loser_rank": 0.5832961872220039,
"support/sampled_reward_rank": -0.366660688072443,
"support/sampled_token_added_rate": 0.03585506067611277,
"support/sampled_winner_rank": 0.6062828227877617,
"support/selected_width": 42.72548723220825,
"support/stored_width": 42.72548723220825
},
{
"epoch": 0.734375,
"grad_norm": 95.33216857910156,
"kl/sequence_policy_ref": -339.36829376220703,
"kl/vocab_forward": 309.0083351135254,
"kl/vocab_js": 34.39964842796326,
"kl/vocab_reverse": 138.5347490310669,
"kl/vocab_symmetric": 447.5432891845703,
"learning_rate": 2e-06,
"logps/chosen": -834.6276512145996,
"logps/rejected": -1626.2992553710938,
"loss": 2.1621,
"loss/dpo": 0.5789944041461155,
"misalign/J": 158.3114309310913,
"misalign/J_aux_loss": 1.5831142514944077,
"misalign/J_aux_loss_raw": 158.3114309310913,
"misalign/J_over_reverse_kl": 1.3874521106481552,
"misalign/J_per_token": 0.2954816836863756,
"misalign/compressed_reward_absmax": 3782.6796264648438,
"misalign/compressed_reward_range": 5458.201934814453,
"misalign/entropy_a": 683.5626831054688,
"misalign/entropy_b": 1110.1263809204102,
"misalign/forward_kl_divergence": 309.0083351135254,
"misalign/forward_kl_divergence_per_token": 0.7647044509649277,
"misalign/gamma_abs_times_reward_std": 42300354.5,
"misalign/gamma_bracketed_rate": 0.9892060980200768,
"misalign/gamma_reward_residual": 0.00012735200425595394,
"misalign/gamma_star": 70152250.0,
"misalign/js_divergence": 34.39964842796326,
"misalign/reverse_kl_divergence": 138.5347490310669,
"misalign/reverse_kl_divergence_per_token": 0.3200199883431196,
"misalign/reward_a": 96.08270931243896,
"misalign/reward_b": -11.616803467273712,
"misalign/reward_improvement": 107.69949054718018,
"misalign/reward_improvement_over_reverse_kl": 0.6338916420936584,
"misalign/reward_improvement_per_token": 0.0737753571011126,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -587.4529418945312,
"misalign/reward_vocab_std": 666.2506561279297,
"misalign/symmetric_kl": 447.5432891845703,
"misalign/tv_distance": 114.28812599182129,
"num_tokens": 7963180.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -15.032280445098877,
"rewards/margins": 37.809099435806274,
"rewards/rejected": -52.84138059616089,
"step": 47,
"support/residual_count": 151893.236328125,
"support/residual_mass_policy": 0.02375667286105454,
"support/residual_mass_reference": 0.04523510206490755,
"support/residual_reward": -0.6332408636808395,
"support/runtime_width": 42.76191568374634,
"support/sampled_loser_rank": 0.6728832796216011,
"support/sampled_reward_rank": -0.326838955283165,
"support/sampled_token_added_rate": 0.029435024363920093,
"support/sampled_winner_rank": 0.7073031365871429,
"support/selected_width": 42.76191568374634,
"support/stored_width": 42.76191568374634
},
{
"epoch": 0.75,
"grad_norm": 92.29680633544922,
"kl/sequence_policy_ref": -349.18139266967773,
"kl/vocab_forward": 318.0182914733887,
"kl/vocab_js": 36.353710412979126,
"kl/vocab_reverse": 146.12834548950195,
"kl/vocab_symmetric": 464.14682960510254,
"learning_rate": 2e-06,
"logps/chosen": -845.9873504638672,
"logps/rejected": -1924.1956024169922,
"loss": 1.6528,
"loss/dpo": 0.049382371892768256,
"misalign/J": 160.33691692352295,
"misalign/J_aux_loss": 1.6033690869808197,
"misalign/J_aux_loss_raw": 160.33691692352295,
"misalign/J_over_reverse_kl": 1.2529658675193787,
"misalign/J_per_token": 0.20762860495597124,
"misalign/compressed_reward_absmax": 4354.188568115234,
"misalign/compressed_reward_range": 6215.323059082031,
"misalign/entropy_a": 822.5513381958008,
"misalign/entropy_b": 1289.558578491211,
"misalign/forward_kl_divergence": 318.0182914733887,
"misalign/forward_kl_divergence_per_token": 0.440604854375124,
"misalign/gamma_abs_times_reward_std": 41074954.5,
"misalign/gamma_bracketed_rate": 0.9895801991224289,
"misalign/gamma_reward_residual": 0.00018931449562842317,
"misalign/gamma_star": 57840401.875,
"misalign/js_divergence": 36.353710412979126,
"misalign/reverse_kl_divergence": 146.12834548950195,
"misalign/reverse_kl_divergence_per_token": 0.18436118587851524,
"misalign/reward_a": 101.43512630462646,
"misalign/reward_b": -11.868624448776245,
"misalign/reward_improvement": 113.30374765396118,
"misalign/reward_improvement_over_reverse_kl": 0.7551356106996536,
"misalign/reward_improvement_per_token": 0.1261532404460013,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -493.02664852142334,
"misalign/reward_vocab_std": 759.0527877807617,
"misalign/symmetric_kl": 464.14682960510254,
"misalign/tv_distance": 123.64643812179565,
"num_tokens": 8135261.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.46340036392212,
"rewards/margins": 40.90948009490967,
"rewards/rejected": -55.37287950515747,
"step": 48,
"support/residual_count": 151893.193359375,
"support/residual_mass_policy": 0.026260258397087455,
"support/residual_mass_reference": 0.042555712163448334,
"support/residual_reward": -0.47505020070821047,
"support/runtime_width": 42.80619525909424,
"support/sampled_loser_rank": 0.625778254121542,
"support/sampled_reward_rank": -0.3972127176821232,
"support/sampled_token_added_rate": 0.03378989826887846,
"support/sampled_winner_rank": 0.6260612569749355,
"support/selected_width": 42.80619525909424,
"support/stored_width": 42.80619525909424
},
{
"epoch": 0.75,
"eval_kl/sequence_policy_ref": -364.4170129299164,
"eval_kl/vocab_forward": 328.8754801750183,
"eval_kl/vocab_js": 37.498633831739426,
"eval_kl/vocab_reverse": 150.4579164981842,
"eval_kl/vocab_symmetric": 479.3336043357849,
"eval_logps/chosen": -882.4125304222107,
"eval_logps/rejected": -1946.6873836517334,
"eval_loss": 1.7490763664245605,
"eval_loss/dpo": 0.06740551616172231,
"eval_misalign/J": 168.167093873024,
"eval_misalign/J_aux_loss": 1.6816708873957396,
"eval_misalign/J_aux_loss_raw": 168.167093873024,
"eval_misalign/J_over_reverse_kl": 1.7788694016635418,
"eval_misalign/J_per_token": 0.22979121375828981,
"eval_misalign/compressed_reward_absmax": 4236.706272125244,
"eval_misalign/compressed_reward_range": 6027.056537628174,
"eval_misalign/entropy_a": 823.1465611457825,
"eval_misalign/entropy_b": 1298.0544729232788,
"eval_misalign/forward_kl_divergence": 328.8754801750183,
"eval_misalign/forward_kl_divergence_per_token": 0.4856905459892005,
"eval_misalign/gamma_abs_times_reward_std": 43424142.421875,
"eval_misalign/gamma_bracketed_rate": 0.9878035467118025,
"eval_misalign/gamma_reward_residual": 0.0007769855760599853,
"eval_misalign/gamma_star": 61287659.90625,
"eval_misalign/js_divergence": 37.498633831739426,
"eval_misalign/reverse_kl_divergence": 150.4579164981842,
"eval_misalign/reverse_kl_divergence_per_token": 0.19300507346633822,
"eval_misalign/reward_a": 101.7501335144043,
"eval_misalign/reward_b": -14.307281976565719,
"eval_misalign/reward_improvement": 116.05739098787308,
"eval_misalign/reward_improvement_over_reverse_kl": 0.7271916684694588,
"eval_misalign/reward_improvement_per_token": 0.09193722825148143,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.11554992198944,
"eval_misalign/reward_vocab_std": 731.32026720047,
"eval_misalign/symmetric_kl": 479.3336043357849,
"eval_misalign/tv_distance": 127.07569408416748,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -15.323657296597958,
"eval_rewards/margins": 42.23608899116516,
"eval_rewards/rejected": -57.55974632501602,
"eval_runtime": 100.8337,
"eval_samples_per_second": 5.078,
"eval_steps_per_second": 0.635,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.025306705734692514,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 48
},
{
"epoch": 0.765625,
"grad_norm": 84.55875396728516,
"kl/sequence_policy_ref": -355.52197265625,
"kl/vocab_forward": 323.58666229248047,
"kl/vocab_js": 38.03043556213379,
"kl/vocab_reverse": 151.7306032180786,
"kl/vocab_symmetric": 475.31740951538086,
"learning_rate": 2e-06,
"logps/chosen": -917.9943237304688,
"logps/rejected": -1895.99658203125,
"loss": 1.6704,
"loss/dpo": 0.04557584371703283,
"misalign/J": 162.48288917541504,
"misalign/J_aux_loss": 1.624828889966011,
"misalign/J_aux_loss_raw": 162.48288917541504,
"misalign/J_over_reverse_kl": 1.4654640778899193,
"misalign/J_per_token": 0.20114040188491344,
"misalign/compressed_reward_absmax": 4219.985290527344,
"misalign/compressed_reward_range": 5980.884826660156,
"misalign/entropy_a": 811.0443420410156,
"misalign/entropy_b": 1299.4309158325195,
"misalign/forward_kl_divergence": 323.58666229248047,
"misalign/forward_kl_divergence_per_token": 0.399110134691,
"misalign/gamma_abs_times_reward_std": 53172764.0,
"misalign/gamma_bracketed_rate": 0.9869352579116821,
"misalign/gamma_reward_residual": 3.9546100197185297e-05,
"misalign/gamma_star": 60015785.75,
"misalign/js_divergence": 38.03043556213379,
"misalign/reverse_kl_divergence": 151.7306032180786,
"misalign/reverse_kl_divergence_per_token": 0.16637993790209293,
"misalign/reward_a": 105.46696472167969,
"misalign/reward_b": -16.47221863269806,
"misalign/reward_improvement": 121.9391622543335,
"misalign/reward_improvement_over_reverse_kl": 0.7262164875864983,
"misalign/reward_improvement_per_token": 0.10510652232915163,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -500.18709564208984,
"misalign/reward_vocab_std": 730.5291976928711,
"misalign/symmetric_kl": 475.31740951538086,
"misalign/tv_distance": 129.61585903167725,
"num_tokens": 8305232.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -15.759445548057556,
"rewards/margins": 39.58550405502319,
"rewards/rejected": -55.34494924545288,
"step": 49,
"support/residual_count": 151893.353515625,
"support/residual_mass_policy": 0.022411803365685046,
"support/residual_mass_reference": 0.03780948673374951,
"support/residual_reward": -0.5111432895064354,
"support/runtime_width": 42.64701747894287,
"support/sampled_loser_rank": 0.6062813438475132,
"support/sampled_reward_rank": -0.361979590728879,
"support/sampled_token_added_rate": 0.03365356335416436,
"support/sampled_winner_rank": 0.626273512840271,
"support/selected_width": 42.64701747894287,
"support/stored_width": 42.64701747894287
},
{
"epoch": 0.78125,
"grad_norm": 120.11223602294922,
"kl/sequence_policy_ref": -378.5549774169922,
"kl/vocab_forward": 352.6853256225586,
"kl/vocab_js": 37.74899077415466,
"kl/vocab_reverse": 148.82802867889404,
"kl/vocab_symmetric": 501.5135078430176,
"learning_rate": 2e-06,
"logps/chosen": -884.3883514404297,
"logps/rejected": -1912.2733612060547,
"loss": 2.1813,
"loss/dpo": 0.1028611735179559,
"misalign/J": 207.84398746490479,
"misalign/J_aux_loss": 2.078439861536026,
"misalign/J_aux_loss_raw": 207.84398746490479,
"misalign/J_over_reverse_kl": 1.7021770626306534,
"misalign/J_per_token": 0.2538851350545883,
"misalign/compressed_reward_absmax": 4287.376739501953,
"misalign/compressed_reward_range": 6082.450378417969,
"misalign/entropy_a": 770.8007431030273,
"misalign/entropy_b": 1264.5378875732422,
"misalign/forward_kl_divergence": 352.6853256225586,
"misalign/forward_kl_divergence_per_token": 0.5054213367402554,
"misalign/gamma_abs_times_reward_std": 62323988.25,
"misalign/gamma_bracketed_rate": 0.985651396214962,
"misalign/gamma_reward_residual": 0.002676691350643523,
"misalign/gamma_star": 81304675.0,
"misalign/js_divergence": 37.74899077415466,
"misalign/reverse_kl_divergence": 148.82802867889404,
"misalign/reverse_kl_divergence_per_token": 0.19959929399192333,
"misalign/reward_a": 91.68502187728882,
"misalign/reward_b": -13.228169560432434,
"misalign/reward_improvement": 104.91317367553711,
"misalign/reward_improvement_over_reverse_kl": 0.6395450867712498,
"misalign/reward_improvement_per_token": 0.07659031543880701,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -545.8368339538574,
"misalign/reward_vocab_std": 735.2048110961914,
"misalign/symmetric_kl": 501.5135078430176,
"misalign/tv_distance": 127.10494995117188,
"num_tokens": 8481546.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -16.9174667596817,
"rewards/margins": 41.87606382369995,
"rewards/rejected": -58.793529987335205,
"step": 50,
"support/residual_count": 151893.240234375,
"support/residual_mass_policy": 0.028523427667096257,
"support/residual_mass_reference": 0.04290076903998852,
"support/residual_reward": -0.4372365176677704,
"support/runtime_width": 42.76475811004639,
"support/sampled_loser_rank": 0.5924237333238125,
"support/sampled_reward_rank": -0.40600746124982834,
"support/sampled_token_added_rate": 0.03576831449754536,
"support/sampled_winner_rank": 0.618581123650074,
"support/selected_width": 42.76475811004639,
"support/stored_width": 42.76475811004639
},
{
"epoch": 0.796875,
"grad_norm": 74.84430694580078,
"kl/sequence_policy_ref": -348.85993576049805,
"kl/vocab_forward": 322.82544136047363,
"kl/vocab_js": 34.94163501262665,
"kl/vocab_reverse": 137.33496236801147,
"kl/vocab_symmetric": 460.160680770874,
"learning_rate": 2e-06,
"logps/chosen": -807.7403802871704,
"logps/rejected": -1745.3731842041016,
"loss": 2.0267,
"loss/dpo": 0.47453043650421023,
"misalign/J": 155.21207237243652,
"misalign/J_aux_loss": 1.5521207079291344,
"misalign/J_aux_loss_raw": 155.21207237243652,
"misalign/J_over_reverse_kl": 1.433588370680809,
"misalign/J_per_token": 0.2532362565398216,
"misalign/compressed_reward_absmax": 3888.342071533203,
"misalign/compressed_reward_range": 5480.323181152344,
"misalign/entropy_a": 695.3439636230469,
"misalign/entropy_b": 1156.5927391052246,
"misalign/forward_kl_divergence": 322.82544136047363,
"misalign/forward_kl_divergence_per_token": 0.725763525813818,
"misalign/gamma_abs_times_reward_std": 41670287.75,
"misalign/gamma_bracketed_rate": 0.9877297207713127,
"misalign/gamma_reward_residual": -6.590514796300795e-05,
"misalign/gamma_star": 57525834.875,
"misalign/js_divergence": 34.94163501262665,
"misalign/reverse_kl_divergence": 137.33496236801147,
"misalign/reverse_kl_divergence_per_token": 0.2455148883163929,
"misalign/reward_a": 83.87791728973389,
"misalign/reward_b": -14.364118754863739,
"misalign/reward_improvement": 98.24201107025146,
"misalign/reward_improvement_over_reverse_kl": 0.5628711394965649,
"misalign/reward_improvement_per_token": 0.03299418743699789,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -404.0699119567871,
"misalign/reward_vocab_std": 662.7630424499512,
"misalign/symmetric_kl": 460.160680770874,
"misalign/tv_distance": 116.86641025543213,
"num_tokens": 8645585.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -16.726518213748932,
"rewards/margins": 36.31895399093628,
"rewards/rejected": -53.045472145080566,
"step": 51,
"support/residual_count": 151893.197265625,
"support/residual_mass_policy": 0.024921999080106616,
"support/residual_mass_reference": 0.04474550345912576,
"support/residual_reward": -0.4795303028076887,
"support/runtime_width": 42.801236152648926,
"support/sampled_loser_rank": 0.624417282640934,
"support/sampled_reward_rank": -0.37105782236903906,
"support/sampled_token_added_rate": 0.032293472439050674,
"support/sampled_winner_rank": 0.6685933172702789,
"support/selected_width": 42.801236152648926,
"support/stored_width": 42.801236152648926
},
{
"epoch": 0.8125,
"grad_norm": 86.25784301757812,
"kl/sequence_policy_ref": -477.46276092529297,
"kl/vocab_forward": 436.0279541015625,
"kl/vocab_js": 44.9802827835083,
"kl/vocab_reverse": 177.54276657104492,
"kl/vocab_symmetric": 613.5709609985352,
"learning_rate": 2e-06,
"logps/chosen": -881.7166900634766,
"logps/rejected": -2414.805862426758,
"loss": 2.5689,
"loss/dpo": 0.6295568409377225,
"misalign/J": 193.93076133728027,
"misalign/J_aux_loss": 1.939307525753975,
"misalign/J_aux_loss_raw": 193.93076133728027,
"misalign/J_over_reverse_kl": 1.406748965382576,
"misalign/J_per_token": 0.2848157715052366,
"misalign/compressed_reward_absmax": 4759.470764160156,
"misalign/compressed_reward_range": 6730.137023925781,
"misalign/entropy_a": 874.9066314697266,
"misalign/entropy_b": 1443.2865905761719,
"misalign/forward_kl_divergence": 436.0279541015625,
"misalign/forward_kl_divergence_per_token": 0.5377647392451763,
"misalign/gamma_abs_times_reward_std": 52484937.5,
"misalign/gamma_bracketed_rate": 0.9844366908073425,
"misalign/gamma_reward_residual": 0.0008822159904866567,
"misalign/gamma_star": 86450198.0,
"misalign/js_divergence": 44.9802827835083,
"misalign/reverse_kl_divergence": 177.54276657104492,
"misalign/reverse_kl_divergence_per_token": 0.1856713891029358,
"misalign/reward_a": 107.41517734527588,
"misalign/reward_b": -19.218833327293396,
"misalign/reward_improvement": 126.63399696350098,
"misalign/reward_improvement_over_reverse_kl": 0.6467054821550846,
"misalign/reward_improvement_per_token": 0.07611742825247347,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -570.9265441894531,
"misalign/reward_vocab_std": 818.5440444946289,
"misalign/symmetric_kl": 613.5709609985352,
"misalign/tv_distance": 146.72683811187744,
"num_tokens": 8830616.0,
"rewards/accuracies": 0.96875,
"rewards/chosen": -20.915815234184265,
"rewards/margins": 53.66092252731323,
"rewards/rejected": -74.57673835754395,
"step": 52,
"support/residual_count": 151893.328125,
"support/residual_mass_policy": 0.02075903001241386,
"support/residual_mass_reference": 0.04033026983961463,
"support/residual_reward": -0.39157247683033347,
"support/runtime_width": 42.66762590408325,
"support/sampled_loser_rank": 0.6271255537867546,
"support/sampled_reward_rank": -0.3960692873224616,
"support/sampled_token_added_rate": 0.0356892254203558,
"support/sampled_winner_rank": 0.6444137506186962,
"support/selected_width": 42.66762590408325,
"support/stored_width": 42.66762590408325
},
{
"epoch": 0.828125,
"grad_norm": 101.57430267333984,
"kl/sequence_policy_ref": -465.77392578125,
"kl/vocab_forward": 446.5703926086426,
"kl/vocab_js": 47.15303373336792,
"kl/vocab_reverse": 184.80498790740967,
"kl/vocab_symmetric": 631.3756484985352,
"learning_rate": 2e-06,
"logps/chosen": -1251.7670364379883,
"logps/rejected": -2130.667724609375,
"loss": 2.2582,
"loss/dpo": 0.29861677209665966,
"misalign/J": 195.95514106750488,
"misalign/J_aux_loss": 1.959551364183426,
"misalign/J_aux_loss_raw": 195.95514106750488,
"misalign/J_over_reverse_kl": 1.1659726202487946,
"misalign/J_per_token": 0.2168925404548645,
"misalign/compressed_reward_absmax": 4992.182373046875,
"misalign/compressed_reward_range": 7108.8575439453125,
"misalign/entropy_a": 898.7776031494141,
"misalign/entropy_b": 1514.9154586791992,
"misalign/forward_kl_divergence": 446.5703926086426,
"misalign/forward_kl_divergence_per_token": 0.5860045477747917,
"misalign/gamma_abs_times_reward_std": 51011083.5,
"misalign/gamma_bracketed_rate": 0.9903441444039345,
"misalign/gamma_reward_residual": -0.0019245314256295387,
"misalign/gamma_star": 85517778.5,
"misalign/js_divergence": 47.15303373336792,
"misalign/reverse_kl_divergence": 184.80498790740967,
"misalign/reverse_kl_divergence_per_token": 0.21940777078270912,
"misalign/reward_a": 114.58473777770996,
"misalign/reward_b": -14.659283697605133,
"misalign/reward_improvement": 129.2440366744995,
"misalign/reward_improvement_over_reverse_kl": 0.6434826478362083,
"misalign/reward_improvement_per_token": 0.05605245754122734,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -762.099235534668,
"misalign/reward_vocab_std": 874.5540542602539,
"misalign/symmetric_kl": 631.3756484985352,
"misalign/tv_distance": 157.77043342590332,
"num_tokens": 9025743.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -26.162310361862183,
"rewards/margins": 40.83016753196716,
"rewards/rejected": -66.99247980117798,
"step": 53,
"support/residual_count": 151893.298828125,
"support/residual_mass_policy": 0.020821302896365523,
"support/residual_mass_reference": 0.039504863787442446,
"support/residual_reward": -0.5619673319160938,
"support/runtime_width": 42.6985387802124,
"support/sampled_loser_rank": 0.6461238414049149,
"support/sampled_reward_rank": -0.34166772849857807,
"support/sampled_token_added_rate": 0.03369109332561493,
"support/sampled_winner_rank": 0.6555347442626953,
"support/selected_width": 42.6985387802124,
"support/stored_width": 42.6985387802124
},
{
"epoch": 0.84375,
"grad_norm": 298.0096435546875,
"kl/sequence_policy_ref": -386.71077728271484,
"kl/vocab_forward": 365.6785316467285,
"kl/vocab_js": 35.59814095497131,
"kl/vocab_reverse": 139.05288410186768,
"kl/vocab_symmetric": 504.7316131591797,
"learning_rate": 2e-06,
"logps/chosen": -760.6196022033691,
"logps/rejected": -1875.7993927001953,
"loss": 2.171,
"loss/dpo": 0.14888411300079252,
"misalign/J": 202.2076416015625,
"misalign/J_aux_loss": 2.022076301276684,
"misalign/J_aux_loss_raw": 202.2076416015625,
"misalign/J_over_reverse_kl": 2.347415864467621,
"misalign/J_per_token": 0.2071497868746519,
"misalign/compressed_reward_absmax": 4018.2659912109375,
"misalign/compressed_reward_range": 5681.7738037109375,
"misalign/entropy_a": 716.9813385009766,
"misalign/entropy_b": 1167.6287994384766,
"misalign/forward_kl_divergence": 365.6785316467285,
"misalign/forward_kl_divergence_per_token": 0.49891503155231476,
"misalign/gamma_abs_times_reward_std": 61696461.5,
"misalign/gamma_bracketed_rate": 0.9874565973877907,
"misalign/gamma_reward_residual": -0.004232324329905168,
"misalign/gamma_star": 28188861.5,
"misalign/js_divergence": 35.59814095497131,
"misalign/reverse_kl_divergence": 139.05288410186768,
"misalign/reverse_kl_divergence_per_token": 0.17908263765275478,
"misalign/reward_a": 83.29793310165405,
"misalign/reward_b": -13.053004205226898,
"misalign/reward_improvement": 96.35093402862549,
"misalign/reward_improvement_over_reverse_kl": 0.588286180049181,
"misalign/reward_improvement_per_token": 0.07466917904093862,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -496.0263900756836,
"misalign/reward_vocab_std": 698.2202835083008,
"misalign/symmetric_kl": 504.7316131591797,
"misalign/tv_distance": 115.95970249176025,
"num_tokens": 9183237.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -18.112544775009155,
"rewards/margins": 41.11706781387329,
"rewards/rejected": -59.229612827301025,
"step": 54,
"support/residual_count": 151893.44140625,
"support/residual_mass_policy": 0.02262613782659173,
"support/residual_mass_reference": 0.03800426935777068,
"support/residual_reward": -0.5341151673346758,
"support/runtime_width": 42.55538368225098,
"support/sampled_loser_rank": 0.5614169128239155,
"support/sampled_reward_rank": -0.29479603469371796,
"support/sampled_token_added_rate": 0.03493571188300848,
"support/sampled_winner_rank": 0.5692789405584335,
"support/selected_width": 42.55538368225098,
"support/stored_width": 42.55538368225098
},
{
"epoch": 0.859375,
"grad_norm": 99.84505462646484,
"kl/sequence_policy_ref": -418.7293930053711,
"kl/vocab_forward": 371.4789810180664,
"kl/vocab_js": 35.276673793792725,
"kl/vocab_reverse": 138.0893669128418,
"kl/vocab_symmetric": 509.5685806274414,
"learning_rate": 2e-06,
"logps/chosen": -689.4199485778809,
"logps/rejected": -1892.0952606201172,
"loss": 1.7759,
"loss/dpo": 0.14481948580403764,
"misalign/J": 163.1080617904663,
"misalign/J_aux_loss": 1.6310805529356003,
"misalign/J_aux_loss_raw": 163.1080617904663,
"misalign/J_over_reverse_kl": 1.3716232553124428,
"misalign/J_per_token": 0.2615004684776068,
"misalign/compressed_reward_absmax": 3880.816650390625,
"misalign/compressed_reward_range": 5555.395263671875,
"misalign/entropy_a": 626.7629547119141,
"misalign/entropy_b": 1067.2015533447266,
"misalign/forward_kl_divergence": 371.4789810180664,
"misalign/forward_kl_divergence_per_token": 0.8582677766680717,
"misalign/gamma_abs_times_reward_std": 44081586.0,
"misalign/gamma_bracketed_rate": 0.9872501865029335,
"misalign/gamma_reward_residual": 0.007101273212128945,
"misalign/gamma_star": 65061232.25,
"misalign/js_divergence": 35.276673793792725,
"misalign/reverse_kl_divergence": 138.0893669128418,
"misalign/reverse_kl_divergence_per_token": 0.27060581743717194,
"misalign/reward_a": 76.1998405456543,
"misalign/reward_b": -13.09484925866127,
"misalign/reward_improvement": 89.29467296600342,
"misalign/reward_improvement_over_reverse_kl": 0.5231703817844391,
"misalign/reward_improvement_per_token": 0.035240316297858953,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -446.2036018371582,
"misalign/reward_vocab_std": 662.775749206543,
"misalign/symmetric_kl": 509.5685806274414,
"misalign/tv_distance": 113.11602115631104,
"num_tokens": 9339840.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -17.44824308156967,
"rewards/margins": 48.849395751953125,
"rewards/rejected": -66.29763889312744,
"step": 55,
"support/residual_count": 151893.3671875,
"support/residual_mass_policy": 0.019164926023222506,
"support/residual_mass_reference": 0.03859002981334925,
"support/residual_reward": -0.47019451297819614,
"support/runtime_width": 42.63320541381836,
"support/sampled_loser_rank": 0.6031154692173004,
"support/sampled_reward_rank": -0.38968720100820065,
"support/sampled_token_added_rate": 0.0316556547768414,
"support/sampled_winner_rank": 0.6063822247087955,
"support/selected_width": 42.63320541381836,
"support/stored_width": 42.63320541381836
},
{
"epoch": 0.875,
"grad_norm": 68.76524353027344,
"kl/sequence_policy_ref": -402.62422370910645,
"kl/vocab_forward": 360.61656951904297,
"kl/vocab_js": 36.79127216339111,
"kl/vocab_reverse": 143.1292266845703,
"kl/vocab_symmetric": 503.7460651397705,
"learning_rate": 2e-06,
"logps/chosen": -796.70947265625,
"logps/rejected": -1951.9369506835938,
"loss": 1.9268,
"loss/dpo": 0.31827243250995924,
"misalign/J": 160.8573293685913,
"misalign/J_aux_loss": 1.608573243021965,
"misalign/J_aux_loss_raw": 160.8573293685913,
"misalign/J_over_reverse_kl": 1.212674729526043,
"misalign/J_per_token": 0.23724722862243652,
"misalign/compressed_reward_absmax": 3996.8917541503906,
"misalign/compressed_reward_range": 5663.479064941406,
"misalign/entropy_a": 717.5296401977539,
"misalign/entropy_b": 1203.4962768554688,
"misalign/forward_kl_divergence": 360.61656951904297,
"misalign/forward_kl_divergence_per_token": 0.7455775737762451,
"misalign/gamma_abs_times_reward_std": 49514251.0,
"misalign/gamma_bracketed_rate": 0.9896730110049248,
"misalign/gamma_reward_residual": 9.679878201040992e-05,
"misalign/gamma_star": 57219496.125,
"misalign/js_divergence": 36.79127216339111,
"misalign/reverse_kl_divergence": 143.1292266845703,
"misalign/reverse_kl_divergence_per_token": 0.26022260822355747,
"misalign/reward_a": 82.29156970977783,
"misalign/reward_b": -11.702201634645462,
"misalign/reward_improvement": 93.99376678466797,
"misalign/reward_improvement_over_reverse_kl": 0.5378214567899704,
"misalign/reward_improvement_per_token": 0.04054644517600536,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -461.6409397125244,
"misalign/reward_vocab_std": 698.131046295166,
"misalign/symmetric_kl": 503.7460651397705,
"misalign/tv_distance": 122.93070888519287,
"num_tokens": 9506436.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -17.97724825143814,
"rewards/margins": 44.570350646972656,
"rewards/rejected": -62.54759979248047,
"step": 56,
"support/residual_count": 151893.306640625,
"support/residual_mass_policy": 0.020866328850388527,
"support/residual_mass_reference": 0.039209546288475394,
"support/residual_reward": -0.41471402533352375,
"support/runtime_width": 42.69208335876465,
"support/sampled_loser_rank": 0.6182565689086914,
"support/sampled_reward_rank": -0.4434542544186115,
"support/sampled_token_added_rate": 0.03403148171491921,
"support/sampled_winner_rank": 0.6329206973314285,
"support/selected_width": 42.69208335876465,
"support/stored_width": 42.69208335876465
},
{
"epoch": 0.890625,
"grad_norm": 94.79530334472656,
"kl/sequence_policy_ref": -375.49475288391113,
"kl/vocab_forward": 340.9373073577881,
"kl/vocab_js": 33.93104815483093,
"kl/vocab_reverse": 132.34368515014648,
"kl/vocab_symmetric": 473.28116607666016,
"learning_rate": 2e-06,
"logps/chosen": -666.1807670593262,
"logps/rejected": -1875.8219909667969,
"loss": 1.7866,
"loss/dpo": 0.24838248359601162,
"misalign/J": 153.82119750976562,
"misalign/J_aux_loss": 1.5382119417190552,
"misalign/J_aux_loss_raw": 153.82119750976562,
"misalign/J_over_reverse_kl": 1.4413592517375946,
"misalign/J_per_token": 0.2751000728458166,
"misalign/compressed_reward_absmax": 3810.44873046875,
"misalign/compressed_reward_range": 5403.443389892578,
"misalign/entropy_a": 678.8246765136719,
"misalign/entropy_b": 1104.3003616333008,
"misalign/forward_kl_divergence": 340.9373073577881,
"misalign/forward_kl_divergence_per_token": 0.9033116102218628,
"misalign/gamma_abs_times_reward_std": 43435399.0,
"misalign/gamma_bracketed_rate": 0.9872131571173668,
"misalign/gamma_reward_residual": 0.00028186071233449184,
"misalign/gamma_star": 71246069.25,
"misalign/js_divergence": 33.93104815483093,
"misalign/reverse_kl_divergence": 132.34368515014648,
"misalign/reverse_kl_divergence_per_token": 0.27233118936419487,
"misalign/reward_a": 77.04429864883423,
"misalign/reward_b": -15.656381011009216,
"misalign/reward_improvement": 92.70064735412598,
"misalign/reward_improvement_over_reverse_kl": 0.46722693368792534,
"misalign/reward_improvement_per_token": -0.003935309126973152,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -453.2850399017334,
"misalign/reward_vocab_std": 652.1301879882812,
"misalign/symmetric_kl": 473.28116607666016,
"misalign/tv_distance": 112.50246047973633,
"num_tokens": 9663583.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -16.42910087108612,
"rewards/margins": 42.240750551223755,
"rewards/rejected": -58.6698522567749,
"step": 57,
"support/residual_count": 151893.123046875,
"support/residual_mass_policy": 0.025635873898863792,
"support/residual_mass_reference": 0.051683404948562384,
"support/residual_reward": -0.43221110105514526,
"support/runtime_width": 42.8780837059021,
"support/sampled_loser_rank": 0.6225372664630413,
"support/sampled_reward_rank": -0.4814031980931759,
"support/sampled_token_added_rate": 0.044704005820676684,
"support/sampled_winner_rank": 0.6614899709820747,
"support/selected_width": 42.8780837059021,
"support/stored_width": 42.8780837059021
},
{
"epoch": 0.90625,
"grad_norm": 63.19841003417969,
"kl/sequence_policy_ref": -429.0251274108887,
"kl/vocab_forward": 393.4870414733887,
"kl/vocab_js": 40.02447009086609,
"kl/vocab_reverse": 155.88319969177246,
"kl/vocab_symmetric": 549.3705825805664,
"learning_rate": 2e-06,
"logps/chosen": -829.9505310058594,
"logps/rejected": -2197.3387451171875,
"loss": 2.1333,
"loss/dpo": 0.4092487035359162,
"misalign/J": 172.40936088562012,
"misalign/J_aux_loss": 1.724093571305275,
"misalign/J_aux_loss_raw": 172.40936088562012,
"misalign/J_over_reverse_kl": 2.0524225011467934,
"misalign/J_per_token": 0.27197333984076977,
"misalign/compressed_reward_absmax": 4601.915740966797,
"misalign/compressed_reward_range": 6508.782531738281,
"misalign/entropy_a": 811.3467330932617,
"misalign/entropy_b": 1340.6858673095703,
"misalign/forward_kl_divergence": 393.4870414733887,
"misalign/forward_kl_divergence_per_token": 0.5244201272726059,
"misalign/gamma_abs_times_reward_std": 48018970.5,
"misalign/gamma_bracketed_rate": 0.9816265851259232,
"misalign/gamma_reward_residual": 0.00013214930413596448,
"misalign/gamma_star": 68324594.0,
"misalign/js_divergence": 40.02447009086609,
"misalign/reverse_kl_divergence": 155.88319969177246,
"misalign/reverse_kl_divergence_per_token": 0.17863536719232798,
"misalign/reward_a": 84.13565301895142,
"misalign/reward_b": -20.68537664413452,
"misalign/reward_improvement": 104.8209924697876,
"misalign/reward_improvement_over_reverse_kl": 0.602836437523365,
"misalign/reward_improvement_per_token": 0.056509769055992365,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -554.3869686126709,
"misalign/reward_vocab_std": 789.2750701904297,
"misalign/symmetric_kl": 549.3705825805664,
"misalign/tv_distance": 134.0800666809082,
"num_tokens": 9847213.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -18.04733145236969,
"rewards/margins": 49.71036195755005,
"rewards/rejected": -67.75769662857056,
"step": 58,
"support/residual_count": 151893.20703125,
"support/residual_mass_policy": 0.02004858397413045,
"support/residual_mass_reference": 0.043993874453008175,
"support/residual_reward": -0.44899558275938034,
"support/runtime_width": 42.797929763793945,
"support/sampled_loser_rank": 0.6209223605692387,
"support/sampled_reward_rank": -0.39453774876892567,
"support/sampled_token_added_rate": 0.03281214344315231,
"support/sampled_winner_rank": 0.663671188056469,
"support/selected_width": 42.797929763793945,
"support/stored_width": 42.797929763793945
},
{
"epoch": 0.921875,
"grad_norm": 117.94857788085938,
"kl/sequence_policy_ref": -443.7041206359863,
"kl/vocab_forward": 409.88890075683594,
"kl/vocab_js": 40.246328592300415,
"kl/vocab_reverse": 156.80553817749023,
"kl/vocab_symmetric": 566.6947135925293,
"learning_rate": 2e-06,
"logps/chosen": -869.2680358886719,
"logps/rejected": -2070.011489868164,
"loss": 3.0679,
"loss/dpo": 1.1368572648187203,
"misalign/J": 193.10802459716797,
"misalign/J_aux_loss": 1.931080162525177,
"misalign/J_aux_loss_raw": 193.10802459716797,
"misalign/J_over_reverse_kl": 1.537727639079094,
"misalign/J_per_token": 0.22832004725933075,
"misalign/compressed_reward_absmax": 4309.089996337891,
"misalign/compressed_reward_range": 6035.041198730469,
"misalign/entropy_a": 770.8880767822266,
"misalign/entropy_b": 1271.0429077148438,
"misalign/forward_kl_divergence": 409.88890075683594,
"misalign/forward_kl_divergence_per_token": 0.548376951366663,
"misalign/gamma_abs_times_reward_std": 57010088.0,
"misalign/gamma_bracketed_rate": 0.9858352392911911,
"misalign/gamma_reward_residual": 0.00474659322799198,
"misalign/gamma_star": 87458212.0,
"misalign/js_divergence": 40.246328592300415,
"misalign/reverse_kl_divergence": 156.80553817749023,
"misalign/reverse_kl_divergence_per_token": 0.1863451935350895,
"misalign/reward_a": 87.77897262573242,
"misalign/reward_b": -18.043125957250595,
"misalign/reward_improvement": 105.82207298278809,
"misalign/reward_improvement_over_reverse_kl": 0.6192803308367729,
"misalign/reward_improvement_per_token": 0.07612488837912679,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -637.8235740661621,
"misalign/reward_vocab_std": 722.6581039428711,
"misalign/symmetric_kl": 566.6947135925293,
"misalign/tv_distance": 131.4430446624756,
"num_tokens": 10021862.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -21.588929533958435,
"rewards/margins": 45.56296682357788,
"rewards/rejected": -67.15189743041992,
"step": 59,
"support/residual_count": 151893.498046875,
"support/residual_mass_policy": 0.022956559900194407,
"support/residual_mass_reference": 0.03896482312120497,
"support/residual_reward": -0.552052453160286,
"support/runtime_width": 42.50350904464722,
"support/sampled_loser_rank": 0.5951877385377884,
"support/sampled_reward_rank": -0.2748332447372377,
"support/sampled_token_added_rate": 0.03523444454185665,
"support/sampled_winner_rank": 0.6131882853806019,
"support/selected_width": 42.50350904464722,
"support/stored_width": 42.50350904464722
},
{
"epoch": 0.9375,
"grad_norm": 52.73037338256836,
"kl/sequence_policy_ref": -394.9115695953369,
"kl/vocab_forward": 363.2713165283203,
"kl/vocab_js": 35.156471252441406,
"kl/vocab_reverse": 136.1243805885315,
"kl/vocab_symmetric": 499.39588928222656,
"learning_rate": 2e-06,
"logps/chosen": -676.7391777038574,
"logps/rejected": -1959.0352096557617,
"loss": 1.6395,
"loss/dpo": 0.1764393468254366,
"misalign/J": 146.30127716064453,
"misalign/J_aux_loss": 1.4630127176642418,
"misalign/J_aux_loss_raw": 146.30127716064453,
"misalign/J_over_reverse_kl": 1.2657062262296677,
"misalign/J_per_token": 0.22581798769533634,
"misalign/compressed_reward_absmax": 4125.538238525391,
"misalign/compressed_reward_range": 5841.7431640625,
"misalign/entropy_a": 674.7129745483398,
"misalign/entropy_b": 1137.8532485961914,
"misalign/forward_kl_divergence": 363.2713165283203,
"misalign/forward_kl_divergence_per_token": 0.5696188099682331,
"misalign/gamma_abs_times_reward_std": 40711104.25,
"misalign/gamma_bracketed_rate": 0.9862173870205879,
"misalign/gamma_reward_residual": 0.0012141035936110711,
"misalign/gamma_star": 54984167.75,
"misalign/js_divergence": 35.156471252441406,
"misalign/reverse_kl_divergence": 136.1243805885315,
"misalign/reverse_kl_divergence_per_token": 0.18038787879049778,
"misalign/reward_a": 68.60513997077942,
"misalign/reward_b": -15.08642065525055,
"misalign/reward_improvement": 83.69155693054199,
"misalign/reward_improvement_over_reverse_kl": 0.6113412380218506,
"misalign/reward_improvement_per_token": 0.08037259662523866,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -327.37775897979736,
"misalign/reward_vocab_std": 704.0034561157227,
"misalign/symmetric_kl": 499.39588928222656,
"misalign/tv_distance": 114.7151231765747,
"num_tokens": 10193216.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -16.442305088043213,
"rewards/margins": 46.09770584106445,
"rewards/rejected": -62.540010929107666,
"step": 60,
"support/residual_count": 151893.306640625,
"support/residual_mass_policy": 0.020310348831117153,
"support/residual_mass_reference": 0.03744203574024141,
"support/residual_reward": -0.35806123074144125,
"support/runtime_width": 42.69117593765259,
"support/sampled_loser_rank": 0.5424250774085522,
"support/sampled_reward_rank": -0.44320493191480637,
"support/sampled_token_added_rate": 0.03534874925389886,
"support/sampled_winner_rank": 0.5376182310283184,
"support/selected_width": 42.69117593765259,
"support/stored_width": 42.69117593765259
},
{
"epoch": 0.9375,
"eval_kl/sequence_policy_ref": -443.8763482570648,
"eval_kl/vocab_forward": 408.00172185897827,
"eval_kl/vocab_js": 39.08841371536255,
"eval_kl/vocab_reverse": 151.38377118110657,
"eval_kl/vocab_symmetric": 559.3856892585754,
"eval_logps/chosen": -939.2870273590088,
"eval_logps/rejected": -2048.731554031372,
"eval_loss": 1.7673890590667725,
"eval_loss/dpo": 0.14154351732228904,
"eval_misalign/J": 162.58456230163574,
"eval_misalign/J_aux_loss": 1.625845598988235,
"eval_misalign/J_aux_loss_raw": 162.58456230163574,
"eval_misalign/J_over_reverse_kl": 1.5481905555352569,
"eval_misalign/J_per_token": 0.22693954594433308,
"eval_misalign/compressed_reward_absmax": 4236.706272125244,
"eval_misalign/compressed_reward_range": 6027.056491851807,
"eval_misalign/entropy_a": 786.7468104362488,
"eval_misalign/entropy_b": 1298.0544500350952,
"eval_misalign/forward_kl_divergence": 408.00172185897827,
"eval_misalign/forward_kl_divergence_per_token": 0.6159884915687144,
"eval_misalign/gamma_abs_times_reward_std": 42470128.234375,
"eval_misalign/gamma_bracketed_rate": 0.9880082719027996,
"eval_misalign/gamma_reward_residual": 0.0014271657525881665,
"eval_misalign/gamma_star": 53453969.375,
"eval_misalign/js_divergence": 39.08841371536255,
"eval_misalign/reverse_kl_divergence": 151.38377118110657,
"eval_misalign/reverse_kl_divergence_per_token": 0.1974795301211998,
"eval_misalign/reward_a": 88.27734404802322,
"eval_misalign/reward_b": -14.307282019406557,
"eval_misalign/reward_improvement": 102.58461898565292,
"eval_misalign/reward_improvement_over_reverse_kl": 0.6177075733430684,
"eval_misalign/reward_improvement_per_token": 0.07934931269846857,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.11556017398834,
"eval_misalign/reward_vocab_std": 731.320264339447,
"eval_misalign/symmetric_kl": 559.3856892585754,
"eval_misalign/tv_distance": 129.75177884101868,
"eval_rewards/accuracies": 0.982421875,
"eval_rewards/chosen": -21.011107921600342,
"eval_rewards/margins": 46.75305512547493,
"eval_rewards/rejected": -67.76416301727295,
"eval_runtime": 101.586,
"eval_samples_per_second": 5.04,
"eval_steps_per_second": 0.63,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.023736586881568655,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 60
},
{
"epoch": 0.953125,
"grad_norm": 66.27374267578125,
"kl/sequence_policy_ref": -495.910831451416,
"kl/vocab_forward": 454.87990951538086,
"kl/vocab_js": 42.96171951293945,
"kl/vocab_reverse": 166.5788345336914,
"kl/vocab_symmetric": 621.4590148925781,
"learning_rate": 2e-06,
"logps/chosen": -882.9071960449219,
"logps/rejected": -2324.2291259765625,
"loss": 1.9092,
"loss/dpo": 0.00656887573249243,
"misalign/J": 190.26472282409668,
"misalign/J_aux_loss": 1.9026471227407455,
"misalign/J_aux_loss_raw": 190.26472282409668,
"misalign/J_over_reverse_kl": 1.2592844367027283,
"misalign/J_per_token": 0.22797120176255703,
"misalign/compressed_reward_absmax": 4472.839447021484,
"misalign/compressed_reward_range": 6308.651184082031,
"misalign/entropy_a": 808.8803405761719,
"misalign/entropy_b": 1358.7424087524414,
"misalign/forward_kl_divergence": 454.87990951538086,
"misalign/forward_kl_divergence_per_token": 0.6771250255405903,
"misalign/gamma_abs_times_reward_std": 54845561.25,
"misalign/gamma_bracketed_rate": 0.9894062727689743,
"misalign/gamma_reward_residual": 0.0002826992104019155,
"misalign/gamma_star": 55459094.75,
"misalign/js_divergence": 42.96171951293945,
"misalign/reverse_kl_divergence": 166.5788345336914,
"misalign/reverse_kl_divergence_per_token": 0.2539278268814087,
"misalign/reward_a": 87.17640686035156,
"misalign/reward_b": -18.71793007850647,
"misalign/reward_improvement": 105.89432716369629,
"misalign/reward_improvement_over_reverse_kl": 0.5326569080352783,
"misalign/reward_improvement_per_token": 0.01839596056379378,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -465.77495288848877,
"misalign/reward_vocab_std": 756.1640853881836,
"misalign/symmetric_kl": 621.4590148925781,
"misalign/tv_distance": 140.44063568115234,
"num_tokens": 10376767.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -23.160260915756226,
"rewards/margins": 52.86164569854736,
"rewards/rejected": -76.02190685272217,
"step": 61,
"support/residual_count": 151893.14453125,
"support/residual_mass_policy": 0.020877071423456073,
"support/residual_mass_reference": 0.04333586525171995,
"support/residual_reward": -0.3012515353038907,
"support/runtime_width": 42.852439880371094,
"support/sampled_loser_rank": 0.6411669962108135,
"support/sampled_reward_rank": -0.4843590557575226,
"support/sampled_token_added_rate": 0.03510869154706597,
"support/sampled_winner_rank": 0.7503333985805511,
"support/selected_width": 42.852439880371094,
"support/stored_width": 42.852439880371094
},
{
"epoch": 0.96875,
"grad_norm": 135.78811645507812,
"kl/sequence_policy_ref": -521.3121376037598,
"kl/vocab_forward": 479.5559272766113,
"kl/vocab_js": 44.95201635360718,
"kl/vocab_reverse": 172.62139701843262,
"kl/vocab_symmetric": 652.1775360107422,
"learning_rate": 2e-06,
"logps/chosen": -1089.0823440551758,
"logps/rejected": -2275.8909606933594,
"loss": 2.0802,
"loss/dpo": 0.21537577021728538,
"misalign/J": 186.48083686828613,
"misalign/J_aux_loss": 1.8648083209991455,
"misalign/J_aux_loss_raw": 186.48083686828613,
"misalign/J_over_reverse_kl": 1.9623293429613113,
"misalign/J_per_token": 0.2417179737240076,
"misalign/compressed_reward_absmax": 4746.217712402344,
"misalign/compressed_reward_range": 6676.980529785156,
"misalign/entropy_a": 843.7399063110352,
"misalign/entropy_b": 1419.2857818603516,
"misalign/forward_kl_divergence": 479.5559272766113,
"misalign/forward_kl_divergence_per_token": 0.5206913501024246,
"misalign/gamma_abs_times_reward_std": 52206773.75,
"misalign/gamma_bracketed_rate": 0.9809182211756706,
"misalign/gamma_reward_residual": 0.0015386186018986336,
"misalign/gamma_star": 70413638.5,
"misalign/js_divergence": 44.95201635360718,
"misalign/reverse_kl_divergence": 172.62139701843262,
"misalign/reverse_kl_divergence_per_token": 0.17035234905779362,
"misalign/reward_a": 94.845046043396,
"misalign/reward_b": -16.313238620758057,
"misalign/reward_improvement": 111.15826416015625,
"misalign/reward_improvement_over_reverse_kl": 0.5862268581986427,
"misalign/reward_improvement_per_token": 0.0937041062861681,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -627.9793968200684,
"misalign/reward_vocab_std": 811.575309753418,
"misalign/symmetric_kl": 652.1775360107422,
"misalign/tv_distance": 147.92323780059814,
"num_tokens": 10568093.0,
"rewards/accuracies": 0.953125,
"rewards/chosen": -28.54444169998169,
"rewards/margins": 47.17354154586792,
"rewards/rejected": -75.7179822921753,
"step": 62,
"support/residual_count": 151893.359375,
"support/residual_mass_policy": 0.022450818214565516,
"support/residual_mass_reference": 0.04165853979066014,
"support/residual_reward": -0.45337859727442265,
"support/runtime_width": 42.641594886779785,
"support/sampled_loser_rank": 0.6541027873754501,
"support/sampled_reward_rank": -0.38643040135502815,
"support/sampled_token_added_rate": 0.037647833582013845,
"support/sampled_winner_rank": 0.6750443577766418,
"support/selected_width": 42.641594886779785,
"support/stored_width": 42.641594886779785
},
{
"epoch": 0.984375,
"grad_norm": 157.7623748779297,
"kl/sequence_policy_ref": -430.5328941345215,
"kl/vocab_forward": 418.2077407836914,
"kl/vocab_js": 37.59886360168457,
"kl/vocab_reverse": 146.4429168701172,
"kl/vocab_symmetric": 564.6509666442871,
"learning_rate": 2e-06,
"logps/chosen": -991.5014877319336,
"logps/rejected": -1757.1116333007812,
"loss": 2.0983,
"loss/dpo": 0.4923999092846994,
"misalign/J": 160.58584594726562,
"misalign/J_aux_loss": 1.6058583855628967,
"misalign/J_aux_loss_raw": 160.58584594726562,
"misalign/J_over_reverse_kl": 1.1775383204221725,
"misalign/J_per_token": 0.2121292594820261,
"misalign/compressed_reward_absmax": 4109.173980712891,
"misalign/compressed_reward_range": 5852.2374267578125,
"misalign/entropy_a": 698.0654525756836,
"misalign/entropy_b": 1176.365493774414,
"misalign/forward_kl_divergence": 418.2077407836914,
"misalign/forward_kl_divergence_per_token": 0.7069764323532581,
"misalign/gamma_abs_times_reward_std": 45094587.0,
"misalign/gamma_bracketed_rate": 0.9876261055469513,
"misalign/gamma_reward_residual": 0.00018861131684388965,
"misalign/gamma_star": 68429815.0,
"misalign/js_divergence": 37.59886360168457,
"misalign/reverse_kl_divergence": 146.4429168701172,
"misalign/reverse_kl_divergence_per_token": 0.1941323447972536,
"misalign/reward_a": 87.26894760131836,
"misalign/reward_b": -12.587202161550522,
"misalign/reward_improvement": 99.85614490509033,
"misalign/reward_improvement_over_reverse_kl": 0.5914403721690178,
"misalign/reward_improvement_per_token": 0.09591837041079998,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -588.2237000465393,
"misalign/reward_vocab_std": 723.0128402709961,
"misalign/symmetric_kl": 564.6509666442871,
"misalign/tv_distance": 122.21442317962646,
"num_tokens": 10736658.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": -24.446208238601685,
"rewards/margins": 37.21416687965393,
"rewards/rejected": -61.6603741645813,
"step": 63,
"support/residual_count": 151893.123046875,
"support/residual_mass_policy": 0.026434314902871847,
"support/residual_mass_reference": 0.04181864345446229,
"support/residual_reward": -0.5436067841947079,
"support/runtime_width": 42.87605428695679,
"support/sampled_loser_rank": 0.5825388208031654,
"support/sampled_reward_rank": -0.3311811648309231,
"support/sampled_token_added_rate": 0.032393347937613726,
"support/sampled_winner_rank": 0.6503275334835052,
"support/selected_width": 42.87605428695679,
"support/stored_width": 42.87605428695679
},
{
"epoch": 1.0,
"grad_norm": 53.39131164550781,
"kl/sequence_policy_ref": -518.7998046875,
"kl/vocab_forward": 483.2027702331543,
"kl/vocab_js": 43.7255322933197,
"kl/vocab_reverse": 169.42027759552002,
"kl/vocab_symmetric": 652.6233749389648,
"learning_rate": 2e-06,
"logps/chosen": -1134.265338897705,
"logps/rejected": -2165.7147216796875,
"loss": 1.7421,
"loss/dpo": 0.08747747553955643,
"misalign/J": 165.4585018157959,
"misalign/J_aux_loss": 1.6545849740505219,
"misalign/J_aux_loss_raw": 165.4585018157959,
"misalign/J_over_reverse_kl": 1.0515589267015457,
"misalign/J_per_token": 0.1934837531298399,
"misalign/compressed_reward_absmax": 4486.107025146484,
"misalign/compressed_reward_range": 6402.580139160156,
"misalign/entropy_a": 834.7258224487305,
"misalign/entropy_b": 1397.0357360839844,
"misalign/forward_kl_divergence": 483.2027702331543,
"misalign/forward_kl_divergence_per_token": 0.6137732639908791,
"misalign/gamma_abs_times_reward_std": 49280889.75,
"misalign/gamma_bracketed_rate": 0.9895988926291466,
"misalign/gamma_reward_residual": 0.00024569173842792225,
"misalign/gamma_star": 40878552.0,
"misalign/js_divergence": 43.7255322933197,
"misalign/reverse_kl_divergence": 169.42027759552002,
"misalign/reverse_kl_divergence_per_token": 0.1976974420249462,
"misalign/reward_a": 98.72580575942993,
"misalign/reward_b": -13.32999886572361,
"misalign/reward_improvement": 112.05579376220703,
"misalign/reward_improvement_over_reverse_kl": 0.6203483864665031,
"misalign/reward_improvement_per_token": 0.10607674531638622,
"misalign/reward_signal_low_rate": 0.0,
"misalign/reward_vocab_mean": -555.1519546508789,
"misalign/reward_vocab_std": 773.9923553466797,
"misalign/symmetric_kl": 652.6233749389648,
"misalign/tv_distance": 142.4766387939453,
"num_tokens": 10915114.0,
"rewards/accuracies": 0.984375,
"rewards/chosen": -29.072665452957153,
"rewards/margins": 45.61463260650635,
"rewards/rejected": -74.68729877471924,
"step": 64,
"support/residual_count": 151893.263671875,
"support/residual_mass_policy": 0.024331093532964587,
"support/residual_mass_reference": 0.04252262390218675,
"support/residual_reward": -0.4992054486647248,
"support/runtime_width": 42.739386558532715,
"support/sampled_loser_rank": 0.6560819111764431,
"support/sampled_reward_rank": -0.35881325230002403,
"support/sampled_token_added_rate": 0.03401010250672698,
"support/sampled_winner_rank": 0.6759711802005768,
"support/selected_width": 42.739386558532715,
"support/stored_width": 42.739386558532715
},
{
"epoch": 1.0,
"step": 64,
"total_flos": 4.111435733296742e+16,
"train_loss": 1.8484279848635197,
"train_runtime": 2097.8551,
"train_samples_per_second": 1.952,
"train_steps_per_second": 0.031
},
{
"epoch": 1.0,
"eval_kl/sequence_policy_ref": -484.36079502105713,
"eval_kl/vocab_forward": 447.0612106323242,
"eval_kl/vocab_js": 40.982308477163315,
"eval_kl/vocab_reverse": 159.29439425468445,
"eval_kl/vocab_symmetric": 606.3558564186096,
"eval_logps/chosen": -960.8710179328918,
"eval_logps/rejected": -2108.1164512634277,
"eval_loss": 1.7901853322982788,
"eval_loss/dpo": 0.1378665120205333,
"eval_misalign/J": 165.23188638687134,
"eval_misalign/J_aux_loss": 1.6523188361898065,
"eval_misalign/J_aux_loss_raw": 165.23188638687134,
"eval_misalign/J_over_reverse_kl": 1.4319368861615658,
"eval_misalign/J_per_token": 0.2249188085552305,
"eval_misalign/compressed_reward_absmax": 4236.7062911987305,
"eval_misalign/compressed_reward_range": 6027.056602478027,
"eval_misalign/entropy_a": 773.5548601150513,
"eval_misalign/entropy_b": 1298.0544300079346,
"eval_misalign/forward_kl_divergence": 447.0612106323242,
"eval_misalign/forward_kl_divergence_per_token": 0.6650652508251369,
"eval_misalign/gamma_abs_times_reward_std": 42179871.046875,
"eval_misalign/gamma_bracketed_rate": 0.9879104141145945,
"eval_misalign/gamma_reward_residual": 0.0007516551395241322,
"eval_misalign/gamma_star": 54525594.609375,
"eval_misalign/js_divergence": 40.982308477163315,
"eval_misalign/reverse_kl_divergence": 159.29439425468445,
"eval_misalign/reverse_kl_divergence_per_token": 0.20468837535008788,
"eval_misalign/reward_a": 88.6484876871109,
"eval_misalign/reward_b": -14.307281013578176,
"eval_misalign/reward_improvement": 102.95576465129852,
"eval_misalign/reward_improvement_over_reverse_kl": 0.5992142450995743,
"eval_misalign/reward_improvement_per_token": 0.0850256277481094,
"eval_misalign/reward_signal_low_rate": 0.0,
"eval_misalign/reward_vocab_mean": -490.11554074287415,
"eval_misalign/reward_vocab_std": 731.3202610015869,
"eval_misalign/symmetric_kl": 606.3558564186096,
"eval_misalign/tv_distance": 133.03198266029358,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -23.169506430625916,
"eval_rewards/margins": 50.53314882516861,
"eval_rewards/rejected": -73.70265424251556,
"eval_runtime": 101.2427,
"eval_samples_per_second": 5.057,
"eval_steps_per_second": 0.632,
"eval_support/residual_count": 151893.29125976562,
"eval_support/residual_mass_policy": 0.023531361599452794,
"eval_support/residual_mass_reference": 0.04279232310364023,
"eval_support/residual_reward": -0.4695481152739376,
"eval_support/runtime_width": 42.70963191986084,
"eval_support/sampled_loser_rank": 0.6486562248319387,
"eval_support/sampled_reward_rank": -0.37071577250026166,
"eval_support/sampled_token_added_rate": 0.037317203474231064,
"eval_support/sampled_winner_rank": 0.6506854901090264,
"eval_support/selected_width": 42.70963191986084,
"eval_support/stored_width": 42.70963191986084,
"step": 64
}
],
"logging_steps": 1,
"max_steps": 64,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 6,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.111435733296742e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}