{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 12, "global_step": 64, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015625, "grad_norm": 99.72795104980469, "kl/sequence_policy_ref": 0.0, "kl/vocab_forward": 0.0, "kl/vocab_js": 1.071127120333415e-08, "kl/vocab_reverse": 0.0, "kl/vocab_symmetric": 0.0, "learning_rate": 0.0, "logps/chosen": -742.8982963562012, "logps/rejected": -1508.9743347167969, "loss": 0.8464, "loss/dpo": 0.6931471824645996, "misalign/J": 15.325548589229584, "misalign/J_aux_loss": 0.15325548127293587, "misalign/J_aux_loss_raw": 15.325548589229584, "misalign/J_over_reverse_kl": 128560019.5, "misalign/J_per_token": 0.019860354135744274, "misalign/compressed_reward_absmax": 4573.125, "misalign/compressed_reward_range": 6496.04150390625, "misalign/entropy_a": 1394.3690948486328, "misalign/entropy_b": 1394.3689727783203, "misalign/forward_kl_divergence": 0.0, "misalign/forward_kl_divergence_per_token": 0.0, "misalign/gamma_abs_times_reward_std": 3358948.609375, "misalign/gamma_bracketed_rate": 0.9989435374736786, "misalign/gamma_reward_residual": -3.5030208209718694e-05, "misalign/gamma_star": 6581222.96875, "misalign/js_divergence": 1.071127120333415e-08, "misalign/reverse_kl_divergence": 0.0, "misalign/reverse_kl_divergence_per_token": 0.0, "misalign/reward_a": -14.673600375652313, "misalign/reward_b": -14.673597425222397, "misalign/reward_improvement": 0.0, "misalign/reward_improvement_over_reverse_kl": 0.0, "misalign/reward_improvement_per_token": 0.0, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -492.57253074645996, "misalign/reward_vocab_std": 775.2254867553711, "misalign/symmetric_kl": 0.0, "misalign/tv_distance": 0.0, "num_tokens": 178419.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "support/residual_count": 151893.28125, "support/residual_mass_policy": 0.043290185276418924, "support/residual_mass_reference": 0.043290185276418924, "support/residual_reward": -0.42998734675347805, "support/runtime_width": 42.71537160873413, "support/sampled_loser_rank": 0.6159256920218468, "support/sampled_reward_rank": -0.34261330030858517, "support/sampled_token_added_rate": 0.03961701481603086, "support/sampled_winner_rank": 0.6312572248280048, "support/selected_width": 42.71537160873413, "support/stored_width": 42.71537160873413 }, { "epoch": 0.03125, "grad_norm": 90.65897369384766, "kl/sequence_policy_ref": 0.0, "kl/vocab_forward": 0.0, "kl/vocab_js": 1.1555954915198896e-08, "kl/vocab_reverse": 0.0, "kl/vocab_symmetric": 0.0, "learning_rate": 2.857142857142857e-07, "logps/chosen": -639.7089538574219, "logps/rejected": -1256.9280319213867, "loss": 0.8522, "loss/dpo": 0.6931471824645996, "misalign/J": 15.909758105874062, "misalign/J_aux_loss": 0.15909757697954774, "misalign/J_aux_loss_raw": 15.909758105874062, "misalign/J_over_reverse_kl": 133460724.125, "misalign/J_per_token": 0.022011274049873464, "misalign/compressed_reward_absmax": 4128.2406005859375, "misalign/compressed_reward_range": 5892.38916015625, "misalign/entropy_a": 1163.6351776123047, "misalign/entropy_b": 1163.6351852416992, "misalign/forward_kl_divergence": 0.0, "misalign/forward_kl_divergence_per_token": 0.0, "misalign/gamma_abs_times_reward_std": 3731882.666015625, "misalign/gamma_bracketed_rate": 0.99904465675354, "misalign/gamma_reward_residual": -2.5868400712170114e-05, "misalign/gamma_star": 6485823.03515625, "misalign/js_divergence": 1.1555954915198896e-08, "misalign/reverse_kl_divergence": 0.0, "misalign/reverse_kl_divergence_per_token": 0.0, "misalign/reward_a": -14.091559052467346, "misalign/reward_b": -14.091563642024994, "misalign/reward_improvement": 0.0, "misalign/reward_improvement_over_reverse_kl": 0.0, "misalign/reward_improvement_per_token": 0.0, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -373.4254665374756, "misalign/reward_vocab_std": 714.1293716430664, "misalign/symmetric_kl": 0.0, "misalign/tv_distance": 0.0, "num_tokens": 353406.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2, "support/residual_count": 151893.015625, "support/residual_mass_policy": 0.04986313637346029, "support/residual_mass_reference": 0.04986313637346029, "support/residual_reward": -0.33420879393815994, "support/runtime_width": 42.98376941680908, "support/sampled_loser_rank": 0.5725675709545612, "support/sampled_reward_rank": -0.46486284397542477, "support/sampled_token_added_rate": 0.039943449199199677, "support/sampled_winner_rank": 0.6429276466369629, "support/selected_width": 42.98376941680908, "support/stored_width": 42.98376941680908 }, { "epoch": 0.046875, "grad_norm": 93.3929672241211, "kl/sequence_policy_ref": 0.012497343122959137, "kl/vocab_forward": 0.583718778565526, "kl/vocab_js": 0.14564759889617562, "kl/vocab_reverse": 0.5832288525998592, "kl/vocab_symmetric": 1.1669474430382252, "learning_rate": 5.714285714285714e-07, "logps/chosen": -682.0669765472412, "logps/rejected": -1253.8106155395508, "loss": 0.8984, "loss/dpo": 0.7069794088602066, "misalign/J": 19.14525681734085, "misalign/J_aux_loss": 0.1914525495376438, "misalign/J_aux_loss_raw": 19.14525681734085, "misalign/J_over_reverse_kl": 181.960098862648, "misalign/J_per_token": 0.11238381525618024, "misalign/compressed_reward_absmax": 4042.6752319335938, "misalign/compressed_reward_range": 5874.695739746094, "misalign/entropy_a": 1187.622413635254, "misalign/entropy_b": 1187.9832382202148, "misalign/forward_kl_divergence": 0.583718778565526, "misalign/forward_kl_divergence_per_token": 0.0007106075063347816, "misalign/gamma_abs_times_reward_std": 4061511.5234375, "misalign/gamma_bracketed_rate": 0.9965383857488632, "misalign/gamma_reward_residual": -1.116046053084574e-05, "misalign/gamma_star": 6953297.8359375, "misalign/js_divergence": 0.14564759889617562, "misalign/reverse_kl_divergence": 0.5832288525998592, "misalign/reverse_kl_divergence_per_token": 0.0007096838744473644, "misalign/reward_a": -14.631245076656342, "misalign/reward_b": -14.68013870716095, "misalign/reward_improvement": 0.04889671457931399, "misalign/reward_improvement_over_reverse_kl": -0.01731939986348152, "misalign/reward_improvement_per_token": 0.0011197524540875747, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -477.72042179107666, "misalign/reward_vocab_std": 708.0760498046875, "misalign/symmetric_kl": 1.1669474430382252, "misalign/tv_distance": 7.252490729093552, "num_tokens": 522739.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.009032575355377048, "rewards/margins": -0.020564619218930602, "rewards/rejected": 0.011532044620253146, "step": 3, "support/residual_count": 151893.173828125, "support/residual_mass_policy": 0.041550057008862495, "support/residual_mass_reference": 0.04161923169158399, "support/residual_reward": -0.48134796414524317, "support/runtime_width": 42.824374198913574, "support/sampled_loser_rank": 0.6047959439456463, "support/sampled_reward_rank": -0.43707933463156223, "support/sampled_token_added_rate": 0.03209112957119942, "support/sampled_winner_rank": 0.6245042234659195, "support/selected_width": 42.824374198913574, "support/stored_width": 42.824374198913574 }, { "epoch": 0.0625, "grad_norm": 92.23185729980469, "kl/sequence_policy_ref": 0.12927092611789703, "kl/vocab_forward": 0.5669154338538647, "kl/vocab_js": 0.14143066108226776, "kl/vocab_reverse": 0.5671258866786957, "kl/vocab_symmetric": 1.1340412348508835, "learning_rate": 8.57142857142857e-07, "logps/chosen": -605.7631340026855, "logps/rejected": -1292.5442504882812, "loss": 0.8402, "loss/dpo": 0.6932987496256828, "misalign/J": 14.686549186706543, "misalign/J_aux_loss": 0.14686548942700028, "misalign/J_aux_loss_raw": 14.686549186706543, "misalign/J_over_reverse_kl": 23.49239319562912, "misalign/J_per_token": 0.015179012378212065, "misalign/compressed_reward_absmax": 4190.518157958984, "misalign/compressed_reward_range": 6010.22119140625, "misalign/entropy_a": 1174.9087295532227, "misalign/entropy_b": 1174.8447265625, "misalign/forward_kl_divergence": 0.5669154338538647, "misalign/forward_kl_divergence_per_token": 0.0007894696027506143, "misalign/gamma_abs_times_reward_std": 3162996.953125, "misalign/gamma_bracketed_rate": 0.9992709010839462, "misalign/gamma_reward_residual": -2.8679768774964032e-05, "misalign/gamma_star": 6157349.4375, "misalign/js_divergence": 0.14143066108226776, "misalign/reverse_kl_divergence": 0.5671258866786957, "misalign/reverse_kl_divergence_per_token": 0.000786922340921592, "misalign/reward_a": -18.330436378717422, "misalign/reward_b": -18.468554601073265, "misalign/reward_improvement": 0.1381110306829214, "misalign/reward_improvement_over_reverse_kl": 1.340573588386178, "misalign/reward_improvement_per_token": 0.0015522810608672444, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -531.1860427856445, "misalign/reward_vocab_std": 741.7677001953125, "misalign/symmetric_kl": 1.1340412348508835, "misalign/tv_distance": 6.983255743980408, "num_tokens": 693958.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.01616873685270548, "rewards/margins": 0.006483289762400091, "rewards/rejected": 0.009685446973890066, "step": 4, "support/residual_count": 151893.24609375, "support/residual_mass_policy": 0.03730555484071374, "support/residual_mass_reference": 0.03720196895301342, "support/residual_reward": -0.5474085882306099, "support/runtime_width": 42.75570487976074, "support/sampled_loser_rank": 0.5622387044131756, "support/sampled_reward_rank": -0.3531896872445941, "support/sampled_token_added_rate": 0.03257534769363701, "support/sampled_winner_rank": 0.5690335519611835, "support/selected_width": 42.75570487976074, "support/stored_width": 42.75570487976074 }, { "epoch": 0.078125, "grad_norm": 117.50943756103516, "kl/sequence_policy_ref": -0.02329159528017044, "kl/vocab_forward": 0.7679473981261253, "kl/vocab_js": 0.1914622224867344, "kl/vocab_reverse": 0.7658381760120392, "kl/vocab_symmetric": 1.5337853729724884, "learning_rate": 1.1428571428571428e-06, "logps/chosen": -888.985481262207, "logps/rejected": -1439.7103576660156, "loss": 0.9771, "loss/dpo": 0.6643600016832352, "misalign/J": 31.272010684013367, "misalign/J_aux_loss": 0.3127200985327363, "misalign/J_aux_loss_raw": 31.272010684013367, "misalign/J_over_reverse_kl": 76.80600309371948, "misalign/J_per_token": 0.034490690915845335, "misalign/compressed_reward_absmax": 4926.2969970703125, "misalign/compressed_reward_range": 7034.768737792969, "misalign/entropy_a": 1437.912109375, "misalign/entropy_b": 1440.320785522461, "misalign/forward_kl_divergence": 0.7679473981261253, "misalign/forward_kl_divergence_per_token": 0.0006905792761244811, "misalign/gamma_abs_times_reward_std": 6929239.875, "misalign/gamma_bracketed_rate": 0.9981078654527664, "misalign/gamma_reward_residual": -2.4677944281847886e-05, "misalign/gamma_star": 13895793.9375, "misalign/js_divergence": 0.1914622224867344, "misalign/reverse_kl_divergence": 0.7658381760120392, "misalign/reverse_kl_divergence_per_token": 0.0006879112333990633, "misalign/reward_a": -16.945995092391968, "misalign/reward_b": -18.123961448669434, "misalign/reward_improvement": 1.1779705435037613, "misalign/reward_improvement_over_reverse_kl": 1.8247669339179993, "misalign/reward_improvement_per_token": 0.0014292692139861174, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -799.9156303405762, "misalign/reward_vocab_std": 839.3828201293945, "misalign/symmetric_kl": 1.5337853729724884, "misalign/tv_distance": 9.24750828742981, "num_tokens": 885705.0, "rewards/accuracies": 0.703125, "rewards/chosen": 0.03075969440396875, "rewards/margins": 0.06617770798038691, "rewards/rejected": -0.035418014391325414, "step": 5, "support/residual_count": 151893.498046875, "support/residual_mass_policy": 0.0339660148601979, "support/residual_mass_reference": 0.03398727998137474, "support/residual_reward": -0.5825221054255962, "support/runtime_width": 42.503896713256836, "support/sampled_loser_rank": 0.6222354024648666, "support/sampled_reward_rank": -0.31187077052891254, "support/sampled_token_added_rate": 0.03087039408273995, "support/sampled_winner_rank": 0.6193482503294945, "support/selected_width": 42.503896713256836, "support/stored_width": 42.503896713256836 }, { "epoch": 0.09375, "grad_norm": 85.93820190429688, "kl/sequence_policy_ref": -0.15299414843320847, "kl/vocab_forward": 0.7101510316133499, "kl/vocab_js": 0.1767149232327938, "kl/vocab_reverse": 0.7063371688127518, "kl/vocab_symmetric": 1.4164880961179733, "learning_rate": 1.4285714285714286e-06, "logps/chosen": -632.6824798583984, "logps/rejected": -1370.8635635375977, "loss": 0.7987, "loss/dpo": 0.6448436826467514, "misalign/J": 15.385070085525513, "misalign/J_aux_loss": 0.15385069977492094, "misalign/J_aux_loss_raw": 15.385070085525513, "misalign/J_over_reverse_kl": 83.44263046979904, "misalign/J_per_token": 0.0304049692931585, "misalign/compressed_reward_absmax": 4116.264617919922, "misalign/compressed_reward_range": 5825.951171875, "misalign/entropy_a": 1233.836441040039, "misalign/entropy_b": 1237.5170669555664, "misalign/forward_kl_divergence": 0.7101510316133499, "misalign/forward_kl_divergence_per_token": 0.0011770288765546866, "misalign/gamma_abs_times_reward_std": 3832625.03125, "misalign/gamma_bracketed_rate": 0.9984977394342422, "misalign/gamma_reward_residual": -2.866266277123941e-05, "misalign/gamma_star": 7119063.15625, "misalign/js_divergence": 0.1767149232327938, "misalign/reverse_kl_divergence": 0.7063371688127518, "misalign/reverse_kl_divergence_per_token": 0.001158414474048186, "misalign/reward_a": -12.991081476211548, "misalign/reward_b": -15.306684225797653, "misalign/reward_improvement": 2.315602958202362, "misalign/reward_improvement_over_reverse_kl": 4.326970279216766, "misalign/reward_improvement_per_token": 0.00793102516036015, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -391.66842794418335, "misalign/reward_vocab_std": 709.3101501464844, "misalign/symmetric_kl": 1.4164880961179733, "misalign/tv_distance": 8.171427369117737, "num_tokens": 1051197.0, "rewards/accuracies": 0.734375, "rewards/chosen": 0.03977742395363748, "rewards/margins": 0.11015367973595858, "rewards/rejected": -0.07037625554949045, "step": 6, "support/residual_count": 151893.294921875, "support/residual_mass_policy": 0.04003174742683768, "support/residual_mass_reference": 0.039974321611225605, "support/residual_reward": -0.44555533304810524, "support/runtime_width": 42.70325422286987, "support/sampled_loser_rank": 0.5838432982563972, "support/sampled_reward_rank": -0.33837335743010044, "support/sampled_token_added_rate": 0.035395737970247865, "support/sampled_winner_rank": 0.5852662436664104, "support/selected_width": 42.70325422286987, "support/stored_width": 42.70325422286987 }, { "epoch": 0.109375, "grad_norm": 88.38682556152344, "kl/sequence_policy_ref": -0.3988112509250641, "kl/vocab_forward": 0.9821137934923172, "kl/vocab_js": 0.2425236813724041, "kl/vocab_reverse": 0.9639514982700348, "kl/vocab_symmetric": 1.9460650980472565, "learning_rate": 1.714285714285714e-06, "logps/chosen": -645.1111679077148, "logps/rejected": -1122.6695709228516, "loss": 0.8321, "loss/dpo": 0.5667471960186958, "misalign/J": 26.53605580329895, "misalign/J_aux_loss": 0.26536055374890566, "misalign/J_aux_loss_raw": 26.53605580329895, "misalign/J_over_reverse_kl": 24.359424114227295, "misalign/J_per_token": 0.027136333868838847, "misalign/compressed_reward_absmax": 4269.828765869141, "misalign/compressed_reward_range": 6126.886657714844, "misalign/entropy_a": 1084.383560180664, "misalign/entropy_b": 1096.2449645996094, "misalign/forward_kl_divergence": 0.9821137934923172, "misalign/forward_kl_divergence_per_token": 0.0023296478029806167, "misalign/gamma_abs_times_reward_std": 6525904.78125, "misalign/gamma_bracketed_rate": 0.9985537081956863, "misalign/gamma_reward_residual": -9.58655687099963e-06, "misalign/gamma_star": 13095489.4375, "misalign/js_divergence": 0.2425236813724041, "misalign/reverse_kl_divergence": 0.9639514982700348, "misalign/reverse_kl_divergence_per_token": 0.0022518262558151037, "misalign/reward_a": -4.632188588380814, "misalign/reward_b": -9.89891766011715, "misalign/reward_improvement": 5.2667356133461, "misalign/reward_improvement_over_reverse_kl": 5.781014442443848, "misalign/reward_improvement_per_token": 0.021993891743477434, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -737.9532012939453, "misalign/reward_vocab_std": 779.6220245361328, "misalign/symmetric_kl": 1.9460650980472565, "misalign/tv_distance": 9.269986510276794, "num_tokens": 1226725.0, "rewards/accuracies": 0.859375, "rewards/chosen": 0.10787314153276384, "rewards/margins": 0.29550853557884693, "rewards/rejected": -0.18763539008796215, "step": 7, "support/residual_count": 151893.15234375, "support/residual_mass_policy": 0.036638311110436916, "support/residual_mass_reference": 0.03580877371132374, "support/residual_reward": -0.6131154783070087, "support/runtime_width": 42.84639596939087, "support/sampled_loser_rank": 0.5267594642937183, "support/sampled_reward_rank": -0.37450613733381033, "support/sampled_token_added_rate": 0.02907218923792243, "support/sampled_winner_rank": 0.5602664463222027, "support/selected_width": 42.84639596939087, "support/stored_width": 42.84639596939087 }, { "epoch": 0.125, "grad_norm": 69.52674102783203, "kl/sequence_policy_ref": -1.1963431239128113, "kl/vocab_forward": 1.3841880485415459, "kl/vocab_js": 0.33685372211039066, "kl/vocab_reverse": 1.3344588950276375, "kl/vocab_symmetric": 2.7186470329761505, "learning_rate": 2e-06, "logps/chosen": -520.5281848907471, "logps/rejected": -1275.8439178466797, "loss": 0.6933, "loss/dpo": 0.5151687189936638, "misalign/J": 17.81585144996643, "misalign/J_aux_loss": 0.17815851839259267, "misalign/J_aux_loss_raw": 17.81585144996643, "misalign/J_over_reverse_kl": 10.14825189113617, "misalign/J_per_token": 0.029440748097840697, "misalign/compressed_reward_absmax": 3982.9888916015625, "misalign/compressed_reward_range": 5621.152526855469, "misalign/entropy_a": 1110.4090042114258, "misalign/entropy_b": 1124.430030822754, "misalign/forward_kl_divergence": 1.3841880485415459, "misalign/forward_kl_divergence_per_token": 0.005764905363321304, "misalign/gamma_abs_times_reward_std": 3656771.703125, "misalign/gamma_bracketed_rate": 0.9985413998365402, "misalign/gamma_reward_residual": -2.2492993480227597e-05, "misalign/gamma_star": 6422516.171875, "misalign/js_divergence": 0.33685372211039066, "misalign/reverse_kl_divergence": 1.3344588950276375, "misalign/reverse_kl_divergence_per_token": 0.005282549886032939, "misalign/reward_a": -9.868786454200745, "misalign/reward_b": -17.78406047821045, "misalign/reward_improvement": 7.915277659893036, "misalign/reward_improvement_over_reverse_kl": 5.6070281863212585, "misalign/reward_improvement_per_token": 0.02902397490106523, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -359.259729385376, "misalign/reward_vocab_std": 703.0287933349609, "misalign/symmetric_kl": 2.7186470329761505, "misalign/tv_distance": 10.146484673023224, "num_tokens": 1388327.0, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0993000838207081, "rewards/margins": 0.43786880001425743, "rewards/rejected": -0.3385687116533518, "step": 8, "support/residual_count": 151893.01171875, "support/residual_mass_policy": 0.039598725736141205, "support/residual_mass_reference": 0.03887052624486387, "support/residual_reward": -0.35539715457707644, "support/runtime_width": 42.98911476135254, "support/sampled_loser_rank": 0.5893515609204769, "support/sampled_reward_rank": -0.47250746935606003, "support/sampled_token_added_rate": 0.03269299981184304, "support/sampled_winner_rank": 0.5859329588711262, "support/selected_width": 42.98911476135254, "support/stored_width": 42.98911476135254 }, { "epoch": 0.140625, "grad_norm": 67.0599136352539, "kl/sequence_policy_ref": -2.3771479576826096, "kl/vocab_forward": 2.6128047704696655, "kl/vocab_js": 0.6251252107322216, "kl/vocab_reverse": 2.4535476565361023, "kl/vocab_symmetric": 5.06635195016861, "learning_rate": 2e-06, "logps/chosen": -615.9590682983398, "logps/rejected": -1274.1997985839844, "loss": 0.6781, "loss/dpo": 0.4201922379434109, "misalign/J": 25.791358947753906, "misalign/J_aux_loss": 0.2579135838896036, "misalign/J_aux_loss_raw": 25.791358947753906, "misalign/J_over_reverse_kl": 12.780362248420715, "misalign/J_per_token": 0.05547305219806731, "misalign/compressed_reward_absmax": 3792.750274658203, "misalign/compressed_reward_range": 5447.807708740234, "misalign/entropy_a": 1148.869960784912, "misalign/entropy_b": 1178.2729797363281, "misalign/forward_kl_divergence": 2.6128047704696655, "misalign/forward_kl_divergence_per_token": 0.006875867606140673, "misalign/gamma_abs_times_reward_std": 5016933.359375, "misalign/gamma_bracketed_rate": 0.9971183687448502, "misalign/gamma_reward_residual": -1.4140535995466053e-05, "misalign/gamma_star": 9540094.09375, "misalign/js_divergence": 0.6251252107322216, "misalign/reverse_kl_divergence": 2.4535476565361023, "misalign/reverse_kl_divergence_per_token": 0.006300629815086722, "misalign/reward_a": -4.234024614095688, "misalign/reward_b": -17.268560528755188, "misalign/reward_improvement": 13.034542560577393, "misalign/reward_improvement_over_reverse_kl": 4.815744161605835, "misalign/reward_improvement_per_token": 0.030366417719051242, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -434.87897872924805, "misalign/reward_vocab_std": 651.9961624145508, "misalign/symmetric_kl": 5.06635195016861, "misalign/tv_distance": 14.232259213924408, "num_tokens": 1550382.0, "rewards/accuracies": 0.953125, "rewards/chosen": 0.12840933166444302, "rewards/margins": 0.7322482690215111, "rewards/rejected": -0.6038389429450035, "step": 9, "support/residual_count": 151893.478515625, "support/residual_mass_policy": 0.03697363403625786, "support/residual_mass_reference": 0.037607218604534864, "support/residual_reward": -0.5447902157902718, "support/runtime_width": 42.5230073928833, "support/sampled_loser_rank": 0.6056095100939274, "support/sampled_reward_rank": -0.31354507617652416, "support/sampled_token_added_rate": 0.03456789907068014, "support/sampled_winner_rank": 0.6345465183258057, "support/selected_width": 42.5230073928833, "support/stored_width": 42.5230073928833 }, { "epoch": 0.15625, "grad_norm": 87.10169982910156, "kl/sequence_policy_ref": -6.291569083929062, "kl/vocab_forward": 5.60641685128212, "kl/vocab_js": 1.291469193994999, "kl/vocab_reverse": 5.0420292019844055, "kl/vocab_symmetric": 10.648443281650543, "learning_rate": 2e-06, "logps/chosen": -468.0983257293701, "logps/rejected": -1285.5955200195312, "loss": 0.9025, "loss/dpo": 0.3525316398590803, "misalign/J": 54.998396158218384, "misalign/J_aux_loss": 0.5499839466065168, "misalign/J_aux_loss_raw": 54.998396158218384, "misalign/J_over_reverse_kl": 9.880838811397552, "misalign/J_per_token": 0.07741667260415852, "misalign/compressed_reward_absmax": 3692.7807006835938, "misalign/compressed_reward_range": 5182.304748535156, "misalign/entropy_a": 1031.7844772338867, "misalign/entropy_b": 1076.0891571044922, "misalign/forward_kl_divergence": 5.60641685128212, "misalign/forward_kl_divergence_per_token": 0.01369796262588352, "misalign/gamma_abs_times_reward_std": 11830781.9375, "misalign/gamma_bracketed_rate": 0.9962232336401939, "misalign/gamma_reward_residual": 9.550651896006457e-06, "misalign/gamma_star": 24339454.65625, "misalign/js_divergence": 1.291469193994999, "misalign/reverse_kl_divergence": 5.0420292019844055, "misalign/reverse_kl_divergence_per_token": 0.011788319039624184, "misalign/reward_a": -0.19761592149734497, "misalign/reward_b": -18.621128231287003, "misalign/reward_improvement": 18.42351984977722, "misalign/reward_improvement_over_reverse_kl": 3.5281217098236084, "misalign/reward_improvement_per_token": 0.044580711517482996, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -340.7347640991211, "misalign/reward_vocab_std": 628.4291152954102, "misalign/symmetric_kl": 10.648443281650543, "misalign/tv_distance": 18.700318098068237, "num_tokens": 1705187.0, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055842478293925524, "rewards/margins": 1.1466289013624191, "rewards/rejected": -1.2024713829159737, "step": 10, "support/residual_count": 151893.390625, "support/residual_mass_policy": 0.03804836701601744, "support/residual_mass_reference": 0.03930599056184292, "support/residual_reward": -0.42244877200573683, "support/runtime_width": 42.61000061035156, "support/sampled_loser_rank": 0.6178636997938156, "support/sampled_reward_rank": -0.380008390173316, "support/sampled_token_added_rate": 0.03352847881615162, "support/sampled_winner_rank": 0.6379577368497849, "support/selected_width": 42.61000061035156, "support/stored_width": 42.61000061035156 }, { "epoch": 0.171875, "grad_norm": 105.65762329101562, "kl/sequence_policy_ref": -11.886906266212463, "kl/vocab_forward": 10.033158540725708, "kl/vocab_js": 2.2728197276592255, "kl/vocab_reverse": 8.8297780752182, "kl/vocab_symmetric": 18.862935781478882, "learning_rate": 2e-06, "logps/chosen": -864.007137298584, "logps/rejected": -1354.3248596191406, "loss": 0.957, "loss/dpo": 0.25358179584145546, "misalign/J": 70.34657621383667, "misalign/J_aux_loss": 0.7034657262265682, "misalign/J_aux_loss_raw": 70.34657621383667, "misalign/J_over_reverse_kl": 7.123360276222229, "misalign/J_per_token": 0.07085963152348995, "misalign/compressed_reward_absmax": 4522.3524169921875, "misalign/compressed_reward_range": 6510.963439941406, "misalign/entropy_a": 1291.0633544921875, "misalign/entropy_b": 1368.3875427246094, "misalign/forward_kl_divergence": 10.033158540725708, "misalign/forward_kl_divergence_per_token": 0.015985821490176022, "misalign/gamma_abs_times_reward_std": 16111157.5625, "misalign/gamma_bracketed_rate": 0.9964210242033005, "misalign/gamma_reward_residual": 2.2860098624732927e-05, "misalign/gamma_star": 28422147.375, "misalign/js_divergence": 2.2728197276592255, "misalign/reverse_kl_divergence": 8.8297780752182, "misalign/reverse_kl_divergence_per_token": 0.013298386707901955, "misalign/reward_a": 11.391705840826035, "misalign/reward_b": -19.758893489837646, "misalign/reward_improvement": 31.150599718093872, "misalign/reward_improvement_over_reverse_kl": 3.3264004588127136, "misalign/reward_improvement_per_token": 0.04476729570887983, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -656.2439975738525, "misalign/reward_vocab_std": 782.3782577514648, "misalign/symmetric_kl": 18.862935781478882, "misalign/tv_distance": 28.730425596237183, "num_tokens": 1887093.0, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3179726116359234, "rewards/margins": 1.741436019539833, "rewards/rejected": -2.0594086199998856, "step": 11, "support/residual_count": 151893.275390625, "support/residual_mass_policy": 0.035074356477707624, "support/residual_mass_reference": 0.037040101597085595, "support/residual_reward": -0.4819745300337672, "support/runtime_width": 42.72910785675049, "support/sampled_loser_rank": 0.6259545609354973, "support/sampled_reward_rank": -0.3720765591133386, "support/sampled_token_added_rate": 0.03333452111110091, "support/sampled_winner_rank": 0.6261583790183067, "support/selected_width": 42.72910785675049, "support/stored_width": 42.72910785675049 }, { "epoch": 0.1875, "grad_norm": 113.49061584472656, "kl/sequence_policy_ref": -10.061190009117126, "kl/vocab_forward": 11.224669754505157, "kl/vocab_js": 2.5259178578853607, "kl/vocab_reverse": 9.792702317237854, "kl/vocab_symmetric": 21.017370462417603, "learning_rate": 2e-06, "logps/chosen": -895.415397644043, "logps/rejected": -1483.4818572998047, "loss": 1.1275, "loss/dpo": 0.3059763703495264, "misalign/J": 82.15509986877441, "misalign/J_aux_loss": 0.8215509578585625, "misalign/J_aux_loss_raw": 82.15509986877441, "misalign/J_over_reverse_kl": 9.914618968963623, "misalign/J_per_token": 0.1017089462839067, "misalign/compressed_reward_absmax": 4446.288055419922, "misalign/compressed_reward_range": 6255.252502441406, "misalign/entropy_a": 1389.9479522705078, "misalign/entropy_b": 1475.2511596679688, "misalign/forward_kl_divergence": 11.224669754505157, "misalign/forward_kl_divergence_per_token": 0.017279054620303214, "misalign/gamma_abs_times_reward_std": 19089820.0, "misalign/gamma_bracketed_rate": 0.994603119790554, "misalign/gamma_reward_residual": 1.673686114145312e-05, "misalign/gamma_star": 34029449.5, "misalign/js_divergence": 2.5259178578853607, "misalign/reverse_kl_divergence": 9.792702317237854, "misalign/reverse_kl_divergence_per_token": 0.015302568324841559, "misalign/reward_a": 18.606368482112885, "misalign/reward_b": -13.686601161956787, "misalign/reward_improvement": 32.2929527759552, "misalign/reward_improvement_over_reverse_kl": 3.0961980521678925, "misalign/reward_improvement_per_token": 0.04442322696559131, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -424.1544990539551, "misalign/reward_vocab_std": 740.4278564453125, "misalign/symmetric_kl": 21.017370462417603, "misalign/tv_distance": 30.929341316223145, "num_tokens": 2067343.0, "rewards/accuracies": 0.921875, "rewards/chosen": -0.20568968541920185, "rewards/margins": 1.6008586585521698, "rewards/rejected": -1.8065483272075653, "step": 12, "support/residual_count": 151893.384765625, "support/residual_mass_policy": 0.04051102139055729, "support/residual_mass_reference": 0.042460997588932514, "support/residual_reward": -0.3618390057235956, "support/runtime_width": 42.61893367767334, "support/sampled_loser_rank": 0.6536427140235901, "support/sampled_reward_rank": -0.37369205243885517, "support/sampled_token_added_rate": 0.037714328384026885, "support/sampled_winner_rank": 0.6573853716254234, "support/selected_width": 42.61893367767334, "support/stored_width": 42.61893367767334 }, { "epoch": 0.1875, "eval_kl/sequence_policy_ref": -17.01151405274868, "eval_kl/vocab_forward": 15.591387048363686, "eval_kl/vocab_js": 3.397782253101468, "eval_kl/vocab_reverse": 13.213865287601948, "eval_kl/vocab_symmetric": 28.80525030195713, "eval_logps/chosen": -734.3654553890228, "eval_logps/rejected": -1399.9234561920166, "eval_loss": 1.0719456672668457, "eval_loss/dpo": 0.21070287289330736, "eval_misalign/J": 86.12427139282227, "eval_misalign/J_aux_loss": 0.8612426882609725, "eval_misalign/J_aux_loss_raw": 86.12427139282227, "eval_misalign/J_over_reverse_kl": 7.439761482179165, "eval_misalign/J_per_token": 0.11331278597936034, "eval_misalign/compressed_reward_absmax": 4236.706287384033, "eval_misalign/compressed_reward_range": 6027.056529998779, "eval_misalign/entropy_a": 1207.335482597351, "eval_misalign/entropy_b": 1298.0544576644897, "eval_misalign/forward_kl_divergence": 15.591387048363686, "eval_misalign/forward_kl_divergence_per_token": 0.027479787677293643, "eval_misalign/gamma_abs_times_reward_std": 20802586.9921875, "eval_misalign/gamma_bracketed_rate": 0.9936437727883458, "eval_misalign/gamma_reward_residual": 2.4353206566019026e-05, "eval_misalign/gamma_star": 37987713.1640625, "eval_misalign/js_divergence": 3.397782253101468, "eval_misalign/reverse_kl_divergence": 13.213865287601948, "eval_misalign/reverse_kl_divergence_per_token": 0.022622147007496096, "eval_misalign/reward_a": 23.110429362626746, "eval_misalign/reward_b": -14.30728217586875, "eval_misalign/reward_improvement": 37.41771391034126, "eval_misalign/reward_improvement_over_reverse_kl": 2.6861571483314037, "eval_misalign/reward_improvement_per_token": 0.05757526887464337, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.11555767059326, "eval_misalign/reward_vocab_std": 731.3202633857727, "eval_misalign/symmetric_kl": 28.80525030195713, "eval_misalign/tv_distance": 33.611202627420425, "eval_rewards/accuracies": 0.931640625, "eval_rewards/chosen": -0.5189501565182582, "eval_rewards/margins": 2.3644025400280952, "eval_rewards/rejected": -2.883352691307664, "eval_runtime": 100.9771, "eval_samples_per_second": 5.07, "eval_steps_per_second": 0.634, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.04048109907307662, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 12 }, { "epoch": 0.203125, "grad_norm": 143.4099884033203, "kl/sequence_policy_ref": -17.67092001438141, "kl/vocab_forward": 16.044883847236633, "kl/vocab_js": 3.4656281918287277, "kl/vocab_reverse": 13.393466770648956, "kl/vocab_symmetric": 29.438353061676025, "learning_rate": 2e-06, "logps/chosen": -670.5424919128418, "logps/rejected": -1345.9356536865234, "loss": 1.3177, "loss/dpo": 0.21267282590270042, "misalign/J": 110.50714302062988, "misalign/J_aux_loss": 1.1050714254379272, "misalign/J_aux_loss_raw": 110.50714302062988, "misalign/J_over_reverse_kl": 8.267582476139069, "misalign/J_per_token": 0.11648859549313784, "misalign/compressed_reward_absmax": 4362.797546386719, "misalign/compressed_reward_range": 6223.5970458984375, "misalign/entropy_a": 1131.1556549072266, "misalign/entropy_b": 1221.6925888061523, "misalign/forward_kl_divergence": 16.044883847236633, "misalign/forward_kl_divergence_per_token": 0.02564867539331317, "misalign/gamma_abs_times_reward_std": 24747786.5, "misalign/gamma_bracketed_rate": 0.9937806725502014, "misalign/gamma_reward_residual": 4.918595743674814e-05, "misalign/gamma_star": 46901569.25, "misalign/js_divergence": 3.4656281918287277, "misalign/reverse_kl_divergence": 13.393466770648956, "misalign/reverse_kl_divergence_per_token": 0.02141062926966697, "misalign/reward_a": 23.634789615869522, "misalign/reward_b": -12.488820567727089, "misalign/reward_improvement": 36.12361431121826, "misalign/reward_improvement_over_reverse_kl": 2.6490939259529114, "misalign/reward_improvement_per_token": 0.05916513060219586, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -522.8331413269043, "misalign/reward_vocab_std": 745.3072738647461, "misalign/symmetric_kl": 29.438353061676025, "misalign/tv_distance": 33.53639495372772, "num_tokens": 2250359.0, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6528808567672968, "rewards/margins": 2.228422373533249, "rewards/rejected": -2.8813031911849976, "step": 13, "support/residual_count": 151893.263671875, "support/residual_mass_policy": 0.03615651559084654, "support/residual_mass_reference": 0.03887659031897783, "support/residual_reward": -0.4554095212370157, "support/runtime_width": 42.7385835647583, "support/sampled_loser_rank": 0.6245415061712265, "support/sampled_reward_rank": -0.40703338757157326, "support/sampled_token_added_rate": 0.032946799183264375, "support/sampled_winner_rank": 0.6250991076231003, "support/selected_width": 42.7385835647583, "support/stored_width": 42.7385835647583 }, { "epoch": 0.21875, "grad_norm": 129.94761657714844, "kl/sequence_policy_ref": -28.61777091026306, "kl/vocab_forward": 24.432228088378906, "kl/vocab_js": 5.003187119960785, "kl/vocab_reverse": 19.63451886177063, "kl/vocab_symmetric": 44.0667519569397, "learning_rate": 2e-06, "logps/chosen": -519.6495475769043, "logps/rejected": -1307.973617553711, "loss": 1.2097, "loss/dpo": 0.13245126977562904, "misalign/J": 107.72635746002197, "misalign/J_aux_loss": 1.0772635713219643, "misalign/J_aux_loss_raw": 107.72635746002197, "misalign/J_over_reverse_kl": 6.927842974662781, "misalign/J_per_token": 0.1508565410040319, "misalign/compressed_reward_absmax": 3698.0084838867188, "misalign/compressed_reward_range": 5319.326965332031, "misalign/entropy_a": 1003.0962753295898, "misalign/entropy_b": 1097.5271453857422, "misalign/forward_kl_divergence": 24.432228088378906, "misalign/forward_kl_divergence_per_token": 0.05230529000982642, "misalign/gamma_abs_times_reward_std": 28536720.375, "misalign/gamma_bracketed_rate": 0.9909562915563583, "misalign/gamma_reward_residual": 4.479655626710155e-05, "misalign/gamma_star": 52695571.75, "misalign/js_divergence": 5.003187119960785, "misalign/reverse_kl_divergence": 19.63451886177063, "misalign/reverse_kl_divergence_per_token": 0.04139284580014646, "misalign/reward_a": 25.409843683242798, "misalign/reward_b": -16.79699671268463, "misalign/reward_improvement": 42.20684003829956, "misalign/reward_improvement_over_reverse_kl": 2.0897003561258316, "misalign/reward_improvement_per_token": 0.08610636787489057, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -519.6820330619812, "misalign/reward_vocab_std": 664.3161697387695, "misalign/symmetric_kl": 44.0667519569397, "misalign/tv_distance": 37.26307129859924, "num_tokens": 2405248.0, "rewards/accuracies": 0.984375, "rewards/chosen": -0.8268119320273399, "rewards/margins": 4.069930404424667, "rewards/rejected": -4.896742224693298, "step": 14, "support/residual_count": 151893.26953125, "support/residual_mass_policy": 0.03416757704690099, "support/residual_mass_reference": 0.03690562699921429, "support/residual_reward": -0.5111633716151118, "support/runtime_width": 42.731074810028076, "support/sampled_loser_rank": 0.5858863964676857, "support/sampled_reward_rank": -0.4269859306514263, "support/sampled_token_added_rate": 0.03297502198256552, "support/sampled_winner_rank": 0.5992331206798553, "support/selected_width": 42.731074810028076, "support/stored_width": 42.731074810028076 }, { "epoch": 0.234375, "grad_norm": 181.91363525390625, "kl/sequence_policy_ref": -37.09248995780945, "kl/vocab_forward": 36.39622640609741, "kl/vocab_js": 7.3817285895347595, "kl/vocab_reverse": 28.830130338668823, "kl/vocab_symmetric": 65.22635078430176, "learning_rate": 2e-06, "logps/chosen": -861.3702850341797, "logps/rejected": -1442.107177734375, "loss": 1.6622, "loss/dpo": 0.07950026832986623, "misalign/J": 158.2665023803711, "misalign/J_aux_loss": 1.582665003836155, "misalign/J_aux_loss_raw": 158.2665023803711, "misalign/J_over_reverse_kl": 6.344271242618561, "misalign/J_per_token": 0.14481874648481607, "misalign/compressed_reward_absmax": 4787.8853759765625, "misalign/compressed_reward_range": 6777.99462890625, "misalign/entropy_a": 1239.900390625, "misalign/entropy_b": 1391.7532577514648, "misalign/forward_kl_divergence": 36.39622640609741, "misalign/forward_kl_divergence_per_token": 0.046229132916778326, "misalign/gamma_abs_times_reward_std": 40674196.875, "misalign/gamma_bracketed_rate": 0.991030216217041, "misalign/gamma_reward_residual": 7.2218020022774e-05, "misalign/gamma_star": 71395850.0, "misalign/js_divergence": 7.3817285895347595, "misalign/reverse_kl_divergence": 28.830130338668823, "misalign/reverse_kl_divergence_per_token": 0.034299688413739204, "misalign/reward_a": 44.823195934295654, "misalign/reward_b": -15.68426263332367, "misalign/reward_improvement": 60.507455825805664, "misalign/reward_improvement_over_reverse_kl": 2.0816327780485153, "misalign/reward_improvement_per_token": 0.07316383346915245, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -574.4606781005859, "misalign/reward_vocab_std": 834.6710662841797, "misalign/symmetric_kl": 65.22635078430176, "misalign/tv_distance": 52.97303628921509, "num_tokens": 2598130.0, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3571155220270157, "rewards/margins": 4.704267263412476, "rewards/rejected": -6.06138277053833, "step": 15, "support/residual_count": 151893.16015625, "support/residual_mass_policy": 0.0329043276142329, "support/residual_mass_reference": 0.0375856957398355, "support/residual_reward": -0.433580182492733, "support/runtime_width": 42.837427616119385, "support/sampled_loser_rank": 0.6224480830132961, "support/sampled_reward_rank": -0.4383445642888546, "support/sampled_token_added_rate": 0.03260477026924491, "support/sampled_winner_rank": 0.6216517090797424, "support/selected_width": 42.837427616119385, "support/stored_width": 42.837427616119385 }, { "epoch": 0.25, "grad_norm": 197.3428497314453, "kl/sequence_policy_ref": -48.6728458404541, "kl/vocab_forward": 41.45539164543152, "kl/vocab_js": 8.142653048038483, "kl/vocab_reverse": 31.82970356941223, "kl/vocab_symmetric": 73.28509140014648, "learning_rate": 2e-06, "logps/chosen": -746.3570137023926, "logps/rejected": -1378.0999755859375, "loss": 1.7196, "loss/dpo": 0.21162739349529147, "misalign/J": 150.7994818687439, "misalign/J_aux_loss": 1.5079948231577873, "misalign/J_aux_loss_raw": 150.7994818687439, "misalign/J_over_reverse_kl": 6.891877442598343, "misalign/J_per_token": 0.1977673191577196, "misalign/compressed_reward_absmax": 4197.204010009766, "misalign/compressed_reward_range": 6020.658874511719, "misalign/entropy_a": 1102.5961456298828, "misalign/entropy_b": 1252.7644424438477, "misalign/forward_kl_divergence": 41.45539164543152, "misalign/forward_kl_divergence_per_token": 0.07184931915253401, "misalign/gamma_abs_times_reward_std": 42797047.125, "misalign/gamma_bracketed_rate": 0.9879247918725014, "misalign/gamma_reward_residual": 7.625430021107604e-05, "misalign/gamma_star": 70593251.625, "misalign/js_divergence": 8.142653048038483, "misalign/reverse_kl_divergence": 31.82970356941223, "misalign/reverse_kl_divergence_per_token": 0.04737356537953019, "misalign/reward_a": 40.68007683753967, "misalign/reward_b": -19.660471826791763, "misalign/reward_improvement": 60.34055471420288, "misalign/reward_improvement_over_reverse_kl": 1.6718981862068176, "misalign/reward_improvement_per_token": 0.06326864054426551, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -500.27739334106445, "misalign/reward_vocab_std": 718.7537536621094, "misalign/symmetric_kl": 73.28509140014648, "misalign/tv_distance": 52.97745728492737, "num_tokens": 2767079.0, "rewards/accuracies": 0.9375, "rewards/chosen": -2.074222356081009, "rewards/margins": 5.586124628782272, "rewards/rejected": -7.660347044467926, "step": 16, "support/residual_count": 151892.984375, "support/residual_mass_policy": 0.04499144037254155, "support/residual_mass_reference": 0.0501289798412472, "support/residual_reward": -0.45880454778671265, "support/runtime_width": 43.018609046936035, "support/sampled_loser_rank": 0.5783994421362877, "support/sampled_reward_rank": -0.40191334672272205, "support/sampled_token_added_rate": 0.035224413964897394, "support/sampled_winner_rank": 0.6004925258457661, "support/selected_width": 43.018609046936035, "support/stored_width": 43.018609046936035 }, { "epoch": 0.265625, "grad_norm": 180.9165802001953, "kl/sequence_policy_ref": -57.82066249847412, "kl/vocab_forward": 49.91004800796509, "kl/vocab_js": 9.40449070930481, "kl/vocab_reverse": 36.53464651107788, "kl/vocab_symmetric": 86.44468975067139, "learning_rate": 2e-06, "logps/chosen": -609.6646957397461, "logps/rejected": -1610.3834075927734, "loss": 1.7231, "loss/dpo": 0.09814724331954494, "misalign/J": 162.49910640716553, "misalign/J_aux_loss": 1.624990999698639, "misalign/J_aux_loss_raw": 162.49910640716553, "misalign/J_over_reverse_kl": 5.963241904973984, "misalign/J_per_token": 0.25427408143877983, "misalign/compressed_reward_absmax": 4177.482116699219, "misalign/compressed_reward_range": 5814.5203857421875, "misalign/entropy_a": 1132.220069885254, "misalign/entropy_b": 1293.5945892333984, "misalign/forward_kl_divergence": 49.91004800796509, "misalign/forward_kl_divergence_per_token": 0.08865668019279838, "misalign/gamma_abs_times_reward_std": 45817248.5, "misalign/gamma_bracketed_rate": 0.9841953068971634, "misalign/gamma_reward_residual": 6.89706002958701e-05, "misalign/gamma_star": 86042709.0, "misalign/js_divergence": 9.40449070930481, "misalign/reverse_kl_divergence": 36.53464651107788, "misalign/reverse_kl_divergence_per_token": 0.05950516927987337, "misalign/reward_a": 46.90000104904175, "misalign/reward_b": -18.144596874713898, "misalign/reward_improvement": 65.04460048675537, "misalign/reward_improvement_over_reverse_kl": 1.706669107079506, "misalign/reward_improvement_per_token": 0.0920080472715199, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -442.9830160140991, "misalign/reward_vocab_std": 696.7616271972656, "misalign/symmetric_kl": 86.44468975067139, "misalign/tv_distance": 57.59225845336914, "num_tokens": 2937506.0, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9826722741127014, "rewards/margins": 7.598788321018219, "rewards/rejected": -9.581460356712341, "step": 17, "support/residual_count": 151893.49609375, "support/residual_mass_policy": 0.0333835429046303, "support/residual_mass_reference": 0.03812613524496555, "support/residual_reward": -0.4731739591807127, "support/runtime_width": 42.50527763366699, "support/sampled_loser_rank": 0.6437485739588737, "support/sampled_reward_rank": -0.3562327502295375, "support/sampled_token_added_rate": 0.035286844009533525, "support/sampled_winner_rank": 0.6677471101284027, "support/selected_width": 42.50527763366699, "support/stored_width": 42.50527763366699 }, { "epoch": 0.28125, "grad_norm": 149.1407928466797, "kl/sequence_policy_ref": -51.036746978759766, "kl/vocab_forward": 44.88189697265625, "kl/vocab_js": 8.306392669677734, "kl/vocab_reverse": 32.83620524406433, "kl/vocab_symmetric": 77.71810150146484, "learning_rate": 2e-06, "logps/chosen": -560.9642581939697, "logps/rejected": -1253.7114028930664, "loss": 1.5028, "loss/dpo": 0.17861688579432666, "misalign/J": 132.42161083221436, "misalign/J_aux_loss": 1.324216105043888, "misalign/J_aux_loss_raw": 132.42161083221436, "misalign/J_over_reverse_kl": 8.582664713263512, "misalign/J_per_token": 0.27548689767718315, "misalign/compressed_reward_absmax": 3723.6834411621094, "misalign/compressed_reward_range": 5332.2451171875, "misalign/entropy_a": 928.9597320556641, "misalign/entropy_b": 1068.3447341918945, "misalign/forward_kl_divergence": 44.88189697265625, "misalign/forward_kl_divergence_per_token": 0.11330410279333591, "misalign/gamma_abs_times_reward_std": 38645046.25, "misalign/gamma_bracketed_rate": 0.9823459088802338, "misalign/gamma_reward_residual": 7.946674531922326e-05, "misalign/gamma_star": 71871948.75, "misalign/js_divergence": 8.306392669677734, "misalign/reverse_kl_divergence": 32.83620524406433, "misalign/reverse_kl_divergence_per_token": 0.08078084606677294, "misalign/reward_a": 35.60689043998718, "misalign/reward_b": -16.05292272567749, "misalign/reward_improvement": 51.65981483459473, "misalign/reward_improvement_over_reverse_kl": 1.344532385468483, "misalign/reward_improvement_per_token": 0.06508979946374893, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -361.802695274353, "misalign/reward_vocab_std": 639.2480430603027, "misalign/symmetric_kl": 77.71810150146484, "misalign/tv_distance": 48.04610013961792, "num_tokens": 3089181.0, "rewards/accuracies": 0.953125, "rewards/chosen": -2.0581542253494263, "rewards/margins": 6.091041147708893, "rewards/rejected": -8.149195373058319, "step": 18, "support/residual_count": 151893.052734375, "support/residual_mass_policy": 0.039642924442887306, "support/residual_mass_reference": 0.049897957127541304, "support/residual_reward": -0.4336713273078203, "support/runtime_width": 42.948453426361084, "support/sampled_loser_rank": 0.6003664061427116, "support/sampled_reward_rank": -0.5117907077074051, "support/sampled_token_added_rate": 0.03761800215579569, "support/sampled_winner_rank": 0.6974114552140236, "support/selected_width": 42.948453426361084, "support/stored_width": 42.948453426361084 }, { "epoch": 0.296875, "grad_norm": 236.4145965576172, "kl/sequence_policy_ref": -70.85580825805664, "kl/vocab_forward": 61.658048152923584, "kl/vocab_js": 11.049875855445862, "kl/vocab_reverse": 42.84557771682739, "kl/vocab_symmetric": 104.50362396240234, "learning_rate": 2e-06, "logps/chosen": -680.9906196594238, "logps/rejected": -1526.8777770996094, "loss": 2.3197, "loss/dpo": 0.15028794163845305, "misalign/J": 216.9369659423828, "misalign/J_aux_loss": 2.169369585812092, "misalign/J_aux_loss_raw": 216.9369659423828, "misalign/J_over_reverse_kl": 6.0987227857112885, "misalign/J_per_token": 0.24055337626487017, "misalign/compressed_reward_absmax": 4234.713134765625, "misalign/compressed_reward_range": 5953.008850097656, "misalign/entropy_a": 1090.484504699707, "misalign/entropy_b": 1275.7961730957031, "misalign/forward_kl_divergence": 61.658048152923584, "misalign/forward_kl_divergence_per_token": 0.0996482465416193, "misalign/gamma_abs_times_reward_std": 67395967.375, "misalign/gamma_bracketed_rate": 0.9844093844294548, "misalign/gamma_reward_residual": 0.00012266499561519595, "misalign/gamma_star": 125619402.0, "misalign/js_divergence": 11.049875855445862, "misalign/reverse_kl_divergence": 42.84557771682739, "misalign/reverse_kl_divergence_per_token": 0.06486277049407363, "misalign/reward_a": 51.76768445968628, "misalign/reward_b": -14.660561382770538, "misalign/reward_improvement": 66.42824363708496, "misalign/reward_improvement_over_reverse_kl": 1.57049061357975, "misalign/reward_improvement_per_token": 0.11498506926000118, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -568.4373540878296, "misalign/reward_vocab_std": 725.4858703613281, "misalign/symmetric_kl": 104.50362396240234, "misalign/tv_distance": 62.00803565979004, "num_tokens": 3255573.0, "rewards/accuracies": 0.953125, "rewards/chosen": -3.0821579098701477, "rewards/margins": 8.006846249103546, "rewards/rejected": -11.089004278182983, "step": 19, "support/residual_count": 151893.365234375, "support/residual_mass_policy": 0.04206296429038048, "support/residual_mass_reference": 0.046698169549927115, "support/residual_reward": -0.47489158436656, "support/runtime_width": 42.64014720916748, "support/sampled_loser_rank": 0.605025552213192, "support/sampled_reward_rank": -0.3768458142876625, "support/sampled_token_added_rate": 0.045746787916868925, "support/sampled_winner_rank": 0.6195746287703514, "support/selected_width": 42.64014720916748, "support/stored_width": 42.64014720916748 }, { "epoch": 0.3125, "grad_norm": 116.85738372802734, "kl/sequence_policy_ref": -65.05696249008179, "kl/vocab_forward": 57.7905068397522, "kl/vocab_js": 10.413756370544434, "kl/vocab_reverse": 40.30951166152954, "kl/vocab_symmetric": 98.10001516342163, "learning_rate": 2e-06, "logps/chosen": -673.3448905944824, "logps/rejected": -1367.0256805419922, "loss": 1.323, "loss/dpo": 0.1641167537018191, "misalign/J": 115.88617134094238, "misalign/J_aux_loss": 1.1588616967201233, "misalign/J_aux_loss_raw": 115.88617134094238, "misalign/J_over_reverse_kl": 3.0085965991020203, "misalign/J_per_token": 0.16471682861447334, "misalign/compressed_reward_absmax": 4075.5525817871094, "misalign/compressed_reward_range": 5858.878479003906, "misalign/entropy_a": 1006.4973907470703, "misalign/entropy_b": 1188.8379135131836, "misalign/forward_kl_divergence": 57.7905068397522, "misalign/forward_kl_divergence_per_token": 0.10363293252885342, "misalign/gamma_abs_times_reward_std": 35261591.5, "misalign/gamma_bracketed_rate": 0.9897296130657196, "misalign/gamma_reward_residual": 6.685065909550758e-05, "misalign/gamma_star": 56798719.5, "misalign/js_divergence": 10.413756370544434, "misalign/reverse_kl_divergence": 40.30951166152954, "misalign/reverse_kl_divergence_per_token": 0.07322107395157218, "misalign/reward_a": 52.47555136680603, "misalign/reward_b": -10.230955243110657, "misalign/reward_improvement": 62.70651149749756, "misalign/reward_improvement_over_reverse_kl": 1.4691433906555176, "misalign/reward_improvement_per_token": 0.10327118635177612, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -398.2197914123535, "misalign/reward_vocab_std": 702.5999450683594, "misalign/symmetric_kl": 98.10001516342163, "misalign/tv_distance": 58.26440095901489, "num_tokens": 3415287.0, "rewards/accuracies": 0.96875, "rewards/chosen": -3.106875829398632, "rewards/margins": 6.797641098499298, "rewards/rejected": -9.904516816139221, "step": 20, "support/residual_count": 151892.931640625, "support/residual_mass_policy": 0.03865605127066374, "support/residual_mass_reference": 0.04426591098308563, "support/residual_reward": -0.37147790379822254, "support/runtime_width": 43.06875991821289, "support/sampled_loser_rank": 0.6501528844237328, "support/sampled_reward_rank": -0.459061823785305, "support/sampled_token_added_rate": 0.037211825139820576, "support/sampled_winner_rank": 0.6419277414679527, "support/selected_width": 43.06875991821289, "support/stored_width": 43.06875991821289 }, { "epoch": 0.328125, "grad_norm": 167.24000549316406, "kl/sequence_policy_ref": -87.30181169509888, "kl/vocab_forward": 80.11595010757446, "kl/vocab_js": 13.753250360488892, "kl/vocab_reverse": 52.890267848968506, "kl/vocab_symmetric": 133.00623035430908, "learning_rate": 2e-06, "logps/chosen": -860.2458534240723, "logps/rejected": -1500.8129959106445, "loss": 1.9906, "loss/dpo": 0.29799531144089997, "misalign/J": 169.26398181915283, "misalign/J_aux_loss": 1.6926398500800133, "misalign/J_aux_loss_raw": 169.26398181915283, "misalign/J_over_reverse_kl": 4.501902684569359, "misalign/J_per_token": 0.20157606061547995, "misalign/compressed_reward_absmax": 4415.365386962891, "misalign/compressed_reward_range": 6252.089111328125, "misalign/entropy_a": 1110.4772415161133, "misalign/entropy_b": 1350.9320755004883, "misalign/forward_kl_divergence": 80.11595010757446, "misalign/forward_kl_divergence_per_token": 0.10054660588502884, "misalign/gamma_abs_times_reward_std": 54722816.875, "misalign/gamma_bracketed_rate": 0.9853794500231743, "misalign/gamma_reward_residual": 0.00010197218091434479, "misalign/gamma_star": 109237338.75, "misalign/js_divergence": 13.753250360488892, "misalign/reverse_kl_divergence": 52.890267848968506, "misalign/reverse_kl_divergence_per_token": 0.06223124684765935, "misalign/reward_a": 63.141523361206055, "misalign/reward_b": -12.613204658031464, "misalign/reward_improvement": 75.75473356246948, "misalign/reward_improvement_over_reverse_kl": 1.376510500907898, "misalign/reward_improvement_per_token": 0.08627395983785391, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -474.14845275878906, "misalign/reward_vocab_std": 756.6061401367188, "misalign/symmetric_kl": 133.00623035430908, "misalign/tv_distance": 71.77893543243408, "num_tokens": 3596201.0, "rewards/accuracies": 0.90625, "rewards/chosen": -5.184990763664246, "rewards/margins": 7.090380907058716, "rewards/rejected": -12.275371551513672, "step": 21, "support/residual_count": 151893.302734375, "support/residual_mass_policy": 0.03361299750395119, "support/residual_mass_reference": 0.03923962963744998, "support/residual_reward": -0.44669216219335794, "support/runtime_width": 42.699111461639404, "support/sampled_loser_rank": 0.5900973714888096, "support/sampled_reward_rank": -0.3974966434761882, "support/sampled_token_added_rate": 0.034115204587578773, "support/sampled_winner_rank": 0.6001664698123932, "support/selected_width": 42.699111461639404, "support/stored_width": 42.699111461639404 }, { "epoch": 0.34375, "grad_norm": 182.3348846435547, "kl/sequence_policy_ref": -98.32647848129272, "kl/vocab_forward": 87.50644826889038, "kl/vocab_js": 14.100131571292877, "kl/vocab_reverse": 54.41349792480469, "kl/vocab_symmetric": 141.9199457168579, "learning_rate": 2e-06, "logps/chosen": -720.9393196105957, "logps/rejected": -1401.5432891845703, "loss": 1.9288, "loss/dpo": 0.18852760957088321, "misalign/J": 174.02261638641357, "misalign/J_aux_loss": 1.7402261197566986, "misalign/J_aux_loss_raw": 174.02261638641357, "misalign/J_over_reverse_kl": 3.6353148818016052, "misalign/J_per_token": 0.21223169937729836, "misalign/compressed_reward_absmax": 3982.8233032226562, "misalign/compressed_reward_range": 5637.1624755859375, "misalign/entropy_a": 958.0937767028809, "misalign/entropy_b": 1190.8169174194336, "misalign/forward_kl_divergence": 87.50644826889038, "misalign/forward_kl_divergence_per_token": 0.14662323985248804, "misalign/gamma_abs_times_reward_std": 59248922.75, "misalign/gamma_bracketed_rate": 0.9862400367856026, "misalign/gamma_reward_residual": 0.0001503152491295623, "misalign/gamma_star": 115726537.75, "misalign/js_divergence": 14.100131571292877, "misalign/reverse_kl_divergence": 54.41349792480469, "misalign/reverse_kl_divergence_per_token": 0.08750392450019717, "misalign/reward_a": 56.57697582244873, "misalign/reward_b": -11.629800856113434, "misalign/reward_improvement": 68.20676565170288, "misalign/reward_improvement_over_reverse_kl": 1.19433955848217, "misalign/reward_improvement_per_token": 0.09249244816601276, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -674.6355495452881, "misalign/reward_vocab_std": 699.2789764404297, "misalign/symmetric_kl": 141.9199457168579, "misalign/tv_distance": 68.87815427780151, "num_tokens": 3758885.0, "rewards/accuracies": 0.96875, "rewards/chosen": -4.974303662776947, "rewards/margins": 9.71668916940689, "rewards/rejected": -14.69099223613739, "step": 22, "support/residual_count": 151893.119140625, "support/residual_mass_policy": 0.033487192122265697, "support/residual_mass_reference": 0.041745478520169854, "support/residual_reward": -0.5358738675713539, "support/runtime_width": 42.88250732421875, "support/sampled_loser_rank": 0.6002072133123875, "support/sampled_reward_rank": -0.3628186024725437, "support/sampled_token_added_rate": 0.036499075358733535, "support/sampled_winner_rank": 0.6214041896164417, "support/selected_width": 42.88250732421875, "support/stored_width": 42.88250732421875 }, { "epoch": 0.359375, "grad_norm": 169.51712036132812, "kl/sequence_policy_ref": -149.24591064453125, "kl/vocab_forward": 133.08092784881592, "kl/vocab_js": 19.9760000705719, "kl/vocab_reverse": 77.37433004379272, "kl/vocab_symmetric": 210.455228805542, "learning_rate": 2e-06, "logps/chosen": -737.869384765625, "logps/rejected": -1762.2871856689453, "loss": 1.9918, "loss/dpo": 0.2823996262759465, "misalign/J": 170.94224166870117, "misalign/J_aux_loss": 1.7094224244356155, "misalign/J_aux_loss_raw": 170.94224166870117, "misalign/J_over_reverse_kl": 3.2799622118473053, "misalign/J_per_token": 0.24247757904231548, "misalign/compressed_reward_absmax": 4777.976379394531, "misalign/compressed_reward_range": 6731.566955566406, "misalign/entropy_a": 1061.749008178711, "misalign/entropy_b": 1356.131118774414, "misalign/forward_kl_divergence": 133.08092784881592, "misalign/forward_kl_divergence_per_token": 0.1782828439027071, "misalign/gamma_abs_times_reward_std": 53340090.0, "misalign/gamma_bracketed_rate": 0.9852898493409157, "misalign/gamma_reward_residual": 5.0128826615036814e-05, "misalign/gamma_star": 100226211.0, "misalign/js_divergence": 19.9760000705719, "misalign/reverse_kl_divergence": 77.37433004379272, "misalign/reverse_kl_divergence_per_token": 0.09528437815606594, "misalign/reward_a": 69.03681755065918, "misalign/reward_b": -18.53004103899002, "misalign/reward_improvement": 87.56686687469482, "misalign/reward_improvement_over_reverse_kl": 1.0574140399694443, "misalign/reward_improvement_per_token": 0.08220357168465853, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -386.6137237548828, "misalign/reward_vocab_std": 798.2211227416992, "misalign/symmetric_kl": 210.455228805542, "misalign/tv_distance": 88.67370319366455, "num_tokens": 3948274.0, "rewards/accuracies": 0.953125, "rewards/chosen": -6.972290515899658, "rewards/margins": 15.904601573944092, "rewards/rejected": -22.876891613006592, "step": 23, "support/residual_count": 151893.173828125, "support/residual_mass_policy": 0.027609078446403146, "support/residual_mass_reference": 0.03574479790404439, "support/residual_reward": -0.29973831586539745, "support/runtime_width": 42.827510833740234, "support/sampled_loser_rank": 0.6080890074372292, "support/sampled_reward_rank": -0.44763438403606415, "support/sampled_token_added_rate": 0.03303293650969863, "support/sampled_winner_rank": 0.6279079839587212, "support/selected_width": 42.827510833740234, "support/stored_width": 42.827510833740234 }, { "epoch": 0.375, "grad_norm": 140.47096252441406, "kl/sequence_policy_ref": -144.5389518737793, "kl/vocab_forward": 128.69008922576904, "kl/vocab_js": 18.313786387443542, "kl/vocab_reverse": 71.02430152893066, "kl/vocab_symmetric": 199.71442413330078, "learning_rate": 2e-06, "logps/chosen": -572.0278053283691, "logps/rejected": -1404.0182037353516, "loss": 2.6061, "loss/dpo": 1.0913660326041281, "misalign/J": 151.477144241333, "misalign/J_aux_loss": 1.5147713869810104, "misalign/J_aux_loss_raw": 151.477144241333, "misalign/J_over_reverse_kl": 3.1627804189920425, "misalign/J_per_token": 0.24758470617234707, "misalign/compressed_reward_absmax": 3778.4814453125, "misalign/compressed_reward_range": 5379.040588378906, "misalign/entropy_a": 782.0902366638184, "misalign/entropy_b": 1035.1530227661133, "misalign/forward_kl_divergence": 128.69008922576904, "misalign/forward_kl_divergence_per_token": 0.2601375840604305, "misalign/gamma_abs_times_reward_std": 47515251.5, "misalign/gamma_bracketed_rate": 0.9839186295866966, "misalign/gamma_reward_residual": 0.00012259059758434887, "misalign/gamma_star": 87554911.0, "misalign/js_divergence": 18.313786387443542, "misalign/reverse_kl_divergence": 71.02430152893066, "misalign/reverse_kl_divergence_per_token": 0.13188489899039268, "misalign/reward_a": 56.8120379447937, "misalign/reward_b": -14.587293282151222, "misalign/reward_improvement": 71.39935445785522, "misalign/reward_improvement_over_reverse_kl": 0.8568570390343666, "misalign/reward_improvement_per_token": 0.08758416399359703, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -452.94491720199585, "misalign/reward_vocab_std": 656.8603706359863, "misalign/symmetric_kl": 199.71442413330078, "misalign/tv_distance": 75.44480800628662, "num_tokens": 4099141.0, "rewards/accuracies": 0.890625, "rewards/chosen": -7.403442680835724, "rewards/margins": 14.100905060768127, "rewards/rejected": -21.504348874092102, "step": 24, "support/residual_count": 151893.353515625, "support/residual_mass_policy": 0.03428218117915094, "support/residual_mass_reference": 0.04287252272479236, "support/residual_reward": -0.5751103330403566, "support/runtime_width": 42.65069341659546, "support/sampled_loser_rank": 0.573052179068327, "support/sampled_reward_rank": -0.40498005226254463, "support/sampled_token_added_rate": 0.03546261019073427, "support/sampled_winner_rank": 0.5977285951375961, "support/selected_width": 42.65069341659546, "support/stored_width": 42.65069341659546 }, { "epoch": 0.375, "eval_kl/sequence_policy_ref": -176.45206832885742, "eval_kl/vocab_forward": 159.24169623851776, "eval_kl/vocab_js": 22.288581863045692, "eval_kl/vocab_reverse": 86.69451874494553, "eval_kl/vocab_symmetric": 245.93626713752747, "eval_logps/chosen": -822.1732840538025, "eval_logps/rejected": -1630.9967403411865, "eval_loss": 2.032864570617676, "eval_loss/dpo": 0.513789746529512, "eval_misalign/J": 151.90749096870422, "eval_misalign/J_aux_loss": 1.519074865616858, "eval_misalign/J_aux_loss_raw": 151.90749096870422, "eval_misalign/J_over_reverse_kl": 2.979654673486948, "eval_misalign/J_per_token": 0.2187155862338841, "eval_misalign/compressed_reward_absmax": 4236.706275939941, "eval_misalign/compressed_reward_range": 6027.056526184082, "eval_misalign/entropy_a": 972.2038908004761, "eval_misalign/entropy_b": 1298.0544624328613, "eval_misalign/forward_kl_divergence": 159.24169623851776, "eval_misalign/forward_kl_divergence_per_token": 0.28451165836304426, "eval_misalign/gamma_abs_times_reward_std": 44413324.9375, "eval_misalign/gamma_bracketed_rate": 0.9879434006288648, "eval_misalign/gamma_reward_residual": 9.759679009846423e-05, "eval_misalign/gamma_star": 76999057.8125, "eval_misalign/js_divergence": 22.288581863045692, "eval_misalign/reverse_kl_divergence": 86.69451874494553, "eval_misalign/reverse_kl_divergence_per_token": 0.14481490349862725, "eval_misalign/reward_a": 73.34608280658722, "eval_misalign/reward_b": -14.307282455265522, "eval_misalign/reward_improvement": 87.65336620807648, "eval_misalign/reward_improvement_over_reverse_kl": 0.9149683965370059, "eval_misalign/reward_improvement_per_token": 0.08359824417857453, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.11556124687195, "eval_misalign/reward_vocab_std": 731.3202571868896, "eval_misalign/symmetric_kl": 245.93626713752747, "eval_misalign/tv_distance": 92.47358250617981, "eval_rewards/accuracies": 0.91796875, "eval_rewards/chosen": -9.299732282757759, "eval_rewards/margins": 16.690949447453022, "eval_rewards/rejected": -25.99068196117878, "eval_runtime": 101.3791, "eval_samples_per_second": 5.05, "eval_steps_per_second": 0.631, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.031916850464767776, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 24 }, { "epoch": 0.390625, "grad_norm": 103.1752700805664, "kl/sequence_policy_ref": -175.5179786682129, "kl/vocab_forward": 165.2755527496338, "kl/vocab_js": 22.945865869522095, "kl/vocab_reverse": 88.68728542327881, "kl/vocab_symmetric": 253.96291160583496, "learning_rate": 2e-06, "logps/chosen": -823.9939117431641, "logps/rejected": -1613.8134307861328, "loss": 1.7244, "loss/dpo": 0.30688550411116466, "misalign/J": 141.75555324554443, "misalign/J_aux_loss": 1.4175555855035782, "misalign/J_aux_loss_raw": 141.75555324554443, "misalign/J_over_reverse_kl": 1.7237665206193924, "misalign/J_per_token": 0.20317152328789234, "misalign/compressed_reward_absmax": 4355.498870849609, "misalign/compressed_reward_range": 6246.024597167969, "misalign/entropy_a": 947.64599609375, "misalign/entropy_b": 1290.7304077148438, "misalign/forward_kl_divergence": 165.2755527496338, "misalign/forward_kl_divergence_per_token": 0.3030826188623905, "misalign/gamma_abs_times_reward_std": 42652071.0, "misalign/gamma_bracketed_rate": 0.9883464574813843, "misalign/gamma_reward_residual": 5.872455130884191e-05, "misalign/gamma_star": 72449817.125, "misalign/js_divergence": 22.945865869522095, "misalign/reverse_kl_divergence": 88.68728542327881, "misalign/reverse_kl_divergence_per_token": 0.13877300918102264, "misalign/reward_a": 74.63873767852783, "misalign/reward_b": -13.649469316005707, "misalign/reward_improvement": 88.28820371627808, "misalign/reward_improvement_over_reverse_kl": 0.9083794951438904, "misalign/reward_improvement_per_token": 0.09202832682058215, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -502.7140769958496, "misalign/reward_vocab_std": 763.2846450805664, "misalign/symmetric_kl": 253.96291160583496, "misalign/tv_distance": 94.16235828399658, "num_tokens": 4267525.0, "rewards/accuracies": 0.96875, "rewards/chosen": -9.091135799884796, "rewards/margins": 16.92132532596588, "rewards/rejected": -26.012461185455322, "step": 25, "support/residual_count": 151893.228515625, "support/residual_mass_policy": 0.02821849100291729, "support/residual_mass_reference": 0.03902764664962888, "support/residual_reward": -0.4604416638612747, "support/runtime_width": 42.77067279815674, "support/sampled_loser_rank": 0.6332324221730232, "support/sampled_reward_rank": -0.385429447516799, "support/sampled_token_added_rate": 0.03589798975735903, "support/sampled_winner_rank": 0.6475523337721825, "support/selected_width": 42.77067279815674, "support/stored_width": 42.77067279815674 }, { "epoch": 0.40625, "grad_norm": 174.2784423828125, "kl/sequence_policy_ref": -163.66453552246094, "kl/vocab_forward": 142.52412605285645, "kl/vocab_js": 19.224056720733643, "kl/vocab_reverse": 76.05754041671753, "kl/vocab_symmetric": 218.58167839050293, "learning_rate": 2e-06, "logps/chosen": -591.3704357147217, "logps/rejected": -1515.6297912597656, "loss": 1.9078, "loss/dpo": 0.4340968047727074, "misalign/J": 147.37513256072998, "misalign/J_aux_loss": 1.473751276731491, "misalign/J_aux_loss_raw": 147.37513256072998, "misalign/J_over_reverse_kl": 2.757804274559021, "misalign/J_per_token": 0.28767452389001846, "misalign/compressed_reward_absmax": 4011.147979736328, "misalign/compressed_reward_range": 5775.938781738281, "misalign/entropy_a": 838.7755889892578, "misalign/entropy_b": 1110.4411010742188, "misalign/forward_kl_divergence": 142.52412605285645, "misalign/forward_kl_divergence_per_token": 0.36550967395305634, "misalign/gamma_abs_times_reward_std": 50544949.5, "misalign/gamma_bracketed_rate": 0.9852296411991119, "misalign/gamma_reward_residual": 4.505999822868034e-05, "misalign/gamma_star": 83547619.75, "misalign/js_divergence": 19.224056720733643, "misalign/reverse_kl_divergence": 76.05754041671753, "misalign/reverse_kl_divergence_per_token": 0.18261760007590055, "misalign/reward_a": 57.265894651412964, "misalign/reward_b": -16.07930701971054, "misalign/reward_improvement": 73.3451886177063, "misalign/reward_improvement_over_reverse_kl": 0.8678321242332458, "misalign/reward_improvement_per_token": 0.11522631160914898, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -398.8517951965332, "misalign/reward_vocab_std": 701.5557250976562, "misalign/symmetric_kl": 218.58167839050293, "misalign/tv_distance": 77.64586639404297, "num_tokens": 4432914.0, "rewards/accuracies": 0.984375, "rewards/chosen": -6.874026566743851, "rewards/margins": 18.98485553264618, "rewards/rejected": -25.85888135433197, "step": 26, "support/residual_count": 151893.0546875, "support/residual_mass_policy": 0.034172143787145615, "support/residual_mass_reference": 0.045072893146425486, "support/residual_reward": -0.4141153208911419, "support/runtime_width": 42.94584655761719, "support/sampled_loser_rank": 0.5826500616967678, "support/sampled_reward_rank": -0.4691601078957319, "support/sampled_token_added_rate": 0.04121970618143678, "support/sampled_winner_rank": 0.5887424424290657, "support/selected_width": 42.94584655761719, "support/stored_width": 42.94584655761719 }, { "epoch": 0.421875, "grad_norm": 136.3807830810547, "kl/sequence_policy_ref": -203.77911186218262, "kl/vocab_forward": 183.4600429534912, "kl/vocab_js": 24.132691860198975, "kl/vocab_reverse": 93.94395637512207, "kl/vocab_symmetric": 277.40405654907227, "learning_rate": 2e-06, "logps/chosen": -713.9009399414062, "logps/rejected": -1687.3765106201172, "loss": 2.0315, "loss/dpo": 0.32593174448902573, "misalign/J": 170.55465126037598, "misalign/J_aux_loss": 1.7055464833974838, "misalign/J_aux_loss_raw": 170.55465126037598, "misalign/J_over_reverse_kl": 2.438810557126999, "misalign/J_per_token": 0.20938482321798801, "misalign/compressed_reward_absmax": 4179.306243896484, "misalign/compressed_reward_range": 5945.363037109375, "misalign/entropy_a": 905.4970626831055, "misalign/entropy_b": 1233.0114974975586, "misalign/forward_kl_divergence": 183.4600429534912, "misalign/forward_kl_divergence_per_token": 0.35007214546203613, "misalign/gamma_abs_times_reward_std": 59701588.25, "misalign/gamma_bracketed_rate": 0.9896951243281364, "misalign/gamma_reward_residual": 4.522019105479558e-05, "misalign/gamma_star": 83829036.25, "misalign/js_divergence": 24.132691860198975, "misalign/reverse_kl_divergence": 93.94395637512207, "misalign/reverse_kl_divergence_per_token": 0.1601700335741043, "misalign/reward_a": 70.40993356704712, "misalign/reward_b": -15.953831195831299, "misalign/reward_improvement": 86.36374855041504, "misalign/reward_improvement_over_reverse_kl": 0.9155527576804161, "misalign/reward_improvement_per_token": 0.10616821236908436, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -455.97426986694336, "misalign/reward_vocab_std": 724.7982482910156, "misalign/symmetric_kl": 277.40405654907227, "misalign/tv_distance": 94.7668981552124, "num_tokens": 4606090.0, "rewards/accuracies": 0.953125, "rewards/chosen": -9.796147882938385, "rewards/margins": 21.163527250289917, "rewards/rejected": -30.959676027297974, "step": 27, "support/residual_count": 151893.162109375, "support/residual_mass_policy": 0.03356131399050355, "support/residual_mass_reference": 0.041941048577427864, "support/residual_reward": -0.38252438232302666, "support/runtime_width": 42.837002754211426, "support/sampled_loser_rank": 0.6207218393683434, "support/sampled_reward_rank": -0.4833753891289234, "support/sampled_token_added_rate": 0.031088492134585977, "support/sampled_winner_rank": 0.6683962419629097, "support/selected_width": 42.837002754211426, "support/stored_width": 42.837002754211426 }, { "epoch": 0.4375, "grad_norm": 196.56495666503906, "kl/sequence_policy_ref": -229.19044494628906, "kl/vocab_forward": 210.6693572998047, "kl/vocab_js": 27.503621578216553, "kl/vocab_reverse": 107.11871337890625, "kl/vocab_symmetric": 317.7881450653076, "learning_rate": 2e-06, "logps/chosen": -1008.6218376159668, "logps/rejected": -1565.5734558105469, "loss": 2.9609, "loss/dpo": 1.097994428826496, "misalign/J": 186.2927417755127, "misalign/J_aux_loss": 1.8629273921251297, "misalign/J_aux_loss_raw": 186.2927417755127, "misalign/J_over_reverse_kl": 1.8979013413190842, "misalign/J_per_token": 0.27667875960469246, "misalign/compressed_reward_absmax": 4361.166076660156, "misalign/compressed_reward_range": 6270.502258300781, "misalign/entropy_a": 908.857048034668, "misalign/entropy_b": 1304.3608856201172, "misalign/forward_kl_divergence": 210.6693572998047, "misalign/forward_kl_divergence_per_token": 0.3658239506185055, "misalign/gamma_abs_times_reward_std": 59128653.0, "misalign/gamma_bracketed_rate": 0.981993056833744, "misalign/gamma_reward_residual": 0.0002913941991664615, "misalign/gamma_star": 100789119.5, "misalign/js_divergence": 27.503621578216553, "misalign/reverse_kl_divergence": 107.11871337890625, "misalign/reverse_kl_divergence_per_token": 0.1738772690296173, "misalign/reward_a": 89.05416059494019, "misalign/reward_b": -10.882870197296143, "misalign/reward_improvement": 99.93703746795654, "misalign/reward_improvement_over_reverse_kl": 0.8969720676541328, "misalign/reward_improvement_per_token": 0.14762359578162432, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -650.0788879394531, "misalign/reward_vocab_std": 765.3363800048828, "misalign/symmetric_kl": 317.7881450653076, "misalign/tv_distance": 106.59637641906738, "num_tokens": 4786127.0, "rewards/accuracies": 0.859375, "rewards/chosen": -13.720831990242004, "rewards/margins": 18.396424293518066, "rewards/rejected": -32.1172571182251, "step": 28, "support/residual_count": 151893.193359375, "support/residual_mass_policy": 0.03176172194071114, "support/residual_mass_reference": 0.041156242368742824, "support/residual_reward": -0.6319293715059757, "support/runtime_width": 42.804439544677734, "support/sampled_loser_rank": 0.6620542109012604, "support/sampled_reward_rank": -0.30558538623154163, "support/sampled_token_added_rate": 0.03132295864634216, "support/sampled_winner_rank": 0.7198682501912117, "support/selected_width": 42.804439544677734, "support/stored_width": 42.804439544677734 }, { "epoch": 0.453125, "grad_norm": 121.38319396972656, "kl/sequence_policy_ref": -252.93916511535645, "kl/vocab_forward": 226.2414608001709, "kl/vocab_js": 28.960803031921387, "kl/vocab_reverse": 114.66798114776611, "kl/vocab_symmetric": 340.90957260131836, "learning_rate": 2e-06, "logps/chosen": -779.5240745544434, "logps/rejected": -1872.5811767578125, "loss": 2.4603, "loss/dpo": 0.9713822825047167, "misalign/J": 148.88968753814697, "misalign/J_aux_loss": 1.4888968467712402, "misalign/J_aux_loss_raw": 148.88968753814697, "misalign/J_over_reverse_kl": 2.155982196331024, "misalign/J_per_token": 0.26285428553819656, "misalign/compressed_reward_absmax": 4548.9989013671875, "misalign/compressed_reward_range": 6488.315979003906, "misalign/entropy_a": 937.3698120117188, "misalign/entropy_b": 1328.4651641845703, "misalign/forward_kl_divergence": 226.2414608001709, "misalign/forward_kl_divergence_per_token": 0.329929880797863, "misalign/gamma_abs_times_reward_std": 42800947.75, "misalign/gamma_bracketed_rate": 0.9842586368322372, "misalign/gamma_reward_residual": 7.247616258609924e-05, "misalign/gamma_star": 66069326.0, "misalign/js_divergence": 28.960803031921387, "misalign/reverse_kl_divergence": 114.66798114776611, "misalign/reverse_kl_divergence_per_token": 0.15985783841460943, "misalign/reward_a": 79.22361898422241, "misalign/reward_b": -19.311943411827087, "misalign/reward_improvement": 98.53554153442383, "misalign/reward_improvement_over_reverse_kl": 0.820253424346447, "misalign/reward_improvement_per_token": 0.09689361555501819, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -195.40936851501465, "misalign/reward_vocab_std": 769.0406875610352, "misalign/symmetric_kl": 340.90957260131836, "misalign/tv_distance": 107.74736499786377, "num_tokens": 4960206.0, "rewards/accuracies": 0.9375, "rewards/chosen": -11.959493935108185, "rewards/margins": 26.66884672641754, "rewards/rejected": -38.62834072113037, "step": 29, "support/residual_count": 151892.96875, "support/residual_mass_policy": 0.033485232619568706, "support/residual_mass_reference": 0.0447953250259161, "support/residual_reward": -0.18092468939721584, "support/runtime_width": 43.02750873565674, "support/sampled_loser_rank": 0.6013398505747318, "support/sampled_reward_rank": -0.48561038076877594, "support/sampled_token_added_rate": 0.042466682847589254, "support/sampled_winner_rank": 0.5962688289582729, "support/selected_width": 43.02750873565674, "support/stored_width": 43.02750873565674 }, { "epoch": 0.46875, "grad_norm": 191.35165405273438, "kl/sequence_policy_ref": -265.8605842590332, "kl/vocab_forward": 241.83115577697754, "kl/vocab_js": 29.952810764312744, "kl/vocab_reverse": 117.4506607055664, "kl/vocab_symmetric": 359.28198051452637, "learning_rate": 2e-06, "logps/chosen": -902.8605346679688, "logps/rejected": -1808.0689544677734, "loss": 2.3052, "loss/dpo": 0.2984987065605812, "misalign/J": 200.67014503479004, "misalign/J_aux_loss": 2.0067013800144196, "misalign/J_aux_loss_raw": 200.67014503479004, "misalign/J_over_reverse_kl": 1.9287290573120117, "misalign/J_per_token": 0.2536418605595827, "misalign/compressed_reward_absmax": 4862.0013427734375, "misalign/compressed_reward_range": 6898.528076171875, "misalign/entropy_a": 945.1607437133789, "misalign/entropy_b": 1346.2866134643555, "misalign/forward_kl_divergence": 241.83115577697754, "misalign/forward_kl_divergence_per_token": 0.34030735678970814, "misalign/gamma_abs_times_reward_std": 66157058.0, "misalign/gamma_bracketed_rate": 0.9856267645955086, "misalign/gamma_reward_residual": 0.000264001724190166, "misalign/gamma_star": 109703980.5, "misalign/js_divergence": 29.952810764312744, "misalign/reverse_kl_divergence": 117.4506607055664, "misalign/reverse_kl_divergence_per_token": 0.1695484183728695, "misalign/reward_a": 87.51940584182739, "misalign/reward_b": -12.946220338344574, "misalign/reward_improvement": 100.46563053131104, "misalign/reward_improvement_over_reverse_kl": 0.804968811571598, "misalign/reward_improvement_per_token": 0.11112680193036795, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -619.1823959350586, "misalign/reward_vocab_std": 829.3263244628906, "misalign/symmetric_kl": 359.28198051452637, "misalign/tv_distance": 112.75522899627686, "num_tokens": 5145509.0, "rewards/accuracies": 0.953125, "rewards/chosen": -15.322714567184448, "rewards/margins": 22.526688814163208, "rewards/rejected": -37.84940314292908, "step": 30, "support/residual_count": 151893.208984375, "support/residual_mass_policy": 0.034048222471028566, "support/residual_mass_reference": 0.04346996312960982, "support/residual_reward": -0.5197547674179077, "support/runtime_width": 42.78953218460083, "support/sampled_loser_rank": 0.5563570559024811, "support/sampled_reward_rank": -0.39020144287496805, "support/sampled_token_added_rate": 0.03398139285854995, "support/sampled_winner_rank": 0.5739484503865242, "support/selected_width": 42.78953218460083, "support/stored_width": 42.78953218460083 }, { "epoch": 0.484375, "grad_norm": 157.51846313476562, "kl/sequence_policy_ref": -242.7173252105713, "kl/vocab_forward": 220.59081268310547, "kl/vocab_js": 26.569517850875854, "kl/vocab_reverse": 104.70345973968506, "kl/vocab_symmetric": 325.2943916320801, "learning_rate": 2e-06, "logps/chosen": -677.5861015319824, "logps/rejected": -1586.1520690917969, "loss": 1.9096, "loss/dpo": 0.26202132055277616, "misalign/J": 164.75636100769043, "misalign/J_aux_loss": 1.6475635841488838, "misalign/J_aux_loss_raw": 164.75636100769043, "misalign/J_over_reverse_kl": 1.940863698720932, "misalign/J_per_token": 0.33494884334504604, "misalign/compressed_reward_absmax": 3763.813751220703, "misalign/compressed_reward_range": 5305.598388671875, "misalign/entropy_a": 752.1155014038086, "misalign/entropy_b": 1103.1325607299805, "misalign/forward_kl_divergence": 220.59081268310547, "misalign/forward_kl_divergence_per_token": 0.5400971993803978, "misalign/gamma_abs_times_reward_std": 49524192.5, "misalign/gamma_bracketed_rate": 0.9842683598399162, "misalign/gamma_reward_residual": 0.00013795335132726905, "misalign/gamma_star": 76385996.75, "misalign/js_divergence": 26.569517850875854, "misalign/reverse_kl_divergence": 104.70345973968506, "misalign/reverse_kl_divergence_per_token": 0.2469406109303236, "misalign/reward_a": 72.4141092300415, "misalign/reward_b": -14.692016035318375, "misalign/reward_improvement": 87.10610628128052, "misalign/reward_improvement_over_reverse_kl": 0.7758874297142029, "misalign/reward_improvement_per_token": 0.11436527967453003, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -336.6119108200073, "misalign/reward_vocab_std": 635.3649368286133, "misalign/symmetric_kl": 325.2943916320801, "misalign/tv_distance": 95.78610897064209, "num_tokens": 5302053.0, "rewards/accuracies": 0.9375, "rewards/chosen": -11.388235569000244, "rewards/margins": 25.766995549201965, "rewards/rejected": -37.155229806900024, "step": 31, "support/residual_count": 151893.30859375, "support/residual_mass_policy": 0.026038944022729993, "support/residual_mass_reference": 0.04330639448016882, "support/residual_reward": -0.4683221112936735, "support/runtime_width": 42.69255495071411, "support/sampled_loser_rank": 0.6677984669804573, "support/sampled_reward_rank": -0.38564055040478706, "support/sampled_token_added_rate": 0.03589020320214331, "support/sampled_winner_rank": 0.6767471358180046, "support/selected_width": 42.69255495071411, "support/stored_width": 42.69255495071411 }, { "epoch": 0.5, "grad_norm": 115.89986419677734, "kl/sequence_policy_ref": -291.1677303314209, "kl/vocab_forward": 264.76793098449707, "kl/vocab_js": 31.6591854095459, "kl/vocab_reverse": 126.53115463256836, "kl/vocab_symmetric": 391.29920196533203, "learning_rate": 2e-06, "logps/chosen": -758.1873931884766, "logps/rejected": -1748.2294845581055, "loss": 1.9955, "loss/dpo": 0.4298266823877448, "misalign/J": 156.56506061553955, "misalign/J_aux_loss": 1.5656505972146988, "misalign/J_aux_loss_raw": 156.56506061553955, "misalign/J_over_reverse_kl": 1.3217194080352783, "misalign/J_per_token": 0.21796293556690216, "misalign/compressed_reward_absmax": 4141.018249511719, "misalign/compressed_reward_range": 5932.4368896484375, "misalign/entropy_a": 796.6573028564453, "misalign/entropy_b": 1197.2570190429688, "misalign/forward_kl_divergence": 264.76793098449707, "misalign/forward_kl_divergence_per_token": 0.42143452540040016, "misalign/gamma_abs_times_reward_std": 42270098.25, "misalign/gamma_bracketed_rate": 0.9892738536000252, "misalign/gamma_reward_residual": 0.0006614696701490175, "misalign/gamma_star": 77172042.5, "misalign/js_divergence": 31.6591854095459, "misalign/reverse_kl_divergence": 126.53115463256836, "misalign/reverse_kl_divergence_per_token": 0.20403443090617657, "misalign/reward_a": 84.93575382232666, "misalign/reward_b": -16.301965177059174, "misalign/reward_improvement": 101.23770046234131, "misalign/reward_improvement_over_reverse_kl": 0.6962632201611996, "misalign/reward_improvement_per_token": 0.06460105488076806, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -461.3788146972656, "misalign/reward_vocab_std": 712.9334030151367, "misalign/symmetric_kl": 391.29920196533203, "misalign/tv_distance": 110.64313316345215, "num_tokens": 5464702.0, "rewards/accuracies": 0.890625, "rewards/chosen": -13.653896808624268, "rewards/margins": 30.925754070281982, "rewards/rejected": -44.57965087890625, "step": 32, "support/residual_count": 151893.287109375, "support/residual_mass_policy": 0.026596042443998158, "support/residual_mass_reference": 0.044471810571849346, "support/residual_reward": -0.42068428732454777, "support/runtime_width": 42.70789432525635, "support/sampled_loser_rank": 0.6153440810739994, "support/sampled_reward_rank": -0.4130892716348171, "support/sampled_token_added_rate": 0.034831034019589424, "support/sampled_winner_rank": 0.6262499615550041, "support/selected_width": 42.70789432525635, "support/stored_width": 42.70789432525635 }, { "epoch": 0.515625, "grad_norm": 126.86652374267578, "kl/sequence_policy_ref": -330.00819396972656, "kl/vocab_forward": 300.4244632720947, "kl/vocab_js": 34.439491748809814, "kl/vocab_reverse": 136.73623180389404, "kl/vocab_symmetric": 437.1608543395996, "learning_rate": 2e-06, "logps/chosen": -748.5886764526367, "logps/rejected": -1879.9018249511719, "loss": 1.9269, "loss/dpo": 0.15133077676370377, "misalign/J": 177.56014442443848, "misalign/J_aux_loss": 1.77560143917799, "misalign/J_aux_loss_raw": 177.56014442443848, "misalign/J_over_reverse_kl": 1.510396808385849, "misalign/J_per_token": 0.25404511764645576, "misalign/compressed_reward_absmax": 4184.280670166016, "misalign/compressed_reward_range": 5938.451599121094, "misalign/entropy_a": 794.0090255737305, "misalign/entropy_b": 1205.6077117919922, "misalign/forward_kl_divergence": 300.4244632720947, "misalign/forward_kl_divergence_per_token": 0.5809952989220619, "misalign/gamma_abs_times_reward_std": 47938589.5, "misalign/gamma_bracketed_rate": 0.9874916970729828, "misalign/gamma_reward_residual": 0.0018580270816528355, "misalign/gamma_star": 84848901.75, "misalign/js_divergence": 34.439491748809814, "misalign/reverse_kl_divergence": 136.73623180389404, "misalign/reverse_kl_divergence_per_token": 0.22614295408129692, "misalign/reward_a": 93.50389242172241, "misalign/reward_b": -11.571515798568726, "misalign/reward_improvement": 105.07538223266602, "misalign/reward_improvement_over_reverse_kl": 0.7212403789162636, "misalign/reward_improvement_per_token": 0.08620550157502294, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -539.4033613204956, "misalign/reward_vocab_std": 717.4302520751953, "misalign/symmetric_kl": 437.1608543395996, "misalign/tv_distance": 118.05486106872559, "num_tokens": 5633244.0, "rewards/accuracies": 0.96875, "rewards/chosen": -14.54743093252182, "rewards/margins": 36.90677738189697, "rewards/rejected": -51.454208850860596, "step": 33, "support/residual_count": 151893.3203125, "support/residual_mass_policy": 0.029681737767532468, "support/residual_mass_reference": 0.04252100153826177, "support/residual_reward": -0.4979167296551168, "support/runtime_width": 42.67843770980835, "support/sampled_loser_rank": 0.6283881887793541, "support/sampled_reward_rank": -0.38332303427159786, "support/sampled_token_added_rate": 0.03253701771609485, "support/sampled_winner_rank": 0.6316058188676834, "support/selected_width": 42.67843770980835, "support/stored_width": 42.67843770980835 }, { "epoch": 0.53125, "grad_norm": 189.66943359375, "kl/sequence_policy_ref": -295.09754180908203, "kl/vocab_forward": 265.09803009033203, "kl/vocab_js": 31.023924469947815, "kl/vocab_reverse": 124.39066219329834, "kl/vocab_symmetric": 389.488920211792, "learning_rate": 2e-06, "logps/chosen": -869.112964630127, "logps/rejected": -1561.075454711914, "loss": 2.3314, "loss/dpo": 0.4870968231589359, "misalign/J": 184.4345703125, "misalign/J_aux_loss": 1.8443456441164017, "misalign/J_aux_loss_raw": 184.4345703125, "misalign/J_over_reverse_kl": 2.160892277956009, "misalign/J_per_token": 0.36055343225598335, "misalign/compressed_reward_absmax": 4057.2051391601562, "misalign/compressed_reward_range": 5788.110046386719, "misalign/entropy_a": 733.9059944152832, "misalign/entropy_b": 1137.9230575561523, "misalign/forward_kl_divergence": 265.09803009033203, "misalign/forward_kl_divergence_per_token": 0.5678062625229359, "misalign/gamma_abs_times_reward_std": 71265453.5, "misalign/gamma_bracketed_rate": 0.9808945804834366, "misalign/gamma_reward_residual": 4.436415292730089e-05, "misalign/gamma_star": 82635977.5, "misalign/js_divergence": 31.023924469947815, "misalign/reverse_kl_divergence": 124.39066219329834, "misalign/reverse_kl_divergence_per_token": 0.22786439768970013, "misalign/reward_a": 93.78890228271484, "misalign/reward_b": -7.1546797305345535, "misalign/reward_improvement": 100.94355964660645, "misalign/reward_improvement_over_reverse_kl": 0.6350699551403522, "misalign/reward_improvement_per_token": 0.05376583803445101, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -452.1529130935669, "misalign/reward_vocab_std": 700.5487747192383, "misalign/symmetric_kl": 389.488920211792, "misalign/tv_distance": 107.56306266784668, "num_tokens": 5794166.0, "rewards/accuracies": 0.90625, "rewards/chosen": -14.67345118522644, "rewards/margins": 29.672606945037842, "rewards/rejected": -44.34605646133423, "step": 34, "support/residual_count": 151893.341796875, "support/residual_mass_policy": 0.023359368089586496, "support/residual_mass_reference": 0.03612355049699545, "support/residual_reward": -0.4855753555893898, "support/runtime_width": 42.65322256088257, "support/sampled_loser_rank": 0.5644064396619797, "support/sampled_reward_rank": -0.3794688871130347, "support/sampled_token_added_rate": 0.030243139481171966, "support/sampled_winner_rank": 0.5993468686938286, "support/selected_width": 42.65322256088257, "support/stored_width": 42.65322256088257 }, { "epoch": 0.546875, "grad_norm": 255.15330505371094, "kl/sequence_policy_ref": -334.76751708984375, "kl/vocab_forward": 300.60124015808105, "kl/vocab_js": 34.761489152908325, "kl/vocab_reverse": 139.5803165435791, "kl/vocab_symmetric": 440.1817283630371, "learning_rate": 2e-06, "logps/chosen": -799.9229431152344, "logps/rejected": -1850.2046356201172, "loss": 2.7865, "loss/dpo": 0.628988600539742, "misalign/J": 215.75445175170898, "misalign/J_aux_loss": 2.1575444042682648, "misalign/J_aux_loss_raw": 215.75445175170898, "misalign/J_over_reverse_kl": 1.6279902905225754, "misalign/J_per_token": 0.23844042047858238, "misalign/compressed_reward_absmax": 4261.061584472656, "misalign/compressed_reward_range": 6006.7581787109375, "misalign/entropy_a": 801.278564453125, "misalign/entropy_b": 1216.669448852539, "misalign/forward_kl_divergence": 300.60124015808105, "misalign/forward_kl_divergence_per_token": 0.4489123970270157, "misalign/gamma_abs_times_reward_std": 66620462.0, "misalign/gamma_bracketed_rate": 0.9877287149429321, "misalign/gamma_reward_residual": 0.00013351680354389828, "misalign/gamma_star": 108750856.5, "misalign/js_divergence": 34.761489152908325, "misalign/reverse_kl_divergence": 139.5803165435791, "misalign/reverse_kl_divergence_per_token": 0.20873420871794224, "misalign/reward_a": 96.30846405029297, "misalign/reward_b": -10.182962775230408, "misalign/reward_improvement": 106.4914083480835, "misalign/reward_improvement_over_reverse_kl": 0.7169731482863426, "misalign/reward_improvement_per_token": 0.1033505480736494, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -652.5512466430664, "misalign/reward_vocab_std": 728.8263397216797, "misalign/symmetric_kl": 440.1817283630371, "misalign/tv_distance": 117.93358135223389, "num_tokens": 5967652.0, "rewards/accuracies": 0.9375, "rewards/chosen": -16.057342648506165, "rewards/margins": 34.83882117271423, "rewards/rejected": -50.89616346359253, "step": 35, "support/residual_count": 151893.296875, "support/residual_mass_policy": 0.025821004761382937, "support/residual_mass_reference": 0.039951348677277565, "support/residual_reward": -0.5654018372297287, "support/runtime_width": 42.70614957809448, "support/sampled_loser_rank": 0.6046793200075626, "support/sampled_reward_rank": -0.2875976013019681, "support/sampled_token_added_rate": 0.03660787723492831, "support/sampled_winner_rank": 0.5936227701604366, "support/selected_width": 42.70614957809448, "support/stored_width": 42.70614957809448 }, { "epoch": 0.5625, "grad_norm": 372.3753967285156, "kl/sequence_policy_ref": -301.4326972961426, "kl/vocab_forward": 272.3033618927002, "kl/vocab_js": 31.47647452354431, "kl/vocab_reverse": 129.38499641418457, "kl/vocab_symmetric": 401.688533782959, "learning_rate": 2e-06, "logps/chosen": -617.8253440856934, "logps/rejected": -1656.3383178710938, "loss": 2.0888, "loss/dpo": 0.20348351792887343, "misalign/J": 188.5364990234375, "misalign/J_aux_loss": 1.8853649497032166, "misalign/J_aux_loss_raw": 188.5364990234375, "misalign/J_over_reverse_kl": 1.918866515159607, "misalign/J_per_token": 0.2657326404005289, "misalign/compressed_reward_absmax": 3813.5306091308594, "misalign/compressed_reward_range": 5372.776062011719, "misalign/entropy_a": 678.8123512268066, "misalign/entropy_b": 1029.1281280517578, "misalign/forward_kl_divergence": 272.3033618927002, "misalign/forward_kl_divergence_per_token": 0.5031169354915619, "misalign/gamma_abs_times_reward_std": 54824950.5, "misalign/gamma_bracketed_rate": 0.984458789229393, "misalign/gamma_reward_residual": 9.551036919219769e-05, "misalign/gamma_star": 76879923.5, "misalign/js_divergence": 31.47647452354431, "misalign/reverse_kl_divergence": 129.38499641418457, "misalign/reverse_kl_divergence_per_token": 0.20918168872594833, "misalign/reward_a": 85.22445583343506, "misalign/reward_b": -13.628453433513641, "misalign/reward_improvement": 98.85287952423096, "misalign/reward_improvement_over_reverse_kl": 0.6402187570929527, "misalign/reward_improvement_per_token": 0.06344311079010367, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -478.2143135070801, "misalign/reward_vocab_std": 661.5251770019531, "misalign/symmetric_kl": 401.688533782959, "misalign/tv_distance": 104.53119087219238, "num_tokens": 6125949.0, "rewards/accuracies": 0.953125, "rewards/chosen": -12.255744695663452, "rewards/margins": 35.775049448013306, "rewards/rejected": -48.03079414367676, "step": 36, "support/residual_count": 151893.57421875, "support/residual_mass_policy": 0.0228013499872759, "support/residual_mass_reference": 0.03604377689771354, "support/residual_reward": -0.5941759124398232, "support/runtime_width": 42.42575168609619, "support/sampled_loser_rank": 0.579292468726635, "support/sampled_reward_rank": -0.31505878642201424, "support/sampled_token_added_rate": 0.027777738636359572, "support/sampled_winner_rank": 0.6083027757704258, "support/selected_width": 42.42575168609619, "support/stored_width": 42.42575168609619 }, { "epoch": 0.5625, "eval_kl/sequence_policy_ref": -370.0239589214325, "eval_kl/vocab_forward": 326.3884401321411, "eval_kl/vocab_js": 39.170413970947266, "eval_kl/vocab_reverse": 163.1976842880249, "eval_kl/vocab_symmetric": 489.5863370895386, "eval_logps/chosen": -878.7963104248047, "eval_logps/rejected": -1961.517490386963, "eval_loss": 1.9275543689727783, "eval_loss/dpo": 0.07911084008669506, "eval_misalign/J": 184.84435880184174, "eval_misalign/J_aux_loss": 1.8484435249119997, "eval_misalign/J_aux_loss_raw": 184.84435880184174, "eval_misalign/J_over_reverse_kl": 1.7581309108063579, "eval_misalign/J_per_token": 0.26096369861625135, "eval_misalign/compressed_reward_absmax": 4236.706245422363, "eval_misalign/compressed_reward_range": 6027.056587219238, "eval_misalign/entropy_a": 859.5940890312195, "eval_misalign/entropy_b": 1298.0544710159302, "eval_misalign/forward_kl_divergence": 326.3884401321411, "eval_misalign/forward_kl_divergence_per_token": 0.5667336815968156, "eval_misalign/gamma_abs_times_reward_std": 44952110.546875, "eval_misalign/gamma_bracketed_rate": 0.9881090503185987, "eval_misalign/gamma_reward_residual": 0.0003100246618572555, "eval_misalign/gamma_star": 68325932.625, "eval_misalign/js_divergence": 39.170413970947266, "eval_misalign/reverse_kl_divergence": 163.1976842880249, "eval_misalign/reverse_kl_divergence_per_token": 0.2310976292937994, "eval_misalign/reward_a": 111.55592322349548, "eval_misalign/reward_b": -14.307281229645014, "eval_misalign/reward_improvement": 125.86316466331482, "eval_misalign/reward_improvement_over_reverse_kl": 0.6893182648345828, "eval_misalign/reward_improvement_per_token": 0.08624049881473184, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.1155492067337, "eval_misalign/reward_vocab_std": 731.3202810287476, "eval_misalign/symmetric_kl": 489.5863370895386, "eval_misalign/tv_distance": 129.546555519104, "eval_rewards/accuracies": 0.98046875, "eval_rewards/chosen": -14.962035872042179, "eval_rewards/margins": 44.080720245838165, "eval_rewards/rejected": -59.04275727272034, "eval_runtime": 100.6796, "eval_samples_per_second": 5.085, "eval_steps_per_second": 0.636, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.02652598696295172, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 36 }, { "epoch": 0.578125, "grad_norm": 538.4682006835938, "kl/sequence_policy_ref": -340.5377769470215, "kl/vocab_forward": 290.87598991394043, "kl/vocab_js": 34.37863755226135, "kl/vocab_reverse": 144.22690105438232, "kl/vocab_symmetric": 435.10305404663086, "learning_rate": 2e-06, "logps/chosen": -651.7600479125977, "logps/rejected": -1730.038101196289, "loss": 2.6095, "loss/dpo": 0.025369518539697822, "misalign/J": 258.41460514068604, "misalign/J_aux_loss": 2.58414613455534, "misalign/J_aux_loss_raw": 258.41460514068604, "misalign/J_over_reverse_kl": 3.293791249394417, "misalign/J_per_token": 0.3237530868500471, "misalign/compressed_reward_absmax": 3946.171417236328, "misalign/compressed_reward_range": 5592.2857666015625, "misalign/entropy_a": 679.9681549072266, "misalign/entropy_b": 1041.4010391235352, "misalign/forward_kl_divergence": 290.87598991394043, "misalign/forward_kl_divergence_per_token": 0.6766270510852337, "misalign/gamma_abs_times_reward_std": 77732721.625, "misalign/gamma_bracketed_rate": 0.9842484146356583, "misalign/gamma_reward_residual": 2.0939698629263148e-05, "misalign/gamma_star": 120903140.125, "misalign/js_divergence": 34.37863755226135, "misalign/reverse_kl_divergence": 144.22690105438232, "misalign/reverse_kl_divergence_per_token": 0.24700743332505226, "misalign/reward_a": 94.92676162719727, "misalign/reward_b": -12.971765249967575, "misalign/reward_improvement": 107.89847755432129, "misalign/reward_improvement_over_reverse_kl": 0.6338806599378586, "misalign/reward_improvement_per_token": 0.060572607442736626, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -481.5314302444458, "misalign/reward_vocab_std": 672.1194839477539, "misalign/symmetric_kl": 435.10305404663086, "misalign/tv_distance": 110.60170650482178, "num_tokens": 6285208.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.246494770050049, "rewards/margins": 43.61456775665283, "rewards/rejected": -55.861063957214355, "step": 37, "support/residual_count": 151893.3203125, "support/residual_mass_policy": 0.023932685144245625, "support/residual_mass_reference": 0.041495030745863914, "support/residual_reward": -0.5419074520468712, "support/runtime_width": 42.680593967437744, "support/sampled_loser_rank": 0.5905993320047855, "support/sampled_reward_rank": -0.3638652637600899, "support/sampled_token_added_rate": 0.030867979861795902, "support/sampled_winner_rank": 0.6219254210591316, "support/selected_width": 42.680593967437744, "support/stored_width": 42.680593967437744 }, { "epoch": 0.59375, "grad_norm": 103.29216766357422, "kl/sequence_policy_ref": -322.09266471862793, "kl/vocab_forward": 284.572021484375, "kl/vocab_js": 32.71440887451172, "kl/vocab_reverse": 135.91109657287598, "kl/vocab_symmetric": 420.4832000732422, "learning_rate": 2e-06, "logps/chosen": -666.5978660583496, "logps/rejected": -1681.1878814697266, "loss": 1.7536, "loss/dpo": 0.07719759906125201, "misalign/J": 167.64227294921875, "misalign/J_aux_loss": 1.6764226853847504, "misalign/J_aux_loss_raw": 167.64227294921875, "misalign/J_over_reverse_kl": 1.7161841690540314, "misalign/J_per_token": 0.27796192467212677, "misalign/compressed_reward_absmax": 3628.3260498046875, "misalign/compressed_reward_range": 5132.099914550781, "misalign/entropy_a": 690.4412384033203, "misalign/entropy_b": 1045.768310546875, "misalign/forward_kl_divergence": 284.572021484375, "misalign/forward_kl_divergence_per_token": 0.6853830218315125, "misalign/gamma_abs_times_reward_std": 51078868.0, "misalign/gamma_bracketed_rate": 0.9857224076986313, "misalign/gamma_reward_residual": 0.00023307789706450421, "misalign/gamma_star": 63338430.75, "misalign/js_divergence": 32.71440887451172, "misalign/reverse_kl_divergence": 135.91109657287598, "misalign/reverse_kl_divergence_per_token": 0.26742945425212383, "misalign/reward_a": 82.06362342834473, "misalign/reward_b": -14.572960376739502, "misalign/reward_improvement": 96.63658332824707, "misalign/reward_improvement_over_reverse_kl": 0.6172222569584846, "misalign/reward_improvement_per_token": 0.09521574154496193, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -462.82337760925293, "misalign/reward_vocab_std": 629.3218460083008, "misalign/symmetric_kl": 420.4832000732422, "misalign/tv_distance": 107.11642932891846, "num_tokens": 6439380.0, "rewards/accuracies": 0.96875, "rewards/chosen": -13.20527732372284, "rewards/margins": 38.00797891616821, "rewards/rejected": -51.21325659751892, "step": 38, "support/residual_count": 151893.580078125, "support/residual_mass_policy": 0.02642570063471794, "support/residual_mass_reference": 0.0395309254527092, "support/residual_reward": -0.619575060904026, "support/runtime_width": 42.41612482070923, "support/sampled_loser_rank": 0.6446574702858925, "support/sampled_reward_rank": -0.3532958813011646, "support/sampled_token_added_rate": 0.029658236424438655, "support/sampled_winner_rank": 0.6769233047962189, "support/selected_width": 42.41612482070923, "support/stored_width": 42.41612482070923 }, { "epoch": 0.609375, "grad_norm": 1020.2838745117188, "kl/sequence_policy_ref": -395.0605163574219, "kl/vocab_forward": 347.51882553100586, "kl/vocab_js": 42.35039806365967, "kl/vocab_reverse": 184.1147804260254, "kl/vocab_symmetric": 531.6339111328125, "learning_rate": 2e-06, "logps/chosen": -740.7064399719238, "logps/rejected": -2187.0342712402344, "loss": 3.4136, "loss/dpo": 0.032895612518908, "misalign/J": 338.06642150878906, "misalign/J_aux_loss": 3.3806639164686203, "misalign/J_aux_loss_raw": 338.06642150878906, "misalign/J_over_reverse_kl": 2.6226917803287506, "misalign/J_per_token": 0.3244504798203707, "misalign/compressed_reward_absmax": 4427.0225830078125, "misalign/compressed_reward_range": 6178.325927734375, "misalign/entropy_a": 868.1625289916992, "misalign/entropy_b": 1331.6126098632812, "misalign/forward_kl_divergence": 347.51882553100586, "misalign/forward_kl_divergence_per_token": 0.5268885493278503, "misalign/gamma_abs_times_reward_std": 78415596.0, "misalign/gamma_bracketed_rate": 0.9849846512079239, "misalign/gamma_reward_residual": 0.00048569267073617084, "misalign/gamma_star": 86910211.0, "misalign/js_divergence": 42.35039806365967, "misalign/reverse_kl_divergence": 184.1147804260254, "misalign/reverse_kl_divergence_per_token": 0.2407812997698784, "misalign/reward_a": 94.71883296966553, "misalign/reward_b": -19.811948537826538, "misalign/reward_improvement": 114.53073215484619, "misalign/reward_improvement_over_reverse_kl": 0.6183451376855373, "misalign/reward_improvement_per_token": 0.07184543320909142, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -322.06157875061035, "misalign/reward_vocab_std": 766.7786178588867, "misalign/symmetric_kl": 531.6339111328125, "misalign/tv_distance": 135.00493335723877, "num_tokens": 6618112.0, "rewards/accuracies": 0.984375, "rewards/chosen": -14.374651193618774, "rewards/margins": 50.26280069351196, "rewards/rejected": -64.63745069503784, "step": 39, "support/residual_count": 151892.978515625, "support/residual_mass_policy": 0.030612861970439553, "support/residual_mass_reference": 0.04724447149783373, "support/residual_reward": -0.27408459782600403, "support/runtime_width": 43.020020484924316, "support/sampled_loser_rank": 0.6019720807671547, "support/sampled_reward_rank": -0.4871169701218605, "support/sampled_token_added_rate": 0.038056216202676296, "support/sampled_winner_rank": 0.6120708398520947, "support/selected_width": 43.020020484924316, "support/stored_width": 43.020020484924316 }, { "epoch": 0.625, "grad_norm": 1029.6490478515625, "kl/sequence_policy_ref": -391.72465896606445, "kl/vocab_forward": 347.66312408447266, "kl/vocab_js": 43.16923809051514, "kl/vocab_reverse": 182.91918754577637, "kl/vocab_symmetric": 530.5825958251953, "learning_rate": 2e-06, "logps/chosen": -836.8339767456055, "logps/rejected": -2054.7322845458984, "loss": 2.4484, "loss/dpo": 0.08931858758296585, "misalign/J": 235.91084098815918, "misalign/J_aux_loss": 2.3591084629297256, "misalign/J_aux_loss_raw": 235.91084098815918, "misalign/J_over_reverse_kl": 2.021958939731121, "misalign/J_per_token": 0.2609993116930127, "misalign/compressed_reward_absmax": 4498.334259033203, "misalign/compressed_reward_range": 6353.229675292969, "misalign/entropy_a": 859.5539245605469, "misalign/entropy_b": 1316.3250732421875, "misalign/forward_kl_divergence": 347.66312408447266, "misalign/forward_kl_divergence_per_token": 0.4745451509952545, "misalign/gamma_abs_times_reward_std": 67711939.75, "misalign/gamma_bracketed_rate": 0.9849048256874084, "misalign/gamma_reward_residual": 6.0246247358008986e-05, "misalign/gamma_star": 116100731.0, "misalign/js_divergence": 43.16923809051514, "misalign/reverse_kl_divergence": 182.91918754577637, "misalign/reverse_kl_divergence_per_token": 0.22102734073996544, "misalign/reward_a": 119.16459369659424, "misalign/reward_b": -17.058857798576355, "misalign/reward_improvement": 136.22340965270996, "misalign/reward_improvement_over_reverse_kl": 0.6773070022463799, "misalign/reward_improvement_per_token": 0.09231107356026769, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -541.3807773590088, "misalign/reward_vocab_std": 752.7018508911133, "misalign/symmetric_kl": 530.5825958251953, "misalign/tv_distance": 137.86173248291016, "num_tokens": 6793032.0, "rewards/accuracies": 0.96875, "rewards/chosen": -15.797016501426697, "rewards/margins": 46.75089979171753, "rewards/rejected": -62.547916412353516, "step": 40, "support/residual_count": 151893.34375, "support/residual_mass_policy": 0.02554402849636972, "support/residual_mass_reference": 0.04131174925714731, "support/residual_reward": -0.43343046586960554, "support/runtime_width": 42.65411186218262, "support/sampled_loser_rank": 0.635368824005127, "support/sampled_reward_rank": -0.3783828802406788, "support/sampled_token_added_rate": 0.03763708798214793, "support/sampled_winner_rank": 0.632044330239296, "support/selected_width": 42.65411186218262, "support/stored_width": 42.65411186218262 }, { "epoch": 0.640625, "grad_norm": 123.83010864257812, "kl/sequence_policy_ref": -379.3154487609863, "kl/vocab_forward": 331.34838104248047, "kl/vocab_js": 39.656344413757324, "kl/vocab_reverse": 165.32555389404297, "kl/vocab_symmetric": 496.6741180419922, "learning_rate": 2e-06, "logps/chosen": -821.5552520751953, "logps/rejected": -2054.2618560791016, "loss": 2.1358, "loss/dpo": 0.3257599932614975, "misalign/J": 181.00886344909668, "misalign/J_aux_loss": 1.8100886344909668, "misalign/J_aux_loss_raw": 181.00886344909668, "misalign/J_over_reverse_kl": 1.6241952329874039, "misalign/J_per_token": 0.2575971782207489, "misalign/compressed_reward_absmax": 4115.048797607422, "misalign/compressed_reward_range": 5863.2059326171875, "misalign/entropy_a": 867.4101486206055, "misalign/entropy_b": 1317.5307006835938, "misalign/forward_kl_divergence": 331.34838104248047, "misalign/forward_kl_divergence_per_token": 0.5753209926187992, "misalign/gamma_abs_times_reward_std": 46303981.5, "misalign/gamma_bracketed_rate": 0.9869667664170265, "misalign/gamma_reward_residual": 8.94081304068095e-05, "misalign/gamma_star": 55120054.25, "misalign/js_divergence": 39.656344413757324, "misalign/reverse_kl_divergence": 165.32555389404297, "misalign/reverse_kl_divergence_per_token": 0.2224746011197567, "misalign/reward_a": 104.61750793457031, "misalign/reward_b": -15.681229546666145, "misalign/reward_improvement": 120.29868698120117, "misalign/reward_improvement_over_reverse_kl": 0.6759593263268471, "misalign/reward_improvement_per_token": 0.08714451128616929, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -421.68537425994873, "misalign/reward_vocab_std": 710.1155014038086, "misalign/symmetric_kl": 496.6741180419922, "misalign/tv_distance": 129.80499076843262, "num_tokens": 6964060.0, "rewards/accuracies": 0.96875, "rewards/chosen": -13.713181614875793, "rewards/margins": 48.436728954315186, "rewards/rejected": -62.14991092681885, "step": 41, "support/residual_count": 151893.20703125, "support/residual_mass_policy": 0.031454769195988774, "support/residual_mass_reference": 0.04673444852232933, "support/residual_reward": -0.3744997123721987, "support/runtime_width": 42.79198360443115, "support/sampled_loser_rank": 0.6497581750154495, "support/sampled_reward_rank": -0.433025848120451, "support/sampled_token_added_rate": 0.039118685061112046, "support/sampled_winner_rank": 0.6626867726445198, "support/selected_width": 42.79198360443115, "support/stored_width": 42.79198360443115 }, { "epoch": 0.65625, "grad_norm": 150.85992431640625, "kl/sequence_policy_ref": -384.6586265563965, "kl/vocab_forward": 345.101411819458, "kl/vocab_js": 41.772791624069214, "kl/vocab_reverse": 176.38330841064453, "kl/vocab_symmetric": 521.48486328125, "learning_rate": 2e-06, "logps/chosen": -863.8736877441406, "logps/rejected": -1970.995620727539, "loss": 1.7582, "loss/dpo": 0.017987981137206566, "misalign/J": 174.01841640472412, "misalign/J_aux_loss": 1.7401841282844543, "misalign/J_aux_loss_raw": 174.01841640472412, "misalign/J_over_reverse_kl": 1.5599696189165115, "misalign/J_per_token": 0.22202685475349426, "misalign/compressed_reward_absmax": 4183.230377197266, "misalign/compressed_reward_range": 5966.482116699219, "misalign/entropy_a": 820.5932464599609, "misalign/entropy_b": 1289.2757186889648, "misalign/forward_kl_divergence": 345.101411819458, "misalign/forward_kl_divergence_per_token": 0.5078705288469791, "misalign/gamma_abs_times_reward_std": 39975210.5, "misalign/gamma_bracketed_rate": 0.9893263578414917, "misalign/gamma_reward_residual": 7.978630480920401e-05, "misalign/gamma_star": 61503279.625, "misalign/js_divergence": 41.772791624069214, "misalign/reverse_kl_divergence": 176.38330841064453, "misalign/reverse_kl_divergence_per_token": 0.2160295583307743, "misalign/reward_a": 117.82565593719482, "misalign/reward_b": -13.596548825502396, "misalign/reward_improvement": 131.42217826843262, "misalign/reward_improvement_over_reverse_kl": 0.661291316151619, "misalign/reward_improvement_per_token": 0.08064441289752722, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -524.60085105896, "misalign/reward_vocab_std": 725.3326187133789, "misalign/symmetric_kl": 521.48486328125, "misalign/tv_distance": 134.1311206817627, "num_tokens": 7130526.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.464305222034454, "rewards/margins": 48.003116607666016, "rewards/rejected": -62.467421531677246, "step": 42, "support/residual_count": 151893.248046875, "support/residual_mass_policy": 0.023710966343060136, "support/residual_mass_reference": 0.03921722201630473, "support/residual_reward": -0.5094128809869289, "support/runtime_width": 42.75533056259155, "support/sampled_loser_rank": 0.6201684325933456, "support/sampled_reward_rank": -0.38710433803498745, "support/sampled_token_added_rate": 0.034145432990044355, "support/sampled_winner_rank": 0.6373011693358421, "support/selected_width": 42.75533056259155, "support/stored_width": 42.75533056259155 }, { "epoch": 0.671875, "grad_norm": 80.83043670654297, "kl/sequence_policy_ref": -430.89680099487305, "kl/vocab_forward": 383.08553886413574, "kl/vocab_js": 42.64789915084839, "kl/vocab_reverse": 176.26643562316895, "kl/vocab_symmetric": 559.3521347045898, "learning_rate": 2e-06, "logps/chosen": -795.0814590454102, "logps/rejected": -2073.403335571289, "loss": 1.947, "loss/dpo": 0.20042237156056064, "misalign/J": 174.65477752685547, "misalign/J_aux_loss": 1.7465477734804153, "misalign/J_aux_loss_raw": 174.65477752685547, "misalign/J_over_reverse_kl": 1.2193461656570435, "misalign/J_per_token": 0.21656284667551517, "misalign/compressed_reward_absmax": 4145.209014892578, "misalign/compressed_reward_range": 5894.319396972656, "misalign/entropy_a": 773.8354606628418, "misalign/entropy_b": 1241.6957702636719, "misalign/forward_kl_divergence": 383.08553886413574, "misalign/forward_kl_divergence_per_token": 0.6106686592102051, "misalign/gamma_abs_times_reward_std": 42811992.0, "misalign/gamma_bracketed_rate": 0.9881041571497917, "misalign/gamma_reward_residual": 0.0022615561604197865, "misalign/gamma_star": 57841907.5, "misalign/js_divergence": 42.64789915084839, "misalign/reverse_kl_divergence": 176.26643562316895, "misalign/reverse_kl_divergence_per_token": 0.20993488654494286, "misalign/reward_a": 112.22536277770996, "misalign/reward_b": -13.715306758880615, "misalign/reward_improvement": 125.94063758850098, "misalign/reward_improvement_over_reverse_kl": 0.7269639819860458, "misalign/reward_improvement_per_token": 0.13059457764029503, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -529.42356300354, "misalign/reward_vocab_std": 729.1826972961426, "misalign/symmetric_kl": 559.3521347045898, "misalign/tv_distance": 134.69511699676514, "num_tokens": 7295429.0, "rewards/accuracies": 0.96875, "rewards/chosen": -16.980572760105133, "rewards/margins": 52.21821975708008, "rewards/rejected": -69.19879198074341, "step": 43, "support/residual_count": 151893.271484375, "support/residual_mass_policy": 0.020681084133684635, "support/residual_mass_reference": 0.03672680747695267, "support/residual_reward": -0.5217800214886665, "support/runtime_width": 42.73170518875122, "support/sampled_loser_rank": 0.6487637758255005, "support/sampled_reward_rank": -0.33589007146656513, "support/sampled_token_added_rate": 0.0326521759852767, "support/sampled_winner_rank": 0.6681758984923363, "support/selected_width": 42.73170518875122, "support/stored_width": 42.73170518875122 }, { "epoch": 0.6875, "grad_norm": 126.96430206298828, "kl/sequence_policy_ref": -369.60080337524414, "kl/vocab_forward": 333.5518455505371, "kl/vocab_js": 38.65760946273804, "kl/vocab_reverse": 158.4586296081543, "kl/vocab_symmetric": 492.01073837280273, "learning_rate": 2e-06, "logps/chosen": -845.3204116821289, "logps/rejected": -1843.438232421875, "loss": 2.4664, "loss/dpo": 0.5317850863039725, "misalign/J": 193.46406745910645, "misalign/J_aux_loss": 1.9346406310796738, "misalign/J_aux_loss_raw": 193.46406745910645, "misalign/J_over_reverse_kl": 1.8410249948501587, "misalign/J_per_token": 0.21784362383186817, "misalign/compressed_reward_absmax": 4217.687438964844, "misalign/compressed_reward_range": 6031.9879150390625, "misalign/entropy_a": 751.3342437744141, "misalign/entropy_b": 1211.2969436645508, "misalign/forward_kl_divergence": 333.5518455505371, "misalign/forward_kl_divergence_per_token": 0.4570343755185604, "misalign/gamma_abs_times_reward_std": 54839250.375, "misalign/gamma_bracketed_rate": 0.9886480942368507, "misalign/gamma_reward_residual": 0.0009870923743164894, "misalign/gamma_star": 66258410.125, "misalign/js_divergence": 38.65760946273804, "misalign/reverse_kl_divergence": 158.4586296081543, "misalign/reverse_kl_divergence_per_token": 0.20796416513621807, "misalign/reward_a": 101.46306848526001, "misalign/reward_b": -14.63782051205635, "misalign/reward_improvement": 116.10085487365723, "misalign/reward_improvement_over_reverse_kl": 0.6505585312843323, "misalign/reward_improvement_per_token": 0.08616658858954906, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -558.583703994751, "misalign/reward_vocab_std": 734.9819030761719, "misalign/symmetric_kl": 492.01073837280273, "misalign/tv_distance": 125.39980697631836, "num_tokens": 7467937.0, "rewards/accuracies": 0.984375, "rewards/chosen": -16.166881680488586, "rewards/margins": 41.5863995552063, "rewards/rejected": -57.75328016281128, "step": 44, "support/residual_count": 151893.1171875, "support/residual_mass_policy": 0.02624459331855178, "support/residual_mass_reference": 0.04203576873987913, "support/residual_reward": -0.41040224581956863, "support/runtime_width": 42.882601737976074, "support/sampled_loser_rank": 0.597820907831192, "support/sampled_reward_rank": -0.4271346926689148, "support/sampled_token_added_rate": 0.03776927734725177, "support/sampled_winner_rank": 0.6288959942758083, "support/selected_width": 42.882601737976074, "support/stored_width": 42.882601737976074 }, { "epoch": 0.703125, "grad_norm": 263.4734191894531, "kl/sequence_policy_ref": -398.33577728271484, "kl/vocab_forward": 362.0641288757324, "kl/vocab_js": 40.054423570632935, "kl/vocab_reverse": 163.50957679748535, "kl/vocab_symmetric": 525.5739707946777, "learning_rate": 2e-06, "logps/chosen": -861.9217529296875, "logps/rejected": -1947.7921600341797, "loss": 2.821, "loss/dpo": 0.805006888877976, "misalign/J": 201.59605598449707, "misalign/J_aux_loss": 2.0159604400396347, "misalign/J_aux_loss_raw": 201.59605598449707, "misalign/J_over_reverse_kl": 1.4087589755654335, "misalign/J_per_token": 0.23312609270215034, "misalign/compressed_reward_absmax": 4261.386077880859, "misalign/compressed_reward_range": 5968.6805419921875, "misalign/entropy_a": 787.204231262207, "misalign/entropy_b": 1236.5688171386719, "misalign/forward_kl_divergence": 362.0641288757324, "misalign/forward_kl_divergence_per_token": 0.5964512750506401, "misalign/gamma_abs_times_reward_std": 58279984.25, "misalign/gamma_bracketed_rate": 0.9880961626768112, "misalign/gamma_reward_residual": 0.0008914316013033385, "misalign/gamma_star": 100481896.0, "misalign/js_divergence": 40.054423570632935, "misalign/reverse_kl_divergence": 163.50957679748535, "misalign/reverse_kl_divergence_per_token": 0.22092271596193314, "misalign/reward_a": 105.81989192962646, "misalign/reward_b": -13.749351739883423, "misalign/reward_improvement": 119.56921005249023, "misalign/reward_improvement_over_reverse_kl": 0.6199643798172474, "misalign/reward_improvement_per_token": 0.08543467940762639, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -543.9455938339233, "misalign/reward_vocab_std": 719.025936126709, "misalign/symmetric_kl": 525.5739707946777, "misalign/tv_distance": 128.89692497253418, "num_tokens": 7641400.0, "rewards/accuracies": 0.953125, "rewards/chosen": -18.768484711647034, "rewards/margins": 42.13018798828125, "rewards/rejected": -60.89867305755615, "step": 45, "support/residual_count": 151893.509765625, "support/residual_mass_policy": 0.023412443813867867, "support/residual_mass_reference": 0.03834630874916911, "support/residual_reward": -0.5029764696955681, "support/runtime_width": 42.48880910873413, "support/sampled_loser_rank": 0.6054023541510105, "support/sampled_reward_rank": -0.30370173789560795, "support/sampled_token_added_rate": 0.03430053312331438, "support/sampled_winner_rank": 0.6158883348107338, "support/selected_width": 42.48880910873413, "support/stored_width": 42.48880910873413 }, { "epoch": 0.71875, "grad_norm": 99.2862548828125, "kl/sequence_policy_ref": -352.9979362487793, "kl/vocab_forward": 320.0415687561035, "kl/vocab_js": 35.314189434051514, "kl/vocab_reverse": 143.37086391448975, "kl/vocab_symmetric": 463.41268157958984, "learning_rate": 2e-06, "logps/chosen": -780.6201934814453, "logps/rejected": -1745.756332397461, "loss": 1.967, "loss/dpo": 0.3397530964894031, "misalign/J": 162.72915935516357, "misalign/J_aux_loss": 1.6272915750741959, "misalign/J_aux_loss_raw": 162.72915935516357, "misalign/J_over_reverse_kl": 1.814868986606598, "misalign/J_per_token": 0.19831308163702488, "misalign/compressed_reward_absmax": 3964.359344482422, "misalign/compressed_reward_range": 5592.5999755859375, "misalign/entropy_a": 701.7979049682617, "misalign/entropy_b": 1128.1244354248047, "misalign/forward_kl_divergence": 320.0415687561035, "misalign/forward_kl_divergence_per_token": 0.5070049501955509, "misalign/gamma_abs_times_reward_std": 43917063.0, "misalign/gamma_bracketed_rate": 0.9886893406510353, "misalign/gamma_reward_residual": 0.00017012334137689322, "misalign/gamma_star": 66023362.0, "misalign/js_divergence": 35.314189434051514, "misalign/reverse_kl_divergence": 143.37086391448975, "misalign/reverse_kl_divergence_per_token": 0.18735219724476337, "misalign/reward_a": 90.78490257263184, "misalign/reward_b": -14.648303270339966, "misalign/reward_improvement": 105.43315505981445, "misalign/reward_improvement_over_reverse_kl": 0.6790562570095062, "misalign/reward_improvement_per_token": 0.09955848660320044, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -502.35912132263184, "misalign/reward_vocab_std": 681.5238342285156, "misalign/symmetric_kl": 463.41268157958984, "misalign/tv_distance": 114.9539966583252, "num_tokens": 7808320.0, "rewards/accuracies": 0.9375, "rewards/chosen": -15.536956906318665, "rewards/margins": 39.525673627853394, "rewards/rejected": -55.06262969970703, "step": 46, "support/residual_count": 151893.275390625, "support/residual_mass_policy": 0.027146896114572883, "support/residual_mass_reference": 0.04411950474604964, "support/residual_reward": -0.5129662416875362, "support/runtime_width": 42.72548723220825, "support/sampled_loser_rank": 0.5832961872220039, "support/sampled_reward_rank": -0.366660688072443, "support/sampled_token_added_rate": 0.03585506067611277, "support/sampled_winner_rank": 0.6062828227877617, "support/selected_width": 42.72548723220825, "support/stored_width": 42.72548723220825 }, { "epoch": 0.734375, "grad_norm": 95.33216857910156, "kl/sequence_policy_ref": -339.36829376220703, "kl/vocab_forward": 309.0083351135254, "kl/vocab_js": 34.39964842796326, "kl/vocab_reverse": 138.5347490310669, "kl/vocab_symmetric": 447.5432891845703, "learning_rate": 2e-06, "logps/chosen": -834.6276512145996, "logps/rejected": -1626.2992553710938, "loss": 2.1621, "loss/dpo": 0.5789944041461155, "misalign/J": 158.3114309310913, "misalign/J_aux_loss": 1.5831142514944077, "misalign/J_aux_loss_raw": 158.3114309310913, "misalign/J_over_reverse_kl": 1.3874521106481552, "misalign/J_per_token": 0.2954816836863756, "misalign/compressed_reward_absmax": 3782.6796264648438, "misalign/compressed_reward_range": 5458.201934814453, "misalign/entropy_a": 683.5626831054688, "misalign/entropy_b": 1110.1263809204102, "misalign/forward_kl_divergence": 309.0083351135254, "misalign/forward_kl_divergence_per_token": 0.7647044509649277, "misalign/gamma_abs_times_reward_std": 42300354.5, "misalign/gamma_bracketed_rate": 0.9892060980200768, "misalign/gamma_reward_residual": 0.00012735200425595394, "misalign/gamma_star": 70152250.0, "misalign/js_divergence": 34.39964842796326, "misalign/reverse_kl_divergence": 138.5347490310669, "misalign/reverse_kl_divergence_per_token": 0.3200199883431196, "misalign/reward_a": 96.08270931243896, "misalign/reward_b": -11.616803467273712, "misalign/reward_improvement": 107.69949054718018, "misalign/reward_improvement_over_reverse_kl": 0.6338916420936584, "misalign/reward_improvement_per_token": 0.0737753571011126, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -587.4529418945312, "misalign/reward_vocab_std": 666.2506561279297, "misalign/symmetric_kl": 447.5432891845703, "misalign/tv_distance": 114.28812599182129, "num_tokens": 7963180.0, "rewards/accuracies": 0.953125, "rewards/chosen": -15.032280445098877, "rewards/margins": 37.809099435806274, "rewards/rejected": -52.84138059616089, "step": 47, "support/residual_count": 151893.236328125, "support/residual_mass_policy": 0.02375667286105454, "support/residual_mass_reference": 0.04523510206490755, "support/residual_reward": -0.6332408636808395, "support/runtime_width": 42.76191568374634, "support/sampled_loser_rank": 0.6728832796216011, "support/sampled_reward_rank": -0.326838955283165, "support/sampled_token_added_rate": 0.029435024363920093, "support/sampled_winner_rank": 0.7073031365871429, "support/selected_width": 42.76191568374634, "support/stored_width": 42.76191568374634 }, { "epoch": 0.75, "grad_norm": 92.29680633544922, "kl/sequence_policy_ref": -349.18139266967773, "kl/vocab_forward": 318.0182914733887, "kl/vocab_js": 36.353710412979126, "kl/vocab_reverse": 146.12834548950195, "kl/vocab_symmetric": 464.14682960510254, "learning_rate": 2e-06, "logps/chosen": -845.9873504638672, "logps/rejected": -1924.1956024169922, "loss": 1.6528, "loss/dpo": 0.049382371892768256, "misalign/J": 160.33691692352295, "misalign/J_aux_loss": 1.6033690869808197, "misalign/J_aux_loss_raw": 160.33691692352295, "misalign/J_over_reverse_kl": 1.2529658675193787, "misalign/J_per_token": 0.20762860495597124, "misalign/compressed_reward_absmax": 4354.188568115234, "misalign/compressed_reward_range": 6215.323059082031, "misalign/entropy_a": 822.5513381958008, "misalign/entropy_b": 1289.558578491211, "misalign/forward_kl_divergence": 318.0182914733887, "misalign/forward_kl_divergence_per_token": 0.440604854375124, "misalign/gamma_abs_times_reward_std": 41074954.5, "misalign/gamma_bracketed_rate": 0.9895801991224289, "misalign/gamma_reward_residual": 0.00018931449562842317, "misalign/gamma_star": 57840401.875, "misalign/js_divergence": 36.353710412979126, "misalign/reverse_kl_divergence": 146.12834548950195, "misalign/reverse_kl_divergence_per_token": 0.18436118587851524, "misalign/reward_a": 101.43512630462646, "misalign/reward_b": -11.868624448776245, "misalign/reward_improvement": 113.30374765396118, "misalign/reward_improvement_over_reverse_kl": 0.7551356106996536, "misalign/reward_improvement_per_token": 0.1261532404460013, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -493.02664852142334, "misalign/reward_vocab_std": 759.0527877807617, "misalign/symmetric_kl": 464.14682960510254, "misalign/tv_distance": 123.64643812179565, "num_tokens": 8135261.0, "rewards/accuracies": 0.984375, "rewards/chosen": -14.46340036392212, "rewards/margins": 40.90948009490967, "rewards/rejected": -55.37287950515747, "step": 48, "support/residual_count": 151893.193359375, "support/residual_mass_policy": 0.026260258397087455, "support/residual_mass_reference": 0.042555712163448334, "support/residual_reward": -0.47505020070821047, "support/runtime_width": 42.80619525909424, "support/sampled_loser_rank": 0.625778254121542, "support/sampled_reward_rank": -0.3972127176821232, "support/sampled_token_added_rate": 0.03378989826887846, "support/sampled_winner_rank": 0.6260612569749355, "support/selected_width": 42.80619525909424, "support/stored_width": 42.80619525909424 }, { "epoch": 0.75, "eval_kl/sequence_policy_ref": -364.4170129299164, "eval_kl/vocab_forward": 328.8754801750183, "eval_kl/vocab_js": 37.498633831739426, "eval_kl/vocab_reverse": 150.4579164981842, "eval_kl/vocab_symmetric": 479.3336043357849, "eval_logps/chosen": -882.4125304222107, "eval_logps/rejected": -1946.6873836517334, "eval_loss": 1.7490763664245605, "eval_loss/dpo": 0.06740551616172231, "eval_misalign/J": 168.167093873024, "eval_misalign/J_aux_loss": 1.6816708873957396, "eval_misalign/J_aux_loss_raw": 168.167093873024, "eval_misalign/J_over_reverse_kl": 1.7788694016635418, "eval_misalign/J_per_token": 0.22979121375828981, "eval_misalign/compressed_reward_absmax": 4236.706272125244, "eval_misalign/compressed_reward_range": 6027.056537628174, "eval_misalign/entropy_a": 823.1465611457825, "eval_misalign/entropy_b": 1298.0544729232788, "eval_misalign/forward_kl_divergence": 328.8754801750183, "eval_misalign/forward_kl_divergence_per_token": 0.4856905459892005, "eval_misalign/gamma_abs_times_reward_std": 43424142.421875, "eval_misalign/gamma_bracketed_rate": 0.9878035467118025, "eval_misalign/gamma_reward_residual": 0.0007769855760599853, "eval_misalign/gamma_star": 61287659.90625, "eval_misalign/js_divergence": 37.498633831739426, "eval_misalign/reverse_kl_divergence": 150.4579164981842, "eval_misalign/reverse_kl_divergence_per_token": 0.19300507346633822, "eval_misalign/reward_a": 101.7501335144043, "eval_misalign/reward_b": -14.307281976565719, "eval_misalign/reward_improvement": 116.05739098787308, "eval_misalign/reward_improvement_over_reverse_kl": 0.7271916684694588, "eval_misalign/reward_improvement_per_token": 0.09193722825148143, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.11554992198944, "eval_misalign/reward_vocab_std": 731.32026720047, "eval_misalign/symmetric_kl": 479.3336043357849, "eval_misalign/tv_distance": 127.07569408416748, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -15.323657296597958, "eval_rewards/margins": 42.23608899116516, "eval_rewards/rejected": -57.55974632501602, "eval_runtime": 100.8337, "eval_samples_per_second": 5.078, "eval_steps_per_second": 0.635, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.025306705734692514, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 48 }, { "epoch": 0.765625, "grad_norm": 84.55875396728516, "kl/sequence_policy_ref": -355.52197265625, "kl/vocab_forward": 323.58666229248047, "kl/vocab_js": 38.03043556213379, "kl/vocab_reverse": 151.7306032180786, "kl/vocab_symmetric": 475.31740951538086, "learning_rate": 2e-06, "logps/chosen": -917.9943237304688, "logps/rejected": -1895.99658203125, "loss": 1.6704, "loss/dpo": 0.04557584371703283, "misalign/J": 162.48288917541504, "misalign/J_aux_loss": 1.624828889966011, "misalign/J_aux_loss_raw": 162.48288917541504, "misalign/J_over_reverse_kl": 1.4654640778899193, "misalign/J_per_token": 0.20114040188491344, "misalign/compressed_reward_absmax": 4219.985290527344, "misalign/compressed_reward_range": 5980.884826660156, "misalign/entropy_a": 811.0443420410156, "misalign/entropy_b": 1299.4309158325195, "misalign/forward_kl_divergence": 323.58666229248047, "misalign/forward_kl_divergence_per_token": 0.399110134691, "misalign/gamma_abs_times_reward_std": 53172764.0, "misalign/gamma_bracketed_rate": 0.9869352579116821, "misalign/gamma_reward_residual": 3.9546100197185297e-05, "misalign/gamma_star": 60015785.75, "misalign/js_divergence": 38.03043556213379, "misalign/reverse_kl_divergence": 151.7306032180786, "misalign/reverse_kl_divergence_per_token": 0.16637993790209293, "misalign/reward_a": 105.46696472167969, "misalign/reward_b": -16.47221863269806, "misalign/reward_improvement": 121.9391622543335, "misalign/reward_improvement_over_reverse_kl": 0.7262164875864983, "misalign/reward_improvement_per_token": 0.10510652232915163, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -500.18709564208984, "misalign/reward_vocab_std": 730.5291976928711, "misalign/symmetric_kl": 475.31740951538086, "misalign/tv_distance": 129.61585903167725, "num_tokens": 8305232.0, "rewards/accuracies": 0.984375, "rewards/chosen": -15.759445548057556, "rewards/margins": 39.58550405502319, "rewards/rejected": -55.34494924545288, "step": 49, "support/residual_count": 151893.353515625, "support/residual_mass_policy": 0.022411803365685046, "support/residual_mass_reference": 0.03780948673374951, "support/residual_reward": -0.5111432895064354, "support/runtime_width": 42.64701747894287, "support/sampled_loser_rank": 0.6062813438475132, "support/sampled_reward_rank": -0.361979590728879, "support/sampled_token_added_rate": 0.03365356335416436, "support/sampled_winner_rank": 0.626273512840271, "support/selected_width": 42.64701747894287, "support/stored_width": 42.64701747894287 }, { "epoch": 0.78125, "grad_norm": 120.11223602294922, "kl/sequence_policy_ref": -378.5549774169922, "kl/vocab_forward": 352.6853256225586, "kl/vocab_js": 37.74899077415466, "kl/vocab_reverse": 148.82802867889404, "kl/vocab_symmetric": 501.5135078430176, "learning_rate": 2e-06, "logps/chosen": -884.3883514404297, "logps/rejected": -1912.2733612060547, "loss": 2.1813, "loss/dpo": 0.1028611735179559, "misalign/J": 207.84398746490479, "misalign/J_aux_loss": 2.078439861536026, "misalign/J_aux_loss_raw": 207.84398746490479, "misalign/J_over_reverse_kl": 1.7021770626306534, "misalign/J_per_token": 0.2538851350545883, "misalign/compressed_reward_absmax": 4287.376739501953, "misalign/compressed_reward_range": 6082.450378417969, "misalign/entropy_a": 770.8007431030273, "misalign/entropy_b": 1264.5378875732422, "misalign/forward_kl_divergence": 352.6853256225586, "misalign/forward_kl_divergence_per_token": 0.5054213367402554, "misalign/gamma_abs_times_reward_std": 62323988.25, "misalign/gamma_bracketed_rate": 0.985651396214962, "misalign/gamma_reward_residual": 0.002676691350643523, "misalign/gamma_star": 81304675.0, "misalign/js_divergence": 37.74899077415466, "misalign/reverse_kl_divergence": 148.82802867889404, "misalign/reverse_kl_divergence_per_token": 0.19959929399192333, "misalign/reward_a": 91.68502187728882, "misalign/reward_b": -13.228169560432434, "misalign/reward_improvement": 104.91317367553711, "misalign/reward_improvement_over_reverse_kl": 0.6395450867712498, "misalign/reward_improvement_per_token": 0.07659031543880701, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -545.8368339538574, "misalign/reward_vocab_std": 735.2048110961914, "misalign/symmetric_kl": 501.5135078430176, "misalign/tv_distance": 127.10494995117188, "num_tokens": 8481546.0, "rewards/accuracies": 0.984375, "rewards/chosen": -16.9174667596817, "rewards/margins": 41.87606382369995, "rewards/rejected": -58.793529987335205, "step": 50, "support/residual_count": 151893.240234375, "support/residual_mass_policy": 0.028523427667096257, "support/residual_mass_reference": 0.04290076903998852, "support/residual_reward": -0.4372365176677704, "support/runtime_width": 42.76475811004639, "support/sampled_loser_rank": 0.5924237333238125, "support/sampled_reward_rank": -0.40600746124982834, "support/sampled_token_added_rate": 0.03576831449754536, "support/sampled_winner_rank": 0.618581123650074, "support/selected_width": 42.76475811004639, "support/stored_width": 42.76475811004639 }, { "epoch": 0.796875, "grad_norm": 74.84430694580078, "kl/sequence_policy_ref": -348.85993576049805, "kl/vocab_forward": 322.82544136047363, "kl/vocab_js": 34.94163501262665, "kl/vocab_reverse": 137.33496236801147, "kl/vocab_symmetric": 460.160680770874, "learning_rate": 2e-06, "logps/chosen": -807.7403802871704, "logps/rejected": -1745.3731842041016, "loss": 2.0267, "loss/dpo": 0.47453043650421023, "misalign/J": 155.21207237243652, "misalign/J_aux_loss": 1.5521207079291344, "misalign/J_aux_loss_raw": 155.21207237243652, "misalign/J_over_reverse_kl": 1.433588370680809, "misalign/J_per_token": 0.2532362565398216, "misalign/compressed_reward_absmax": 3888.342071533203, "misalign/compressed_reward_range": 5480.323181152344, "misalign/entropy_a": 695.3439636230469, "misalign/entropy_b": 1156.5927391052246, "misalign/forward_kl_divergence": 322.82544136047363, "misalign/forward_kl_divergence_per_token": 0.725763525813818, "misalign/gamma_abs_times_reward_std": 41670287.75, "misalign/gamma_bracketed_rate": 0.9877297207713127, "misalign/gamma_reward_residual": -6.590514796300795e-05, "misalign/gamma_star": 57525834.875, "misalign/js_divergence": 34.94163501262665, "misalign/reverse_kl_divergence": 137.33496236801147, "misalign/reverse_kl_divergence_per_token": 0.2455148883163929, "misalign/reward_a": 83.87791728973389, "misalign/reward_b": -14.364118754863739, "misalign/reward_improvement": 98.24201107025146, "misalign/reward_improvement_over_reverse_kl": 0.5628711394965649, "misalign/reward_improvement_per_token": 0.03299418743699789, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -404.0699119567871, "misalign/reward_vocab_std": 662.7630424499512, "misalign/symmetric_kl": 460.160680770874, "misalign/tv_distance": 116.86641025543213, "num_tokens": 8645585.0, "rewards/accuracies": 0.96875, "rewards/chosen": -16.726518213748932, "rewards/margins": 36.31895399093628, "rewards/rejected": -53.045472145080566, "step": 51, "support/residual_count": 151893.197265625, "support/residual_mass_policy": 0.024921999080106616, "support/residual_mass_reference": 0.04474550345912576, "support/residual_reward": -0.4795303028076887, "support/runtime_width": 42.801236152648926, "support/sampled_loser_rank": 0.624417282640934, "support/sampled_reward_rank": -0.37105782236903906, "support/sampled_token_added_rate": 0.032293472439050674, "support/sampled_winner_rank": 0.6685933172702789, "support/selected_width": 42.801236152648926, "support/stored_width": 42.801236152648926 }, { "epoch": 0.8125, "grad_norm": 86.25784301757812, "kl/sequence_policy_ref": -477.46276092529297, "kl/vocab_forward": 436.0279541015625, "kl/vocab_js": 44.9802827835083, "kl/vocab_reverse": 177.54276657104492, "kl/vocab_symmetric": 613.5709609985352, "learning_rate": 2e-06, "logps/chosen": -881.7166900634766, "logps/rejected": -2414.805862426758, "loss": 2.5689, "loss/dpo": 0.6295568409377225, "misalign/J": 193.93076133728027, "misalign/J_aux_loss": 1.939307525753975, "misalign/J_aux_loss_raw": 193.93076133728027, "misalign/J_over_reverse_kl": 1.406748965382576, "misalign/J_per_token": 0.2848157715052366, "misalign/compressed_reward_absmax": 4759.470764160156, "misalign/compressed_reward_range": 6730.137023925781, "misalign/entropy_a": 874.9066314697266, "misalign/entropy_b": 1443.2865905761719, "misalign/forward_kl_divergence": 436.0279541015625, "misalign/forward_kl_divergence_per_token": 0.5377647392451763, "misalign/gamma_abs_times_reward_std": 52484937.5, "misalign/gamma_bracketed_rate": 0.9844366908073425, "misalign/gamma_reward_residual": 0.0008822159904866567, "misalign/gamma_star": 86450198.0, "misalign/js_divergence": 44.9802827835083, "misalign/reverse_kl_divergence": 177.54276657104492, "misalign/reverse_kl_divergence_per_token": 0.1856713891029358, "misalign/reward_a": 107.41517734527588, "misalign/reward_b": -19.218833327293396, "misalign/reward_improvement": 126.63399696350098, "misalign/reward_improvement_over_reverse_kl": 0.6467054821550846, "misalign/reward_improvement_per_token": 0.07611742825247347, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -570.9265441894531, "misalign/reward_vocab_std": 818.5440444946289, "misalign/symmetric_kl": 613.5709609985352, "misalign/tv_distance": 146.72683811187744, "num_tokens": 8830616.0, "rewards/accuracies": 0.96875, "rewards/chosen": -20.915815234184265, "rewards/margins": 53.66092252731323, "rewards/rejected": -74.57673835754395, "step": 52, "support/residual_count": 151893.328125, "support/residual_mass_policy": 0.02075903001241386, "support/residual_mass_reference": 0.04033026983961463, "support/residual_reward": -0.39157247683033347, "support/runtime_width": 42.66762590408325, "support/sampled_loser_rank": 0.6271255537867546, "support/sampled_reward_rank": -0.3960692873224616, "support/sampled_token_added_rate": 0.0356892254203558, "support/sampled_winner_rank": 0.6444137506186962, "support/selected_width": 42.66762590408325, "support/stored_width": 42.66762590408325 }, { "epoch": 0.828125, "grad_norm": 101.57430267333984, "kl/sequence_policy_ref": -465.77392578125, "kl/vocab_forward": 446.5703926086426, "kl/vocab_js": 47.15303373336792, "kl/vocab_reverse": 184.80498790740967, "kl/vocab_symmetric": 631.3756484985352, "learning_rate": 2e-06, "logps/chosen": -1251.7670364379883, "logps/rejected": -2130.667724609375, "loss": 2.2582, "loss/dpo": 0.29861677209665966, "misalign/J": 195.95514106750488, "misalign/J_aux_loss": 1.959551364183426, "misalign/J_aux_loss_raw": 195.95514106750488, "misalign/J_over_reverse_kl": 1.1659726202487946, "misalign/J_per_token": 0.2168925404548645, "misalign/compressed_reward_absmax": 4992.182373046875, "misalign/compressed_reward_range": 7108.8575439453125, "misalign/entropy_a": 898.7776031494141, "misalign/entropy_b": 1514.9154586791992, "misalign/forward_kl_divergence": 446.5703926086426, "misalign/forward_kl_divergence_per_token": 0.5860045477747917, "misalign/gamma_abs_times_reward_std": 51011083.5, "misalign/gamma_bracketed_rate": 0.9903441444039345, "misalign/gamma_reward_residual": -0.0019245314256295387, "misalign/gamma_star": 85517778.5, "misalign/js_divergence": 47.15303373336792, "misalign/reverse_kl_divergence": 184.80498790740967, "misalign/reverse_kl_divergence_per_token": 0.21940777078270912, "misalign/reward_a": 114.58473777770996, "misalign/reward_b": -14.659283697605133, "misalign/reward_improvement": 129.2440366744995, "misalign/reward_improvement_over_reverse_kl": 0.6434826478362083, "misalign/reward_improvement_per_token": 0.05605245754122734, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -762.099235534668, "misalign/reward_vocab_std": 874.5540542602539, "misalign/symmetric_kl": 631.3756484985352, "misalign/tv_distance": 157.77043342590332, "num_tokens": 9025743.0, "rewards/accuracies": 0.953125, "rewards/chosen": -26.162310361862183, "rewards/margins": 40.83016753196716, "rewards/rejected": -66.99247980117798, "step": 53, "support/residual_count": 151893.298828125, "support/residual_mass_policy": 0.020821302896365523, "support/residual_mass_reference": 0.039504863787442446, "support/residual_reward": -0.5619673319160938, "support/runtime_width": 42.6985387802124, "support/sampled_loser_rank": 0.6461238414049149, "support/sampled_reward_rank": -0.34166772849857807, "support/sampled_token_added_rate": 0.03369109332561493, "support/sampled_winner_rank": 0.6555347442626953, "support/selected_width": 42.6985387802124, "support/stored_width": 42.6985387802124 }, { "epoch": 0.84375, "grad_norm": 298.0096435546875, "kl/sequence_policy_ref": -386.71077728271484, "kl/vocab_forward": 365.6785316467285, "kl/vocab_js": 35.59814095497131, "kl/vocab_reverse": 139.05288410186768, "kl/vocab_symmetric": 504.7316131591797, "learning_rate": 2e-06, "logps/chosen": -760.6196022033691, "logps/rejected": -1875.7993927001953, "loss": 2.171, "loss/dpo": 0.14888411300079252, "misalign/J": 202.2076416015625, "misalign/J_aux_loss": 2.022076301276684, "misalign/J_aux_loss_raw": 202.2076416015625, "misalign/J_over_reverse_kl": 2.347415864467621, "misalign/J_per_token": 0.2071497868746519, "misalign/compressed_reward_absmax": 4018.2659912109375, "misalign/compressed_reward_range": 5681.7738037109375, "misalign/entropy_a": 716.9813385009766, "misalign/entropy_b": 1167.6287994384766, "misalign/forward_kl_divergence": 365.6785316467285, "misalign/forward_kl_divergence_per_token": 0.49891503155231476, "misalign/gamma_abs_times_reward_std": 61696461.5, "misalign/gamma_bracketed_rate": 0.9874565973877907, "misalign/gamma_reward_residual": -0.004232324329905168, "misalign/gamma_star": 28188861.5, "misalign/js_divergence": 35.59814095497131, "misalign/reverse_kl_divergence": 139.05288410186768, "misalign/reverse_kl_divergence_per_token": 0.17908263765275478, "misalign/reward_a": 83.29793310165405, "misalign/reward_b": -13.053004205226898, "misalign/reward_improvement": 96.35093402862549, "misalign/reward_improvement_over_reverse_kl": 0.588286180049181, "misalign/reward_improvement_per_token": 0.07466917904093862, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -496.0263900756836, "misalign/reward_vocab_std": 698.2202835083008, "misalign/symmetric_kl": 504.7316131591797, "misalign/tv_distance": 115.95970249176025, "num_tokens": 9183237.0, "rewards/accuracies": 0.953125, "rewards/chosen": -18.112544775009155, "rewards/margins": 41.11706781387329, "rewards/rejected": -59.229612827301025, "step": 54, "support/residual_count": 151893.44140625, "support/residual_mass_policy": 0.02262613782659173, "support/residual_mass_reference": 0.03800426935777068, "support/residual_reward": -0.5341151673346758, "support/runtime_width": 42.55538368225098, "support/sampled_loser_rank": 0.5614169128239155, "support/sampled_reward_rank": -0.29479603469371796, "support/sampled_token_added_rate": 0.03493571188300848, "support/sampled_winner_rank": 0.5692789405584335, "support/selected_width": 42.55538368225098, "support/stored_width": 42.55538368225098 }, { "epoch": 0.859375, "grad_norm": 99.84505462646484, "kl/sequence_policy_ref": -418.7293930053711, "kl/vocab_forward": 371.4789810180664, "kl/vocab_js": 35.276673793792725, "kl/vocab_reverse": 138.0893669128418, "kl/vocab_symmetric": 509.5685806274414, "learning_rate": 2e-06, "logps/chosen": -689.4199485778809, "logps/rejected": -1892.0952606201172, "loss": 1.7759, "loss/dpo": 0.14481948580403764, "misalign/J": 163.1080617904663, "misalign/J_aux_loss": 1.6310805529356003, "misalign/J_aux_loss_raw": 163.1080617904663, "misalign/J_over_reverse_kl": 1.3716232553124428, "misalign/J_per_token": 0.2615004684776068, "misalign/compressed_reward_absmax": 3880.816650390625, "misalign/compressed_reward_range": 5555.395263671875, "misalign/entropy_a": 626.7629547119141, "misalign/entropy_b": 1067.2015533447266, "misalign/forward_kl_divergence": 371.4789810180664, "misalign/forward_kl_divergence_per_token": 0.8582677766680717, "misalign/gamma_abs_times_reward_std": 44081586.0, "misalign/gamma_bracketed_rate": 0.9872501865029335, "misalign/gamma_reward_residual": 0.007101273212128945, "misalign/gamma_star": 65061232.25, "misalign/js_divergence": 35.276673793792725, "misalign/reverse_kl_divergence": 138.0893669128418, "misalign/reverse_kl_divergence_per_token": 0.27060581743717194, "misalign/reward_a": 76.1998405456543, "misalign/reward_b": -13.09484925866127, "misalign/reward_improvement": 89.29467296600342, "misalign/reward_improvement_over_reverse_kl": 0.5231703817844391, "misalign/reward_improvement_per_token": 0.035240316297858953, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -446.2036018371582, "misalign/reward_vocab_std": 662.775749206543, "misalign/symmetric_kl": 509.5685806274414, "misalign/tv_distance": 113.11602115631104, "num_tokens": 9339840.0, "rewards/accuracies": 0.984375, "rewards/chosen": -17.44824308156967, "rewards/margins": 48.849395751953125, "rewards/rejected": -66.29763889312744, "step": 55, "support/residual_count": 151893.3671875, "support/residual_mass_policy": 0.019164926023222506, "support/residual_mass_reference": 0.03859002981334925, "support/residual_reward": -0.47019451297819614, "support/runtime_width": 42.63320541381836, "support/sampled_loser_rank": 0.6031154692173004, "support/sampled_reward_rank": -0.38968720100820065, "support/sampled_token_added_rate": 0.0316556547768414, "support/sampled_winner_rank": 0.6063822247087955, "support/selected_width": 42.63320541381836, "support/stored_width": 42.63320541381836 }, { "epoch": 0.875, "grad_norm": 68.76524353027344, "kl/sequence_policy_ref": -402.62422370910645, "kl/vocab_forward": 360.61656951904297, "kl/vocab_js": 36.79127216339111, "kl/vocab_reverse": 143.1292266845703, "kl/vocab_symmetric": 503.7460651397705, "learning_rate": 2e-06, "logps/chosen": -796.70947265625, "logps/rejected": -1951.9369506835938, "loss": 1.9268, "loss/dpo": 0.31827243250995924, "misalign/J": 160.8573293685913, "misalign/J_aux_loss": 1.608573243021965, "misalign/J_aux_loss_raw": 160.8573293685913, "misalign/J_over_reverse_kl": 1.212674729526043, "misalign/J_per_token": 0.23724722862243652, "misalign/compressed_reward_absmax": 3996.8917541503906, "misalign/compressed_reward_range": 5663.479064941406, "misalign/entropy_a": 717.5296401977539, "misalign/entropy_b": 1203.4962768554688, "misalign/forward_kl_divergence": 360.61656951904297, "misalign/forward_kl_divergence_per_token": 0.7455775737762451, "misalign/gamma_abs_times_reward_std": 49514251.0, "misalign/gamma_bracketed_rate": 0.9896730110049248, "misalign/gamma_reward_residual": 9.679878201040992e-05, "misalign/gamma_star": 57219496.125, "misalign/js_divergence": 36.79127216339111, "misalign/reverse_kl_divergence": 143.1292266845703, "misalign/reverse_kl_divergence_per_token": 0.26022260822355747, "misalign/reward_a": 82.29156970977783, "misalign/reward_b": -11.702201634645462, "misalign/reward_improvement": 93.99376678466797, "misalign/reward_improvement_over_reverse_kl": 0.5378214567899704, "misalign/reward_improvement_per_token": 0.04054644517600536, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -461.6409397125244, "misalign/reward_vocab_std": 698.131046295166, "misalign/symmetric_kl": 503.7460651397705, "misalign/tv_distance": 122.93070888519287, "num_tokens": 9506436.0, "rewards/accuracies": 0.953125, "rewards/chosen": -17.97724825143814, "rewards/margins": 44.570350646972656, "rewards/rejected": -62.54759979248047, "step": 56, "support/residual_count": 151893.306640625, "support/residual_mass_policy": 0.020866328850388527, "support/residual_mass_reference": 0.039209546288475394, "support/residual_reward": -0.41471402533352375, "support/runtime_width": 42.69208335876465, "support/sampled_loser_rank": 0.6182565689086914, "support/sampled_reward_rank": -0.4434542544186115, "support/sampled_token_added_rate": 0.03403148171491921, "support/sampled_winner_rank": 0.6329206973314285, "support/selected_width": 42.69208335876465, "support/stored_width": 42.69208335876465 }, { "epoch": 0.890625, "grad_norm": 94.79530334472656, "kl/sequence_policy_ref": -375.49475288391113, "kl/vocab_forward": 340.9373073577881, "kl/vocab_js": 33.93104815483093, "kl/vocab_reverse": 132.34368515014648, "kl/vocab_symmetric": 473.28116607666016, "learning_rate": 2e-06, "logps/chosen": -666.1807670593262, "logps/rejected": -1875.8219909667969, "loss": 1.7866, "loss/dpo": 0.24838248359601162, "misalign/J": 153.82119750976562, "misalign/J_aux_loss": 1.5382119417190552, "misalign/J_aux_loss_raw": 153.82119750976562, "misalign/J_over_reverse_kl": 1.4413592517375946, "misalign/J_per_token": 0.2751000728458166, "misalign/compressed_reward_absmax": 3810.44873046875, "misalign/compressed_reward_range": 5403.443389892578, "misalign/entropy_a": 678.8246765136719, "misalign/entropy_b": 1104.3003616333008, "misalign/forward_kl_divergence": 340.9373073577881, "misalign/forward_kl_divergence_per_token": 0.9033116102218628, "misalign/gamma_abs_times_reward_std": 43435399.0, "misalign/gamma_bracketed_rate": 0.9872131571173668, "misalign/gamma_reward_residual": 0.00028186071233449184, "misalign/gamma_star": 71246069.25, "misalign/js_divergence": 33.93104815483093, "misalign/reverse_kl_divergence": 132.34368515014648, "misalign/reverse_kl_divergence_per_token": 0.27233118936419487, "misalign/reward_a": 77.04429864883423, "misalign/reward_b": -15.656381011009216, "misalign/reward_improvement": 92.70064735412598, "misalign/reward_improvement_over_reverse_kl": 0.46722693368792534, "misalign/reward_improvement_per_token": -0.003935309126973152, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -453.2850399017334, "misalign/reward_vocab_std": 652.1301879882812, "misalign/symmetric_kl": 473.28116607666016, "misalign/tv_distance": 112.50246047973633, "num_tokens": 9663583.0, "rewards/accuracies": 0.953125, "rewards/chosen": -16.42910087108612, "rewards/margins": 42.240750551223755, "rewards/rejected": -58.6698522567749, "step": 57, "support/residual_count": 151893.123046875, "support/residual_mass_policy": 0.025635873898863792, "support/residual_mass_reference": 0.051683404948562384, "support/residual_reward": -0.43221110105514526, "support/runtime_width": 42.8780837059021, "support/sampled_loser_rank": 0.6225372664630413, "support/sampled_reward_rank": -0.4814031980931759, "support/sampled_token_added_rate": 0.044704005820676684, "support/sampled_winner_rank": 0.6614899709820747, "support/selected_width": 42.8780837059021, "support/stored_width": 42.8780837059021 }, { "epoch": 0.90625, "grad_norm": 63.19841003417969, "kl/sequence_policy_ref": -429.0251274108887, "kl/vocab_forward": 393.4870414733887, "kl/vocab_js": 40.02447009086609, "kl/vocab_reverse": 155.88319969177246, "kl/vocab_symmetric": 549.3705825805664, "learning_rate": 2e-06, "logps/chosen": -829.9505310058594, "logps/rejected": -2197.3387451171875, "loss": 2.1333, "loss/dpo": 0.4092487035359162, "misalign/J": 172.40936088562012, "misalign/J_aux_loss": 1.724093571305275, "misalign/J_aux_loss_raw": 172.40936088562012, "misalign/J_over_reverse_kl": 2.0524225011467934, "misalign/J_per_token": 0.27197333984076977, "misalign/compressed_reward_absmax": 4601.915740966797, "misalign/compressed_reward_range": 6508.782531738281, "misalign/entropy_a": 811.3467330932617, "misalign/entropy_b": 1340.6858673095703, "misalign/forward_kl_divergence": 393.4870414733887, "misalign/forward_kl_divergence_per_token": 0.5244201272726059, "misalign/gamma_abs_times_reward_std": 48018970.5, "misalign/gamma_bracketed_rate": 0.9816265851259232, "misalign/gamma_reward_residual": 0.00013214930413596448, "misalign/gamma_star": 68324594.0, "misalign/js_divergence": 40.02447009086609, "misalign/reverse_kl_divergence": 155.88319969177246, "misalign/reverse_kl_divergence_per_token": 0.17863536719232798, "misalign/reward_a": 84.13565301895142, "misalign/reward_b": -20.68537664413452, "misalign/reward_improvement": 104.8209924697876, "misalign/reward_improvement_over_reverse_kl": 0.602836437523365, "misalign/reward_improvement_per_token": 0.056509769055992365, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -554.3869686126709, "misalign/reward_vocab_std": 789.2750701904297, "misalign/symmetric_kl": 549.3705825805664, "misalign/tv_distance": 134.0800666809082, "num_tokens": 9847213.0, "rewards/accuracies": 0.984375, "rewards/chosen": -18.04733145236969, "rewards/margins": 49.71036195755005, "rewards/rejected": -67.75769662857056, "step": 58, "support/residual_count": 151893.20703125, "support/residual_mass_policy": 0.02004858397413045, "support/residual_mass_reference": 0.043993874453008175, "support/residual_reward": -0.44899558275938034, "support/runtime_width": 42.797929763793945, "support/sampled_loser_rank": 0.6209223605692387, "support/sampled_reward_rank": -0.39453774876892567, "support/sampled_token_added_rate": 0.03281214344315231, "support/sampled_winner_rank": 0.663671188056469, "support/selected_width": 42.797929763793945, "support/stored_width": 42.797929763793945 }, { "epoch": 0.921875, "grad_norm": 117.94857788085938, "kl/sequence_policy_ref": -443.7041206359863, "kl/vocab_forward": 409.88890075683594, "kl/vocab_js": 40.246328592300415, "kl/vocab_reverse": 156.80553817749023, "kl/vocab_symmetric": 566.6947135925293, "learning_rate": 2e-06, "logps/chosen": -869.2680358886719, "logps/rejected": -2070.011489868164, "loss": 3.0679, "loss/dpo": 1.1368572648187203, "misalign/J": 193.10802459716797, "misalign/J_aux_loss": 1.931080162525177, "misalign/J_aux_loss_raw": 193.10802459716797, "misalign/J_over_reverse_kl": 1.537727639079094, "misalign/J_per_token": 0.22832004725933075, "misalign/compressed_reward_absmax": 4309.089996337891, "misalign/compressed_reward_range": 6035.041198730469, "misalign/entropy_a": 770.8880767822266, "misalign/entropy_b": 1271.0429077148438, "misalign/forward_kl_divergence": 409.88890075683594, "misalign/forward_kl_divergence_per_token": 0.548376951366663, "misalign/gamma_abs_times_reward_std": 57010088.0, "misalign/gamma_bracketed_rate": 0.9858352392911911, "misalign/gamma_reward_residual": 0.00474659322799198, "misalign/gamma_star": 87458212.0, "misalign/js_divergence": 40.246328592300415, "misalign/reverse_kl_divergence": 156.80553817749023, "misalign/reverse_kl_divergence_per_token": 0.1863451935350895, "misalign/reward_a": 87.77897262573242, "misalign/reward_b": -18.043125957250595, "misalign/reward_improvement": 105.82207298278809, "misalign/reward_improvement_over_reverse_kl": 0.6192803308367729, "misalign/reward_improvement_per_token": 0.07612488837912679, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -637.8235740661621, "misalign/reward_vocab_std": 722.6581039428711, "misalign/symmetric_kl": 566.6947135925293, "misalign/tv_distance": 131.4430446624756, "num_tokens": 10021862.0, "rewards/accuracies": 0.953125, "rewards/chosen": -21.588929533958435, "rewards/margins": 45.56296682357788, "rewards/rejected": -67.15189743041992, "step": 59, "support/residual_count": 151893.498046875, "support/residual_mass_policy": 0.022956559900194407, "support/residual_mass_reference": 0.03896482312120497, "support/residual_reward": -0.552052453160286, "support/runtime_width": 42.50350904464722, "support/sampled_loser_rank": 0.5951877385377884, "support/sampled_reward_rank": -0.2748332447372377, "support/sampled_token_added_rate": 0.03523444454185665, "support/sampled_winner_rank": 0.6131882853806019, "support/selected_width": 42.50350904464722, "support/stored_width": 42.50350904464722 }, { "epoch": 0.9375, "grad_norm": 52.73037338256836, "kl/sequence_policy_ref": -394.9115695953369, "kl/vocab_forward": 363.2713165283203, "kl/vocab_js": 35.156471252441406, "kl/vocab_reverse": 136.1243805885315, "kl/vocab_symmetric": 499.39588928222656, "learning_rate": 2e-06, "logps/chosen": -676.7391777038574, "logps/rejected": -1959.0352096557617, "loss": 1.6395, "loss/dpo": 0.1764393468254366, "misalign/J": 146.30127716064453, "misalign/J_aux_loss": 1.4630127176642418, "misalign/J_aux_loss_raw": 146.30127716064453, "misalign/J_over_reverse_kl": 1.2657062262296677, "misalign/J_per_token": 0.22581798769533634, "misalign/compressed_reward_absmax": 4125.538238525391, "misalign/compressed_reward_range": 5841.7431640625, "misalign/entropy_a": 674.7129745483398, "misalign/entropy_b": 1137.8532485961914, "misalign/forward_kl_divergence": 363.2713165283203, "misalign/forward_kl_divergence_per_token": 0.5696188099682331, "misalign/gamma_abs_times_reward_std": 40711104.25, "misalign/gamma_bracketed_rate": 0.9862173870205879, "misalign/gamma_reward_residual": 0.0012141035936110711, "misalign/gamma_star": 54984167.75, "misalign/js_divergence": 35.156471252441406, "misalign/reverse_kl_divergence": 136.1243805885315, "misalign/reverse_kl_divergence_per_token": 0.18038787879049778, "misalign/reward_a": 68.60513997077942, "misalign/reward_b": -15.08642065525055, "misalign/reward_improvement": 83.69155693054199, "misalign/reward_improvement_over_reverse_kl": 0.6113412380218506, "misalign/reward_improvement_per_token": 0.08037259662523866, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -327.37775897979736, "misalign/reward_vocab_std": 704.0034561157227, "misalign/symmetric_kl": 499.39588928222656, "misalign/tv_distance": 114.7151231765747, "num_tokens": 10193216.0, "rewards/accuracies": 0.984375, "rewards/chosen": -16.442305088043213, "rewards/margins": 46.09770584106445, "rewards/rejected": -62.540010929107666, "step": 60, "support/residual_count": 151893.306640625, "support/residual_mass_policy": 0.020310348831117153, "support/residual_mass_reference": 0.03744203574024141, "support/residual_reward": -0.35806123074144125, "support/runtime_width": 42.69117593765259, "support/sampled_loser_rank": 0.5424250774085522, "support/sampled_reward_rank": -0.44320493191480637, "support/sampled_token_added_rate": 0.03534874925389886, "support/sampled_winner_rank": 0.5376182310283184, "support/selected_width": 42.69117593765259, "support/stored_width": 42.69117593765259 }, { "epoch": 0.9375, "eval_kl/sequence_policy_ref": -443.8763482570648, "eval_kl/vocab_forward": 408.00172185897827, "eval_kl/vocab_js": 39.08841371536255, "eval_kl/vocab_reverse": 151.38377118110657, "eval_kl/vocab_symmetric": 559.3856892585754, "eval_logps/chosen": -939.2870273590088, "eval_logps/rejected": -2048.731554031372, "eval_loss": 1.7673890590667725, "eval_loss/dpo": 0.14154351732228904, "eval_misalign/J": 162.58456230163574, "eval_misalign/J_aux_loss": 1.625845598988235, "eval_misalign/J_aux_loss_raw": 162.58456230163574, "eval_misalign/J_over_reverse_kl": 1.5481905555352569, "eval_misalign/J_per_token": 0.22693954594433308, "eval_misalign/compressed_reward_absmax": 4236.706272125244, "eval_misalign/compressed_reward_range": 6027.056491851807, "eval_misalign/entropy_a": 786.7468104362488, "eval_misalign/entropy_b": 1298.0544500350952, "eval_misalign/forward_kl_divergence": 408.00172185897827, "eval_misalign/forward_kl_divergence_per_token": 0.6159884915687144, "eval_misalign/gamma_abs_times_reward_std": 42470128.234375, "eval_misalign/gamma_bracketed_rate": 0.9880082719027996, "eval_misalign/gamma_reward_residual": 0.0014271657525881665, "eval_misalign/gamma_star": 53453969.375, "eval_misalign/js_divergence": 39.08841371536255, "eval_misalign/reverse_kl_divergence": 151.38377118110657, "eval_misalign/reverse_kl_divergence_per_token": 0.1974795301211998, "eval_misalign/reward_a": 88.27734404802322, "eval_misalign/reward_b": -14.307282019406557, "eval_misalign/reward_improvement": 102.58461898565292, "eval_misalign/reward_improvement_over_reverse_kl": 0.6177075733430684, "eval_misalign/reward_improvement_per_token": 0.07934931269846857, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.11556017398834, "eval_misalign/reward_vocab_std": 731.320264339447, "eval_misalign/symmetric_kl": 559.3856892585754, "eval_misalign/tv_distance": 129.75177884101868, "eval_rewards/accuracies": 0.982421875, "eval_rewards/chosen": -21.011107921600342, "eval_rewards/margins": 46.75305512547493, "eval_rewards/rejected": -67.76416301727295, "eval_runtime": 101.586, "eval_samples_per_second": 5.04, "eval_steps_per_second": 0.63, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.023736586881568655, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 60 }, { "epoch": 0.953125, "grad_norm": 66.27374267578125, "kl/sequence_policy_ref": -495.910831451416, "kl/vocab_forward": 454.87990951538086, "kl/vocab_js": 42.96171951293945, "kl/vocab_reverse": 166.5788345336914, "kl/vocab_symmetric": 621.4590148925781, "learning_rate": 2e-06, "logps/chosen": -882.9071960449219, "logps/rejected": -2324.2291259765625, "loss": 1.9092, "loss/dpo": 0.00656887573249243, "misalign/J": 190.26472282409668, "misalign/J_aux_loss": 1.9026471227407455, "misalign/J_aux_loss_raw": 190.26472282409668, "misalign/J_over_reverse_kl": 1.2592844367027283, "misalign/J_per_token": 0.22797120176255703, "misalign/compressed_reward_absmax": 4472.839447021484, "misalign/compressed_reward_range": 6308.651184082031, "misalign/entropy_a": 808.8803405761719, "misalign/entropy_b": 1358.7424087524414, "misalign/forward_kl_divergence": 454.87990951538086, "misalign/forward_kl_divergence_per_token": 0.6771250255405903, "misalign/gamma_abs_times_reward_std": 54845561.25, "misalign/gamma_bracketed_rate": 0.9894062727689743, "misalign/gamma_reward_residual": 0.0002826992104019155, "misalign/gamma_star": 55459094.75, "misalign/js_divergence": 42.96171951293945, "misalign/reverse_kl_divergence": 166.5788345336914, "misalign/reverse_kl_divergence_per_token": 0.2539278268814087, "misalign/reward_a": 87.17640686035156, "misalign/reward_b": -18.71793007850647, "misalign/reward_improvement": 105.89432716369629, "misalign/reward_improvement_over_reverse_kl": 0.5326569080352783, "misalign/reward_improvement_per_token": 0.01839596056379378, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -465.77495288848877, "misalign/reward_vocab_std": 756.1640853881836, "misalign/symmetric_kl": 621.4590148925781, "misalign/tv_distance": 140.44063568115234, "num_tokens": 10376767.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.160260915756226, "rewards/margins": 52.86164569854736, "rewards/rejected": -76.02190685272217, "step": 61, "support/residual_count": 151893.14453125, "support/residual_mass_policy": 0.020877071423456073, "support/residual_mass_reference": 0.04333586525171995, "support/residual_reward": -0.3012515353038907, "support/runtime_width": 42.852439880371094, "support/sampled_loser_rank": 0.6411669962108135, "support/sampled_reward_rank": -0.4843590557575226, "support/sampled_token_added_rate": 0.03510869154706597, "support/sampled_winner_rank": 0.7503333985805511, "support/selected_width": 42.852439880371094, "support/stored_width": 42.852439880371094 }, { "epoch": 0.96875, "grad_norm": 135.78811645507812, "kl/sequence_policy_ref": -521.3121376037598, "kl/vocab_forward": 479.5559272766113, "kl/vocab_js": 44.95201635360718, "kl/vocab_reverse": 172.62139701843262, "kl/vocab_symmetric": 652.1775360107422, "learning_rate": 2e-06, "logps/chosen": -1089.0823440551758, "logps/rejected": -2275.8909606933594, "loss": 2.0802, "loss/dpo": 0.21537577021728538, "misalign/J": 186.48083686828613, "misalign/J_aux_loss": 1.8648083209991455, "misalign/J_aux_loss_raw": 186.48083686828613, "misalign/J_over_reverse_kl": 1.9623293429613113, "misalign/J_per_token": 0.2417179737240076, "misalign/compressed_reward_absmax": 4746.217712402344, "misalign/compressed_reward_range": 6676.980529785156, "misalign/entropy_a": 843.7399063110352, "misalign/entropy_b": 1419.2857818603516, "misalign/forward_kl_divergence": 479.5559272766113, "misalign/forward_kl_divergence_per_token": 0.5206913501024246, "misalign/gamma_abs_times_reward_std": 52206773.75, "misalign/gamma_bracketed_rate": 0.9809182211756706, "misalign/gamma_reward_residual": 0.0015386186018986336, "misalign/gamma_star": 70413638.5, "misalign/js_divergence": 44.95201635360718, "misalign/reverse_kl_divergence": 172.62139701843262, "misalign/reverse_kl_divergence_per_token": 0.17035234905779362, "misalign/reward_a": 94.845046043396, "misalign/reward_b": -16.313238620758057, "misalign/reward_improvement": 111.15826416015625, "misalign/reward_improvement_over_reverse_kl": 0.5862268581986427, "misalign/reward_improvement_per_token": 0.0937041062861681, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -627.9793968200684, "misalign/reward_vocab_std": 811.575309753418, "misalign/symmetric_kl": 652.1775360107422, "misalign/tv_distance": 147.92323780059814, "num_tokens": 10568093.0, "rewards/accuracies": 0.953125, "rewards/chosen": -28.54444169998169, "rewards/margins": 47.17354154586792, "rewards/rejected": -75.7179822921753, "step": 62, "support/residual_count": 151893.359375, "support/residual_mass_policy": 0.022450818214565516, "support/residual_mass_reference": 0.04165853979066014, "support/residual_reward": -0.45337859727442265, "support/runtime_width": 42.641594886779785, "support/sampled_loser_rank": 0.6541027873754501, "support/sampled_reward_rank": -0.38643040135502815, "support/sampled_token_added_rate": 0.037647833582013845, "support/sampled_winner_rank": 0.6750443577766418, "support/selected_width": 42.641594886779785, "support/stored_width": 42.641594886779785 }, { "epoch": 0.984375, "grad_norm": 157.7623748779297, "kl/sequence_policy_ref": -430.5328941345215, "kl/vocab_forward": 418.2077407836914, "kl/vocab_js": 37.59886360168457, "kl/vocab_reverse": 146.4429168701172, "kl/vocab_symmetric": 564.6509666442871, "learning_rate": 2e-06, "logps/chosen": -991.5014877319336, "logps/rejected": -1757.1116333007812, "loss": 2.0983, "loss/dpo": 0.4923999092846994, "misalign/J": 160.58584594726562, "misalign/J_aux_loss": 1.6058583855628967, "misalign/J_aux_loss_raw": 160.58584594726562, "misalign/J_over_reverse_kl": 1.1775383204221725, "misalign/J_per_token": 0.2121292594820261, "misalign/compressed_reward_absmax": 4109.173980712891, "misalign/compressed_reward_range": 5852.2374267578125, "misalign/entropy_a": 698.0654525756836, "misalign/entropy_b": 1176.365493774414, "misalign/forward_kl_divergence": 418.2077407836914, "misalign/forward_kl_divergence_per_token": 0.7069764323532581, "misalign/gamma_abs_times_reward_std": 45094587.0, "misalign/gamma_bracketed_rate": 0.9876261055469513, "misalign/gamma_reward_residual": 0.00018861131684388965, "misalign/gamma_star": 68429815.0, "misalign/js_divergence": 37.59886360168457, "misalign/reverse_kl_divergence": 146.4429168701172, "misalign/reverse_kl_divergence_per_token": 0.1941323447972536, "misalign/reward_a": 87.26894760131836, "misalign/reward_b": -12.587202161550522, "misalign/reward_improvement": 99.85614490509033, "misalign/reward_improvement_over_reverse_kl": 0.5914403721690178, "misalign/reward_improvement_per_token": 0.09591837041079998, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -588.2237000465393, "misalign/reward_vocab_std": 723.0128402709961, "misalign/symmetric_kl": 564.6509666442871, "misalign/tv_distance": 122.21442317962646, "num_tokens": 10736658.0, "rewards/accuracies": 0.9375, "rewards/chosen": -24.446208238601685, "rewards/margins": 37.21416687965393, "rewards/rejected": -61.6603741645813, "step": 63, "support/residual_count": 151893.123046875, "support/residual_mass_policy": 0.026434314902871847, "support/residual_mass_reference": 0.04181864345446229, "support/residual_reward": -0.5436067841947079, "support/runtime_width": 42.87605428695679, "support/sampled_loser_rank": 0.5825388208031654, "support/sampled_reward_rank": -0.3311811648309231, "support/sampled_token_added_rate": 0.032393347937613726, "support/sampled_winner_rank": 0.6503275334835052, "support/selected_width": 42.87605428695679, "support/stored_width": 42.87605428695679 }, { "epoch": 1.0, "grad_norm": 53.39131164550781, "kl/sequence_policy_ref": -518.7998046875, "kl/vocab_forward": 483.2027702331543, "kl/vocab_js": 43.7255322933197, "kl/vocab_reverse": 169.42027759552002, "kl/vocab_symmetric": 652.6233749389648, "learning_rate": 2e-06, "logps/chosen": -1134.265338897705, "logps/rejected": -2165.7147216796875, "loss": 1.7421, "loss/dpo": 0.08747747553955643, "misalign/J": 165.4585018157959, "misalign/J_aux_loss": 1.6545849740505219, "misalign/J_aux_loss_raw": 165.4585018157959, "misalign/J_over_reverse_kl": 1.0515589267015457, "misalign/J_per_token": 0.1934837531298399, "misalign/compressed_reward_absmax": 4486.107025146484, "misalign/compressed_reward_range": 6402.580139160156, "misalign/entropy_a": 834.7258224487305, "misalign/entropy_b": 1397.0357360839844, "misalign/forward_kl_divergence": 483.2027702331543, "misalign/forward_kl_divergence_per_token": 0.6137732639908791, "misalign/gamma_abs_times_reward_std": 49280889.75, "misalign/gamma_bracketed_rate": 0.9895988926291466, "misalign/gamma_reward_residual": 0.00024569173842792225, "misalign/gamma_star": 40878552.0, "misalign/js_divergence": 43.7255322933197, "misalign/reverse_kl_divergence": 169.42027759552002, "misalign/reverse_kl_divergence_per_token": 0.1976974420249462, "misalign/reward_a": 98.72580575942993, "misalign/reward_b": -13.32999886572361, "misalign/reward_improvement": 112.05579376220703, "misalign/reward_improvement_over_reverse_kl": 0.6203483864665031, "misalign/reward_improvement_per_token": 0.10607674531638622, "misalign/reward_signal_low_rate": 0.0, "misalign/reward_vocab_mean": -555.1519546508789, "misalign/reward_vocab_std": 773.9923553466797, "misalign/symmetric_kl": 652.6233749389648, "misalign/tv_distance": 142.4766387939453, "num_tokens": 10915114.0, "rewards/accuracies": 0.984375, "rewards/chosen": -29.072665452957153, "rewards/margins": 45.61463260650635, "rewards/rejected": -74.68729877471924, "step": 64, "support/residual_count": 151893.263671875, "support/residual_mass_policy": 0.024331093532964587, "support/residual_mass_reference": 0.04252262390218675, "support/residual_reward": -0.4992054486647248, "support/runtime_width": 42.739386558532715, "support/sampled_loser_rank": 0.6560819111764431, "support/sampled_reward_rank": -0.35881325230002403, "support/sampled_token_added_rate": 0.03401010250672698, "support/sampled_winner_rank": 0.6759711802005768, "support/selected_width": 42.739386558532715, "support/stored_width": 42.739386558532715 }, { "epoch": 1.0, "step": 64, "total_flos": 4.111435733296742e+16, "train_loss": 1.8484279848635197, "train_runtime": 2097.8551, "train_samples_per_second": 1.952, "train_steps_per_second": 0.031 }, { "epoch": 1.0, "eval_kl/sequence_policy_ref": -484.36079502105713, "eval_kl/vocab_forward": 447.0612106323242, "eval_kl/vocab_js": 40.982308477163315, "eval_kl/vocab_reverse": 159.29439425468445, "eval_kl/vocab_symmetric": 606.3558564186096, "eval_logps/chosen": -960.8710179328918, "eval_logps/rejected": -2108.1164512634277, "eval_loss": 1.7901853322982788, "eval_loss/dpo": 0.1378665120205333, "eval_misalign/J": 165.23188638687134, "eval_misalign/J_aux_loss": 1.6523188361898065, "eval_misalign/J_aux_loss_raw": 165.23188638687134, "eval_misalign/J_over_reverse_kl": 1.4319368861615658, "eval_misalign/J_per_token": 0.2249188085552305, "eval_misalign/compressed_reward_absmax": 4236.7062911987305, "eval_misalign/compressed_reward_range": 6027.056602478027, "eval_misalign/entropy_a": 773.5548601150513, "eval_misalign/entropy_b": 1298.0544300079346, "eval_misalign/forward_kl_divergence": 447.0612106323242, "eval_misalign/forward_kl_divergence_per_token": 0.6650652508251369, "eval_misalign/gamma_abs_times_reward_std": 42179871.046875, "eval_misalign/gamma_bracketed_rate": 0.9879104141145945, "eval_misalign/gamma_reward_residual": 0.0007516551395241322, "eval_misalign/gamma_star": 54525594.609375, "eval_misalign/js_divergence": 40.982308477163315, "eval_misalign/reverse_kl_divergence": 159.29439425468445, "eval_misalign/reverse_kl_divergence_per_token": 0.20468837535008788, "eval_misalign/reward_a": 88.6484876871109, "eval_misalign/reward_b": -14.307281013578176, "eval_misalign/reward_improvement": 102.95576465129852, "eval_misalign/reward_improvement_over_reverse_kl": 0.5992142450995743, "eval_misalign/reward_improvement_per_token": 0.0850256277481094, "eval_misalign/reward_signal_low_rate": 0.0, "eval_misalign/reward_vocab_mean": -490.11554074287415, "eval_misalign/reward_vocab_std": 731.3202610015869, "eval_misalign/symmetric_kl": 606.3558564186096, "eval_misalign/tv_distance": 133.03198266029358, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -23.169506430625916, "eval_rewards/margins": 50.53314882516861, "eval_rewards/rejected": -73.70265424251556, "eval_runtime": 101.2427, "eval_samples_per_second": 5.057, "eval_steps_per_second": 0.632, "eval_support/residual_count": 151893.29125976562, "eval_support/residual_mass_policy": 0.023531361599452794, "eval_support/residual_mass_reference": 0.04279232310364023, "eval_support/residual_reward": -0.4695481152739376, "eval_support/runtime_width": 42.70963191986084, "eval_support/sampled_loser_rank": 0.6486562248319387, "eval_support/sampled_reward_rank": -0.37071577250026166, "eval_support/sampled_token_added_rate": 0.037317203474231064, "eval_support/sampled_winner_rank": 0.6506854901090264, "eval_support/selected_width": 42.70963191986084, "eval_support/stored_width": 42.70963191986084, "step": 64 } ], "logging_steps": 1, "max_steps": 64, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 6, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.111435733296742e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }