Files
llama-3-8b-base-beta-dpo-hh…/trainer_state.json
ModelHub XC 2349f340b8 初始化项目,由ModelHub XC社区提供模型
Model: W-61/llama-3-8b-base-beta-dpo-hh-harmless-8xh200
Source: Original Platform
2026-05-25 19:35:17 +08:00

1027 lines
36 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"beta_dpo/beta_used": 0.10009249299764633,
"beta_dpo/beta_used_raw": 0.10009249299764633,
"beta_dpo/gap_mean": 0.0012140885228291154,
"beta_dpo/gap_std": 0.029596734791994095,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.0030303030303030303,
"grad_norm": 11.079418182373047,
"learning_rate": 0.0,
"logits/chosen": -0.818070113658905,
"logits/rejected": -0.7612971663475037,
"loss": 0.6929,
"step": 1
},
{
"beta_dpo/beta_used": 0.10004878044128418,
"beta_dpo/beta_used_raw": 0.10004878044128418,
"beta_dpo/gap_mean": -0.003181760897859931,
"beta_dpo/gap_std": 0.09769059717655182,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.015151515151515152,
"grad_norm": 12.246779441833496,
"learning_rate": 6.060606060606061e-08,
"logits/chosen": -0.8416346907615662,
"logits/rejected": -0.8071619272232056,
"loss": 0.6934,
"step": 5
},
{
"beta_dpo/beta_used": 0.10060784965753555,
"beta_dpo/beta_used_raw": 0.10060784965753555,
"beta_dpo/gap_mean": -0.0015905939508229494,
"beta_dpo/gap_std": 0.1881129890680313,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.030303030303030304,
"grad_norm": 11.778424263000488,
"learning_rate": 1.3636363636363635e-07,
"logits/chosen": -0.7911893129348755,
"logits/rejected": -0.7587390542030334,
"loss": 0.6928,
"step": 10
},
{
"beta_dpo/beta_used": 0.10040197521448135,
"beta_dpo/beta_used_raw": 0.10040197521448135,
"beta_dpo/gap_mean": 0.0006210329011082649,
"beta_dpo/gap_std": 0.24522730708122253,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.045454545454545456,
"grad_norm": 12.626185417175293,
"learning_rate": 2.121212121212121e-07,
"logits/chosen": -0.8082472085952759,
"logits/rejected": -0.8093615770339966,
"loss": 0.6928,
"step": 15
},
{
"beta_dpo/beta_used": 0.10040859878063202,
"beta_dpo/beta_used_raw": 0.10040859878063202,
"beta_dpo/gap_mean": 0.008134648203849792,
"beta_dpo/gap_std": 0.2810249626636505,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.06060606060606061,
"grad_norm": 12.163843154907227,
"learning_rate": 2.878787878787879e-07,
"logits/chosen": -0.7914258241653442,
"logits/rejected": -0.7522870302200317,
"loss": 0.6925,
"step": 20
},
{
"beta_dpo/beta_used": 0.10019676387310028,
"beta_dpo/beta_used_raw": 0.10019676387310028,
"beta_dpo/gap_mean": 0.007132118102163076,
"beta_dpo/gap_std": 0.3137893080711365,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.07575757575757576,
"grad_norm": 12.878430366516113,
"learning_rate": 3.636363636363636e-07,
"logits/chosen": -0.7768210172653198,
"logits/rejected": -0.771538496017456,
"loss": 0.6926,
"step": 25
},
{
"beta_dpo/beta_used": 0.10199077427387238,
"beta_dpo/beta_used_raw": 0.10199077427387238,
"beta_dpo/gap_mean": 0.015979086980223656,
"beta_dpo/gap_std": 0.34232962131500244,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.09090909090909091,
"grad_norm": 11.947314262390137,
"learning_rate": 4.3939393939393937e-07,
"logits/chosen": -0.8367147445678711,
"logits/rejected": -0.8112382888793945,
"loss": 0.6907,
"step": 30
},
{
"beta_dpo/beta_used": 0.10177697986364365,
"beta_dpo/beta_used_raw": 0.10177697986364365,
"beta_dpo/gap_mean": 0.0375533364713192,
"beta_dpo/gap_std": 0.3859425187110901,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.10606060606060606,
"grad_norm": 14.33592700958252,
"learning_rate": 4.999860140229787e-07,
"logits/chosen": -0.8096274137496948,
"logits/rejected": -0.7928019762039185,
"loss": 0.6898,
"step": 35
},
{
"beta_dpo/beta_used": 0.10338791459798813,
"beta_dpo/beta_used_raw": 0.10338791459798813,
"beta_dpo/gap_mean": 0.06975066661834717,
"beta_dpo/gap_std": 0.45846351981163025,
"beta_dpo/mask_keep_frac": 0.824999988079071,
"epoch": 0.12121212121212122,
"grad_norm": 11.904743194580078,
"learning_rate": 4.994966691179711e-07,
"logits/chosen": -0.7240467667579651,
"logits/rejected": -0.6869294047355652,
"loss": 0.6868,
"step": 40
},
{
"beta_dpo/beta_used": 0.105168916285038,
"beta_dpo/beta_used_raw": 0.105168916285038,
"beta_dpo/gap_mean": 0.14308178424835205,
"beta_dpo/gap_std": 0.5644584894180298,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.13636363636363635,
"grad_norm": 13.17418098449707,
"learning_rate": 4.983095894354857e-07,
"logits/chosen": -0.7734057307243347,
"logits/rejected": -0.7477155923843384,
"loss": 0.6818,
"step": 45
},
{
"beta_dpo/beta_used": 0.10223841667175293,
"beta_dpo/beta_used_raw": 0.10223841667175293,
"beta_dpo/gap_mean": 0.21264997124671936,
"beta_dpo/gap_std": 0.7354207038879395,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.15151515151515152,
"grad_norm": 12.405279159545898,
"learning_rate": 4.964280947263676e-07,
"logits/chosen": -0.7339795827865601,
"logits/rejected": -0.7022608518600464,
"loss": 0.6815,
"step": 50
},
{
"beta_dpo/beta_used": 0.10513879358768463,
"beta_dpo/beta_used_raw": 0.10513879358768463,
"beta_dpo/gap_mean": 0.27966898679733276,
"beta_dpo/gap_std": 1.0065762996673584,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.16666666666666666,
"grad_norm": 13.70584774017334,
"learning_rate": 4.938574467213517e-07,
"logits/chosen": -0.7537848949432373,
"logits/rejected": -0.7295504808425903,
"loss": 0.6752,
"step": 55
},
{
"beta_dpo/beta_used": 0.10337547957897186,
"beta_dpo/beta_used_raw": 0.10337547957897186,
"beta_dpo/gap_mean": 0.3844713568687439,
"beta_dpo/gap_std": 1.2807694673538208,
"beta_dpo/mask_keep_frac": 0.762499988079071,
"epoch": 0.18181818181818182,
"grad_norm": 12.184106826782227,
"learning_rate": 4.906048344162676e-07,
"logits/chosen": -0.7029341459274292,
"logits/rejected": -0.6750706434249878,
"loss": 0.6718,
"step": 60
},
{
"beta_dpo/beta_used": 0.10123707354068756,
"beta_dpo/beta_used_raw": 0.10123707354068756,
"beta_dpo/gap_mean": 0.5187833309173584,
"beta_dpo/gap_std": 1.5582863092422485,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.19696969696969696,
"grad_norm": 12.474862098693848,
"learning_rate": 4.866793539675126e-07,
"logits/chosen": -0.7182232737541199,
"logits/rejected": -0.6864453554153442,
"loss": 0.668,
"step": 65
},
{
"beta_dpo/beta_used": 0.10362961143255234,
"beta_dpo/beta_used_raw": 0.10362961143255234,
"beta_dpo/gap_mean": 0.6425492763519287,
"beta_dpo/gap_std": 1.8649520874023438,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.21212121212121213,
"grad_norm": 13.411380767822266,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": -0.6498057842254639,
"logits/rejected": -0.6468607783317566,
"loss": 0.6611,
"step": 70
},
{
"beta_dpo/beta_used": 0.10772015154361725,
"beta_dpo/beta_used_raw": 0.10772015154361725,
"beta_dpo/gap_mean": 0.7031647562980652,
"beta_dpo/gap_std": 2.167182683944702,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.22727272727272727,
"grad_norm": 12.674415588378906,
"learning_rate": 4.768555511768486e-07,
"logits/chosen": -0.6153755187988281,
"logits/rejected": -0.606307327747345,
"loss": 0.653,
"step": 75
},
{
"beta_dpo/beta_used": 0.10870923101902008,
"beta_dpo/beta_used_raw": 0.10870923101902008,
"beta_dpo/gap_mean": 0.8461316227912903,
"beta_dpo/gap_std": 2.5076112747192383,
"beta_dpo/mask_keep_frac": 0.8374999761581421,
"epoch": 0.24242424242424243,
"grad_norm": 13.425226211547852,
"learning_rate": 4.7098470178228755e-07,
"logits/chosen": -0.6497966647148132,
"logits/rejected": -0.6329380869865417,
"loss": 0.6466,
"step": 80
},
{
"beta_dpo/beta_used": 0.1060580238699913,
"beta_dpo/beta_used_raw": 0.1060580238699913,
"beta_dpo/gap_mean": 0.9982147216796875,
"beta_dpo/gap_std": 2.806090831756592,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.25757575757575757,
"grad_norm": 9.75727653503418,
"learning_rate": 4.6449585330874425e-07,
"logits/chosen": -0.6012470722198486,
"logits/rejected": -0.5752061605453491,
"loss": 0.6435,
"step": 85
},
{
"beta_dpo/beta_used": 0.11574982106685638,
"beta_dpo/beta_used_raw": 0.11574982106685638,
"beta_dpo/gap_mean": 1.2254174947738647,
"beta_dpo/gap_std": 3.2572083473205566,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.2727272727272727,
"grad_norm": 10.738388061523438,
"learning_rate": 4.5740715227200897e-07,
"logits/chosen": -0.650251567363739,
"logits/rejected": -0.6243180632591248,
"loss": 0.6219,
"step": 90
},
{
"beta_dpo/beta_used": 0.09826114773750305,
"beta_dpo/beta_used_raw": 0.09826114773750305,
"beta_dpo/gap_mean": 1.4264709949493408,
"beta_dpo/gap_std": 3.7166686058044434,
"beta_dpo/mask_keep_frac": 0.762499988079071,
"epoch": 0.2878787878787879,
"grad_norm": 13.121673583984375,
"learning_rate": 4.4973842271726024e-07,
"logits/chosen": -0.5675602555274963,
"logits/rejected": -0.5547417402267456,
"loss": 0.6362,
"step": 95
},
{
"beta_dpo/beta_used": 0.10674748569726944,
"beta_dpo/beta_used_raw": 0.10674748569726944,
"beta_dpo/gap_mean": 1.5260875225067139,
"beta_dpo/gap_std": 4.1418657302856445,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.30303030303030304,
"grad_norm": 15.6002197265625,
"learning_rate": 4.415111107797445e-07,
"logits/chosen": -0.5712032914161682,
"logits/rejected": -0.5290790796279907,
"loss": 0.6231,
"step": 100
},
{
"epoch": 0.30303030303030304,
"eval_beta_dpo/beta_used": 0.11167524755001068,
"eval_beta_dpo/beta_used_raw": 0.11167524755001068,
"eval_beta_dpo/gap_mean": 1.9525233507156372,
"eval_beta_dpo/gap_std": 4.847992897033691,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": -0.5574179887771606,
"eval_logits/rejected": -0.540048360824585,
"eval_loss": 0.6185675263404846,
"eval_runtime": 18.8608,
"eval_samples_per_second": 122.105,
"eval_steps_per_second": 0.954,
"step": 100
},
{
"beta_dpo/beta_used": 0.06386379897594452,
"beta_dpo/beta_used_raw": 0.06386379897594452,
"beta_dpo/gap_mean": 2.0449135303497314,
"beta_dpo/gap_std": 5.11466121673584,
"beta_dpo/mask_keep_frac": 0.887499988079071,
"epoch": 0.3181818181818182,
"grad_norm": 10.90100383758545,
"learning_rate": 4.327482247091679e-07,
"logits/chosen": -0.5555615425109863,
"logits/rejected": -0.528151273727417,
"loss": 0.6534,
"step": 105
},
{
"beta_dpo/beta_used": 0.08590348809957504,
"beta_dpo/beta_used_raw": 0.08590348809957504,
"beta_dpo/gap_mean": 2.1610352993011475,
"beta_dpo/gap_std": 5.504552364349365,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.3333333333333333,
"grad_norm": 7.672910690307617,
"learning_rate": 4.234742705255272e-07,
"logits/chosen": -0.4595974385738373,
"logits/rejected": -0.45340991020202637,
"loss": 0.6317,
"step": 110
},
{
"beta_dpo/beta_used": 0.10557971149682999,
"beta_dpo/beta_used_raw": 0.10557971149682999,
"beta_dpo/gap_mean": 2.390939474105835,
"beta_dpo/gap_std": 5.818662166595459,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.3484848484848485,
"grad_norm": 8.269521713256836,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -0.5435389280319214,
"logits/rejected": -0.4987867474555969,
"loss": 0.5959,
"step": 115
},
{
"beta_dpo/beta_used": 0.08998899161815643,
"beta_dpo/beta_used_raw": 0.08998899161815643,
"beta_dpo/gap_mean": 2.3944687843322754,
"beta_dpo/gap_std": 6.05053186416626,
"beta_dpo/mask_keep_frac": 0.8374999761581421,
"epoch": 0.36363636363636365,
"grad_norm": 13.379582405090332,
"learning_rate": 4.0349825555680045e-07,
"logits/chosen": -0.5789726972579956,
"logits/rejected": -0.5432100296020508,
"loss": 0.6198,
"step": 120
},
{
"beta_dpo/beta_used": 0.08791515231132507,
"beta_dpo/beta_used_raw": 0.08791515231132507,
"beta_dpo/gap_mean": 2.5297319889068604,
"beta_dpo/gap_std": 6.210949897766113,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3787878787878788,
"grad_norm": 7.562979221343994,
"learning_rate": 3.9285205908608934e-07,
"logits/chosen": -0.5596938729286194,
"logits/rejected": -0.5469728708267212,
"loss": 0.6146,
"step": 125
},
{
"beta_dpo/beta_used": 0.11058609187602997,
"beta_dpo/beta_used_raw": 0.11058609187602997,
"beta_dpo/gap_mean": 2.536633014678955,
"beta_dpo/gap_std": 6.392093181610107,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.3939393939393939,
"grad_norm": 23.452016830444336,
"learning_rate": 3.818063669026256e-07,
"logits/chosen": -0.5439124703407288,
"logits/rejected": -0.5279029607772827,
"loss": 0.5928,
"step": 130
},
{
"beta_dpo/beta_used": 0.10615509748458862,
"beta_dpo/beta_used_raw": 0.10615509748458862,
"beta_dpo/gap_mean": 2.8626952171325684,
"beta_dpo/gap_std": 6.557906150817871,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.4090909090909091,
"grad_norm": 16.79780387878418,
"learning_rate": 3.7039206905237656e-07,
"logits/chosen": -0.556363582611084,
"logits/rejected": -0.5632845163345337,
"loss": 0.5811,
"step": 135
},
{
"beta_dpo/beta_used": 0.1162651777267456,
"beta_dpo/beta_used_raw": 0.1162651777267456,
"beta_dpo/gap_mean": 3.088381290435791,
"beta_dpo/gap_std": 6.59566593170166,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.42424242424242425,
"grad_norm": 14.226531982421875,
"learning_rate": 3.586410864126781e-07,
"logits/chosen": -0.5420447587966919,
"logits/rejected": -0.5301133990287781,
"loss": 0.5488,
"step": 140
},
{
"beta_dpo/beta_used": 0.11434066295623779,
"beta_dpo/beta_used_raw": 0.11434066295623779,
"beta_dpo/gap_mean": 3.461772918701172,
"beta_dpo/gap_std": 6.666165828704834,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4393939393939394,
"grad_norm": 13.191394805908203,
"learning_rate": 3.465862814232821e-07,
"logits/chosen": -0.49957942962646484,
"logits/rejected": -0.4835745394229889,
"loss": 0.5499,
"step": 145
},
{
"beta_dpo/beta_used": 0.12056032568216324,
"beta_dpo/beta_used_raw": 0.12056032568216324,
"beta_dpo/gap_mean": 3.900587797164917,
"beta_dpo/gap_std": 6.922667026519775,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.45454545454545453,
"grad_norm": 10.217402458190918,
"learning_rate": 3.3426136618426043e-07,
"logits/chosen": -0.5163663625717163,
"logits/rejected": -0.4923931062221527,
"loss": 0.5155,
"step": 150
},
{
"beta_dpo/beta_used": 0.08996663987636566,
"beta_dpo/beta_used_raw": 0.08996663987636566,
"beta_dpo/gap_mean": 4.022343635559082,
"beta_dpo/gap_std": 7.262037754058838,
"beta_dpo/mask_keep_frac": 0.762499988079071,
"epoch": 0.4696969696969697,
"grad_norm": 6.328583240509033,
"learning_rate": 3.2170080817777257e-07,
"logits/chosen": -0.47460970282554626,
"logits/rejected": -0.4646075665950775,
"loss": 0.5723,
"step": 155
},
{
"beta_dpo/beta_used": 0.09257197380065918,
"beta_dpo/beta_used_raw": 0.09257197380065918,
"beta_dpo/gap_mean": 4.135162353515625,
"beta_dpo/gap_std": 7.709047794342041,
"beta_dpo/mask_keep_frac": 0.8374999761581421,
"epoch": 0.48484848484848486,
"grad_norm": 2.340575933456421,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": -0.549339234828949,
"logits/rejected": -0.5254893898963928,
"loss": 0.5706,
"step": 160
},
{
"beta_dpo/beta_used": 0.1215561255812645,
"beta_dpo/beta_used_raw": 0.1215561255812645,
"beta_dpo/gap_mean": 4.385509490966797,
"beta_dpo/gap_std": 8.18330192565918,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5,
"grad_norm": 27.537439346313477,
"learning_rate": 2.9601383051430505e-07,
"logits/chosen": -0.4928368926048279,
"logits/rejected": -0.46984148025512695,
"loss": 0.5273,
"step": 165
},
{
"beta_dpo/beta_used": 0.08485610783100128,
"beta_dpo/beta_used_raw": 0.08485610783100128,
"beta_dpo/gap_mean": 4.619694709777832,
"beta_dpo/gap_std": 8.622313499450684,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5151515151515151,
"grad_norm": 10.716350555419922,
"learning_rate": 2.8295924627584004e-07,
"logits/chosen": -0.47423356771469116,
"logits/rejected": -0.43696826696395874,
"loss": 0.5656,
"step": 170
},
{
"beta_dpo/beta_used": 0.10904519259929657,
"beta_dpo/beta_used_raw": 0.10904519259929657,
"beta_dpo/gap_mean": 4.983495712280273,
"beta_dpo/gap_std": 9.088811874389648,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5303030303030303,
"grad_norm": 16.4443416595459,
"learning_rate": 2.698124892141971e-07,
"logits/chosen": -0.4739559590816498,
"logits/rejected": -0.452726274728775,
"loss": 0.5275,
"step": 175
},
{
"beta_dpo/beta_used": 0.09932375699281693,
"beta_dpo/beta_used_raw": 0.09932375699281693,
"beta_dpo/gap_mean": 5.506978511810303,
"beta_dpo/gap_std": 9.59619426727295,
"beta_dpo/mask_keep_frac": 0.887499988079071,
"epoch": 0.5454545454545454,
"grad_norm": 6.31719446182251,
"learning_rate": 2.5661032514931834e-07,
"logits/chosen": -0.5071254968643188,
"logits/rejected": -0.4881424307823181,
"loss": 0.5367,
"step": 180
},
{
"beta_dpo/beta_used": 0.08257903903722763,
"beta_dpo/beta_used_raw": 0.08257903903722763,
"beta_dpo/gap_mean": 5.807556629180908,
"beta_dpo/gap_std": 10.00381088256836,
"beta_dpo/mask_keep_frac": 0.9125000238418579,
"epoch": 0.5606060606060606,
"grad_norm": 16.983186721801758,
"learning_rate": 2.4338967485068164e-07,
"logits/chosen": -0.44962626695632935,
"logits/rejected": -0.4310552179813385,
"loss": 0.5442,
"step": 185
},
{
"beta_dpo/beta_used": 0.1385645568370819,
"beta_dpo/beta_used_raw": 0.1385645568370819,
"beta_dpo/gap_mean": 5.958134651184082,
"beta_dpo/gap_std": 10.562962532043457,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.5757575757575758,
"grad_norm": 31.49508285522461,
"learning_rate": 2.3018751078580283e-07,
"logits/chosen": -0.4748384356498718,
"logits/rejected": -0.4529237151145935,
"loss": 0.4962,
"step": 190
},
{
"beta_dpo/beta_used": 0.10011277347803116,
"beta_dpo/beta_used_raw": 0.09850181639194489,
"beta_dpo/gap_mean": 6.100876808166504,
"beta_dpo/gap_std": 11.020359992980957,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.5909090909090909,
"grad_norm": 17.15842056274414,
"learning_rate": 2.170407537241599e-07,
"logits/chosen": -0.4534582495689392,
"logits/rejected": -0.42914143204689026,
"loss": 0.5502,
"step": 195
},
{
"beta_dpo/beta_used": 0.1180671900510788,
"beta_dpo/beta_used_raw": 0.1180671900510788,
"beta_dpo/gap_mean": 6.612210273742676,
"beta_dpo/gap_std": 11.322927474975586,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.6060606060606061,
"grad_norm": 13.65029239654541,
"learning_rate": 2.0398616948569493e-07,
"logits/chosen": -0.4936196208000183,
"logits/rejected": -0.4612639546394348,
"loss": 0.498,
"step": 200
},
{
"epoch": 0.6060606060606061,
"eval_beta_dpo/beta_used": 0.10561517626047134,
"eval_beta_dpo/beta_used_raw": 0.10561517626047134,
"eval_beta_dpo/gap_mean": 6.780107498168945,
"eval_beta_dpo/gap_std": 11.72070598602295,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": -0.4722588062286377,
"eval_logits/rejected": -0.45819586515426636,
"eval_loss": 0.5506138801574707,
"eval_runtime": 18.8213,
"eval_samples_per_second": 122.361,
"eval_steps_per_second": 0.956,
"step": 200
},
{
"beta_dpo/beta_used": 0.08741272985935211,
"beta_dpo/beta_used_raw": 0.08735300600528717,
"beta_dpo/gap_mean": 7.251504421234131,
"beta_dpo/gap_std": 11.868724822998047,
"beta_dpo/mask_keep_frac": 0.762499988079071,
"epoch": 0.6212121212121212,
"grad_norm": 0.15343494713306427,
"learning_rate": 1.9106026612264315e-07,
"logits/chosen": -0.4946843981742859,
"logits/rejected": -0.46265077590942383,
"loss": 0.5233,
"step": 205
},
{
"beta_dpo/beta_used": 0.08492619544267654,
"beta_dpo/beta_used_raw": 0.08492619544267654,
"beta_dpo/gap_mean": 7.168964385986328,
"beta_dpo/gap_std": 11.9141845703125,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.6363636363636364,
"grad_norm": 38.745361328125,
"learning_rate": 1.782991918222275e-07,
"logits/chosen": -0.42799100279808044,
"logits/rejected": -0.4196823239326477,
"loss": 0.5237,
"step": 210
},
{
"beta_dpo/beta_used": 0.08925200998783112,
"beta_dpo/beta_used_raw": 0.08484373241662979,
"beta_dpo/gap_mean": 7.09285831451416,
"beta_dpo/gap_std": 12.202669143676758,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.6515151515151515,
"grad_norm": 39.51192092895508,
"learning_rate": 1.6573863381573954e-07,
"logits/chosen": -0.43246760964393616,
"logits/rejected": -0.4298061430454254,
"loss": 0.5466,
"step": 215
},
{
"beta_dpo/beta_used": 0.1373816877603531,
"beta_dpo/beta_used_raw": 0.1373816877603531,
"beta_dpo/gap_mean": 7.408307075500488,
"beta_dpo/gap_std": 12.6698579788208,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6666666666666666,
"grad_norm": 66.92206573486328,
"learning_rate": 1.534137185767178e-07,
"logits/chosen": -0.5049004554748535,
"logits/rejected": -0.4828864634037018,
"loss": 0.4731,
"step": 220
},
{
"beta_dpo/beta_used": 0.11999156326055527,
"beta_dpo/beta_used_raw": 0.11999156326055527,
"beta_dpo/gap_mean": 7.8069658279418945,
"beta_dpo/gap_std": 12.916173934936523,
"beta_dpo/mask_keep_frac": 0.7124999761581421,
"epoch": 0.6818181818181818,
"grad_norm": 5.55664587020874,
"learning_rate": 1.4135891358732205e-07,
"logits/chosen": -0.4607675075531006,
"logits/rejected": -0.429083913564682,
"loss": 0.4933,
"step": 225
},
{
"beta_dpo/beta_used": 0.11390962451696396,
"beta_dpo/beta_used_raw": 0.11390962451696396,
"beta_dpo/gap_mean": 7.83342981338501,
"beta_dpo/gap_std": 12.932693481445312,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.696969696969697,
"grad_norm": 32.68361282348633,
"learning_rate": 1.2960793094762345e-07,
"logits/chosen": -0.41661542654037476,
"logits/rejected": -0.4079780578613281,
"loss": 0.4954,
"step": 230
},
{
"beta_dpo/beta_used": 0.09100167453289032,
"beta_dpo/beta_used_raw": 0.09100167453289032,
"beta_dpo/gap_mean": 8.167860984802246,
"beta_dpo/gap_std": 12.970059394836426,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 0.7121212121212122,
"grad_norm": 1.9182671308517456,
"learning_rate": 1.1819363309737438e-07,
"logits/chosen": -0.4386097490787506,
"logits/rejected": -0.42474693059921265,
"loss": 0.5136,
"step": 235
},
{
"beta_dpo/beta_used": 0.11001662909984589,
"beta_dpo/beta_used_raw": 0.11001662909984589,
"beta_dpo/gap_mean": 8.317561149597168,
"beta_dpo/gap_std": 13.424278259277344,
"beta_dpo/mask_keep_frac": 0.800000011920929,
"epoch": 0.7272727272727273,
"grad_norm": 17.994626998901367,
"learning_rate": 1.0714794091391072e-07,
"logits/chosen": -0.4545617997646332,
"logits/rejected": -0.4394044280052185,
"loss": 0.4769,
"step": 240
},
{
"beta_dpo/beta_used": 0.07068195939064026,
"beta_dpo/beta_used_raw": 0.07068195939064026,
"beta_dpo/gap_mean": 8.271533966064453,
"beta_dpo/gap_std": 13.785310745239258,
"beta_dpo/mask_keep_frac": 0.824999988079071,
"epoch": 0.7424242424242424,
"grad_norm": 9.725923538208008,
"learning_rate": 9.650174444319956e-08,
"logits/chosen": -0.45390695333480835,
"logits/rejected": -0.43619924783706665,
"loss": 0.5268,
"step": 245
},
{
"beta_dpo/beta_used": 0.08607280999422073,
"beta_dpo/beta_used_raw": 0.08015486598014832,
"beta_dpo/gap_mean": 8.123547554016113,
"beta_dpo/gap_std": 14.15746021270752,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7575757575757576,
"grad_norm": 19.712242126464844,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -0.4595223069190979,
"logits/rejected": -0.4408304691314697,
"loss": 0.5287,
"step": 250
},
{
"beta_dpo/beta_used": 0.0958368107676506,
"beta_dpo/beta_used_raw": 0.08722580969333649,
"beta_dpo/gap_mean": 8.267644882202148,
"beta_dpo/gap_std": 14.14880657196045,
"beta_dpo/mask_keep_frac": 0.8999999761581421,
"epoch": 0.7727272727272727,
"grad_norm": 61.9700927734375,
"learning_rate": 7.652572947447272e-08,
"logits/chosen": -0.44903382658958435,
"logits/rejected": -0.4424815773963928,
"loss": 0.5257,
"step": 255
},
{
"beta_dpo/beta_used": 0.07386674731969833,
"beta_dpo/beta_used_raw": 0.06767500936985016,
"beta_dpo/gap_mean": 8.649662017822266,
"beta_dpo/gap_std": 14.375146865844727,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.7878787878787878,
"grad_norm": 20.901798248291016,
"learning_rate": 6.725177529083209e-08,
"logits/chosen": -0.46160441637039185,
"logits/rejected": -0.44480133056640625,
"loss": 0.5284,
"step": 260
},
{
"beta_dpo/beta_used": 0.08889990299940109,
"beta_dpo/beta_used_raw": 0.05368128418922424,
"beta_dpo/gap_mean": 8.253731727600098,
"beta_dpo/gap_std": 14.49620532989502,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.803030303030303,
"grad_norm": 36.13115692138672,
"learning_rate": 5.848888922025552e-08,
"logits/chosen": -0.4071124196052551,
"logits/rejected": -0.38313764333724976,
"loss": 0.5524,
"step": 265
},
{
"beta_dpo/beta_used": 0.05102431774139404,
"beta_dpo/beta_used_raw": 0.05102431774139404,
"beta_dpo/gap_mean": 8.481303215026855,
"beta_dpo/gap_std": 14.435537338256836,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.8181818181818182,
"grad_norm": 4.406769275665283,
"learning_rate": 5.026157728273966e-08,
"logits/chosen": -0.43619123101234436,
"logits/rejected": -0.40814194083213806,
"loss": 0.5676,
"step": 270
},
{
"beta_dpo/beta_used": 0.08905264735221863,
"beta_dpo/beta_used_raw": 0.08905264735221863,
"beta_dpo/gap_mean": 8.75959587097168,
"beta_dpo/gap_std": 14.441301345825195,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.8333333333333334,
"grad_norm": 13.085917472839355,
"learning_rate": 4.259284772799099e-08,
"logits/chosen": -0.43446803092956543,
"logits/rejected": -0.4283529818058014,
"loss": 0.5225,
"step": 275
},
{
"beta_dpo/beta_used": 0.1104244738817215,
"beta_dpo/beta_used_raw": 0.1104244738817215,
"beta_dpo/gap_mean": 8.6881103515625,
"beta_dpo/gap_std": 14.51659870147705,
"beta_dpo/mask_keep_frac": 0.7875000238418579,
"epoch": 0.8484848484848485,
"grad_norm": 47.124366760253906,
"learning_rate": 3.550414669125573e-08,
"logits/chosen": -0.4580152630805969,
"logits/rejected": -0.4392933249473572,
"loss": 0.4767,
"step": 280
},
{
"beta_dpo/beta_used": 0.14569848775863647,
"beta_dpo/beta_used_raw": 0.14569848775863647,
"beta_dpo/gap_mean": 9.179306030273438,
"beta_dpo/gap_std": 14.847735404968262,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8636363636363636,
"grad_norm": 43.69351577758789,
"learning_rate": 2.9015298217712453e-08,
"logits/chosen": -0.42454952001571655,
"logits/rejected": -0.3965614438056946,
"loss": 0.4529,
"step": 285
},
{
"beta_dpo/beta_used": 0.06421518325805664,
"beta_dpo/beta_used_raw": 0.056242913007736206,
"beta_dpo/gap_mean": 9.178163528442383,
"beta_dpo/gap_std": 14.94957160949707,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.8787878787878788,
"grad_norm": 19.567977905273438,
"learning_rate": 2.3144448823151392e-08,
"logits/chosen": -0.4124082624912262,
"logits/rejected": -0.38752835988998413,
"loss": 0.5666,
"step": 290
},
{
"beta_dpo/beta_used": 0.11043484508991241,
"beta_dpo/beta_used_raw": 0.11043484508991241,
"beta_dpo/gap_mean": 9.004778861999512,
"beta_dpo/gap_std": 15.063299179077148,
"beta_dpo/mask_keep_frac": 0.737500011920929,
"epoch": 0.8939393939393939,
"grad_norm": 45.88330841064453,
"learning_rate": 1.7908016745981856e-08,
"logits/chosen": -0.41249990463256836,
"logits/rejected": -0.41048282384872437,
"loss": 0.4783,
"step": 295
},
{
"beta_dpo/beta_used": 0.06652533262968063,
"beta_dpo/beta_used_raw": 0.05020095035433769,
"beta_dpo/gap_mean": 9.056544303894043,
"beta_dpo/gap_std": 15.056539535522461,
"beta_dpo/mask_keep_frac": 0.762499988079071,
"epoch": 0.9090909090909091,
"grad_norm": 0.25523823499679565,
"learning_rate": 1.3320646032487393e-08,
"logits/chosen": -0.4351003170013428,
"logits/rejected": -0.42235302925109863,
"loss": 0.5615,
"step": 300
},
{
"epoch": 0.9090909090909091,
"eval_beta_dpo/beta_used": 0.10696752369403839,
"eval_beta_dpo/beta_used_raw": 0.10696752369403839,
"eval_beta_dpo/gap_mean": 8.805192947387695,
"eval_beta_dpo/gap_std": 15.178271293640137,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": -0.4217662513256073,
"eval_logits/rejected": -0.4089266359806061,
"eval_loss": 0.5633069276809692,
"eval_runtime": 18.8692,
"eval_samples_per_second": 122.051,
"eval_steps_per_second": 0.954,
"step": 300
},
{
"beta_dpo/beta_used": 0.0679563358426094,
"beta_dpo/beta_used_raw": 0.06361763179302216,
"beta_dpo/gap_mean": 9.039968490600586,
"beta_dpo/gap_std": 15.006390571594238,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.9242424242424242,
"grad_norm": 26.64524269104004,
"learning_rate": 9.395165583732379e-09,
"logits/chosen": -0.40837812423706055,
"logits/rejected": -0.3757531940937042,
"loss": 0.5354,
"step": 305
},
{
"beta_dpo/beta_used": 0.09475517272949219,
"beta_dpo/beta_used_raw": 0.09475517272949219,
"beta_dpo/gap_mean": 9.129568099975586,
"beta_dpo/gap_std": 14.912490844726562,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9393939393939394,
"grad_norm": 17.02347755432129,
"learning_rate": 6.142553278648238e-09,
"logits/chosen": -0.4192012846469879,
"logits/rejected": -0.4020632803440094,
"loss": 0.4862,
"step": 310
},
{
"beta_dpo/beta_used": 0.09896779805421829,
"beta_dpo/beta_used_raw": 0.09896779805421829,
"beta_dpo/gap_mean": 9.311323165893555,
"beta_dpo/gap_std": 14.838136672973633,
"beta_dpo/mask_keep_frac": 0.7749999761581421,
"epoch": 0.9545454545454546,
"grad_norm": 13.178363800048828,
"learning_rate": 3.5719052736323806e-09,
"logits/chosen": -0.41689127683639526,
"logits/rejected": -0.41213899850845337,
"loss": 0.5065,
"step": 315
},
{
"beta_dpo/beta_used": 0.05972599983215332,
"beta_dpo/beta_used_raw": 0.048868484795093536,
"beta_dpo/gap_mean": 9.482072830200195,
"beta_dpo/gap_std": 15.056081771850586,
"beta_dpo/mask_keep_frac": 0.8999999761581421,
"epoch": 0.9696969696969697,
"grad_norm": 16.041927337646484,
"learning_rate": 1.690410564514244e-09,
"logits/chosen": -0.42210960388183594,
"logits/rejected": -0.38882067799568176,
"loss": 0.5702,
"step": 320
},
{
"beta_dpo/beta_used": 0.12381196022033691,
"beta_dpo/beta_used_raw": 0.12381196022033691,
"beta_dpo/gap_mean": 9.218812942504883,
"beta_dpo/gap_std": 15.04699993133545,
"beta_dpo/mask_keep_frac": 0.887499988079071,
"epoch": 0.9848484848484849,
"grad_norm": 30.680978775024414,
"learning_rate": 5.033308820289184e-10,
"logits/chosen": -0.4276047348976135,
"logits/rejected": -0.4020787179470062,
"loss": 0.4571,
"step": 325
},
{
"beta_dpo/beta_used": 0.08325864374637604,
"beta_dpo/beta_used_raw": 0.07991620153188705,
"beta_dpo/gap_mean": 9.292040824890137,
"beta_dpo/gap_std": 15.013906478881836,
"beta_dpo/mask_keep_frac": 0.862500011920929,
"epoch": 1.0,
"grad_norm": 12.934744834899902,
"learning_rate": 1.3985977021235829e-11,
"logits/chosen": -0.45221251249313354,
"logits/rejected": -0.42801961302757263,
"loss": 0.5248,
"step": 330
},
{
"epoch": 1.0,
"step": 330,
"total_flos": 0.0,
"train_loss": 0.5772968926213005,
"train_runtime": 1407.4268,
"train_samples_per_second": 30.08,
"train_steps_per_second": 0.234
}
],
"logging_steps": 5,
"max_steps": 330,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}