{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020931449502878076, "grad_norm": 4.5917708857834985, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.8526347279548645, "logits/rejected": -0.7768423557281494, "logps/chosen": -363.13519287109375, "logps/rejected": -364.9631042480469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.020931449502878074, "grad_norm": 4.404674449441554, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.7482305765151978, "logits/rejected": -0.7081854343414307, "logps/chosen": -311.2024841308594, "logps/rejected": -284.1365966796875, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": 2.1014602680224925e-05, "rewards/margins": 8.458160300506279e-05, "rewards/rejected": -6.356705853249878e-05, "step": 10 }, { "epoch": 0.04186289900575615, "grad_norm": 4.2921416180433765, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.7403894066810608, "logits/rejected": -0.6793709993362427, "logps/chosen": -324.6893005371094, "logps/rejected": -290.2327575683594, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0014148516347631812, "rewards/margins": 0.0002438486844766885, "rewards/rejected": 0.0011710028629750013, "step": 20 }, { "epoch": 0.06279434850863422, "grad_norm": 4.218970991450984, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.7561457753181458, "logits/rejected": -0.7098526954650879, "logps/chosen": -295.118408203125, "logps/rejected": -255.83407592773438, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": 0.008535891771316528, "rewards/margins": 0.0028298485558480024, "rewards/rejected": 0.005706042982637882, "step": 30 }, { "epoch": 0.0837257980115123, "grad_norm": 3.614368719783126, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.7299980521202087, "logits/rejected": -0.6768942475318909, "logps/chosen": -267.0862121582031, "logps/rejected": -267.53863525390625, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.020161841064691544, "rewards/margins": 0.008152564987540245, "rewards/rejected": 0.012009273283183575, "step": 40 }, { "epoch": 0.10465724751439037, "grad_norm": 3.6040620075273546, "learning_rate": 4.999731868769026e-07, "logits/chosen": -0.7151128053665161, "logits/rejected": -0.6647322177886963, "logps/chosen": -296.5942077636719, "logps/rejected": -277.5081787109375, "loss": 0.6833, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0398605577647686, "rewards/margins": 0.02836265228688717, "rewards/rejected": 0.011497899889945984, "step": 50 }, { "epoch": 0.12558869701726844, "grad_norm": 3.5922190973220163, "learning_rate": 4.990353313429303e-07, "logits/chosen": -0.7289865016937256, "logits/rejected": -0.6785635352134705, "logps/chosen": -262.1878967285156, "logps/rejected": -253.5371856689453, "loss": 0.6783, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0495939627289772, "rewards/margins": 0.042832277715206146, "rewards/rejected": 0.0067616915330290794, "step": 60 }, { "epoch": 0.14652014652014653, "grad_norm": 3.6371057840364927, "learning_rate": 4.967625656594781e-07, "logits/chosen": -0.6846636533737183, "logits/rejected": -0.6486319303512573, "logps/chosen": -304.8815002441406, "logps/rejected": -293.3005065917969, "loss": 0.6683, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.014333389699459076, "rewards/margins": 0.050421230494976044, "rewards/rejected": -0.036087844520807266, "step": 70 }, { "epoch": 0.1674515960230246, "grad_norm": 4.641698206330642, "learning_rate": 4.93167072587771e-07, "logits/chosen": -0.7803142070770264, "logits/rejected": -0.6576212048530579, "logps/chosen": -338.9702453613281, "logps/rejected": -270.46124267578125, "loss": 0.6684, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.041751302778720856, "rewards/margins": 0.06293781846761703, "rewards/rejected": -0.10468912124633789, "step": 80 }, { "epoch": 0.18838304552590268, "grad_norm": 5.809449166665223, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.730857253074646, "logits/rejected": -0.6790161728858948, "logps/chosen": -270.4856872558594, "logps/rejected": -280.201171875, "loss": 0.6533, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05735975503921509, "rewards/margins": 0.0867304801940918, "rewards/rejected": -0.14409023523330688, "step": 90 }, { "epoch": 0.20931449502878074, "grad_norm": 6.629060471777615, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.819484531879425, "logits/rejected": -0.7444473505020142, "logps/chosen": -320.7297058105469, "logps/rejected": -315.49786376953125, "loss": 0.6452, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.106062151491642, "rewards/margins": 0.15797743201255798, "rewards/rejected": -0.2640395760536194, "step": 100 }, { "epoch": 0.2302459445316588, "grad_norm": 8.885427777088127, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.7485495805740356, "logits/rejected": -0.6900595426559448, "logps/chosen": -313.8240966796875, "logps/rejected": -310.7196960449219, "loss": 0.6442, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1286977082490921, "rewards/margins": 0.1550484299659729, "rewards/rejected": -0.2837461233139038, "step": 110 }, { "epoch": 0.25117739403453687, "grad_norm": 10.779857080720818, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -0.6856316328048706, "logits/rejected": -0.6849483251571655, "logps/chosen": -303.8964538574219, "logps/rejected": -321.4309387207031, "loss": 0.6199, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2916944622993469, "rewards/margins": 0.21943287551403046, "rewards/rejected": -0.5111273527145386, "step": 120 }, { "epoch": 0.272108843537415, "grad_norm": 11.982309184139016, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -0.6801525950431824, "logits/rejected": -0.6393054723739624, "logps/chosen": -319.20672607421875, "logps/rejected": -333.1614685058594, "loss": 0.609, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.27636662125587463, "rewards/margins": 0.2492021769285202, "rewards/rejected": -0.5255688428878784, "step": 130 }, { "epoch": 0.29304029304029305, "grad_norm": 19.151178395769573, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.7184507846832275, "logits/rejected": -0.6374621987342834, "logps/chosen": -356.2397766113281, "logps/rejected": -337.32354736328125, "loss": 0.6109, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3082619607448578, "rewards/margins": 0.31162941455841064, "rewards/rejected": -0.6198914051055908, "step": 140 }, { "epoch": 0.3139717425431711, "grad_norm": 13.824909994267095, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -0.6901696920394897, "logits/rejected": -0.6279430389404297, "logps/chosen": -353.95184326171875, "logps/rejected": -346.9781494140625, "loss": 0.6132, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6322077512741089, "rewards/margins": 0.2923499643802643, "rewards/rejected": -0.9245575666427612, "step": 150 }, { "epoch": 0.3349031920460492, "grad_norm": 13.009971235384292, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -0.7091597318649292, "logits/rejected": -0.6455188989639282, "logps/chosen": -340.92657470703125, "logps/rejected": -347.0225830078125, "loss": 0.603, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5523598790168762, "rewards/margins": 0.33642885088920593, "rewards/rejected": -0.8887887001037598, "step": 160 }, { "epoch": 0.35583464154892724, "grad_norm": 13.829339790258013, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.6223039627075195, "logits/rejected": -0.5784906148910522, "logps/chosen": -351.1839904785156, "logps/rejected": -358.60479736328125, "loss": 0.5953, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8050142526626587, "rewards/margins": 0.3386740982532501, "rewards/rejected": -1.143688440322876, "step": 170 }, { "epoch": 0.37676609105180536, "grad_norm": 20.849086588528724, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.6424199342727661, "logits/rejected": -0.5930343270301819, "logps/chosen": -358.6197204589844, "logps/rejected": -367.137451171875, "loss": 0.5849, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8172851800918579, "rewards/margins": 0.3052961528301239, "rewards/rejected": -1.1225812435150146, "step": 180 }, { "epoch": 0.3976975405546834, "grad_norm": 17.753231429350524, "learning_rate": 3.765821230985757e-07, "logits/chosen": -0.6292937994003296, "logits/rejected": -0.615179717540741, "logps/chosen": -343.19952392578125, "logps/rejected": -375.33929443359375, "loss": 0.5817, "rewards/accuracies": 0.6875, "rewards/chosen": -0.68468177318573, "rewards/margins": 0.34494417905807495, "rewards/rejected": -1.0296258926391602, "step": 190 }, { "epoch": 0.4186289900575615, "grad_norm": 28.151101969379706, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.5903419256210327, "logits/rejected": -0.5930633544921875, "logps/chosen": -360.65179443359375, "logps/rejected": -412.225830078125, "loss": 0.5821, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0000778436660767, "rewards/margins": 0.33244088292121887, "rewards/rejected": -1.3325188159942627, "step": 200 }, { "epoch": 0.43956043956043955, "grad_norm": 17.118814416105273, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.6688283085823059, "logits/rejected": -0.5644041895866394, "logps/chosen": -433.589599609375, "logps/rejected": -432.74993896484375, "loss": 0.5853, "rewards/accuracies": 0.75, "rewards/chosen": -1.17396080493927, "rewards/margins": 0.4916624426841736, "rewards/rejected": -1.6656233072280884, "step": 210 }, { "epoch": 0.4604918890633176, "grad_norm": 23.06196255102623, "learning_rate": 3.265574537815398e-07, "logits/chosen": -0.5818850994110107, "logits/rejected": -0.5700303316116333, "logps/chosen": -351.98638916015625, "logps/rejected": -410.9193420410156, "loss": 0.5704, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0643993616104126, "rewards/margins": 0.5418257713317871, "rewards/rejected": -1.6062252521514893, "step": 220 }, { "epoch": 0.48142333856619574, "grad_norm": 21.04733996523729, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.6803761720657349, "logits/rejected": -0.6139528751373291, "logps/chosen": -355.9638671875, "logps/rejected": -392.5525207519531, "loss": 0.5753, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8678590059280396, "rewards/margins": 0.6038464307785034, "rewards/rejected": -1.4717055559158325, "step": 230 }, { "epoch": 0.5023547880690737, "grad_norm": 18.458075241435953, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.6669884920120239, "logits/rejected": -0.5953234434127808, "logps/chosen": -363.3609313964844, "logps/rejected": -395.0166320800781, "loss": 0.5445, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6804043054580688, "rewards/margins": 0.501660168170929, "rewards/rejected": -1.1820645332336426, "step": 240 }, { "epoch": 0.5232862375719518, "grad_norm": 24.001388696637466, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.6821622252464294, "logits/rejected": -0.5736308693885803, "logps/chosen": -392.59375, "logps/rejected": -420.7662658691406, "loss": 0.5592, "rewards/accuracies": 0.75, "rewards/chosen": -1.037414789199829, "rewards/margins": 0.6777431964874268, "rewards/rejected": -1.7151581048965454, "step": 250 }, { "epoch": 0.54421768707483, "grad_norm": 20.80336944317341, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -0.6666806936264038, "logits/rejected": -0.6545027494430542, "logps/chosen": -379.15423583984375, "logps/rejected": -414.10302734375, "loss": 0.549, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1116763353347778, "rewards/margins": 0.48681873083114624, "rewards/rejected": -1.5984950065612793, "step": 260 }, { "epoch": 0.565149136577708, "grad_norm": 18.999186941850777, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -0.6578361988067627, "logits/rejected": -0.6166576147079468, "logps/chosen": -415.3619079589844, "logps/rejected": -453.8072814941406, "loss": 0.5596, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0673983097076416, "rewards/margins": 0.6960801482200623, "rewards/rejected": -1.7634785175323486, "step": 270 }, { "epoch": 0.5860805860805861, "grad_norm": 25.011257430836675, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -0.5656932592391968, "logits/rejected": -0.5232654809951782, "logps/chosen": -409.4795837402344, "logps/rejected": -441.3401794433594, "loss": 0.5636, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4023996591567993, "rewards/margins": 0.5914163589477539, "rewards/rejected": -1.9938161373138428, "step": 280 }, { "epoch": 0.6070120355834642, "grad_norm": 16.649025738470474, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -0.617931067943573, "logits/rejected": -0.5761314034461975, "logps/chosen": -406.4532775878906, "logps/rejected": -464.67822265625, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4842880964279175, "rewards/margins": 0.5774600505828857, "rewards/rejected": -2.0617482662200928, "step": 290 }, { "epoch": 0.6279434850863422, "grad_norm": 16.730004932577106, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.6562352180480957, "logits/rejected": -0.5790780186653137, "logps/chosen": -451.573486328125, "logps/rejected": -457.2960510253906, "loss": 0.5394, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1794800758361816, "rewards/margins": 0.5321540832519531, "rewards/rejected": -1.7116340398788452, "step": 300 }, { "epoch": 0.6488749345892203, "grad_norm": 20.824128290150536, "learning_rate": 1.647817538357072e-07, "logits/chosen": -0.6351410150527954, "logits/rejected": -0.5707007050514221, "logps/chosen": -443.58978271484375, "logps/rejected": -464.4762268066406, "loss": 0.5379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2887599468231201, "rewards/margins": 0.7534160614013672, "rewards/rejected": -2.042175769805908, "step": 310 }, { "epoch": 0.6698063840920984, "grad_norm": 21.443811248225554, "learning_rate": 1.478143389201113e-07, "logits/chosen": -0.652029275894165, "logits/rejected": -0.5766469240188599, "logps/chosen": -430.428466796875, "logps/rejected": -456.797607421875, "loss": 0.5387, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6468877792358398, "rewards/margins": 0.5889655351638794, "rewards/rejected": -2.235853433609009, "step": 320 }, { "epoch": 0.6907378335949764, "grad_norm": 18.054876727682743, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -0.6749883890151978, "logits/rejected": -0.6647608876228333, "logps/chosen": -418.3038635253906, "logps/rejected": -472.7947692871094, "loss": 0.5367, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3353520631790161, "rewards/margins": 0.6326448917388916, "rewards/rejected": -1.9679968357086182, "step": 330 }, { "epoch": 0.7116692830978545, "grad_norm": 26.52212132673544, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -0.6122914552688599, "logits/rejected": -0.5800005793571472, "logps/chosen": -433.57635498046875, "logps/rejected": -465.54669189453125, "loss": 0.5428, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.294762134552002, "rewards/margins": 0.587591290473938, "rewards/rejected": -1.8823535442352295, "step": 340 }, { "epoch": 0.7326007326007326, "grad_norm": 19.655279922879483, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -0.6827625036239624, "logits/rejected": -0.6138468980789185, "logps/chosen": -397.7406921386719, "logps/rejected": -434.5318298339844, "loss": 0.5167, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2331862449645996, "rewards/margins": 0.6665691137313843, "rewards/rejected": -1.8997554779052734, "step": 350 }, { "epoch": 0.7535321821036107, "grad_norm": 19.235838308842027, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.6631180047988892, "logits/rejected": -0.5895651578903198, "logps/chosen": -461.24176025390625, "logps/rejected": -491.12176513671875, "loss": 0.5577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4328138828277588, "rewards/margins": 0.6876929402351379, "rewards/rejected": -2.120506763458252, "step": 360 }, { "epoch": 0.7744636316064888, "grad_norm": 19.962323367834856, "learning_rate": 7.289996455765748e-08, "logits/chosen": -0.6887942552566528, "logits/rejected": -0.6230372190475464, "logps/chosen": -414.96270751953125, "logps/rejected": -449.39874267578125, "loss": 0.5411, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4094620943069458, "rewards/margins": 0.6524969935417175, "rewards/rejected": -2.0619590282440186, "step": 370 }, { "epoch": 0.7953950811093669, "grad_norm": 17.262620635801852, "learning_rate": 6.046442623320145e-08, "logits/chosen": -0.6020098924636841, "logits/rejected": -0.6000246405601501, "logps/chosen": -398.9418029785156, "logps/rejected": -501.1026916503906, "loss": 0.5387, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.440071940422058, "rewards/margins": 0.8659427762031555, "rewards/rejected": -2.3060147762298584, "step": 380 }, { "epoch": 0.8163265306122449, "grad_norm": 23.268294087473826, "learning_rate": 4.904486005914027e-08, "logits/chosen": -0.7086952328681946, "logits/rejected": -0.6402121782302856, "logps/chosen": -475.9898376464844, "logps/rejected": -513.5247802734375, "loss": 0.5207, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3465341329574585, "rewards/margins": 0.7135976552963257, "rewards/rejected": -2.060131788253784, "step": 390 }, { "epoch": 0.837257980115123, "grad_norm": 20.66365114424933, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.641961932182312, "logits/rejected": -0.5901409983634949, "logps/chosen": -395.93157958984375, "logps/rejected": -442.00244140625, "loss": 0.5353, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2355709075927734, "rewards/margins": 0.6957671642303467, "rewards/rejected": -1.9313379526138306, "step": 400 }, { "epoch": 0.858189429618001, "grad_norm": 18.94839439294029, "learning_rate": 2.9492720416985e-08, "logits/chosen": -0.7429651021957397, "logits/rejected": -0.6672912836074829, "logps/chosen": -440.3075256347656, "logps/rejected": -467.0520935058594, "loss": 0.5506, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2028748989105225, "rewards/margins": 0.7443949580192566, "rewards/rejected": -1.9472697973251343, "step": 410 }, { "epoch": 0.8791208791208791, "grad_norm": 19.442404647939284, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.597920298576355, "logits/rejected": -0.586058497428894, "logps/chosen": -406.3937072753906, "logps/rejected": -479.6614685058594, "loss": 0.5384, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3360610008239746, "rewards/margins": 0.6579158902168274, "rewards/rejected": -1.9939768314361572, "step": 420 }, { "epoch": 0.9000523286237572, "grad_norm": 19.433771609383193, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -0.6416221857070923, "logits/rejected": -0.5748856663703918, "logps/chosen": -432.43292236328125, "logps/rejected": -473.4640197753906, "loss": 0.5268, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.289609670639038, "rewards/margins": 0.6828420162200928, "rewards/rejected": -1.9724515676498413, "step": 430 }, { "epoch": 0.9209837781266352, "grad_norm": 25.563733256044383, "learning_rate": 9.12094829893642e-09, "logits/chosen": -0.7145225405693054, "logits/rejected": -0.6558529138565063, "logps/chosen": -401.7915344238281, "logps/rejected": -416.718994140625, "loss": 0.5331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2713123559951782, "rewards/margins": 0.5642818212509155, "rewards/rejected": -1.8355941772460938, "step": 440 }, { "epoch": 0.9419152276295133, "grad_norm": 18.949969564615213, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.5803197622299194, "logits/rejected": -0.5434113144874573, "logps/chosen": -410.92730712890625, "logps/rejected": -468.50323486328125, "loss": 0.534, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2884615659713745, "rewards/margins": 0.7078900337219238, "rewards/rejected": -1.9963515996932983, "step": 450 }, { "epoch": 0.9628466771323915, "grad_norm": 22.7412601308732, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -0.6422279477119446, "logits/rejected": -0.5603567361831665, "logps/chosen": -407.45465087890625, "logps/rejected": -431.9442443847656, "loss": 0.5408, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2375915050506592, "rewards/margins": 0.686954140663147, "rewards/rejected": -1.9245456457138062, "step": 460 }, { "epoch": 0.9837781266352695, "grad_norm": 17.505664359316036, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -0.6509039998054504, "logits/rejected": -0.6031205654144287, "logps/chosen": -423.6900939941406, "logps/rejected": -475.7867736816406, "loss": 0.5134, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1757431030273438, "rewards/margins": 0.6970219612121582, "rewards/rejected": -1.8727651834487915, "step": 470 }, { "epoch": 0.9984301412872841, "eval_logits/chosen": -0.5977884531021118, "eval_logits/rejected": -0.5622259378433228, "eval_logps/chosen": -401.6270751953125, "eval_logps/rejected": -481.71246337890625, "eval_loss": 0.5371974110603333, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -1.1911048889160156, "eval_rewards/margins": 0.8110275268554688, "eval_rewards/rejected": -2.0021324157714844, "eval_runtime": 196.6339, "eval_samples_per_second": 10.171, "eval_steps_per_second": 0.163, "step": 477 }, { "epoch": 0.9984301412872841, "step": 477, "total_flos": 0.0, "train_loss": 0.5847830807387954, "train_runtime": 56722.3251, "train_samples_per_second": 1.078, "train_steps_per_second": 0.008 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }