{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.3326352834701538, "learning_rate": 4.99989327925842e-05, "loss": 1.8387, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.3947046995162964, "learning_rate": 4.999573126145132e-05, "loss": 1.7528, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.3496498167514801, "learning_rate": 4.999039567993719e-05, "loss": 1.7639, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.2980650067329407, "learning_rate": 4.998292650357558e-05, "loss": 1.7576, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.3043113350868225, "learning_rate": 4.997332437005931e-05, "loss": 1.6951, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.30196303129196167, "learning_rate": 4.996159009918585e-05, "loss": 1.7508, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.3021036684513092, "learning_rate": 4.994772469278726e-05, "loss": 1.7063, "step": 70 }, { "epoch": 0.05, "grad_norm": 0.25009843707084656, "learning_rate": 4.993172933464471e-05, "loss": 1.7439, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.269055038690567, "learning_rate": 4.9913605390387365e-05, "loss": 1.7166, "step": 90 }, { "epoch": 0.06, "grad_norm": 0.28669387102127075, "learning_rate": 4.989335440737586e-05, "loss": 1.682, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.2838974595069885, "learning_rate": 4.987097811457014e-05, "loss": 1.7938, "step": 110 }, { "epoch": 0.07, "grad_norm": 0.275068074464798, "learning_rate": 4.984647842238185e-05, "loss": 1.689, "step": 120 }, { "epoch": 0.08, "grad_norm": 0.3068424463272095, "learning_rate": 4.981985742251123e-05, "loss": 1.7041, "step": 130 }, { "epoch": 0.08, "grad_norm": 0.27469873428344727, "learning_rate": 4.979111738776857e-05, "loss": 1.6888, "step": 140 }, { "epoch": 0.09, "grad_norm": 0.26790323853492737, "learning_rate": 4.976026077188013e-05, "loss": 1.6397, "step": 150 }, { "epoch": 0.09, "grad_norm": 0.25793883204460144, "learning_rate": 4.972729020927865e-05, "loss": 1.6512, "step": 160 }, { "epoch": 0.1, "grad_norm": 0.27911970019340515, "learning_rate": 4.9692208514878444e-05, "loss": 1.7302, "step": 170 }, { "epoch": 0.11, "grad_norm": 0.30416035652160645, "learning_rate": 4.965501868383506e-05, "loss": 1.6873, "step": 180 }, { "epoch": 0.11, "grad_norm": 0.28593677282333374, "learning_rate": 4.961572389128959e-05, "loss": 1.7116, "step": 190 }, { "epoch": 0.12, "grad_norm": 0.2666521370410919, "learning_rate": 4.957432749209755e-05, "loss": 1.6284, "step": 200 }, { "epoch": 0.12, "grad_norm": 0.29738298058509827, "learning_rate": 4.953083302054247e-05, "loss": 1.6265, "step": 210 }, { "epoch": 0.13, "grad_norm": 0.29092609882354736, "learning_rate": 4.948524419003415e-05, "loss": 1.6713, "step": 220 }, { "epoch": 0.14, "grad_norm": 0.25735458731651306, "learning_rate": 4.943756489279164e-05, "loss": 1.7279, "step": 230 }, { "epoch": 0.14, "grad_norm": 0.25443825125694275, "learning_rate": 4.938779919951092e-05, "loss": 1.6343, "step": 240 }, { "epoch": 0.15, "grad_norm": 0.25252780318260193, "learning_rate": 4.933595135901732e-05, "loss": 1.7045, "step": 250 }, { "epoch": 0.15, "grad_norm": 0.2798963785171509, "learning_rate": 4.928202579790285e-05, "loss": 1.6339, "step": 260 }, { "epoch": 0.16, "grad_norm": 0.2597511410713196, "learning_rate": 4.9226027120148195e-05, "loss": 1.7226, "step": 270 }, { "epoch": 0.16, "grad_norm": 0.28118622303009033, "learning_rate": 4.916796010672969e-05, "loss": 1.7111, "step": 280 }, { "epoch": 0.17, "grad_norm": 0.25402092933654785, "learning_rate": 4.9107829715211124e-05, "loss": 1.7058, "step": 290 }, { "epoch": 0.18, "grad_norm": 0.29867735505104065, "learning_rate": 4.9045641079320484e-05, "loss": 1.6785, "step": 300 }, { "epoch": 0.18, "grad_norm": 0.2505153715610504, "learning_rate": 4.8981399508511624e-05, "loss": 1.7509, "step": 310 }, { "epoch": 0.19, "grad_norm": 0.21384477615356445, "learning_rate": 4.891511048751102e-05, "loss": 1.8352, "step": 320 }, { "epoch": 0.19, "grad_norm": 0.26775240898132324, "learning_rate": 4.884677967584945e-05, "loss": 1.7624, "step": 330 }, { "epoch": 0.2, "grad_norm": 0.29322007298469543, "learning_rate": 4.877641290737884e-05, "loss": 1.706, "step": 340 }, { "epoch": 0.21, "grad_norm": 0.25446540117263794, "learning_rate": 4.870401618977415e-05, "loss": 1.7385, "step": 350 }, { "epoch": 0.21, "grad_norm": 0.2639479339122772, "learning_rate": 4.862959570402049e-05, "loss": 1.6283, "step": 360 }, { "epoch": 0.22, "grad_norm": 0.270839661359787, "learning_rate": 4.8553157803885404e-05, "loss": 1.6634, "step": 370 }, { "epoch": 0.22, "grad_norm": 0.26396167278289795, "learning_rate": 4.8474709015376416e-05, "loss": 1.6037, "step": 380 }, { "epoch": 0.23, "grad_norm": 0.24521879851818085, "learning_rate": 4.8394256036183816e-05, "loss": 1.6235, "step": 390 }, { "epoch": 0.24, "grad_norm": 0.278817355632782, "learning_rate": 4.8311805735108894e-05, "loss": 1.6136, "step": 400 }, { "epoch": 0.24, "grad_norm": 0.27180808782577515, "learning_rate": 4.822736515147748e-05, "loss": 1.6092, "step": 410 }, { "epoch": 0.25, "grad_norm": 0.2518569827079773, "learning_rate": 4.814094149453891e-05, "loss": 1.7227, "step": 420 }, { "epoch": 0.25, "grad_norm": 0.29305458068847656, "learning_rate": 4.805254214285061e-05, "loss": 1.6615, "step": 430 }, { "epoch": 0.26, "grad_norm": 0.28227561712265015, "learning_rate": 4.796217464364808e-05, "loss": 1.6513, "step": 440 }, { "epoch": 0.26, "grad_norm": 0.2649269998073578, "learning_rate": 4.786984671220053e-05, "loss": 1.645, "step": 450 }, { "epoch": 0.27, "grad_norm": 0.2525152564048767, "learning_rate": 4.777556623115221e-05, "loss": 1.7315, "step": 460 }, { "epoch": 0.28, "grad_norm": 0.3066771328449249, "learning_rate": 4.767934124984941e-05, "loss": 1.6781, "step": 470 }, { "epoch": 0.28, "grad_norm": 0.253776878118515, "learning_rate": 4.758117998365322e-05, "loss": 1.6885, "step": 480 }, { "epoch": 0.29, "grad_norm": 0.2555100917816162, "learning_rate": 4.748109081323814e-05, "loss": 1.6221, "step": 490 }, { "epoch": 0.29, "grad_norm": 0.2828095853328705, "learning_rate": 4.7379082283876566e-05, "loss": 1.6639, "step": 500 }, { "epoch": 0.3, "grad_norm": 0.22167381644248962, "learning_rate": 4.72751631047092e-05, "loss": 1.714, "step": 510 }, { "epoch": 0.31, "grad_norm": 0.2374366968870163, "learning_rate": 4.716934214800155e-05, "loss": 1.7015, "step": 520 }, { "epoch": 0.31, "grad_norm": 0.25729697942733765, "learning_rate": 4.70616284483864e-05, "loss": 1.6717, "step": 530 }, { "epoch": 0.32, "grad_norm": 0.24997037649154663, "learning_rate": 4.695203120209245e-05, "loss": 1.7098, "step": 540 }, { "epoch": 0.32, "grad_norm": 0.24944821000099182, "learning_rate": 4.684055976615924e-05, "loss": 1.6521, "step": 550 }, { "epoch": 0.33, "grad_norm": 0.23309342563152313, "learning_rate": 4.672722365763821e-05, "loss": 1.7418, "step": 560 }, { "epoch": 0.34, "grad_norm": 0.24750587344169617, "learning_rate": 4.66120325527802e-05, "loss": 1.7373, "step": 570 }, { "epoch": 0.34, "grad_norm": 0.2507722079753876, "learning_rate": 4.649499628620931e-05, "loss": 1.6818, "step": 580 }, { "epoch": 0.35, "grad_norm": 0.19257722795009613, "learning_rate": 4.637612485008328e-05, "loss": 1.7484, "step": 590 }, { "epoch": 0.35, "grad_norm": 0.24218104779720306, "learning_rate": 4.625542839324036e-05, "loss": 1.6264, "step": 600 }, { "epoch": 0.36, "grad_norm": 0.26412564516067505, "learning_rate": 4.6132917220332846e-05, "loss": 1.6143, "step": 610 }, { "epoch": 0.36, "grad_norm": 0.24778200685977936, "learning_rate": 4.600860179094732e-05, "loss": 1.5942, "step": 620 }, { "epoch": 0.37, "grad_norm": 0.25658294558525085, "learning_rate": 4.588249271871164e-05, "loss": 1.6639, "step": 630 }, { "epoch": 0.38, "grad_norm": 0.2580535411834717, "learning_rate": 4.575460077038877e-05, "loss": 1.6117, "step": 640 }, { "epoch": 0.38, "grad_norm": 0.2333957999944687, "learning_rate": 4.5624936864957556e-05, "loss": 1.6668, "step": 650 }, { "epoch": 0.39, "grad_norm": 0.26032692193984985, "learning_rate": 4.5493512072680536e-05, "loss": 1.6715, "step": 660 }, { "epoch": 0.39, "grad_norm": 0.2909170985221863, "learning_rate": 4.536033761415871e-05, "loss": 1.7098, "step": 670 }, { "epoch": 0.4, "grad_norm": 0.252714604139328, "learning_rate": 4.522542485937369e-05, "loss": 1.619, "step": 680 }, { "epoch": 0.41, "grad_norm": 0.23777702450752258, "learning_rate": 4.5088785326716844e-05, "loss": 1.5681, "step": 690 }, { "epoch": 0.41, "grad_norm": 0.2274021953344345, "learning_rate": 4.4950430682006e-05, "loss": 1.6849, "step": 700 }, { "epoch": 0.42, "grad_norm": 0.23289409279823303, "learning_rate": 4.4810372737489345e-05, "loss": 1.7016, "step": 710 }, { "epoch": 0.42, "grad_norm": 0.25084224343299866, "learning_rate": 4.4668623450837085e-05, "loss": 1.656, "step": 720 }, { "epoch": 0.43, "grad_norm": 0.30249300599098206, "learning_rate": 4.452519492412039e-05, "loss": 1.6872, "step": 730 }, { "epoch": 0.44, "grad_norm": 0.25141090154647827, "learning_rate": 4.4380099402778244e-05, "loss": 1.5979, "step": 740 }, { "epoch": 0.44, "grad_norm": 0.2582787573337555, "learning_rate": 4.423334927457198e-05, "loss": 1.6975, "step": 750 }, { "epoch": 0.45, "grad_norm": 0.2253919243812561, "learning_rate": 4.408495706852758e-05, "loss": 1.6676, "step": 760 }, { "epoch": 0.45, "grad_norm": 0.24296186864376068, "learning_rate": 4.393493545386607e-05, "loss": 1.6405, "step": 770 }, { "epoch": 0.46, "grad_norm": 0.23083582520484924, "learning_rate": 4.378329723892184e-05, "loss": 1.755, "step": 780 }, { "epoch": 0.46, "grad_norm": 0.2642640769481659, "learning_rate": 4.363005537004907e-05, "loss": 1.6541, "step": 790 }, { "epoch": 0.47, "grad_norm": 0.2347261905670166, "learning_rate": 4.347522293051648e-05, "loss": 1.6949, "step": 800 }, { "epoch": 0.48, "grad_norm": 0.2511994540691376, "learning_rate": 4.331881313939029e-05, "loss": 1.7085, "step": 810 }, { "epoch": 0.48, "grad_norm": 0.22544771432876587, "learning_rate": 4.3160839350405606e-05, "loss": 1.7079, "step": 820 }, { "epoch": 0.49, "grad_norm": 0.26223573088645935, "learning_rate": 4.300131505082637e-05, "loss": 1.5977, "step": 830 }, { "epoch": 0.49, "grad_norm": 0.22108006477355957, "learning_rate": 4.284025386029381e-05, "loss": 1.7442, "step": 840 }, { "epoch": 0.5, "grad_norm": 0.2781137228012085, "learning_rate": 4.267766952966369e-05, "loss": 1.6607, "step": 850 }, { "epoch": 0.51, "grad_norm": 0.2906346619129181, "learning_rate": 4.2513575939832275e-05, "loss": 1.6634, "step": 860 }, { "epoch": 0.51, "grad_norm": 0.2514524757862091, "learning_rate": 4.234798710055125e-05, "loss": 1.6332, "step": 870 }, { "epoch": 0.52, "grad_norm": 0.2293555736541748, "learning_rate": 4.218091714923157e-05, "loss": 1.6625, "step": 880 }, { "epoch": 0.52, "grad_norm": 0.24143122136592865, "learning_rate": 4.201238034973654e-05, "loss": 1.7808, "step": 890 }, { "epoch": 0.53, "grad_norm": 0.2245650440454483, "learning_rate": 4.184239109116393e-05, "loss": 1.6353, "step": 900 }, { "epoch": 0.54, "grad_norm": 0.2478613555431366, "learning_rate": 4.1670963886617535e-05, "loss": 1.6436, "step": 910 }, { "epoch": 0.54, "grad_norm": 0.24913890659809113, "learning_rate": 4.149811337196807e-05, "loss": 1.6229, "step": 920 }, { "epoch": 0.55, "grad_norm": 0.23885607719421387, "learning_rate": 4.132385430460361e-05, "loss": 1.6778, "step": 930 }, { "epoch": 0.55, "grad_norm": 0.24098117649555206, "learning_rate": 4.1148201562169685e-05, "loss": 1.6253, "step": 940 }, { "epoch": 0.56, "grad_norm": 0.232025146484375, "learning_rate": 4.097117014129903e-05, "loss": 1.7341, "step": 950 }, { "epoch": 0.56, "grad_norm": 0.25174739956855774, "learning_rate": 4.079277515633127e-05, "loss": 1.6617, "step": 960 }, { "epoch": 0.57, "grad_norm": 0.23565252125263214, "learning_rate": 4.0613031838022486e-05, "loss": 1.7218, "step": 970 }, { "epoch": 0.58, "grad_norm": 0.24838000535964966, "learning_rate": 4.0431955532244827e-05, "loss": 1.6409, "step": 980 }, { "epoch": 0.58, "grad_norm": 0.264049768447876, "learning_rate": 4.0249561698676416e-05, "loss": 1.6628, "step": 990 }, { "epoch": 0.59, "grad_norm": 0.2418060302734375, "learning_rate": 4.0065865909481417e-05, "loss": 1.63, "step": 1000 }, { "epoch": 0.59, "grad_norm": 0.2619737684726715, "learning_rate": 3.988088384798047e-05, "loss": 1.7268, "step": 1010 }, { "epoch": 0.6, "grad_norm": 0.24747079610824585, "learning_rate": 3.969463130731183e-05, "loss": 1.683, "step": 1020 }, { "epoch": 0.61, "grad_norm": 0.2688976228237152, "learning_rate": 3.950712418908289e-05, "loss": 1.6266, "step": 1030 }, { "epoch": 0.61, "grad_norm": 0.23298139870166779, "learning_rate": 3.931837850201263e-05, "loss": 1.6665, "step": 1040 }, { "epoch": 0.62, "grad_norm": 0.2630896270275116, "learning_rate": 3.91284103605648e-05, "loss": 1.6105, "step": 1050 }, { "epoch": 0.62, "grad_norm": 0.23678737878799438, "learning_rate": 3.893723598357214e-05, "loss": 1.6338, "step": 1060 }, { "epoch": 0.63, "grad_norm": 0.23340342938899994, "learning_rate": 3.874487169285168e-05, "loss": 1.6318, "step": 1070 }, { "epoch": 0.64, "grad_norm": 0.261382520198822, "learning_rate": 3.855133391181124e-05, "loss": 1.6206, "step": 1080 }, { "epoch": 0.64, "grad_norm": 0.26030778884887695, "learning_rate": 3.835663916404721e-05, "loss": 1.6291, "step": 1090 }, { "epoch": 0.65, "grad_norm": 0.2665380835533142, "learning_rate": 3.81608040719339e-05, "loss": 1.617, "step": 1100 }, { "epoch": 0.65, "grad_norm": 0.2118641436100006, "learning_rate": 3.7963845355204304e-05, "loss": 1.6328, "step": 1110 }, { "epoch": 0.66, "grad_norm": 0.33383819460868835, "learning_rate": 3.7765779829522675e-05, "loss": 1.6419, "step": 1120 }, { "epoch": 0.66, "grad_norm": 0.27692610025405884, "learning_rate": 3.7566624405048844e-05, "loss": 1.6831, "step": 1130 }, { "epoch": 0.67, "grad_norm": 0.22456732392311096, "learning_rate": 3.7366396084994475e-05, "loss": 1.7071, "step": 1140 }, { "epoch": 0.68, "grad_norm": 0.25981763005256653, "learning_rate": 3.716511196417141e-05, "loss": 1.6056, "step": 1150 }, { "epoch": 0.68, "grad_norm": 0.2628863453865051, "learning_rate": 3.696278922753216e-05, "loss": 1.625, "step": 1160 }, { "epoch": 0.69, "grad_norm": 0.2798576354980469, "learning_rate": 3.6759445148702735e-05, "loss": 1.6918, "step": 1170 }, { "epoch": 0.69, "grad_norm": 0.24460268020629883, "learning_rate": 3.655509708850783e-05, "loss": 1.6883, "step": 1180 }, { "epoch": 0.7, "grad_norm": 0.27063578367233276, "learning_rate": 3.634976249348867e-05, "loss": 1.6268, "step": 1190 }, { "epoch": 0.71, "grad_norm": 0.2729038596153259, "learning_rate": 3.6143458894413465e-05, "loss": 1.6014, "step": 1200 }, { "epoch": 0.71, "grad_norm": 0.23127633333206177, "learning_rate": 3.593620390478066e-05, "loss": 1.6393, "step": 1210 }, { "epoch": 0.72, "grad_norm": 0.23226168751716614, "learning_rate": 3.572801521931522e-05, "loss": 1.6408, "step": 1220 }, { "epoch": 0.72, "grad_norm": 0.23845529556274414, "learning_rate": 3.551891061245788e-05, "loss": 1.6366, "step": 1230 }, { "epoch": 0.73, "grad_norm": 0.2765669524669647, "learning_rate": 3.5308907936847594e-05, "loss": 1.6158, "step": 1240 }, { "epoch": 0.74, "grad_norm": 0.3147326707839966, "learning_rate": 3.509802512179737e-05, "loss": 1.6012, "step": 1250 }, { "epoch": 0.74, "grad_norm": 0.2679651081562042, "learning_rate": 3.488628017176356e-05, "loss": 1.6459, "step": 1260 }, { "epoch": 0.75, "grad_norm": 0.24452731013298035, "learning_rate": 3.467369116480864e-05, "loss": 1.599, "step": 1270 }, { "epoch": 0.75, "grad_norm": 0.2260725051164627, "learning_rate": 3.446027625105776e-05, "loss": 1.6405, "step": 1280 }, { "epoch": 0.76, "grad_norm": 0.2767278552055359, "learning_rate": 3.424605365114923e-05, "loss": 1.6976, "step": 1290 }, { "epoch": 0.76, "grad_norm": 0.23728059232234955, "learning_rate": 3.403104165467883e-05, "loss": 1.7039, "step": 1300 }, { "epoch": 0.77, "grad_norm": 0.2619900107383728, "learning_rate": 3.381525861863831e-05, "loss": 1.6404, "step": 1310 }, { "epoch": 0.78, "grad_norm": 0.2897908389568329, "learning_rate": 3.3598722965848204e-05, "loss": 1.6433, "step": 1320 }, { "epoch": 0.78, "grad_norm": 0.2271248698234558, "learning_rate": 3.3381453183384846e-05, "loss": 1.6096, "step": 1330 }, { "epoch": 0.79, "grad_norm": 0.31233999133110046, "learning_rate": 3.316346782100208e-05, "loss": 1.7015, "step": 1340 }, { "epoch": 0.79, "grad_norm": 0.2572289705276489, "learning_rate": 3.294478548954754e-05, "loss": 1.6442, "step": 1350 }, { "epoch": 0.8, "grad_norm": 0.2430019974708557, "learning_rate": 3.272542485937369e-05, "loss": 1.6863, "step": 1360 }, { "epoch": 0.81, "grad_norm": 0.2512265741825104, "learning_rate": 3.250540465874382e-05, "loss": 1.6767, "step": 1370 }, { "epoch": 0.81, "grad_norm": 0.2533564567565918, "learning_rate": 3.228474367223312e-05, "loss": 1.6692, "step": 1380 }, { "epoch": 0.82, "grad_norm": 0.25457942485809326, "learning_rate": 3.206346073912488e-05, "loss": 1.6627, "step": 1390 }, { "epoch": 0.82, "grad_norm": 0.24955084919929504, "learning_rate": 3.1841574751802076e-05, "loss": 1.6625, "step": 1400 }, { "epoch": 0.83, "grad_norm": 0.2384202629327774, "learning_rate": 3.1619104654134395e-05, "loss": 1.7596, "step": 1410 }, { "epoch": 0.84, "grad_norm": 0.27582287788391113, "learning_rate": 3.1396069439860894e-05, "loss": 1.7282, "step": 1420 }, { "epoch": 0.84, "grad_norm": 0.2678006887435913, "learning_rate": 3.117248815096833e-05, "loss": 1.6456, "step": 1430 }, { "epoch": 0.85, "grad_norm": 0.23636989295482635, "learning_rate": 3.094837987606547e-05, "loss": 1.5889, "step": 1440 }, { "epoch": 0.85, "grad_norm": 0.2650662362575531, "learning_rate": 3.072376374875335e-05, "loss": 1.6597, "step": 1450 }, { "epoch": 0.86, "grad_norm": 0.2590391933917999, "learning_rate": 3.049865894599172e-05, "loss": 1.6503, "step": 1460 }, { "epoch": 0.86, "grad_norm": 0.26722925901412964, "learning_rate": 3.027308468646175e-05, "loss": 1.6738, "step": 1470 }, { "epoch": 0.87, "grad_norm": 0.23559637367725372, "learning_rate": 3.0047060228925256e-05, "loss": 1.5497, "step": 1480 }, { "epoch": 0.88, "grad_norm": 0.2601984143257141, "learning_rate": 2.9820604870580427e-05, "loss": 1.5991, "step": 1490 }, { "epoch": 0.88, "grad_norm": 0.24394232034683228, "learning_rate": 2.9593737945414264e-05, "loss": 1.7422, "step": 1500 }, { "epoch": 0.89, "grad_norm": 0.28175410628318787, "learning_rate": 2.9366478822551975e-05, "loss": 1.7054, "step": 1510 }, { "epoch": 0.89, "grad_norm": 0.2808462977409363, "learning_rate": 2.913884690460325e-05, "loss": 1.6179, "step": 1520 }, { "epoch": 0.9, "grad_norm": 0.2547930181026459, "learning_rate": 2.8910861626005776e-05, "loss": 1.6518, "step": 1530 }, { "epoch": 0.91, "grad_norm": 0.23432618379592896, "learning_rate": 2.868254245136594e-05, "loss": 1.6254, "step": 1540 }, { "epoch": 0.91, "grad_norm": 0.24667441844940186, "learning_rate": 2.8453908873797058e-05, "loss": 1.6296, "step": 1550 }, { "epoch": 0.92, "grad_norm": 0.2473803460597992, "learning_rate": 2.8224980413255086e-05, "loss": 1.6373, "step": 1560 }, { "epoch": 0.92, "grad_norm": 0.2975025475025177, "learning_rate": 2.7995776614872084e-05, "loss": 1.6004, "step": 1570 }, { "epoch": 0.93, "grad_norm": 0.2573077976703644, "learning_rate": 2.776631704728752e-05, "loss": 1.6139, "step": 1580 }, { "epoch": 0.94, "grad_norm": 0.24170945584774017, "learning_rate": 2.7536621300977576e-05, "loss": 1.6792, "step": 1590 }, { "epoch": 0.94, "grad_norm": 0.31662338972091675, "learning_rate": 2.7306708986582553e-05, "loss": 1.5272, "step": 1600 }, { "epoch": 0.95, "grad_norm": 0.24606853723526, "learning_rate": 2.70765997332326e-05, "loss": 1.6827, "step": 1610 }, { "epoch": 0.95, "grad_norm": 0.2409406304359436, "learning_rate": 2.6846313186871853e-05, "loss": 1.6965, "step": 1620 }, { "epoch": 0.96, "grad_norm": 0.24571745097637177, "learning_rate": 2.6615869008581107e-05, "loss": 1.6486, "step": 1630 }, { "epoch": 0.96, "grad_norm": 0.23941560089588165, "learning_rate": 2.638528687289925e-05, "loss": 1.6894, "step": 1640 }, { "epoch": 0.97, "grad_norm": 0.2921188473701477, "learning_rate": 2.6154586466143495e-05, "loss": 1.5936, "step": 1650 }, { "epoch": 0.98, "grad_norm": 0.2708994150161743, "learning_rate": 2.592378748472863e-05, "loss": 1.5613, "step": 1660 }, { "epoch": 0.98, "grad_norm": 0.28650519251823425, "learning_rate": 2.569290963348541e-05, "loss": 1.6684, "step": 1670 }, { "epoch": 0.99, "grad_norm": 0.28054314851760864, "learning_rate": 2.5461972623978247e-05, "loss": 1.6376, "step": 1680 }, { "epoch": 0.99, "grad_norm": 0.23524071276187897, "learning_rate": 2.5230996172822275e-05, "loss": 1.6724, "step": 1690 }, { "epoch": 1.0, "grad_norm": 0.32756513357162476, "learning_rate": 2.5e-05, "loss": 1.6852, "step": 1700 } ], "logging_steps": 10, "max_steps": 3400, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 3.332749990819791e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }