1212 lines
27 KiB
JSON
1212 lines
27 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1700,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 0.3326352834701538,
|
|
"learning_rate": 4.99989327925842e-05,
|
|
"loss": 1.8387,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 0.3947046995162964,
|
|
"learning_rate": 4.999573126145132e-05,
|
|
"loss": 1.7528,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 0.3496498167514801,
|
|
"learning_rate": 4.999039567993719e-05,
|
|
"loss": 1.7639,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 0.2980650067329407,
|
|
"learning_rate": 4.998292650357558e-05,
|
|
"loss": 1.7576,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 0.3043113350868225,
|
|
"learning_rate": 4.997332437005931e-05,
|
|
"loss": 1.6951,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 0.30196303129196167,
|
|
"learning_rate": 4.996159009918585e-05,
|
|
"loss": 1.7508,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 0.3021036684513092,
|
|
"learning_rate": 4.994772469278726e-05,
|
|
"loss": 1.7063,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 0.25009843707084656,
|
|
"learning_rate": 4.993172933464471e-05,
|
|
"loss": 1.7439,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 0.269055038690567,
|
|
"learning_rate": 4.9913605390387365e-05,
|
|
"loss": 1.7166,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 0.28669387102127075,
|
|
"learning_rate": 4.989335440737586e-05,
|
|
"loss": 1.682,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 0.2838974595069885,
|
|
"learning_rate": 4.987097811457014e-05,
|
|
"loss": 1.7938,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 0.275068074464798,
|
|
"learning_rate": 4.984647842238185e-05,
|
|
"loss": 1.689,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 0.3068424463272095,
|
|
"learning_rate": 4.981985742251123e-05,
|
|
"loss": 1.7041,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 0.27469873428344727,
|
|
"learning_rate": 4.979111738776857e-05,
|
|
"loss": 1.6888,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 0.26790323853492737,
|
|
"learning_rate": 4.976026077188013e-05,
|
|
"loss": 1.6397,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 0.25793883204460144,
|
|
"learning_rate": 4.972729020927865e-05,
|
|
"loss": 1.6512,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 0.27911970019340515,
|
|
"learning_rate": 4.9692208514878444e-05,
|
|
"loss": 1.7302,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 0.30416035652160645,
|
|
"learning_rate": 4.965501868383506e-05,
|
|
"loss": 1.6873,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 0.28593677282333374,
|
|
"learning_rate": 4.961572389128959e-05,
|
|
"loss": 1.7116,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 0.2666521370410919,
|
|
"learning_rate": 4.957432749209755e-05,
|
|
"loss": 1.6284,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 0.29738298058509827,
|
|
"learning_rate": 4.953083302054247e-05,
|
|
"loss": 1.6265,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 0.29092609882354736,
|
|
"learning_rate": 4.948524419003415e-05,
|
|
"loss": 1.6713,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 0.25735458731651306,
|
|
"learning_rate": 4.943756489279164e-05,
|
|
"loss": 1.7279,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 0.25443825125694275,
|
|
"learning_rate": 4.938779919951092e-05,
|
|
"loss": 1.6343,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 0.25252780318260193,
|
|
"learning_rate": 4.933595135901732e-05,
|
|
"loss": 1.7045,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 0.2798963785171509,
|
|
"learning_rate": 4.928202579790285e-05,
|
|
"loss": 1.6339,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 0.2597511410713196,
|
|
"learning_rate": 4.9226027120148195e-05,
|
|
"loss": 1.7226,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 0.28118622303009033,
|
|
"learning_rate": 4.916796010672969e-05,
|
|
"loss": 1.7111,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 0.25402092933654785,
|
|
"learning_rate": 4.9107829715211124e-05,
|
|
"loss": 1.7058,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 0.29867735505104065,
|
|
"learning_rate": 4.9045641079320484e-05,
|
|
"loss": 1.6785,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 0.2505153715610504,
|
|
"learning_rate": 4.8981399508511624e-05,
|
|
"loss": 1.7509,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 0.21384477615356445,
|
|
"learning_rate": 4.891511048751102e-05,
|
|
"loss": 1.8352,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 0.26775240898132324,
|
|
"learning_rate": 4.884677967584945e-05,
|
|
"loss": 1.7624,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 0.29322007298469543,
|
|
"learning_rate": 4.877641290737884e-05,
|
|
"loss": 1.706,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 0.25446540117263794,
|
|
"learning_rate": 4.870401618977415e-05,
|
|
"loss": 1.7385,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 0.2639479339122772,
|
|
"learning_rate": 4.862959570402049e-05,
|
|
"loss": 1.6283,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 0.270839661359787,
|
|
"learning_rate": 4.8553157803885404e-05,
|
|
"loss": 1.6634,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 0.26396167278289795,
|
|
"learning_rate": 4.8474709015376416e-05,
|
|
"loss": 1.6037,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 0.24521879851818085,
|
|
"learning_rate": 4.8394256036183816e-05,
|
|
"loss": 1.6235,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 0.278817355632782,
|
|
"learning_rate": 4.8311805735108894e-05,
|
|
"loss": 1.6136,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 0.27180808782577515,
|
|
"learning_rate": 4.822736515147748e-05,
|
|
"loss": 1.6092,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 0.2518569827079773,
|
|
"learning_rate": 4.814094149453891e-05,
|
|
"loss": 1.7227,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 0.29305458068847656,
|
|
"learning_rate": 4.805254214285061e-05,
|
|
"loss": 1.6615,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 0.28227561712265015,
|
|
"learning_rate": 4.796217464364808e-05,
|
|
"loss": 1.6513,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 0.2649269998073578,
|
|
"learning_rate": 4.786984671220053e-05,
|
|
"loss": 1.645,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 0.2525152564048767,
|
|
"learning_rate": 4.777556623115221e-05,
|
|
"loss": 1.7315,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 0.3066771328449249,
|
|
"learning_rate": 4.767934124984941e-05,
|
|
"loss": 1.6781,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 0.253776878118515,
|
|
"learning_rate": 4.758117998365322e-05,
|
|
"loss": 1.6885,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 0.2555100917816162,
|
|
"learning_rate": 4.748109081323814e-05,
|
|
"loss": 1.6221,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 0.2828095853328705,
|
|
"learning_rate": 4.7379082283876566e-05,
|
|
"loss": 1.6639,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 0.22167381644248962,
|
|
"learning_rate": 4.72751631047092e-05,
|
|
"loss": 1.714,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 0.2374366968870163,
|
|
"learning_rate": 4.716934214800155e-05,
|
|
"loss": 1.7015,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 0.25729697942733765,
|
|
"learning_rate": 4.70616284483864e-05,
|
|
"loss": 1.6717,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 0.24997037649154663,
|
|
"learning_rate": 4.695203120209245e-05,
|
|
"loss": 1.7098,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 0.24944821000099182,
|
|
"learning_rate": 4.684055976615924e-05,
|
|
"loss": 1.6521,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 0.23309342563152313,
|
|
"learning_rate": 4.672722365763821e-05,
|
|
"loss": 1.7418,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 0.24750587344169617,
|
|
"learning_rate": 4.66120325527802e-05,
|
|
"loss": 1.7373,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 0.2507722079753876,
|
|
"learning_rate": 4.649499628620931e-05,
|
|
"loss": 1.6818,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 0.19257722795009613,
|
|
"learning_rate": 4.637612485008328e-05,
|
|
"loss": 1.7484,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 0.24218104779720306,
|
|
"learning_rate": 4.625542839324036e-05,
|
|
"loss": 1.6264,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 0.26412564516067505,
|
|
"learning_rate": 4.6132917220332846e-05,
|
|
"loss": 1.6143,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 0.24778200685977936,
|
|
"learning_rate": 4.600860179094732e-05,
|
|
"loss": 1.5942,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 0.25658294558525085,
|
|
"learning_rate": 4.588249271871164e-05,
|
|
"loss": 1.6639,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 0.2580535411834717,
|
|
"learning_rate": 4.575460077038877e-05,
|
|
"loss": 1.6117,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 0.2333957999944687,
|
|
"learning_rate": 4.5624936864957556e-05,
|
|
"loss": 1.6668,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 0.26032692193984985,
|
|
"learning_rate": 4.5493512072680536e-05,
|
|
"loss": 1.6715,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 0.2909170985221863,
|
|
"learning_rate": 4.536033761415871e-05,
|
|
"loss": 1.7098,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 0.252714604139328,
|
|
"learning_rate": 4.522542485937369e-05,
|
|
"loss": 1.619,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 0.23777702450752258,
|
|
"learning_rate": 4.5088785326716844e-05,
|
|
"loss": 1.5681,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 0.2274021953344345,
|
|
"learning_rate": 4.4950430682006e-05,
|
|
"loss": 1.6849,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 0.23289409279823303,
|
|
"learning_rate": 4.4810372737489345e-05,
|
|
"loss": 1.7016,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 0.25084224343299866,
|
|
"learning_rate": 4.4668623450837085e-05,
|
|
"loss": 1.656,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 0.30249300599098206,
|
|
"learning_rate": 4.452519492412039e-05,
|
|
"loss": 1.6872,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 0.25141090154647827,
|
|
"learning_rate": 4.4380099402778244e-05,
|
|
"loss": 1.5979,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 0.2582787573337555,
|
|
"learning_rate": 4.423334927457198e-05,
|
|
"loss": 1.6975,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 0.2253919243812561,
|
|
"learning_rate": 4.408495706852758e-05,
|
|
"loss": 1.6676,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 0.24296186864376068,
|
|
"learning_rate": 4.393493545386607e-05,
|
|
"loss": 1.6405,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 0.23083582520484924,
|
|
"learning_rate": 4.378329723892184e-05,
|
|
"loss": 1.755,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 0.2642640769481659,
|
|
"learning_rate": 4.363005537004907e-05,
|
|
"loss": 1.6541,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 0.2347261905670166,
|
|
"learning_rate": 4.347522293051648e-05,
|
|
"loss": 1.6949,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 0.2511994540691376,
|
|
"learning_rate": 4.331881313939029e-05,
|
|
"loss": 1.7085,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 0.22544771432876587,
|
|
"learning_rate": 4.3160839350405606e-05,
|
|
"loss": 1.7079,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 0.26223573088645935,
|
|
"learning_rate": 4.300131505082637e-05,
|
|
"loss": 1.5977,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 0.22108006477355957,
|
|
"learning_rate": 4.284025386029381e-05,
|
|
"loss": 1.7442,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 0.2781137228012085,
|
|
"learning_rate": 4.267766952966369e-05,
|
|
"loss": 1.6607,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 0.2906346619129181,
|
|
"learning_rate": 4.2513575939832275e-05,
|
|
"loss": 1.6634,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 0.2514524757862091,
|
|
"learning_rate": 4.234798710055125e-05,
|
|
"loss": 1.6332,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 0.2293555736541748,
|
|
"learning_rate": 4.218091714923157e-05,
|
|
"loss": 1.6625,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 0.24143122136592865,
|
|
"learning_rate": 4.201238034973654e-05,
|
|
"loss": 1.7808,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 0.2245650440454483,
|
|
"learning_rate": 4.184239109116393e-05,
|
|
"loss": 1.6353,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 0.2478613555431366,
|
|
"learning_rate": 4.1670963886617535e-05,
|
|
"loss": 1.6436,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 0.24913890659809113,
|
|
"learning_rate": 4.149811337196807e-05,
|
|
"loss": 1.6229,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 0.23885607719421387,
|
|
"learning_rate": 4.132385430460361e-05,
|
|
"loss": 1.6778,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 0.24098117649555206,
|
|
"learning_rate": 4.1148201562169685e-05,
|
|
"loss": 1.6253,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 0.232025146484375,
|
|
"learning_rate": 4.097117014129903e-05,
|
|
"loss": 1.7341,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 0.25174739956855774,
|
|
"learning_rate": 4.079277515633127e-05,
|
|
"loss": 1.6617,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 0.23565252125263214,
|
|
"learning_rate": 4.0613031838022486e-05,
|
|
"loss": 1.7218,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 0.24838000535964966,
|
|
"learning_rate": 4.0431955532244827e-05,
|
|
"loss": 1.6409,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 0.264049768447876,
|
|
"learning_rate": 4.0249561698676416e-05,
|
|
"loss": 1.6628,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 0.2418060302734375,
|
|
"learning_rate": 4.0065865909481417e-05,
|
|
"loss": 1.63,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 0.2619737684726715,
|
|
"learning_rate": 3.988088384798047e-05,
|
|
"loss": 1.7268,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 0.24747079610824585,
|
|
"learning_rate": 3.969463130731183e-05,
|
|
"loss": 1.683,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 0.2688976228237152,
|
|
"learning_rate": 3.950712418908289e-05,
|
|
"loss": 1.6266,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 0.23298139870166779,
|
|
"learning_rate": 3.931837850201263e-05,
|
|
"loss": 1.6665,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 0.2630896270275116,
|
|
"learning_rate": 3.91284103605648e-05,
|
|
"loss": 1.6105,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 0.23678737878799438,
|
|
"learning_rate": 3.893723598357214e-05,
|
|
"loss": 1.6338,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 0.23340342938899994,
|
|
"learning_rate": 3.874487169285168e-05,
|
|
"loss": 1.6318,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 0.261382520198822,
|
|
"learning_rate": 3.855133391181124e-05,
|
|
"loss": 1.6206,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 0.26030778884887695,
|
|
"learning_rate": 3.835663916404721e-05,
|
|
"loss": 1.6291,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 0.2665380835533142,
|
|
"learning_rate": 3.81608040719339e-05,
|
|
"loss": 1.617,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 0.2118641436100006,
|
|
"learning_rate": 3.7963845355204304e-05,
|
|
"loss": 1.6328,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 0.33383819460868835,
|
|
"learning_rate": 3.7765779829522675e-05,
|
|
"loss": 1.6419,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 0.27692610025405884,
|
|
"learning_rate": 3.7566624405048844e-05,
|
|
"loss": 1.6831,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 0.22456732392311096,
|
|
"learning_rate": 3.7366396084994475e-05,
|
|
"loss": 1.7071,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 0.25981763005256653,
|
|
"learning_rate": 3.716511196417141e-05,
|
|
"loss": 1.6056,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 0.2628863453865051,
|
|
"learning_rate": 3.696278922753216e-05,
|
|
"loss": 1.625,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 0.2798576354980469,
|
|
"learning_rate": 3.6759445148702735e-05,
|
|
"loss": 1.6918,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 0.24460268020629883,
|
|
"learning_rate": 3.655509708850783e-05,
|
|
"loss": 1.6883,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 0.27063578367233276,
|
|
"learning_rate": 3.634976249348867e-05,
|
|
"loss": 1.6268,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 0.2729038596153259,
|
|
"learning_rate": 3.6143458894413465e-05,
|
|
"loss": 1.6014,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 0.23127633333206177,
|
|
"learning_rate": 3.593620390478066e-05,
|
|
"loss": 1.6393,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 0.23226168751716614,
|
|
"learning_rate": 3.572801521931522e-05,
|
|
"loss": 1.6408,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 0.23845529556274414,
|
|
"learning_rate": 3.551891061245788e-05,
|
|
"loss": 1.6366,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 0.2765669524669647,
|
|
"learning_rate": 3.5308907936847594e-05,
|
|
"loss": 1.6158,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 0.3147326707839966,
|
|
"learning_rate": 3.509802512179737e-05,
|
|
"loss": 1.6012,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 0.2679651081562042,
|
|
"learning_rate": 3.488628017176356e-05,
|
|
"loss": 1.6459,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 0.24452731013298035,
|
|
"learning_rate": 3.467369116480864e-05,
|
|
"loss": 1.599,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 0.2260725051164627,
|
|
"learning_rate": 3.446027625105776e-05,
|
|
"loss": 1.6405,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 0.2767278552055359,
|
|
"learning_rate": 3.424605365114923e-05,
|
|
"loss": 1.6976,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 0.23728059232234955,
|
|
"learning_rate": 3.403104165467883e-05,
|
|
"loss": 1.7039,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 0.2619900107383728,
|
|
"learning_rate": 3.381525861863831e-05,
|
|
"loss": 1.6404,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 0.2897908389568329,
|
|
"learning_rate": 3.3598722965848204e-05,
|
|
"loss": 1.6433,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 0.2271248698234558,
|
|
"learning_rate": 3.3381453183384846e-05,
|
|
"loss": 1.6096,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 0.31233999133110046,
|
|
"learning_rate": 3.316346782100208e-05,
|
|
"loss": 1.7015,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 0.2572289705276489,
|
|
"learning_rate": 3.294478548954754e-05,
|
|
"loss": 1.6442,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.2430019974708557,
|
|
"learning_rate": 3.272542485937369e-05,
|
|
"loss": 1.6863,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 0.2512265741825104,
|
|
"learning_rate": 3.250540465874382e-05,
|
|
"loss": 1.6767,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 0.2533564567565918,
|
|
"learning_rate": 3.228474367223312e-05,
|
|
"loss": 1.6692,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 0.25457942485809326,
|
|
"learning_rate": 3.206346073912488e-05,
|
|
"loss": 1.6627,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 0.24955084919929504,
|
|
"learning_rate": 3.1841574751802076e-05,
|
|
"loss": 1.6625,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 0.2384202629327774,
|
|
"learning_rate": 3.1619104654134395e-05,
|
|
"loss": 1.7596,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 0.27582287788391113,
|
|
"learning_rate": 3.1396069439860894e-05,
|
|
"loss": 1.7282,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 0.2678006887435913,
|
|
"learning_rate": 3.117248815096833e-05,
|
|
"loss": 1.6456,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 0.23636989295482635,
|
|
"learning_rate": 3.094837987606547e-05,
|
|
"loss": 1.5889,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 0.2650662362575531,
|
|
"learning_rate": 3.072376374875335e-05,
|
|
"loss": 1.6597,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 0.2590391933917999,
|
|
"learning_rate": 3.049865894599172e-05,
|
|
"loss": 1.6503,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 0.26722925901412964,
|
|
"learning_rate": 3.027308468646175e-05,
|
|
"loss": 1.6738,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 0.23559637367725372,
|
|
"learning_rate": 3.0047060228925256e-05,
|
|
"loss": 1.5497,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 0.2601984143257141,
|
|
"learning_rate": 2.9820604870580427e-05,
|
|
"loss": 1.5991,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 0.24394232034683228,
|
|
"learning_rate": 2.9593737945414264e-05,
|
|
"loss": 1.7422,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 0.28175410628318787,
|
|
"learning_rate": 2.9366478822551975e-05,
|
|
"loss": 1.7054,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 0.2808462977409363,
|
|
"learning_rate": 2.913884690460325e-05,
|
|
"loss": 1.6179,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 0.2547930181026459,
|
|
"learning_rate": 2.8910861626005776e-05,
|
|
"loss": 1.6518,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 0.23432618379592896,
|
|
"learning_rate": 2.868254245136594e-05,
|
|
"loss": 1.6254,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 0.24667441844940186,
|
|
"learning_rate": 2.8453908873797058e-05,
|
|
"loss": 1.6296,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 0.2473803460597992,
|
|
"learning_rate": 2.8224980413255086e-05,
|
|
"loss": 1.6373,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 0.2975025475025177,
|
|
"learning_rate": 2.7995776614872084e-05,
|
|
"loss": 1.6004,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 0.2573077976703644,
|
|
"learning_rate": 2.776631704728752e-05,
|
|
"loss": 1.6139,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 0.24170945584774017,
|
|
"learning_rate": 2.7536621300977576e-05,
|
|
"loss": 1.6792,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 0.31662338972091675,
|
|
"learning_rate": 2.7306708986582553e-05,
|
|
"loss": 1.5272,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 0.24606853723526,
|
|
"learning_rate": 2.70765997332326e-05,
|
|
"loss": 1.6827,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 0.2409406304359436,
|
|
"learning_rate": 2.6846313186871853e-05,
|
|
"loss": 1.6965,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 0.24571745097637177,
|
|
"learning_rate": 2.6615869008581107e-05,
|
|
"loss": 1.6486,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 0.23941560089588165,
|
|
"learning_rate": 2.638528687289925e-05,
|
|
"loss": 1.6894,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 0.2921188473701477,
|
|
"learning_rate": 2.6154586466143495e-05,
|
|
"loss": 1.5936,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 0.2708994150161743,
|
|
"learning_rate": 2.592378748472863e-05,
|
|
"loss": 1.5613,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 0.28650519251823425,
|
|
"learning_rate": 2.569290963348541e-05,
|
|
"loss": 1.6684,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 0.28054314851760864,
|
|
"learning_rate": 2.5461972623978247e-05,
|
|
"loss": 1.6376,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 0.23524071276187897,
|
|
"learning_rate": 2.5230996172822275e-05,
|
|
"loss": 1.6724,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.32756513357162476,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 1.6852,
|
|
"step": 1700
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 3400,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 500,
|
|
"total_flos": 3.332749990819791e+18,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|