Files
WidsoMenter-8B/trainer_state.json
ModelHub XC c33f2a658d 初始化项目,由ModelHub XC社区提供模型
Model: linjh1118/WidsoMenter-8B
Source: Original Platform
2026-06-01 10:32:18 +08:00

1212 lines
27 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 0.3326352834701538,
"learning_rate": 4.99989327925842e-05,
"loss": 1.8387,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.3947046995162964,
"learning_rate": 4.999573126145132e-05,
"loss": 1.7528,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.3496498167514801,
"learning_rate": 4.999039567993719e-05,
"loss": 1.7639,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.2980650067329407,
"learning_rate": 4.998292650357558e-05,
"loss": 1.7576,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.3043113350868225,
"learning_rate": 4.997332437005931e-05,
"loss": 1.6951,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 0.30196303129196167,
"learning_rate": 4.996159009918585e-05,
"loss": 1.7508,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 0.3021036684513092,
"learning_rate": 4.994772469278726e-05,
"loss": 1.7063,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.25009843707084656,
"learning_rate": 4.993172933464471e-05,
"loss": 1.7439,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 0.269055038690567,
"learning_rate": 4.9913605390387365e-05,
"loss": 1.7166,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 0.28669387102127075,
"learning_rate": 4.989335440737586e-05,
"loss": 1.682,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 0.2838974595069885,
"learning_rate": 4.987097811457014e-05,
"loss": 1.7938,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 0.275068074464798,
"learning_rate": 4.984647842238185e-05,
"loss": 1.689,
"step": 120
},
{
"epoch": 0.08,
"grad_norm": 0.3068424463272095,
"learning_rate": 4.981985742251123e-05,
"loss": 1.7041,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 0.27469873428344727,
"learning_rate": 4.979111738776857e-05,
"loss": 1.6888,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 0.26790323853492737,
"learning_rate": 4.976026077188013e-05,
"loss": 1.6397,
"step": 150
},
{
"epoch": 0.09,
"grad_norm": 0.25793883204460144,
"learning_rate": 4.972729020927865e-05,
"loss": 1.6512,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 0.27911970019340515,
"learning_rate": 4.9692208514878444e-05,
"loss": 1.7302,
"step": 170
},
{
"epoch": 0.11,
"grad_norm": 0.30416035652160645,
"learning_rate": 4.965501868383506e-05,
"loss": 1.6873,
"step": 180
},
{
"epoch": 0.11,
"grad_norm": 0.28593677282333374,
"learning_rate": 4.961572389128959e-05,
"loss": 1.7116,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 0.2666521370410919,
"learning_rate": 4.957432749209755e-05,
"loss": 1.6284,
"step": 200
},
{
"epoch": 0.12,
"grad_norm": 0.29738298058509827,
"learning_rate": 4.953083302054247e-05,
"loss": 1.6265,
"step": 210
},
{
"epoch": 0.13,
"grad_norm": 0.29092609882354736,
"learning_rate": 4.948524419003415e-05,
"loss": 1.6713,
"step": 220
},
{
"epoch": 0.14,
"grad_norm": 0.25735458731651306,
"learning_rate": 4.943756489279164e-05,
"loss": 1.7279,
"step": 230
},
{
"epoch": 0.14,
"grad_norm": 0.25443825125694275,
"learning_rate": 4.938779919951092e-05,
"loss": 1.6343,
"step": 240
},
{
"epoch": 0.15,
"grad_norm": 0.25252780318260193,
"learning_rate": 4.933595135901732e-05,
"loss": 1.7045,
"step": 250
},
{
"epoch": 0.15,
"grad_norm": 0.2798963785171509,
"learning_rate": 4.928202579790285e-05,
"loss": 1.6339,
"step": 260
},
{
"epoch": 0.16,
"grad_norm": 0.2597511410713196,
"learning_rate": 4.9226027120148195e-05,
"loss": 1.7226,
"step": 270
},
{
"epoch": 0.16,
"grad_norm": 0.28118622303009033,
"learning_rate": 4.916796010672969e-05,
"loss": 1.7111,
"step": 280
},
{
"epoch": 0.17,
"grad_norm": 0.25402092933654785,
"learning_rate": 4.9107829715211124e-05,
"loss": 1.7058,
"step": 290
},
{
"epoch": 0.18,
"grad_norm": 0.29867735505104065,
"learning_rate": 4.9045641079320484e-05,
"loss": 1.6785,
"step": 300
},
{
"epoch": 0.18,
"grad_norm": 0.2505153715610504,
"learning_rate": 4.8981399508511624e-05,
"loss": 1.7509,
"step": 310
},
{
"epoch": 0.19,
"grad_norm": 0.21384477615356445,
"learning_rate": 4.891511048751102e-05,
"loss": 1.8352,
"step": 320
},
{
"epoch": 0.19,
"grad_norm": 0.26775240898132324,
"learning_rate": 4.884677967584945e-05,
"loss": 1.7624,
"step": 330
},
{
"epoch": 0.2,
"grad_norm": 0.29322007298469543,
"learning_rate": 4.877641290737884e-05,
"loss": 1.706,
"step": 340
},
{
"epoch": 0.21,
"grad_norm": 0.25446540117263794,
"learning_rate": 4.870401618977415e-05,
"loss": 1.7385,
"step": 350
},
{
"epoch": 0.21,
"grad_norm": 0.2639479339122772,
"learning_rate": 4.862959570402049e-05,
"loss": 1.6283,
"step": 360
},
{
"epoch": 0.22,
"grad_norm": 0.270839661359787,
"learning_rate": 4.8553157803885404e-05,
"loss": 1.6634,
"step": 370
},
{
"epoch": 0.22,
"grad_norm": 0.26396167278289795,
"learning_rate": 4.8474709015376416e-05,
"loss": 1.6037,
"step": 380
},
{
"epoch": 0.23,
"grad_norm": 0.24521879851818085,
"learning_rate": 4.8394256036183816e-05,
"loss": 1.6235,
"step": 390
},
{
"epoch": 0.24,
"grad_norm": 0.278817355632782,
"learning_rate": 4.8311805735108894e-05,
"loss": 1.6136,
"step": 400
},
{
"epoch": 0.24,
"grad_norm": 0.27180808782577515,
"learning_rate": 4.822736515147748e-05,
"loss": 1.6092,
"step": 410
},
{
"epoch": 0.25,
"grad_norm": 0.2518569827079773,
"learning_rate": 4.814094149453891e-05,
"loss": 1.7227,
"step": 420
},
{
"epoch": 0.25,
"grad_norm": 0.29305458068847656,
"learning_rate": 4.805254214285061e-05,
"loss": 1.6615,
"step": 430
},
{
"epoch": 0.26,
"grad_norm": 0.28227561712265015,
"learning_rate": 4.796217464364808e-05,
"loss": 1.6513,
"step": 440
},
{
"epoch": 0.26,
"grad_norm": 0.2649269998073578,
"learning_rate": 4.786984671220053e-05,
"loss": 1.645,
"step": 450
},
{
"epoch": 0.27,
"grad_norm": 0.2525152564048767,
"learning_rate": 4.777556623115221e-05,
"loss": 1.7315,
"step": 460
},
{
"epoch": 0.28,
"grad_norm": 0.3066771328449249,
"learning_rate": 4.767934124984941e-05,
"loss": 1.6781,
"step": 470
},
{
"epoch": 0.28,
"grad_norm": 0.253776878118515,
"learning_rate": 4.758117998365322e-05,
"loss": 1.6885,
"step": 480
},
{
"epoch": 0.29,
"grad_norm": 0.2555100917816162,
"learning_rate": 4.748109081323814e-05,
"loss": 1.6221,
"step": 490
},
{
"epoch": 0.29,
"grad_norm": 0.2828095853328705,
"learning_rate": 4.7379082283876566e-05,
"loss": 1.6639,
"step": 500
},
{
"epoch": 0.3,
"grad_norm": 0.22167381644248962,
"learning_rate": 4.72751631047092e-05,
"loss": 1.714,
"step": 510
},
{
"epoch": 0.31,
"grad_norm": 0.2374366968870163,
"learning_rate": 4.716934214800155e-05,
"loss": 1.7015,
"step": 520
},
{
"epoch": 0.31,
"grad_norm": 0.25729697942733765,
"learning_rate": 4.70616284483864e-05,
"loss": 1.6717,
"step": 530
},
{
"epoch": 0.32,
"grad_norm": 0.24997037649154663,
"learning_rate": 4.695203120209245e-05,
"loss": 1.7098,
"step": 540
},
{
"epoch": 0.32,
"grad_norm": 0.24944821000099182,
"learning_rate": 4.684055976615924e-05,
"loss": 1.6521,
"step": 550
},
{
"epoch": 0.33,
"grad_norm": 0.23309342563152313,
"learning_rate": 4.672722365763821e-05,
"loss": 1.7418,
"step": 560
},
{
"epoch": 0.34,
"grad_norm": 0.24750587344169617,
"learning_rate": 4.66120325527802e-05,
"loss": 1.7373,
"step": 570
},
{
"epoch": 0.34,
"grad_norm": 0.2507722079753876,
"learning_rate": 4.649499628620931e-05,
"loss": 1.6818,
"step": 580
},
{
"epoch": 0.35,
"grad_norm": 0.19257722795009613,
"learning_rate": 4.637612485008328e-05,
"loss": 1.7484,
"step": 590
},
{
"epoch": 0.35,
"grad_norm": 0.24218104779720306,
"learning_rate": 4.625542839324036e-05,
"loss": 1.6264,
"step": 600
},
{
"epoch": 0.36,
"grad_norm": 0.26412564516067505,
"learning_rate": 4.6132917220332846e-05,
"loss": 1.6143,
"step": 610
},
{
"epoch": 0.36,
"grad_norm": 0.24778200685977936,
"learning_rate": 4.600860179094732e-05,
"loss": 1.5942,
"step": 620
},
{
"epoch": 0.37,
"grad_norm": 0.25658294558525085,
"learning_rate": 4.588249271871164e-05,
"loss": 1.6639,
"step": 630
},
{
"epoch": 0.38,
"grad_norm": 0.2580535411834717,
"learning_rate": 4.575460077038877e-05,
"loss": 1.6117,
"step": 640
},
{
"epoch": 0.38,
"grad_norm": 0.2333957999944687,
"learning_rate": 4.5624936864957556e-05,
"loss": 1.6668,
"step": 650
},
{
"epoch": 0.39,
"grad_norm": 0.26032692193984985,
"learning_rate": 4.5493512072680536e-05,
"loss": 1.6715,
"step": 660
},
{
"epoch": 0.39,
"grad_norm": 0.2909170985221863,
"learning_rate": 4.536033761415871e-05,
"loss": 1.7098,
"step": 670
},
{
"epoch": 0.4,
"grad_norm": 0.252714604139328,
"learning_rate": 4.522542485937369e-05,
"loss": 1.619,
"step": 680
},
{
"epoch": 0.41,
"grad_norm": 0.23777702450752258,
"learning_rate": 4.5088785326716844e-05,
"loss": 1.5681,
"step": 690
},
{
"epoch": 0.41,
"grad_norm": 0.2274021953344345,
"learning_rate": 4.4950430682006e-05,
"loss": 1.6849,
"step": 700
},
{
"epoch": 0.42,
"grad_norm": 0.23289409279823303,
"learning_rate": 4.4810372737489345e-05,
"loss": 1.7016,
"step": 710
},
{
"epoch": 0.42,
"grad_norm": 0.25084224343299866,
"learning_rate": 4.4668623450837085e-05,
"loss": 1.656,
"step": 720
},
{
"epoch": 0.43,
"grad_norm": 0.30249300599098206,
"learning_rate": 4.452519492412039e-05,
"loss": 1.6872,
"step": 730
},
{
"epoch": 0.44,
"grad_norm": 0.25141090154647827,
"learning_rate": 4.4380099402778244e-05,
"loss": 1.5979,
"step": 740
},
{
"epoch": 0.44,
"grad_norm": 0.2582787573337555,
"learning_rate": 4.423334927457198e-05,
"loss": 1.6975,
"step": 750
},
{
"epoch": 0.45,
"grad_norm": 0.2253919243812561,
"learning_rate": 4.408495706852758e-05,
"loss": 1.6676,
"step": 760
},
{
"epoch": 0.45,
"grad_norm": 0.24296186864376068,
"learning_rate": 4.393493545386607e-05,
"loss": 1.6405,
"step": 770
},
{
"epoch": 0.46,
"grad_norm": 0.23083582520484924,
"learning_rate": 4.378329723892184e-05,
"loss": 1.755,
"step": 780
},
{
"epoch": 0.46,
"grad_norm": 0.2642640769481659,
"learning_rate": 4.363005537004907e-05,
"loss": 1.6541,
"step": 790
},
{
"epoch": 0.47,
"grad_norm": 0.2347261905670166,
"learning_rate": 4.347522293051648e-05,
"loss": 1.6949,
"step": 800
},
{
"epoch": 0.48,
"grad_norm": 0.2511994540691376,
"learning_rate": 4.331881313939029e-05,
"loss": 1.7085,
"step": 810
},
{
"epoch": 0.48,
"grad_norm": 0.22544771432876587,
"learning_rate": 4.3160839350405606e-05,
"loss": 1.7079,
"step": 820
},
{
"epoch": 0.49,
"grad_norm": 0.26223573088645935,
"learning_rate": 4.300131505082637e-05,
"loss": 1.5977,
"step": 830
},
{
"epoch": 0.49,
"grad_norm": 0.22108006477355957,
"learning_rate": 4.284025386029381e-05,
"loss": 1.7442,
"step": 840
},
{
"epoch": 0.5,
"grad_norm": 0.2781137228012085,
"learning_rate": 4.267766952966369e-05,
"loss": 1.6607,
"step": 850
},
{
"epoch": 0.51,
"grad_norm": 0.2906346619129181,
"learning_rate": 4.2513575939832275e-05,
"loss": 1.6634,
"step": 860
},
{
"epoch": 0.51,
"grad_norm": 0.2514524757862091,
"learning_rate": 4.234798710055125e-05,
"loss": 1.6332,
"step": 870
},
{
"epoch": 0.52,
"grad_norm": 0.2293555736541748,
"learning_rate": 4.218091714923157e-05,
"loss": 1.6625,
"step": 880
},
{
"epoch": 0.52,
"grad_norm": 0.24143122136592865,
"learning_rate": 4.201238034973654e-05,
"loss": 1.7808,
"step": 890
},
{
"epoch": 0.53,
"grad_norm": 0.2245650440454483,
"learning_rate": 4.184239109116393e-05,
"loss": 1.6353,
"step": 900
},
{
"epoch": 0.54,
"grad_norm": 0.2478613555431366,
"learning_rate": 4.1670963886617535e-05,
"loss": 1.6436,
"step": 910
},
{
"epoch": 0.54,
"grad_norm": 0.24913890659809113,
"learning_rate": 4.149811337196807e-05,
"loss": 1.6229,
"step": 920
},
{
"epoch": 0.55,
"grad_norm": 0.23885607719421387,
"learning_rate": 4.132385430460361e-05,
"loss": 1.6778,
"step": 930
},
{
"epoch": 0.55,
"grad_norm": 0.24098117649555206,
"learning_rate": 4.1148201562169685e-05,
"loss": 1.6253,
"step": 940
},
{
"epoch": 0.56,
"grad_norm": 0.232025146484375,
"learning_rate": 4.097117014129903e-05,
"loss": 1.7341,
"step": 950
},
{
"epoch": 0.56,
"grad_norm": 0.25174739956855774,
"learning_rate": 4.079277515633127e-05,
"loss": 1.6617,
"step": 960
},
{
"epoch": 0.57,
"grad_norm": 0.23565252125263214,
"learning_rate": 4.0613031838022486e-05,
"loss": 1.7218,
"step": 970
},
{
"epoch": 0.58,
"grad_norm": 0.24838000535964966,
"learning_rate": 4.0431955532244827e-05,
"loss": 1.6409,
"step": 980
},
{
"epoch": 0.58,
"grad_norm": 0.264049768447876,
"learning_rate": 4.0249561698676416e-05,
"loss": 1.6628,
"step": 990
},
{
"epoch": 0.59,
"grad_norm": 0.2418060302734375,
"learning_rate": 4.0065865909481417e-05,
"loss": 1.63,
"step": 1000
},
{
"epoch": 0.59,
"grad_norm": 0.2619737684726715,
"learning_rate": 3.988088384798047e-05,
"loss": 1.7268,
"step": 1010
},
{
"epoch": 0.6,
"grad_norm": 0.24747079610824585,
"learning_rate": 3.969463130731183e-05,
"loss": 1.683,
"step": 1020
},
{
"epoch": 0.61,
"grad_norm": 0.2688976228237152,
"learning_rate": 3.950712418908289e-05,
"loss": 1.6266,
"step": 1030
},
{
"epoch": 0.61,
"grad_norm": 0.23298139870166779,
"learning_rate": 3.931837850201263e-05,
"loss": 1.6665,
"step": 1040
},
{
"epoch": 0.62,
"grad_norm": 0.2630896270275116,
"learning_rate": 3.91284103605648e-05,
"loss": 1.6105,
"step": 1050
},
{
"epoch": 0.62,
"grad_norm": 0.23678737878799438,
"learning_rate": 3.893723598357214e-05,
"loss": 1.6338,
"step": 1060
},
{
"epoch": 0.63,
"grad_norm": 0.23340342938899994,
"learning_rate": 3.874487169285168e-05,
"loss": 1.6318,
"step": 1070
},
{
"epoch": 0.64,
"grad_norm": 0.261382520198822,
"learning_rate": 3.855133391181124e-05,
"loss": 1.6206,
"step": 1080
},
{
"epoch": 0.64,
"grad_norm": 0.26030778884887695,
"learning_rate": 3.835663916404721e-05,
"loss": 1.6291,
"step": 1090
},
{
"epoch": 0.65,
"grad_norm": 0.2665380835533142,
"learning_rate": 3.81608040719339e-05,
"loss": 1.617,
"step": 1100
},
{
"epoch": 0.65,
"grad_norm": 0.2118641436100006,
"learning_rate": 3.7963845355204304e-05,
"loss": 1.6328,
"step": 1110
},
{
"epoch": 0.66,
"grad_norm": 0.33383819460868835,
"learning_rate": 3.7765779829522675e-05,
"loss": 1.6419,
"step": 1120
},
{
"epoch": 0.66,
"grad_norm": 0.27692610025405884,
"learning_rate": 3.7566624405048844e-05,
"loss": 1.6831,
"step": 1130
},
{
"epoch": 0.67,
"grad_norm": 0.22456732392311096,
"learning_rate": 3.7366396084994475e-05,
"loss": 1.7071,
"step": 1140
},
{
"epoch": 0.68,
"grad_norm": 0.25981763005256653,
"learning_rate": 3.716511196417141e-05,
"loss": 1.6056,
"step": 1150
},
{
"epoch": 0.68,
"grad_norm": 0.2628863453865051,
"learning_rate": 3.696278922753216e-05,
"loss": 1.625,
"step": 1160
},
{
"epoch": 0.69,
"grad_norm": 0.2798576354980469,
"learning_rate": 3.6759445148702735e-05,
"loss": 1.6918,
"step": 1170
},
{
"epoch": 0.69,
"grad_norm": 0.24460268020629883,
"learning_rate": 3.655509708850783e-05,
"loss": 1.6883,
"step": 1180
},
{
"epoch": 0.7,
"grad_norm": 0.27063578367233276,
"learning_rate": 3.634976249348867e-05,
"loss": 1.6268,
"step": 1190
},
{
"epoch": 0.71,
"grad_norm": 0.2729038596153259,
"learning_rate": 3.6143458894413465e-05,
"loss": 1.6014,
"step": 1200
},
{
"epoch": 0.71,
"grad_norm": 0.23127633333206177,
"learning_rate": 3.593620390478066e-05,
"loss": 1.6393,
"step": 1210
},
{
"epoch": 0.72,
"grad_norm": 0.23226168751716614,
"learning_rate": 3.572801521931522e-05,
"loss": 1.6408,
"step": 1220
},
{
"epoch": 0.72,
"grad_norm": 0.23845529556274414,
"learning_rate": 3.551891061245788e-05,
"loss": 1.6366,
"step": 1230
},
{
"epoch": 0.73,
"grad_norm": 0.2765669524669647,
"learning_rate": 3.5308907936847594e-05,
"loss": 1.6158,
"step": 1240
},
{
"epoch": 0.74,
"grad_norm": 0.3147326707839966,
"learning_rate": 3.509802512179737e-05,
"loss": 1.6012,
"step": 1250
},
{
"epoch": 0.74,
"grad_norm": 0.2679651081562042,
"learning_rate": 3.488628017176356e-05,
"loss": 1.6459,
"step": 1260
},
{
"epoch": 0.75,
"grad_norm": 0.24452731013298035,
"learning_rate": 3.467369116480864e-05,
"loss": 1.599,
"step": 1270
},
{
"epoch": 0.75,
"grad_norm": 0.2260725051164627,
"learning_rate": 3.446027625105776e-05,
"loss": 1.6405,
"step": 1280
},
{
"epoch": 0.76,
"grad_norm": 0.2767278552055359,
"learning_rate": 3.424605365114923e-05,
"loss": 1.6976,
"step": 1290
},
{
"epoch": 0.76,
"grad_norm": 0.23728059232234955,
"learning_rate": 3.403104165467883e-05,
"loss": 1.7039,
"step": 1300
},
{
"epoch": 0.77,
"grad_norm": 0.2619900107383728,
"learning_rate": 3.381525861863831e-05,
"loss": 1.6404,
"step": 1310
},
{
"epoch": 0.78,
"grad_norm": 0.2897908389568329,
"learning_rate": 3.3598722965848204e-05,
"loss": 1.6433,
"step": 1320
},
{
"epoch": 0.78,
"grad_norm": 0.2271248698234558,
"learning_rate": 3.3381453183384846e-05,
"loss": 1.6096,
"step": 1330
},
{
"epoch": 0.79,
"grad_norm": 0.31233999133110046,
"learning_rate": 3.316346782100208e-05,
"loss": 1.7015,
"step": 1340
},
{
"epoch": 0.79,
"grad_norm": 0.2572289705276489,
"learning_rate": 3.294478548954754e-05,
"loss": 1.6442,
"step": 1350
},
{
"epoch": 0.8,
"grad_norm": 0.2430019974708557,
"learning_rate": 3.272542485937369e-05,
"loss": 1.6863,
"step": 1360
},
{
"epoch": 0.81,
"grad_norm": 0.2512265741825104,
"learning_rate": 3.250540465874382e-05,
"loss": 1.6767,
"step": 1370
},
{
"epoch": 0.81,
"grad_norm": 0.2533564567565918,
"learning_rate": 3.228474367223312e-05,
"loss": 1.6692,
"step": 1380
},
{
"epoch": 0.82,
"grad_norm": 0.25457942485809326,
"learning_rate": 3.206346073912488e-05,
"loss": 1.6627,
"step": 1390
},
{
"epoch": 0.82,
"grad_norm": 0.24955084919929504,
"learning_rate": 3.1841574751802076e-05,
"loss": 1.6625,
"step": 1400
},
{
"epoch": 0.83,
"grad_norm": 0.2384202629327774,
"learning_rate": 3.1619104654134395e-05,
"loss": 1.7596,
"step": 1410
},
{
"epoch": 0.84,
"grad_norm": 0.27582287788391113,
"learning_rate": 3.1396069439860894e-05,
"loss": 1.7282,
"step": 1420
},
{
"epoch": 0.84,
"grad_norm": 0.2678006887435913,
"learning_rate": 3.117248815096833e-05,
"loss": 1.6456,
"step": 1430
},
{
"epoch": 0.85,
"grad_norm": 0.23636989295482635,
"learning_rate": 3.094837987606547e-05,
"loss": 1.5889,
"step": 1440
},
{
"epoch": 0.85,
"grad_norm": 0.2650662362575531,
"learning_rate": 3.072376374875335e-05,
"loss": 1.6597,
"step": 1450
},
{
"epoch": 0.86,
"grad_norm": 0.2590391933917999,
"learning_rate": 3.049865894599172e-05,
"loss": 1.6503,
"step": 1460
},
{
"epoch": 0.86,
"grad_norm": 0.26722925901412964,
"learning_rate": 3.027308468646175e-05,
"loss": 1.6738,
"step": 1470
},
{
"epoch": 0.87,
"grad_norm": 0.23559637367725372,
"learning_rate": 3.0047060228925256e-05,
"loss": 1.5497,
"step": 1480
},
{
"epoch": 0.88,
"grad_norm": 0.2601984143257141,
"learning_rate": 2.9820604870580427e-05,
"loss": 1.5991,
"step": 1490
},
{
"epoch": 0.88,
"grad_norm": 0.24394232034683228,
"learning_rate": 2.9593737945414264e-05,
"loss": 1.7422,
"step": 1500
},
{
"epoch": 0.89,
"grad_norm": 0.28175410628318787,
"learning_rate": 2.9366478822551975e-05,
"loss": 1.7054,
"step": 1510
},
{
"epoch": 0.89,
"grad_norm": 0.2808462977409363,
"learning_rate": 2.913884690460325e-05,
"loss": 1.6179,
"step": 1520
},
{
"epoch": 0.9,
"grad_norm": 0.2547930181026459,
"learning_rate": 2.8910861626005776e-05,
"loss": 1.6518,
"step": 1530
},
{
"epoch": 0.91,
"grad_norm": 0.23432618379592896,
"learning_rate": 2.868254245136594e-05,
"loss": 1.6254,
"step": 1540
},
{
"epoch": 0.91,
"grad_norm": 0.24667441844940186,
"learning_rate": 2.8453908873797058e-05,
"loss": 1.6296,
"step": 1550
},
{
"epoch": 0.92,
"grad_norm": 0.2473803460597992,
"learning_rate": 2.8224980413255086e-05,
"loss": 1.6373,
"step": 1560
},
{
"epoch": 0.92,
"grad_norm": 0.2975025475025177,
"learning_rate": 2.7995776614872084e-05,
"loss": 1.6004,
"step": 1570
},
{
"epoch": 0.93,
"grad_norm": 0.2573077976703644,
"learning_rate": 2.776631704728752e-05,
"loss": 1.6139,
"step": 1580
},
{
"epoch": 0.94,
"grad_norm": 0.24170945584774017,
"learning_rate": 2.7536621300977576e-05,
"loss": 1.6792,
"step": 1590
},
{
"epoch": 0.94,
"grad_norm": 0.31662338972091675,
"learning_rate": 2.7306708986582553e-05,
"loss": 1.5272,
"step": 1600
},
{
"epoch": 0.95,
"grad_norm": 0.24606853723526,
"learning_rate": 2.70765997332326e-05,
"loss": 1.6827,
"step": 1610
},
{
"epoch": 0.95,
"grad_norm": 0.2409406304359436,
"learning_rate": 2.6846313186871853e-05,
"loss": 1.6965,
"step": 1620
},
{
"epoch": 0.96,
"grad_norm": 0.24571745097637177,
"learning_rate": 2.6615869008581107e-05,
"loss": 1.6486,
"step": 1630
},
{
"epoch": 0.96,
"grad_norm": 0.23941560089588165,
"learning_rate": 2.638528687289925e-05,
"loss": 1.6894,
"step": 1640
},
{
"epoch": 0.97,
"grad_norm": 0.2921188473701477,
"learning_rate": 2.6154586466143495e-05,
"loss": 1.5936,
"step": 1650
},
{
"epoch": 0.98,
"grad_norm": 0.2708994150161743,
"learning_rate": 2.592378748472863e-05,
"loss": 1.5613,
"step": 1660
},
{
"epoch": 0.98,
"grad_norm": 0.28650519251823425,
"learning_rate": 2.569290963348541e-05,
"loss": 1.6684,
"step": 1670
},
{
"epoch": 0.99,
"grad_norm": 0.28054314851760864,
"learning_rate": 2.5461972623978247e-05,
"loss": 1.6376,
"step": 1680
},
{
"epoch": 0.99,
"grad_norm": 0.23524071276187897,
"learning_rate": 2.5230996172822275e-05,
"loss": 1.6724,
"step": 1690
},
{
"epoch": 1.0,
"grad_norm": 0.32756513357162476,
"learning_rate": 2.5e-05,
"loss": 1.6852,
"step": 1700
}
],
"logging_steps": 10,
"max_steps": 3400,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 3.332749990819791e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}