2312 lines
56 KiB
JSON
2312 lines
56 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1623,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0030807147258163892,
|
|
"grad_norm": 1.9506609439849854,
|
|
"learning_rate": 9.756097560975611e-06,
|
|
"loss": 1.0043,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.0061614294516327784,
|
|
"grad_norm": 1.3290749788284302,
|
|
"learning_rate": 2.1951219512195124e-05,
|
|
"loss": 1.0881,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.009242144177449169,
|
|
"grad_norm": 1.0360522270202637,
|
|
"learning_rate": 3.414634146341464e-05,
|
|
"loss": 1.1051,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.012322858903265557,
|
|
"grad_norm": 1.3245832920074463,
|
|
"learning_rate": 4.634146341463415e-05,
|
|
"loss": 0.9605,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.015403573629081947,
|
|
"grad_norm": 3.1614928245544434,
|
|
"learning_rate": 5.853658536585366e-05,
|
|
"loss": 0.8698,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.018484288354898338,
|
|
"grad_norm": 1.4028220176696777,
|
|
"learning_rate": 7.073170731707317e-05,
|
|
"loss": 0.8835,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.021565003080714726,
|
|
"grad_norm": 0.9983139634132385,
|
|
"learning_rate": 8.292682926829268e-05,
|
|
"loss": 0.9123,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.024645717806531114,
|
|
"grad_norm": 1.0538513660430908,
|
|
"learning_rate": 9.51219512195122e-05,
|
|
"loss": 0.885,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.027726432532347505,
|
|
"grad_norm": 1.131333827972412,
|
|
"learning_rate": 0.00010731707317073172,
|
|
"loss": 0.8446,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.030807147258163893,
|
|
"grad_norm": 2.132134199142456,
|
|
"learning_rate": 0.00011951219512195122,
|
|
"loss": 0.7956,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.033887861983980284,
|
|
"grad_norm": 1.0456063747406006,
|
|
"learning_rate": 0.00013170731707317076,
|
|
"loss": 0.8816,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.036968576709796676,
|
|
"grad_norm": 0.8690502643585205,
|
|
"learning_rate": 0.00014390243902439025,
|
|
"loss": 0.8685,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.04004929143561306,
|
|
"grad_norm": 1.0242969989776611,
|
|
"learning_rate": 0.00015609756097560978,
|
|
"loss": 0.8768,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.04313000616142945,
|
|
"grad_norm": 1.328539490699768,
|
|
"learning_rate": 0.00016829268292682927,
|
|
"loss": 0.9351,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.04621072088724584,
|
|
"grad_norm": 1.3852390050888062,
|
|
"learning_rate": 0.0001804878048780488,
|
|
"loss": 0.786,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.04929143561306223,
|
|
"grad_norm": 0.9410791397094727,
|
|
"learning_rate": 0.0001926829268292683,
|
|
"loss": 0.8309,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.05237215033887862,
|
|
"grad_norm": 0.8410763144493103,
|
|
"learning_rate": 0.0001999991687649223,
|
|
"loss": 0.9027,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.05545286506469501,
|
|
"grad_norm": 0.915637195110321,
|
|
"learning_rate": 0.00019998981752900036,
|
|
"loss": 0.9057,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.0585335797905114,
|
|
"grad_norm": 1.2468522787094116,
|
|
"learning_rate": 0.00019997007698817557,
|
|
"loss": 0.9095,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.061614294516327786,
|
|
"grad_norm": 1.3756437301635742,
|
|
"learning_rate": 0.00019993994919356167,
|
|
"loss": 0.8025,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.06469500924214418,
|
|
"grad_norm": 0.8679229021072388,
|
|
"learning_rate": 0.00019989943727554598,
|
|
"loss": 0.8418,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.06777572396796057,
|
|
"grad_norm": 0.8622983694076538,
|
|
"learning_rate": 0.00019984854544346367,
|
|
"loss": 0.8821,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.07085643869377696,
|
|
"grad_norm": 0.8575367331504822,
|
|
"learning_rate": 0.00019978727898516086,
|
|
"loss": 0.9431,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.07393715341959335,
|
|
"grad_norm": 1.2082788944244385,
|
|
"learning_rate": 0.0001997156442664449,
|
|
"loss": 0.8149,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.07701786814540973,
|
|
"grad_norm": 1.507377028465271,
|
|
"learning_rate": 0.00019963364873042298,
|
|
"loss": 0.7926,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.08009858287122612,
|
|
"grad_norm": 0.8779996037483215,
|
|
"learning_rate": 0.0001995413008967289,
|
|
"loss": 0.923,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.08317929759704251,
|
|
"grad_norm": 0.8468291163444519,
|
|
"learning_rate": 0.00019943861036063768,
|
|
"loss": 0.8893,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.0862600123228589,
|
|
"grad_norm": 0.7504271864891052,
|
|
"learning_rate": 0.00019932558779206874,
|
|
"loss": 0.8932,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.0893407270486753,
|
|
"grad_norm": 1.057389736175537,
|
|
"learning_rate": 0.00019920224493447702,
|
|
"loss": 0.8907,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.09242144177449169,
|
|
"grad_norm": 1.2131892442703247,
|
|
"learning_rate": 0.00019906859460363307,
|
|
"loss": 0.7649,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.09550215650030808,
|
|
"grad_norm": 0.7876495718955994,
|
|
"learning_rate": 0.00019892465068629131,
|
|
"loss": 0.8601,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.09858287122612445,
|
|
"grad_norm": 0.7372773885726929,
|
|
"learning_rate": 0.0001987704281387471,
|
|
"loss": 0.9682,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.10166358595194085,
|
|
"grad_norm": 0.9224637150764465,
|
|
"learning_rate": 0.00019860594298528282,
|
|
"loss": 0.8882,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.10474430067775724,
|
|
"grad_norm": 1.195654034614563,
|
|
"learning_rate": 0.0001984312123165028,
|
|
"loss": 0.9287,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.10782501540357363,
|
|
"grad_norm": 1.0784906148910522,
|
|
"learning_rate": 0.0001982462542875576,
|
|
"loss": 0.7491,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.11090573012939002,
|
|
"grad_norm": 0.6587386131286621,
|
|
"learning_rate": 0.00019805108811625773,
|
|
"loss": 0.8577,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.11398644485520641,
|
|
"grad_norm": 0.699715256690979,
|
|
"learning_rate": 0.00019784573408107657,
|
|
"loss": 0.8966,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.1170671595810228,
|
|
"grad_norm": 0.9360162615776062,
|
|
"learning_rate": 0.00019763021351904358,
|
|
"loss": 0.8773,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.12014787430683918,
|
|
"grad_norm": 1.123854160308838,
|
|
"learning_rate": 0.00019740454882352732,
|
|
"loss": 0.8704,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.12322858903265557,
|
|
"grad_norm": 1.2856158018112183,
|
|
"learning_rate": 0.0001971687634419086,
|
|
"loss": 0.7832,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.12630930375847196,
|
|
"grad_norm": 0.7385960221290588,
|
|
"learning_rate": 0.0001969228818731442,
|
|
"loss": 0.8582,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.12939001848428835,
|
|
"grad_norm": 0.6473488211631775,
|
|
"learning_rate": 0.00019666692966522145,
|
|
"loss": 0.8792,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.13247073321010475,
|
|
"grad_norm": 0.746126115322113,
|
|
"learning_rate": 0.00019640093341250357,
|
|
"loss": 0.8736,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.13555144793592114,
|
|
"grad_norm": 1.0111432075500488,
|
|
"learning_rate": 0.0001961249207529665,
|
|
"loss": 0.8853,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.13863216266173753,
|
|
"grad_norm": 1.2317752838134766,
|
|
"learning_rate": 0.00019583892036532726,
|
|
"loss": 0.7865,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.14171287738755392,
|
|
"grad_norm": 0.6940191388130188,
|
|
"learning_rate": 0.00019554296196606395,
|
|
"loss": 0.8703,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.1447935921133703,
|
|
"grad_norm": 0.7127991318702698,
|
|
"learning_rate": 0.00019523707630632835,
|
|
"loss": 0.8262,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.1478743068391867,
|
|
"grad_norm": 0.6165663003921509,
|
|
"learning_rate": 0.00019492129516875055,
|
|
"loss": 0.9039,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.15095502156500307,
|
|
"grad_norm": 1.2010482549667358,
|
|
"learning_rate": 0.00019459565136413666,
|
|
"loss": 0.9394,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.15403573629081946,
|
|
"grad_norm": 1.4223132133483887,
|
|
"learning_rate": 0.0001942601787280598,
|
|
"loss": 0.7718,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.15711645101663585,
|
|
"grad_norm": 0.8233282566070557,
|
|
"learning_rate": 0.00019391491211734425,
|
|
"loss": 0.8706,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.16019716574245224,
|
|
"grad_norm": 0.6466130018234253,
|
|
"learning_rate": 0.0001935598874064438,
|
|
"loss": 0.8579,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.16327788046826863,
|
|
"grad_norm": 0.7207822799682617,
|
|
"learning_rate": 0.00019319514148371435,
|
|
"loss": 0.7989,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.16635859519408502,
|
|
"grad_norm": 1.073905110359192,
|
|
"learning_rate": 0.00019282071224758091,
|
|
"loss": 0.8333,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.16943930991990142,
|
|
"grad_norm": 1.2006767988204956,
|
|
"learning_rate": 0.00019243663860259993,
|
|
"loss": 0.7993,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.1725200246457178,
|
|
"grad_norm": 0.6434801816940308,
|
|
"learning_rate": 0.00019204296045541685,
|
|
"loss": 0.8851,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.1756007393715342,
|
|
"grad_norm": 0.6209431290626526,
|
|
"learning_rate": 0.0001916397187106199,
|
|
"loss": 0.8257,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.1786814540973506,
|
|
"grad_norm": 0.5881760120391846,
|
|
"learning_rate": 0.00019122695526648968,
|
|
"loss": 0.8571,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.18176216882316698,
|
|
"grad_norm": 1.2488080263137817,
|
|
"learning_rate": 0.00019080471301064598,
|
|
"loss": 0.895,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.18484288354898337,
|
|
"grad_norm": 1.179758071899414,
|
|
"learning_rate": 0.00019037303581559143,
|
|
"loss": 0.7412,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.18792359827479976,
|
|
"grad_norm": 0.6266173720359802,
|
|
"learning_rate": 0.00018993196853415317,
|
|
"loss": 0.8424,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.19100431300061615,
|
|
"grad_norm": 0.781024694442749,
|
|
"learning_rate": 0.00018948155699482244,
|
|
"loss": 0.818,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.19408502772643252,
|
|
"grad_norm": 0.857995331287384,
|
|
"learning_rate": 0.00018902184799699263,
|
|
"loss": 0.9148,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.1971657424522489,
|
|
"grad_norm": 0.7764838933944702,
|
|
"learning_rate": 0.00018855288930609692,
|
|
"loss": 0.7969,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.2002464571780653,
|
|
"grad_norm": 0.9209476113319397,
|
|
"learning_rate": 0.00018807472964864515,
|
|
"loss": 0.7768,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.2033271719038817,
|
|
"grad_norm": 0.6861506104469299,
|
|
"learning_rate": 0.00018758741870716092,
|
|
"loss": 0.8849,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.20640788662969808,
|
|
"grad_norm": 0.5947197675704956,
|
|
"learning_rate": 0.00018709100711501955,
|
|
"loss": 0.8672,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.20948860135551448,
|
|
"grad_norm": 0.6780321002006531,
|
|
"learning_rate": 0.0001865855464511869,
|
|
"loss": 0.7919,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.21256931608133087,
|
|
"grad_norm": 1.1705702543258667,
|
|
"learning_rate": 0.00018607108923486025,
|
|
"loss": 0.7502,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.21565003080714726,
|
|
"grad_norm": 1.004110336303711,
|
|
"learning_rate": 0.00018554768892001136,
|
|
"loss": 0.7369,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.21873074553296365,
|
|
"grad_norm": 0.6980533003807068,
|
|
"learning_rate": 0.00018501539988983234,
|
|
"loss": 0.7703,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.22181146025878004,
|
|
"grad_norm": 0.6246406435966492,
|
|
"learning_rate": 0.0001844742774510851,
|
|
"loss": 0.8441,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.22489217498459643,
|
|
"grad_norm": 0.5528222322463989,
|
|
"learning_rate": 0.00018392437782835475,
|
|
"loss": 0.8385,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.22797288971041282,
|
|
"grad_norm": 0.9303919076919556,
|
|
"learning_rate": 0.00018336575815820766,
|
|
"loss": 0.8384,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.23105360443622922,
|
|
"grad_norm": 1.3066872358322144,
|
|
"learning_rate": 0.00018279847648325478,
|
|
"loss": 0.7767,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.2341343191620456,
|
|
"grad_norm": 0.7670502662658691,
|
|
"learning_rate": 0.0001822225917461208,
|
|
"loss": 0.8032,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.23721503388786197,
|
|
"grad_norm": 0.7942723631858826,
|
|
"learning_rate": 0.0001816381637833198,
|
|
"loss": 0.8288,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.24029574861367836,
|
|
"grad_norm": 0.6688534617424011,
|
|
"learning_rate": 0.00018104525331903799,
|
|
"loss": 0.8631,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.24337646333949475,
|
|
"grad_norm": 0.9976694583892822,
|
|
"learning_rate": 0.00018044392195882427,
|
|
"loss": 0.9414,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.24645717806531114,
|
|
"grad_norm": 1.142532229423523,
|
|
"learning_rate": 0.00017983423218318918,
|
|
"loss": 0.7797,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.24953789279112754,
|
|
"grad_norm": 0.6933532953262329,
|
|
"learning_rate": 0.00017921624734111292,
|
|
"loss": 0.8074,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.2526186075169439,
|
|
"grad_norm": 0.7068799138069153,
|
|
"learning_rate": 0.00017859003164346336,
|
|
"loss": 0.9038,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.2556993222427603,
|
|
"grad_norm": 0.7113360166549683,
|
|
"learning_rate": 0.0001779556501563239,
|
|
"loss": 0.8077,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.2587800369685767,
|
|
"grad_norm": 1.0284174680709839,
|
|
"learning_rate": 0.00017731316879423327,
|
|
"loss": 0.9117,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.2618607516943931,
|
|
"grad_norm": 1.0717129707336426,
|
|
"learning_rate": 0.00017666265431333654,
|
|
"loss": 0.8241,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.2649414664202095,
|
|
"grad_norm": 0.6098302006721497,
|
|
"learning_rate": 0.000176004174304449,
|
|
"loss": 0.8526,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.2680221811460259,
|
|
"grad_norm": 0.6730021834373474,
|
|
"learning_rate": 0.00017533779718603313,
|
|
"loss": 0.8473,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.2711028958718423,
|
|
"grad_norm": 0.627232551574707,
|
|
"learning_rate": 0.00017466359219708985,
|
|
"loss": 0.7787,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.27418361059765867,
|
|
"grad_norm": 0.7510082125663757,
|
|
"learning_rate": 0.00017398162938996422,
|
|
"loss": 0.7869,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.27726432532347506,
|
|
"grad_norm": 0.800914466381073,
|
|
"learning_rate": 0.00017329197962306664,
|
|
"loss": 0.7307,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.28034504004929145,
|
|
"grad_norm": 0.688615083694458,
|
|
"learning_rate": 0.00017259471455351072,
|
|
"loss": 0.8078,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.28342575477510784,
|
|
"grad_norm": 0.6369125247001648,
|
|
"learning_rate": 0.0001718899066296675,
|
|
"loss": 0.827,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.28650646950092423,
|
|
"grad_norm": 0.7632527351379395,
|
|
"learning_rate": 0.000171177629083638,
|
|
"loss": 0.8171,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.2895871842267406,
|
|
"grad_norm": 0.8901572227478027,
|
|
"learning_rate": 0.0001704579559236441,
|
|
"loss": 0.8534,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.292667898952557,
|
|
"grad_norm": 1.2739449739456177,
|
|
"learning_rate": 0.00016973096192633884,
|
|
"loss": 0.718,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.2957486136783734,
|
|
"grad_norm": 0.655415952205658,
|
|
"learning_rate": 0.00016899672262903677,
|
|
"loss": 0.7889,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.2988293284041898,
|
|
"grad_norm": 0.6401821970939636,
|
|
"learning_rate": 0.00016825531432186543,
|
|
"loss": 0.9173,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.30191004313000613,
|
|
"grad_norm": 0.5191354155540466,
|
|
"learning_rate": 0.00016750681403983846,
|
|
"loss": 0.931,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.3049907578558225,
|
|
"grad_norm": 0.8743457794189453,
|
|
"learning_rate": 0.00016675129955485152,
|
|
"loss": 0.8111,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.3080714725816389,
|
|
"grad_norm": 1.0666691064834595,
|
|
"learning_rate": 0.00016598884936760131,
|
|
"loss": 0.736,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.3111521873074553,
|
|
"grad_norm": 0.606925368309021,
|
|
"learning_rate": 0.00016521954269942918,
|
|
"loss": 0.8742,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.3142329020332717,
|
|
"grad_norm": 0.5690024495124817,
|
|
"learning_rate": 0.00016444345948408984,
|
|
"loss": 0.8288,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.3173136167590881,
|
|
"grad_norm": 0.7045788168907166,
|
|
"learning_rate": 0.0001636606803594457,
|
|
"loss": 0.8422,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.3203943314849045,
|
|
"grad_norm": 0.9720426201820374,
|
|
"learning_rate": 0.0001628712866590885,
|
|
"loss": 0.8012,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.3234750462107209,
|
|
"grad_norm": 1.1168466806411743,
|
|
"learning_rate": 0.00016207536040388845,
|
|
"loss": 0.7338,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.32655576093653726,
|
|
"grad_norm": 0.6451901197433472,
|
|
"learning_rate": 0.0001612729842934718,
|
|
"loss": 0.8471,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.32963647566235366,
|
|
"grad_norm": 0.7933263778686523,
|
|
"learning_rate": 0.00016046424169762827,
|
|
"loss": 0.8995,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.33271719038817005,
|
|
"grad_norm": 0.6123836636543274,
|
|
"learning_rate": 0.0001596492166476485,
|
|
"loss": 0.8341,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.33579790511398644,
|
|
"grad_norm": 0.8381984233856201,
|
|
"learning_rate": 0.0001588279938275929,
|
|
"loss": 0.8531,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.33887861983980283,
|
|
"grad_norm": 1.155434250831604,
|
|
"learning_rate": 0.00015800065856549269,
|
|
"loss": 0.6915,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.3419593345656192,
|
|
"grad_norm": 0.634437084197998,
|
|
"learning_rate": 0.00015716729682448393,
|
|
"loss": 0.8241,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.3450400492914356,
|
|
"grad_norm": 0.6219022274017334,
|
|
"learning_rate": 0.0001563279951938758,
|
|
"loss": 0.8461,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.348120764017252,
|
|
"grad_norm": 0.4845116138458252,
|
|
"learning_rate": 0.00015548284088015354,
|
|
"loss": 0.8311,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.3512014787430684,
|
|
"grad_norm": 0.9370896816253662,
|
|
"learning_rate": 0.00015463192169791741,
|
|
"loss": 0.8278,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.3542821934688848,
|
|
"grad_norm": 1.1633367538452148,
|
|
"learning_rate": 0.0001537753260607584,
|
|
"loss": 0.7536,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.3573629081947012,
|
|
"grad_norm": 0.5799803733825684,
|
|
"learning_rate": 0.00015291314297207175,
|
|
"loss": 0.7999,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.36044362292051757,
|
|
"grad_norm": 0.47321441769599915,
|
|
"learning_rate": 0.0001520454620158093,
|
|
"loss": 0.8864,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.36352433764633396,
|
|
"grad_norm": 0.5589901804924011,
|
|
"learning_rate": 0.00015117237334717117,
|
|
"loss": 0.7921,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.36660505237215035,
|
|
"grad_norm": 0.9199019074440002,
|
|
"learning_rate": 0.00015029396768323846,
|
|
"loss": 0.8999,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.36968576709796674,
|
|
"grad_norm": 1.062154769897461,
|
|
"learning_rate": 0.00014941033629354734,
|
|
"loss": 0.815,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.37276648182378314,
|
|
"grad_norm": 0.7027643918991089,
|
|
"learning_rate": 0.00014852157099060596,
|
|
"loss": 0.8644,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.3758471965495995,
|
|
"grad_norm": 0.5963114500045776,
|
|
"learning_rate": 0.00014762776412035456,
|
|
"loss": 0.87,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.3789279112754159,
|
|
"grad_norm": 0.5938383340835571,
|
|
"learning_rate": 0.00014672900855257056,
|
|
"loss": 0.8137,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.3820086260012323,
|
|
"grad_norm": 0.9398343563079834,
|
|
"learning_rate": 0.00014582539767121904,
|
|
"loss": 0.8821,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.3850893407270487,
|
|
"grad_norm": 0.99173504114151,
|
|
"learning_rate": 0.0001449170253647498,
|
|
"loss": 0.6784,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.38817005545286504,
|
|
"grad_norm": 0.7030093669891357,
|
|
"learning_rate": 0.0001440039860163419,
|
|
"loss": 0.832,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.39125077017868143,
|
|
"grad_norm": 0.6641373038291931,
|
|
"learning_rate": 0.00014308637449409706,
|
|
"loss": 0.826,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.3943314849044978,
|
|
"grad_norm": 0.6074934005737305,
|
|
"learning_rate": 0.00014216428614118243,
|
|
"loss": 0.8566,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.3974121996303142,
|
|
"grad_norm": 0.8580813407897949,
|
|
"learning_rate": 0.00014123781676592418,
|
|
"loss": 0.8423,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.4004929143561306,
|
|
"grad_norm": 1.0865176916122437,
|
|
"learning_rate": 0.00014030706263185247,
|
|
"loss": 0.769,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.403573629081947,
|
|
"grad_norm": 0.6008116602897644,
|
|
"learning_rate": 0.00013937212044769955,
|
|
"loss": 0.8003,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.4066543438077634,
|
|
"grad_norm": 0.7620061635971069,
|
|
"learning_rate": 0.0001384330873573513,
|
|
"loss": 0.863,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.4097350585335798,
|
|
"grad_norm": 0.5859779119491577,
|
|
"learning_rate": 0.00013749006092975347,
|
|
"loss": 0.8224,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.41281577325939617,
|
|
"grad_norm": 0.8230900764465332,
|
|
"learning_rate": 0.00013654313914877414,
|
|
"loss": 0.8245,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.41589648798521256,
|
|
"grad_norm": 0.9619393348693848,
|
|
"learning_rate": 0.00013559242040302272,
|
|
"loss": 0.7234,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.41897720271102895,
|
|
"grad_norm": 0.687427282333374,
|
|
"learning_rate": 0.00013463800347562706,
|
|
"loss": 0.8489,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.42205791743684534,
|
|
"grad_norm": 0.5357842445373535,
|
|
"learning_rate": 0.00013367998753396944,
|
|
"loss": 0.755,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.42513863216266173,
|
|
"grad_norm": 0.5615360736846924,
|
|
"learning_rate": 0.00013271847211938285,
|
|
"loss": 0.8116,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.4282193468884781,
|
|
"grad_norm": 1.0073713064193726,
|
|
"learning_rate": 0.0001317535571368082,
|
|
"loss": 0.8125,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.4313000616142945,
|
|
"grad_norm": 0.9638437032699585,
|
|
"learning_rate": 0.00013078534284441382,
|
|
"loss": 0.7871,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.4343807763401109,
|
|
"grad_norm": 0.6262618899345398,
|
|
"learning_rate": 0.00012981392984317834,
|
|
"loss": 0.7716,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.4374614910659273,
|
|
"grad_norm": 0.6202210187911987,
|
|
"learning_rate": 0.00012883941906643786,
|
|
"loss": 0.8464,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.4405422057917437,
|
|
"grad_norm": 0.636193037033081,
|
|
"learning_rate": 0.00012786191176939848,
|
|
"loss": 0.7936,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.4436229205175601,
|
|
"grad_norm": 0.7928021550178528,
|
|
"learning_rate": 0.00012688150951861582,
|
|
"loss": 0.7535,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.4467036352433765,
|
|
"grad_norm": 1.1193562746047974,
|
|
"learning_rate": 0.00012589831418144154,
|
|
"loss": 0.7378,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.44978434996919286,
|
|
"grad_norm": 0.5501682162284851,
|
|
"learning_rate": 0.00012491242791543922,
|
|
"loss": 0.8251,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.45286506469500926,
|
|
"grad_norm": 0.5638311505317688,
|
|
"learning_rate": 0.00012392395315776963,
|
|
"loss": 0.8488,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.45594577942082565,
|
|
"grad_norm": 0.5718980431556702,
|
|
"learning_rate": 0.00012293299261454725,
|
|
"loss": 0.8058,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.45902649414664204,
|
|
"grad_norm": 0.9007247090339661,
|
|
"learning_rate": 0.00012193964925016872,
|
|
"loss": 0.7745,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.46210720887245843,
|
|
"grad_norm": 1.2628989219665527,
|
|
"learning_rate": 0.00012094402627661447,
|
|
"loss": 0.7156,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.4651879235982748,
|
|
"grad_norm": 0.6456303596496582,
|
|
"learning_rate": 0.00011994622714272448,
|
|
"loss": 0.8305,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.4682686383240912,
|
|
"grad_norm": 0.7610649466514587,
|
|
"learning_rate": 0.00011894635552344975,
|
|
"loss": 0.8419,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.4713493530499076,
|
|
"grad_norm": 0.415209025144577,
|
|
"learning_rate": 0.00011794451530908011,
|
|
"loss": 0.7674,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.47443006777572394,
|
|
"grad_norm": 0.8516692519187927,
|
|
"learning_rate": 0.00011694081059444946,
|
|
"loss": 0.8148,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.47751078250154033,
|
|
"grad_norm": 1.0319640636444092,
|
|
"learning_rate": 0.0001159353456681201,
|
|
"loss": 0.7435,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.4805914972273567,
|
|
"grad_norm": 0.5645594000816345,
|
|
"learning_rate": 0.00011492822500154667,
|
|
"loss": 0.7572,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.4836722119531731,
|
|
"grad_norm": 0.5694653987884521,
|
|
"learning_rate": 0.00011391955323822126,
|
|
"loss": 0.7624,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.4867529266789895,
|
|
"grad_norm": 0.5023643374443054,
|
|
"learning_rate": 0.00011290943518280057,
|
|
"loss": 0.8524,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.4898336414048059,
|
|
"grad_norm": 0.9564359784126282,
|
|
"learning_rate": 0.0001118979757902162,
|
|
"loss": 0.7575,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.4929143561306223,
|
|
"grad_norm": 1.1121801137924194,
|
|
"learning_rate": 0.00011088528015476964,
|
|
"loss": 0.7799,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.4959950708564387,
|
|
"grad_norm": 0.5354447364807129,
|
|
"learning_rate": 0.00010987145349921251,
|
|
"loss": 0.7722,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.49907578558225507,
|
|
"grad_norm": 0.5652374625205994,
|
|
"learning_rate": 0.0001088566011638134,
|
|
"loss": 0.7795,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.5021565003080715,
|
|
"grad_norm": 0.5385531783103943,
|
|
"learning_rate": 0.00010784082859541292,
|
|
"loss": 0.7919,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.5052372150338879,
|
|
"grad_norm": 0.7922360897064209,
|
|
"learning_rate": 0.0001068242413364671,
|
|
"loss": 0.8262,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.5083179297597042,
|
|
"grad_norm": 1.1030114889144897,
|
|
"learning_rate": 0.00010580694501408138,
|
|
"loss": 0.7565,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.5113986444855206,
|
|
"grad_norm": 0.5422970056533813,
|
|
"learning_rate": 0.00010478904532903535,
|
|
"loss": 0.7479,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.514479359211337,
|
|
"grad_norm": 0.5856009125709534,
|
|
"learning_rate": 0.00010377064804480025,
|
|
"loss": 0.8519,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.5175600739371534,
|
|
"grad_norm": 0.5326692461967468,
|
|
"learning_rate": 0.00010275185897654971,
|
|
"loss": 0.7604,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.5206407886629698,
|
|
"grad_norm": 0.9409604072570801,
|
|
"learning_rate": 0.00010173278398016501,
|
|
"loss": 0.7729,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.5237215033887862,
|
|
"grad_norm": 0.8584169149398804,
|
|
"learning_rate": 0.00010071352894123654,
|
|
"loss": 0.7233,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.5268022181146026,
|
|
"grad_norm": 0.5935032963752747,
|
|
"learning_rate": 9.969419976406165e-05,
|
|
"loss": 0.7798,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.529882932840419,
|
|
"grad_norm": 0.6444724202156067,
|
|
"learning_rate": 9.867490236064108e-05,
|
|
"loss": 0.7783,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.5329636475662354,
|
|
"grad_norm": 0.5421064496040344,
|
|
"learning_rate": 9.765574263967396e-05,
|
|
"loss": 0.8457,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.5360443622920518,
|
|
"grad_norm": 0.7332625985145569,
|
|
"learning_rate": 9.66368264955539e-05,
|
|
"loss": 0.7861,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.5391250770178682,
|
|
"grad_norm": 0.9407272338867188,
|
|
"learning_rate": 9.56182597973658e-05,
|
|
"loss": 0.738,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.5422057917436846,
|
|
"grad_norm": 0.5053693652153015,
|
|
"learning_rate": 9.460014837788605e-05,
|
|
"loss": 0.7868,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.5452865064695009,
|
|
"grad_norm": 0.5454622507095337,
|
|
"learning_rate": 9.358259802258581e-05,
|
|
"loss": 0.8042,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.5483672211953173,
|
|
"grad_norm": 0.5724400281906128,
|
|
"learning_rate": 9.256571445863972e-05,
|
|
"loss": 0.7704,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.5514479359211337,
|
|
"grad_norm": 0.843951404094696,
|
|
"learning_rate": 9.154960334394027e-05,
|
|
"loss": 0.8044,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.5545286506469501,
|
|
"grad_norm": 1.0994093418121338,
|
|
"learning_rate": 9.053437025611973e-05,
|
|
"loss": 0.7098,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.5576093653727665,
|
|
"grad_norm": 0.6525376439094543,
|
|
"learning_rate": 8.952012068158027e-05,
|
|
"loss": 0.8139,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.5606900800985829,
|
|
"grad_norm": 0.49181467294692993,
|
|
"learning_rate": 8.850696000453326e-05,
|
|
"loss": 0.8226,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.5637707948243993,
|
|
"grad_norm": 0.5223445296287537,
|
|
"learning_rate": 8.749499349604993e-05,
|
|
"loss": 0.7899,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.5668515095502157,
|
|
"grad_norm": 0.8057864308357239,
|
|
"learning_rate": 8.64843263031228e-05,
|
|
"loss": 0.7831,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.5699322242760321,
|
|
"grad_norm": 1.0692858695983887,
|
|
"learning_rate": 8.547506343774097e-05,
|
|
"loss": 0.6825,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.5730129390018485,
|
|
"grad_norm": 0.6106774806976318,
|
|
"learning_rate": 8.446730976597878e-05,
|
|
"loss": 0.8087,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.5760936537276649,
|
|
"grad_norm": 0.657316267490387,
|
|
"learning_rate": 8.346116999709975e-05,
|
|
"loss": 0.7957,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.5791743684534812,
|
|
"grad_norm": 0.4877719581127167,
|
|
"learning_rate": 8.245674867267724e-05,
|
|
"loss": 0.7564,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.5822550831792976,
|
|
"grad_norm": 0.8313435912132263,
|
|
"learning_rate": 8.145415015573183e-05,
|
|
"loss": 0.8325,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.585335797905114,
|
|
"grad_norm": 0.9311845302581787,
|
|
"learning_rate": 8.045347861988789e-05,
|
|
"loss": 0.6653,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.5884165126309304,
|
|
"grad_norm": 0.6919590830802917,
|
|
"learning_rate": 7.945483803854936e-05,
|
|
"loss": 0.7861,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.5914972273567468,
|
|
"grad_norm": 0.5540125370025635,
|
|
"learning_rate": 7.845833217409675e-05,
|
|
"loss": 0.8685,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.5945779420825632,
|
|
"grad_norm": 0.5159959197044373,
|
|
"learning_rate": 7.746406456710564e-05,
|
|
"loss": 0.7727,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.5976586568083796,
|
|
"grad_norm": 0.8416092395782471,
|
|
"learning_rate": 7.64721385255886e-05,
|
|
"loss": 0.7798,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.600739371534196,
|
|
"grad_norm": 1.0320320129394531,
|
|
"learning_rate": 7.548265711426104e-05,
|
|
"loss": 0.6814,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.6038200862600123,
|
|
"grad_norm": 0.626734733581543,
|
|
"learning_rate": 7.449572314383237e-05,
|
|
"loss": 0.8495,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.6069008009858287,
|
|
"grad_norm": 0.6575467586517334,
|
|
"learning_rate": 7.351143916032374e-05,
|
|
"loss": 0.8553,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.609981515711645,
|
|
"grad_norm": 0.5149129629135132,
|
|
"learning_rate": 7.252990743441293e-05,
|
|
"loss": 0.7646,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.6130622304374614,
|
|
"grad_norm": 0.8680890798568726,
|
|
"learning_rate": 7.155122995080827e-05,
|
|
"loss": 0.8524,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.6161429451632778,
|
|
"grad_norm": 1.129459023475647,
|
|
"learning_rate": 7.057550839765188e-05,
|
|
"loss": 0.7465,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.6192236598890942,
|
|
"grad_norm": 0.7005685567855835,
|
|
"learning_rate": 6.960284415595407e-05,
|
|
"loss": 0.7945,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.6223043746149106,
|
|
"grad_norm": 0.5934234261512756,
|
|
"learning_rate": 6.863333828905929e-05,
|
|
"loss": 0.9171,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.625385089340727,
|
|
"grad_norm": 0.5575773119926453,
|
|
"learning_rate": 6.766709153214542e-05,
|
|
"loss": 0.8228,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.6284658040665434,
|
|
"grad_norm": 0.7771855592727661,
|
|
"learning_rate": 6.670420428175705e-05,
|
|
"loss": 0.8146,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.6315465187923598,
|
|
"grad_norm": 0.8821945190429688,
|
|
"learning_rate": 6.574477658537375e-05,
|
|
"loss": 0.7083,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.6346272335181762,
|
|
"grad_norm": 0.6677148342132568,
|
|
"learning_rate": 6.4788908131015e-05,
|
|
"loss": 0.7567,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.6377079482439926,
|
|
"grad_norm": 0.5348629951477051,
|
|
"learning_rate": 6.38366982368819e-05,
|
|
"loss": 0.7256,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.640788662969809,
|
|
"grad_norm": 0.539318323135376,
|
|
"learning_rate": 6.288824584103816e-05,
|
|
"loss": 0.7838,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.6438693776956254,
|
|
"grad_norm": 0.8003283739089966,
|
|
"learning_rate": 6.194364949112953e-05,
|
|
"loss": 0.7608,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.6469500924214417,
|
|
"grad_norm": 0.9320285320281982,
|
|
"learning_rate": 6.100300733414474e-05,
|
|
"loss": 0.6823,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.6500308071472581,
|
|
"grad_norm": 0.6213887333869934,
|
|
"learning_rate": 6.0066417106217455e-05,
|
|
"loss": 0.7781,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.6531115218730745,
|
|
"grad_norm": 0.5485357046127319,
|
|
"learning_rate": 5.9133976122471214e-05,
|
|
"loss": 0.8137,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.6561922365988909,
|
|
"grad_norm": 0.5012507438659668,
|
|
"learning_rate": 5.82057812669081e-05,
|
|
"loss": 0.7741,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.6592729513247073,
|
|
"grad_norm": 0.6728748083114624,
|
|
"learning_rate": 5.728192898234195e-05,
|
|
"loss": 0.7326,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.6623536660505237,
|
|
"grad_norm": 1.107875943183899,
|
|
"learning_rate": 5.6362515260377835e-05,
|
|
"loss": 0.676,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.6654343807763401,
|
|
"grad_norm": 0.6420050263404846,
|
|
"learning_rate": 5.544763563143793e-05,
|
|
"loss": 0.8215,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.6685150955021565,
|
|
"grad_norm": 0.5939176082611084,
|
|
"learning_rate": 5.4537385154835864e-05,
|
|
"loss": 0.7669,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.6715958102279729,
|
|
"grad_norm": 0.4939870536327362,
|
|
"learning_rate": 5.363185840889935e-05,
|
|
"loss": 0.7439,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.6746765249537893,
|
|
"grad_norm": 0.7583256363868713,
|
|
"learning_rate": 5.273114948114346e-05,
|
|
"loss": 0.81,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.6777572396796057,
|
|
"grad_norm": 1.1161051988601685,
|
|
"learning_rate": 5.1835351958494515e-05,
|
|
"loss": 0.683,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.680837954405422,
|
|
"grad_norm": 0.5778560042381287,
|
|
"learning_rate": 5.094455891756587e-05,
|
|
"loss": 0.7839,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.6839186691312384,
|
|
"grad_norm": 0.5644716024398804,
|
|
"learning_rate": 5.00588629149872e-05,
|
|
"loss": 0.8279,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.6869993838570548,
|
|
"grad_norm": 0.525404691696167,
|
|
"learning_rate": 4.91783559777873e-05,
|
|
"loss": 0.7721,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.6900800985828712,
|
|
"grad_norm": 0.9260256290435791,
|
|
"learning_rate": 4.830312959383238e-05,
|
|
"loss": 0.7555,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.6931608133086876,
|
|
"grad_norm": 0.8119556903839111,
|
|
"learning_rate": 4.7433274702319815e-05,
|
|
"loss": 0.6236,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.696241528034504,
|
|
"grad_norm": 0.6038579344749451,
|
|
"learning_rate": 4.656888168432962e-05,
|
|
"loss": 0.7604,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.6993222427603204,
|
|
"grad_norm": 0.6174635887145996,
|
|
"learning_rate": 4.571004035343315e-05,
|
|
"loss": 0.8142,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.7024029574861368,
|
|
"grad_norm": 0.470345139503479,
|
|
"learning_rate": 4.485683994636144e-05,
|
|
"loss": 0.7306,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.7054836722119532,
|
|
"grad_norm": 0.8356139659881592,
|
|
"learning_rate": 4.400936911373308e-05,
|
|
"loss": 0.8464,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.7085643869377696,
|
|
"grad_norm": 0.8929637670516968,
|
|
"learning_rate": 4.3167715910842966e-05,
|
|
"loss": 0.6958,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.711645101663586,
|
|
"grad_norm": 0.5765171051025391,
|
|
"learning_rate": 4.2331967788513295e-05,
|
|
"loss": 0.7714,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.7147258163894024,
|
|
"grad_norm": 0.5743905901908875,
|
|
"learning_rate": 4.1502211584006836e-05,
|
|
"loss": 0.7777,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.7178065311152187,
|
|
"grad_norm": 0.47613048553466797,
|
|
"learning_rate": 4.067853351200446e-05,
|
|
"loss": 0.8229,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.7208872458410351,
|
|
"grad_norm": 0.6945850849151611,
|
|
"learning_rate": 3.986101915564695e-05,
|
|
"loss": 0.7934,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.7239679605668515,
|
|
"grad_norm": 1.2221349477767944,
|
|
"learning_rate": 3.904975345764262e-05,
|
|
"loss": 0.7063,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.7270486752926679,
|
|
"grad_norm": 0.6141952872276306,
|
|
"learning_rate": 3.824482071144163e-05,
|
|
"loss": 0.7582,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.7301293900184843,
|
|
"grad_norm": 0.6697849035263062,
|
|
"learning_rate": 3.744630455247739e-05,
|
|
"loss": 0.7688,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.7332101047443007,
|
|
"grad_norm": 0.45467403531074524,
|
|
"learning_rate": 3.6654287949476626e-05,
|
|
"loss": 0.7579,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.7362908194701171,
|
|
"grad_norm": 0.7754799127578735,
|
|
"learning_rate": 3.586885319583858e-05,
|
|
"loss": 0.7618,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.7393715341959335,
|
|
"grad_norm": 1.0018911361694336,
|
|
"learning_rate": 3.5090081901084525e-05,
|
|
"loss": 0.6976,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.7424522489217499,
|
|
"grad_norm": 0.6909040808677673,
|
|
"learning_rate": 3.431805498237808e-05,
|
|
"loss": 0.8187,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.7455329636475663,
|
|
"grad_norm": 0.5961218476295471,
|
|
"learning_rate": 3.355285265611784e-05,
|
|
"loss": 0.7922,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.7486136783733827,
|
|
"grad_norm": 0.4505811333656311,
|
|
"learning_rate": 3.279455442960238e-05,
|
|
"loss": 0.7868,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.751694393099199,
|
|
"grad_norm": 0.7794224619865417,
|
|
"learning_rate": 3.204323909276924e-05,
|
|
"loss": 0.813,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.7547751078250154,
|
|
"grad_norm": 0.8165008425712585,
|
|
"learning_rate": 3.1298984710008484e-05,
|
|
"loss": 0.666,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.7578558225508318,
|
|
"grad_norm": 0.72403484582901,
|
|
"learning_rate": 3.056186861205136e-05,
|
|
"loss": 0.7433,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.7609365372766482,
|
|
"grad_norm": 0.5372538566589355,
|
|
"learning_rate": 2.9831967387935467e-05,
|
|
"loss": 0.8582,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.7640172520024646,
|
|
"grad_norm": 0.5056626796722412,
|
|
"learning_rate": 2.9109356877046712e-05,
|
|
"loss": 0.7281,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.767097966728281,
|
|
"grad_norm": 0.8713842630386353,
|
|
"learning_rate": 2.8394112161239605e-05,
|
|
"loss": 0.794,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.7701786814540974,
|
|
"grad_norm": 1.0365170240402222,
|
|
"learning_rate": 2.7686307557035685e-05,
|
|
"loss": 0.6647,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.7732593961799138,
|
|
"grad_norm": 0.534180760383606,
|
|
"learning_rate": 2.6986016607901908e-05,
|
|
"loss": 0.7198,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.7763401109057301,
|
|
"grad_norm": 0.5429548025131226,
|
|
"learning_rate": 2.629331207660931e-05,
|
|
"loss": 0.7513,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.7794208256315465,
|
|
"grad_norm": 0.49868354201316833,
|
|
"learning_rate": 2.5608265937672436e-05,
|
|
"loss": 0.7611,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.7825015403573629,
|
|
"grad_norm": 0.7774792909622192,
|
|
"learning_rate": 2.4930949369871203e-05,
|
|
"loss": 0.7762,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.7855822550831792,
|
|
"grad_norm": 1.040366291999817,
|
|
"learning_rate": 2.426143274885493e-05,
|
|
"loss": 0.635,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.7886629698089956,
|
|
"grad_norm": 0.6801935434341431,
|
|
"learning_rate": 2.359978563983022e-05,
|
|
"loss": 0.7553,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.791743684534812,
|
|
"grad_norm": 0.4836885929107666,
|
|
"learning_rate": 2.2946076790332827e-05,
|
|
"loss": 0.7228,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.7948243992606284,
|
|
"grad_norm": 0.5009395480155945,
|
|
"learning_rate": 2.2300374123084522e-05,
|
|
"loss": 0.706,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.7979051139864448,
|
|
"grad_norm": 0.8395462036132812,
|
|
"learning_rate": 2.166274472893567e-05,
|
|
"loss": 0.7395,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.8009858287122612,
|
|
"grad_norm": 0.9128504395484924,
|
|
"learning_rate": 2.1033254859894226e-05,
|
|
"loss": 0.6511,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.8040665434380776,
|
|
"grad_norm": 0.4820072650909424,
|
|
"learning_rate": 2.041196992224206e-05,
|
|
"loss": 0.7469,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.807147258163894,
|
|
"grad_norm": 0.5351672768592834,
|
|
"learning_rate": 1.9798954469738762e-05,
|
|
"loss": 0.7357,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.8102279728897104,
|
|
"grad_norm": 0.46705517172813416,
|
|
"learning_rate": 1.919427219691453e-05,
|
|
"loss": 0.7738,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.8133086876155268,
|
|
"grad_norm": 0.767042875289917,
|
|
"learning_rate": 1.8597985932451856e-05,
|
|
"loss": 0.7527,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.8163894023413432,
|
|
"grad_norm": 1.0822641849517822,
|
|
"learning_rate": 1.8010157632657543e-05,
|
|
"loss": 0.6507,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.8194701170671596,
|
|
"grad_norm": 0.6491265892982483,
|
|
"learning_rate": 1.7430848375025176e-05,
|
|
"loss": 0.7882,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.822550831792976,
|
|
"grad_norm": 0.6176543831825256,
|
|
"learning_rate": 1.686011835188891e-05,
|
|
"loss": 0.7986,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.8256315465187923,
|
|
"grad_norm": 0.41295069456100464,
|
|
"learning_rate": 1.6298026864169335e-05,
|
|
"loss": 0.7427,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.8287122612446087,
|
|
"grad_norm": 0.6847618818283081,
|
|
"learning_rate": 1.5744632315211815e-05,
|
|
"loss": 0.7551,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.8317929759704251,
|
|
"grad_norm": 0.9406694769859314,
|
|
"learning_rate": 1.5199992204718294e-05,
|
|
"loss": 0.6678,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.8348736906962415,
|
|
"grad_norm": 0.609660804271698,
|
|
"learning_rate": 1.4664163122772689e-05,
|
|
"loss": 0.7761,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.8379544054220579,
|
|
"grad_norm": 0.6481778621673584,
|
|
"learning_rate": 1.4137200743961188e-05,
|
|
"loss": 0.7838,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.8410351201478743,
|
|
"grad_norm": 0.4781896770000458,
|
|
"learning_rate": 1.3619159821587235e-05,
|
|
"loss": 0.7483,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.8441158348736907,
|
|
"grad_norm": 0.6182886958122253,
|
|
"learning_rate": 1.3110094181982657e-05,
|
|
"loss": 0.7589,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.8471965495995071,
|
|
"grad_norm": 1.1230385303497314,
|
|
"learning_rate": 1.261005671891482e-05,
|
|
"loss": 0.6927,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.8502772643253235,
|
|
"grad_norm": 0.6041083335876465,
|
|
"learning_rate": 1.2119099388090716e-05,
|
|
"loss": 0.7507,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.8533579790511399,
|
|
"grad_norm": 0.5692098140716553,
|
|
"learning_rate": 1.1637273201758748e-05,
|
|
"loss": 0.7854,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.8564386937769563,
|
|
"grad_norm": 0.4730149209499359,
|
|
"learning_rate": 1.1164628223408168e-05,
|
|
"loss": 0.7449,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.8595194085027726,
|
|
"grad_norm": 0.868976354598999,
|
|
"learning_rate": 1.0701213562567492e-05,
|
|
"loss": 0.7069,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.862600123228589,
|
|
"grad_norm": 1.2989338636398315,
|
|
"learning_rate": 1.0247077369701653e-05,
|
|
"loss": 0.6703,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.8656808379544054,
|
|
"grad_norm": 0.5826016068458557,
|
|
"learning_rate": 9.802266831209206e-06,
|
|
"loss": 0.7252,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.8687615526802218,
|
|
"grad_norm": 0.5001908540725708,
|
|
"learning_rate": 9.366828164519258e-06,
|
|
"loss": 0.7289,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.8718422674060382,
|
|
"grad_norm": 0.49539172649383545,
|
|
"learning_rate": 8.940806613289498e-06,
|
|
"loss": 0.7756,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.8749229821318546,
|
|
"grad_norm": 0.8087944984436035,
|
|
"learning_rate": 8.524246442705153e-06,
|
|
"loss": 0.7602,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.878003696857671,
|
|
"grad_norm": 1.1548421382904053,
|
|
"learning_rate": 8.117190934879593e-06,
|
|
"loss": 0.659,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.8810844115834874,
|
|
"grad_norm": 0.6721958518028259,
|
|
"learning_rate": 7.719682384357308e-06,
|
|
"loss": 0.7557,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.8841651263093038,
|
|
"grad_norm": 0.5357010364532471,
|
|
"learning_rate": 7.33176209371923e-06,
|
|
"loss": 0.7661,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.8872458410351202,
|
|
"grad_norm": 0.5159677863121033,
|
|
"learning_rate": 6.953470369291348e-06,
|
|
"loss": 0.743,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.8903265557609366,
|
|
"grad_norm": 0.8002091646194458,
|
|
"learning_rate": 6.5848465169566e-06,
|
|
"loss": 0.7694,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.893407270486753,
|
|
"grad_norm": 0.9943484663963318,
|
|
"learning_rate": 6.225928838071016e-06,
|
|
"loss": 0.6581,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.8964879852125693,
|
|
"grad_norm": 0.7004392743110657,
|
|
"learning_rate": 5.876754625483904e-06,
|
|
"loss": 0.6769,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.8995686999383857,
|
|
"grad_norm": 0.5294577479362488,
|
|
"learning_rate": 5.537360159663108e-06,
|
|
"loss": 0.7258,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.9026494146642021,
|
|
"grad_norm": 0.5086947679519653,
|
|
"learning_rate": 5.207780704925314e-06,
|
|
"loss": 0.7234,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.9057301293900185,
|
|
"grad_norm": 0.6993657350540161,
|
|
"learning_rate": 4.888050505771868e-06,
|
|
"loss": 0.7496,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.9088108441158349,
|
|
"grad_norm": 1.0011433362960815,
|
|
"learning_rate": 4.578202783330799e-06,
|
|
"loss": 0.6805,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.9118915588416513,
|
|
"grad_norm": 0.5664235353469849,
|
|
"learning_rate": 4.2782697319048605e-06,
|
|
"loss": 0.7404,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.9149722735674677,
|
|
"grad_norm": 0.5276147723197937,
|
|
"learning_rate": 3.988282515626585e-06,
|
|
"loss": 0.732,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.9180529882932841,
|
|
"grad_norm": 0.512634813785553,
|
|
"learning_rate": 3.7082712652200867e-06,
|
|
"loss": 0.7366,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.9211337030191005,
|
|
"grad_norm": 0.7594392895698547,
|
|
"learning_rate": 3.438265074870417e-06,
|
|
"loss": 0.7826,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.9242144177449169,
|
|
"grad_norm": 1.0028218030929565,
|
|
"learning_rate": 3.1782919992006333e-06,
|
|
"loss": 0.665,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.9272951324707333,
|
|
"grad_norm": 0.5730507969856262,
|
|
"learning_rate": 2.9283790503567222e-06,
|
|
"loss": 0.7497,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.9303758471965496,
|
|
"grad_norm": 0.5541625618934631,
|
|
"learning_rate": 2.6885521952010105e-06,
|
|
"loss": 0.7605,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.933456561922366,
|
|
"grad_norm": 0.5235902070999146,
|
|
"learning_rate": 2.458836352614069e-06,
|
|
"loss": 0.7322,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.9365372766481824,
|
|
"grad_norm": 0.787257730960846,
|
|
"learning_rate": 2.239255390905581e-06,
|
|
"loss": 0.7623,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.9396179913739988,
|
|
"grad_norm": 1.2801662683486938,
|
|
"learning_rate": 2.029832125334319e-06,
|
|
"loss": 0.6707,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.9426987060998152,
|
|
"grad_norm": 0.604186475276947,
|
|
"learning_rate": 1.8305883157375804e-06,
|
|
"loss": 0.733,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.9457794208256316,
|
|
"grad_norm": 0.5822204351425171,
|
|
"learning_rate": 1.6415446642702337e-06,
|
|
"loss": 0.7948,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.9488601355514479,
|
|
"grad_norm": 0.49571382999420166,
|
|
"learning_rate": 1.462720813253682e-06,
|
|
"loss": 0.743,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.9519408502772643,
|
|
"grad_norm": 0.8096848726272583,
|
|
"learning_rate": 1.2941353431350056e-06,
|
|
"loss": 0.7614,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.9550215650030807,
|
|
"grad_norm": 1.2853180170059204,
|
|
"learning_rate": 1.135805770556364e-06,
|
|
"loss": 0.7004,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.958102279728897,
|
|
"grad_norm": 0.5894697308540344,
|
|
"learning_rate": 9.877485465349058e-07,
|
|
"loss": 0.8042,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.9611829944547134,
|
|
"grad_norm": 0.5564848780632019,
|
|
"learning_rate": 8.499790547535025e-07,
|
|
"loss": 0.742,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.9642637091805298,
|
|
"grad_norm": 0.4396416246891022,
|
|
"learning_rate": 7.225116099623286e-07,
|
|
"loss": 0.6936,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.9673444239063462,
|
|
"grad_norm": 0.8373792171478271,
|
|
"learning_rate": 6.053594564914611e-07,
|
|
"loss": 0.7837,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.9704251386321626,
|
|
"grad_norm": 1.2192115783691406,
|
|
"learning_rate": 4.985347668747809e-07,
|
|
"loss": 0.6541,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.973505853357979,
|
|
"grad_norm": 0.6488357782363892,
|
|
"learning_rate": 4.0204864058522864e-07,
|
|
"loss": 0.8073,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.9765865680837954,
|
|
"grad_norm": 0.456813782453537,
|
|
"learning_rate": 3.15911102881461e-07,
|
|
"loss": 0.837,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.9796672828096118,
|
|
"grad_norm": 0.514924168586731,
|
|
"learning_rate": 2.40131103766239e-07,
|
|
"loss": 0.7338,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.9827479975354282,
|
|
"grad_norm": 0.7612470984458923,
|
|
"learning_rate": 1.747165170564724e-07,
|
|
"loss": 0.7998,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.9858287122612446,
|
|
"grad_norm": 0.8824509978294373,
|
|
"learning_rate": 1.1967413956510686e-07,
|
|
"loss": 0.6986,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.988909426987061,
|
|
"grad_norm": 0.6480757594108582,
|
|
"learning_rate": 7.500969039491157e-08,
|
|
"loss": 0.7331,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.9919901417128774,
|
|
"grad_norm": 0.6705480217933655,
|
|
"learning_rate": 4.0727810344254325e-08,
|
|
"loss": 0.8027,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.9950708564386938,
|
|
"grad_norm": 0.4745963215827942,
|
|
"learning_rate": 1.6832061424865153e-08,
|
|
"loss": 0.7123,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.9981515711645101,
|
|
"grad_norm": 0.5832483172416687,
|
|
"learning_rate": 3.3249264917878387e-09,
|
|
"loss": 0.6986,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 1623,
|
|
"total_flos": 123288491655168.0,
|
|
"train_loss": 0.7956359339436115,
|
|
"train_runtime": 17588.8577,
|
|
"train_samples_per_second": 2.952,
|
|
"train_steps_per_second": 0.092
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1623,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 123288491655168.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|