1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 120,
|
|
"best_metric": 0.33126009,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v7-20250507-004227/checkpoint-120",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 2.4505362510681152,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.4081788659095764,
|
|
"memory(GiB)": 27.77,
|
|
"step": 1,
|
|
"token_acc": 0.8560397131825703,
|
|
"train_speed(iter/s)": 0.065308
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 1.3398605585098267,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.3790343999862671,
|
|
"memory(GiB)": 27.77,
|
|
"step": 5,
|
|
"token_acc": 0.8759903354497949,
|
|
"train_speed(iter/s)": 0.120195
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 1.0211882591247559,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.3569159030914307,
|
|
"memory(GiB)": 27.77,
|
|
"step": 10,
|
|
"token_acc": 0.8904844941361507,
|
|
"train_speed(iter/s)": 0.137392
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.0586270093917847,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.36738641262054444,
|
|
"memory(GiB)": 27.77,
|
|
"step": 15,
|
|
"token_acc": 0.8821489760952925,
|
|
"train_speed(iter/s)": 0.140251
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 1.1482552289962769,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.3260908842086792,
|
|
"memory(GiB)": 27.77,
|
|
"step": 20,
|
|
"token_acc": 0.8923615160349854,
|
|
"train_speed(iter/s)": 0.144117
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.3649641275405884,
|
|
"eval_runtime": 5.3926,
|
|
"eval_samples_per_second": 18.544,
|
|
"eval_steps_per_second": 4.636,
|
|
"eval_token_acc": 0.8829422873787651,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 1.0274701118469238,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.32098817825317383,
|
|
"memory(GiB)": 27.77,
|
|
"step": 25,
|
|
"token_acc": 0.8865429663420047,
|
|
"train_speed(iter/s)": 0.132926
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 0.9113245010375977,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.35467684268951416,
|
|
"memory(GiB)": 27.77,
|
|
"step": 30,
|
|
"token_acc": 0.8852310260970564,
|
|
"train_speed(iter/s)": 0.137888
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.9023920893669128,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.3223384380340576,
|
|
"memory(GiB)": 27.77,
|
|
"step": 35,
|
|
"token_acc": 0.8949514563106796,
|
|
"train_speed(iter/s)": 0.140106
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.9127278923988342,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.3180943489074707,
|
|
"memory(GiB)": 27.77,
|
|
"step": 40,
|
|
"token_acc": 0.90066669149689,
|
|
"train_speed(iter/s)": 0.142488
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.3500390648841858,
|
|
"eval_runtime": 5.3821,
|
|
"eval_samples_per_second": 18.58,
|
|
"eval_steps_per_second": 4.645,
|
|
"eval_token_acc": 0.8875516148650812,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.947066068649292,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.33401944637298586,
|
|
"memory(GiB)": 27.77,
|
|
"step": 45,
|
|
"token_acc": 0.8919813402256824,
|
|
"train_speed(iter/s)": 0.136919
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 2.0620720386505127,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.3351354837417603,
|
|
"memory(GiB)": 27.77,
|
|
"step": 50,
|
|
"token_acc": 0.8765755647073505,
|
|
"train_speed(iter/s)": 0.139127
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.863703191280365,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.3273132801055908,
|
|
"memory(GiB)": 27.77,
|
|
"step": 55,
|
|
"token_acc": 0.8912860949877706,
|
|
"train_speed(iter/s)": 0.140614
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 0.890646755695343,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.3155367374420166,
|
|
"memory(GiB)": 30.08,
|
|
"step": 60,
|
|
"token_acc": 0.9143575243480992,
|
|
"train_speed(iter/s)": 0.142452
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.3454614281654358,
|
|
"eval_runtime": 5.3711,
|
|
"eval_samples_per_second": 18.618,
|
|
"eval_steps_per_second": 4.655,
|
|
"eval_token_acc": 0.8889920297045549,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.8996425271034241,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.2981250762939453,
|
|
"memory(GiB)": 30.08,
|
|
"step": 65,
|
|
"token_acc": 0.9051183738056113,
|
|
"train_speed(iter/s)": 0.137618
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.9426372647285461,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.3414067029953003,
|
|
"memory(GiB)": 30.08,
|
|
"step": 70,
|
|
"token_acc": 0.8901309721453606,
|
|
"train_speed(iter/s)": 0.139706
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.8184367418289185,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.32115802764892576,
|
|
"memory(GiB)": 30.08,
|
|
"step": 75,
|
|
"token_acc": 0.8985269424515341,
|
|
"train_speed(iter/s)": 0.14097
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 0.9321162104606628,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.31497313976287844,
|
|
"memory(GiB)": 30.08,
|
|
"step": 80,
|
|
"token_acc": 0.9010191988622896,
|
|
"train_speed(iter/s)": 0.141741
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.33993807435035706,
|
|
"eval_runtime": 5.3957,
|
|
"eval_samples_per_second": 18.533,
|
|
"eval_steps_per_second": 4.633,
|
|
"eval_token_acc": 0.8895361864216894,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.8877705931663513,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.32355318069458006,
|
|
"memory(GiB)": 30.08,
|
|
"step": 85,
|
|
"token_acc": 0.8971333885666943,
|
|
"train_speed(iter/s)": 0.138684
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 1.019900918006897,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.33187189102172854,
|
|
"memory(GiB)": 30.08,
|
|
"step": 90,
|
|
"token_acc": 0.8973049754299754,
|
|
"train_speed(iter/s)": 0.140038
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 1.0134016275405884,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.3340008020401001,
|
|
"memory(GiB)": 30.08,
|
|
"step": 95,
|
|
"token_acc": 0.9059227157818707,
|
|
"train_speed(iter/s)": 0.141282
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.9215405583381653,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.306801438331604,
|
|
"memory(GiB)": 30.08,
|
|
"step": 100,
|
|
"token_acc": 0.8903882234088613,
|
|
"train_speed(iter/s)": 0.142097
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.3363126516342163,
|
|
"eval_runtime": 5.4132,
|
|
"eval_samples_per_second": 18.473,
|
|
"eval_steps_per_second": 4.618,
|
|
"eval_token_acc": 0.8896002048589994,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 0.9498901963233948,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.31042842864990233,
|
|
"memory(GiB)": 30.08,
|
|
"step": 105,
|
|
"token_acc": 0.889660103071286,
|
|
"train_speed(iter/s)": 0.139949
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.8483244180679321,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.3191797733306885,
|
|
"memory(GiB)": 30.08,
|
|
"step": 110,
|
|
"token_acc": 0.909381808278867,
|
|
"train_speed(iter/s)": 0.140731
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.7640553116798401,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.2995746374130249,
|
|
"memory(GiB)": 30.08,
|
|
"step": 115,
|
|
"token_acc": 0.8994715117849015,
|
|
"train_speed(iter/s)": 0.141551
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 0.9591003656387329,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.32628965377807617,
|
|
"memory(GiB)": 30.08,
|
|
"step": 120,
|
|
"token_acc": 0.8891077731264877,
|
|
"train_speed(iter/s)": 0.142186
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.331260085105896,
|
|
"eval_runtime": 5.3939,
|
|
"eval_samples_per_second": 18.539,
|
|
"eval_steps_per_second": 4.635,
|
|
"eval_token_acc": 0.8917128132902276,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.9353374242782593,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.3271709680557251,
|
|
"memory(GiB)": 30.08,
|
|
"step": 125,
|
|
"token_acc": 0.8903564002694234,
|
|
"train_speed(iter/s)": 0.140229
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.9060182571411133,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.3163918018341064,
|
|
"memory(GiB)": 30.08,
|
|
"step": 130,
|
|
"token_acc": 0.8912760416666666,
|
|
"train_speed(iter/s)": 0.140553
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.8269082903862,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.3141618251800537,
|
|
"memory(GiB)": 30.08,
|
|
"step": 135,
|
|
"token_acc": 0.8898337308583083,
|
|
"train_speed(iter/s)": 0.141211
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.8577111959457397,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.3134615898132324,
|
|
"memory(GiB)": 30.08,
|
|
"step": 140,
|
|
"token_acc": 0.9006930194742344,
|
|
"train_speed(iter/s)": 0.141613
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.3313320279121399,
|
|
"eval_runtime": 5.3884,
|
|
"eval_samples_per_second": 18.558,
|
|
"eval_steps_per_second": 4.64,
|
|
"eval_token_acc": 0.8926410806312218,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.8400760293006897,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.2992702007293701,
|
|
"memory(GiB)": 30.08,
|
|
"step": 145,
|
|
"token_acc": 0.9021267154765301,
|
|
"train_speed(iter/s)": 0.139829
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.8579837679862976,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.32117404937744143,
|
|
"memory(GiB)": 30.08,
|
|
"step": 150,
|
|
"token_acc": 0.8848251385041551,
|
|
"train_speed(iter/s)": 0.140369
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.8645791411399841,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.3043407440185547,
|
|
"memory(GiB)": 30.08,
|
|
"step": 155,
|
|
"token_acc": 0.903212915601023,
|
|
"train_speed(iter/s)": 0.141142
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 0.8486159443855286,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.2472972869873047,
|
|
"memory(GiB)": 30.08,
|
|
"step": 160,
|
|
"token_acc": 0.9189923065319052,
|
|
"train_speed(iter/s)": 0.14175
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.33611831068992615,
|
|
"eval_runtime": 5.3694,
|
|
"eval_samples_per_second": 18.624,
|
|
"eval_steps_per_second": 4.656,
|
|
"eval_token_acc": 0.8918088409461925,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 0.8444439768791199,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.22124772071838378,
|
|
"memory(GiB)": 30.08,
|
|
"step": 165,
|
|
"token_acc": 0.92039636166496,
|
|
"train_speed(iter/s)": 0.14026
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 0.8758794069290161,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.231048059463501,
|
|
"memory(GiB)": 30.08,
|
|
"step": 170,
|
|
"token_acc": 0.927355278093076,
|
|
"train_speed(iter/s)": 0.141102
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 0.9414038062095642,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.2431964874267578,
|
|
"memory(GiB)": 30.08,
|
|
"step": 175,
|
|
"token_acc": 0.914981199287552,
|
|
"train_speed(iter/s)": 0.141563
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.8615570664405823,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.22901198863983155,
|
|
"memory(GiB)": 30.08,
|
|
"step": 180,
|
|
"token_acc": 0.9300724249884048,
|
|
"train_speed(iter/s)": 0.141935
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.34168365597724915,
|
|
"eval_runtime": 5.3971,
|
|
"eval_samples_per_second": 18.528,
|
|
"eval_steps_per_second": 4.632,
|
|
"eval_token_acc": 0.8907205275119234,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.8244655132293701,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.2308629035949707,
|
|
"memory(GiB)": 30.08,
|
|
"step": 185,
|
|
"token_acc": 0.9109640722038423,
|
|
"train_speed(iter/s)": 0.140603
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 0.9139054417610168,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.2585730791091919,
|
|
"memory(GiB)": 30.08,
|
|
"step": 190,
|
|
"token_acc": 0.9028991841491841,
|
|
"train_speed(iter/s)": 0.141319
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.7501769065856934,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.2274195671081543,
|
|
"memory(GiB)": 30.08,
|
|
"step": 195,
|
|
"token_acc": 0.9299425265767627,
|
|
"train_speed(iter/s)": 0.142053
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.8027601838111877,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.2291938304901123,
|
|
"memory(GiB)": 30.08,
|
|
"step": 200,
|
|
"token_acc": 0.9253513490971267,
|
|
"train_speed(iter/s)": 0.142337
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.34100720286369324,
|
|
"eval_runtime": 5.3678,
|
|
"eval_samples_per_second": 18.63,
|
|
"eval_steps_per_second": 4.657,
|
|
"eval_token_acc": 0.8911046381357831,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.8584316372871399,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.21250443458557128,
|
|
"memory(GiB)": 30.08,
|
|
"step": 205,
|
|
"token_acc": 0.9207754541291406,
|
|
"train_speed(iter/s)": 0.141123
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.8164386749267578,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.23753111362457274,
|
|
"memory(GiB)": 30.08,
|
|
"step": 210,
|
|
"token_acc": 0.9153866525423728,
|
|
"train_speed(iter/s)": 0.141492
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.8197464346885681,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.2262495279312134,
|
|
"memory(GiB)": 30.08,
|
|
"step": 215,
|
|
"token_acc": 0.9298754093424173,
|
|
"train_speed(iter/s)": 0.141871
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.8861284255981445,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.23235108852386474,
|
|
"memory(GiB)": 30.08,
|
|
"step": 220,
|
|
"token_acc": 0.916040434865535,
|
|
"train_speed(iter/s)": 0.142257
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.34229058027267456,
|
|
"eval_runtime": 5.3952,
|
|
"eval_samples_per_second": 18.535,
|
|
"eval_steps_per_second": 4.634,
|
|
"eval_token_acc": 0.8919368778208124,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 0.8957362771034241,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.2363661289215088,
|
|
"memory(GiB)": 30.08,
|
|
"step": 225,
|
|
"token_acc": 0.9184887277670782,
|
|
"train_speed(iter/s)": 0.141447
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.8692964911460876,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.22849671840667723,
|
|
"memory(GiB)": 30.08,
|
|
"step": 230,
|
|
"token_acc": 0.9200207931085698,
|
|
"train_speed(iter/s)": 0.141745
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.8732675909996033,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.22888550758361817,
|
|
"memory(GiB)": 30.08,
|
|
"step": 235,
|
|
"token_acc": 0.9242218099360956,
|
|
"train_speed(iter/s)": 0.142114
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.8548147082328796,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.24225883483886718,
|
|
"memory(GiB)": 30.08,
|
|
"step": 240,
|
|
"token_acc": 0.9354398726983405,
|
|
"train_speed(iter/s)": 0.142537
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.3384065330028534,
|
|
"eval_runtime": 5.3801,
|
|
"eval_samples_per_second": 18.587,
|
|
"eval_steps_per_second": 4.647,
|
|
"eval_token_acc": 0.8922569700073621,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.8377289772033691,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.20645420551300048,
|
|
"memory(GiB)": 30.08,
|
|
"step": 245,
|
|
"token_acc": 0.9174505252870755,
|
|
"train_speed(iter/s)": 0.14148
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.779121994972229,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.2250507354736328,
|
|
"memory(GiB)": 30.08,
|
|
"step": 250,
|
|
"token_acc": 0.9264376661536309,
|
|
"train_speed(iter/s)": 0.141711
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.8410191535949707,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.2303227186203003,
|
|
"memory(GiB)": 30.08,
|
|
"step": 255,
|
|
"token_acc": 0.9288690903865497,
|
|
"train_speed(iter/s)": 0.142159
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.8751044273376465,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.24010176658630372,
|
|
"memory(GiB)": 30.08,
|
|
"step": 260,
|
|
"token_acc": 0.9276785345930113,
|
|
"train_speed(iter/s)": 0.142379
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.33762192726135254,
|
|
"eval_runtime": 5.3885,
|
|
"eval_samples_per_second": 18.558,
|
|
"eval_steps_per_second": 4.639,
|
|
"eval_token_acc": 0.8926090714125668,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.7685917019844055,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.25291612148284914,
|
|
"memory(GiB)": 30.08,
|
|
"step": 265,
|
|
"token_acc": 0.9064826915478832,
|
|
"train_speed(iter/s)": 0.141534
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.8771150708198547,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.23574538230895997,
|
|
"memory(GiB)": 30.08,
|
|
"step": 270,
|
|
"token_acc": 0.9073804876022761,
|
|
"train_speed(iter/s)": 0.141801
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 0.8586969375610352,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.24325270652770997,
|
|
"memory(GiB)": 30.08,
|
|
"step": 275,
|
|
"token_acc": 0.9250533832744076,
|
|
"train_speed(iter/s)": 0.142307
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.8120052814483643,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.2067141056060791,
|
|
"memory(GiB)": 30.08,
|
|
"step": 280,
|
|
"token_acc": 0.9247845178077736,
|
|
"train_speed(iter/s)": 0.142568
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.3372032642364502,
|
|
"eval_runtime": 5.3852,
|
|
"eval_samples_per_second": 18.569,
|
|
"eval_steps_per_second": 4.642,
|
|
"eval_token_acc": 0.8925130437566019,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.7563571929931641,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.22141036987304688,
|
|
"memory(GiB)": 30.08,
|
|
"step": 285,
|
|
"token_acc": 0.9158564914393874,
|
|
"train_speed(iter/s)": 0.141564
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.840555727481842,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.22226524353027344,
|
|
"memory(GiB)": 30.08,
|
|
"step": 290,
|
|
"token_acc": 0.9322453534191164,
|
|
"train_speed(iter/s)": 0.14182
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.8599358797073364,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.22878422737121581,
|
|
"memory(GiB)": 30.08,
|
|
"step": 295,
|
|
"token_acc": 0.9275855327468231,
|
|
"train_speed(iter/s)": 0.142089
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 0.9196203947067261,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.23831405639648437,
|
|
"memory(GiB)": 30.08,
|
|
"step": 300,
|
|
"token_acc": 0.9162388743213797,
|
|
"train_speed(iter/s)": 0.142402
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.33528250455856323,
|
|
"eval_runtime": 5.3837,
|
|
"eval_samples_per_second": 18.575,
|
|
"eval_steps_per_second": 4.644,
|
|
"eval_token_acc": 0.8929931820364265,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.757847785949707,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.2106797695159912,
|
|
"memory(GiB)": 30.08,
|
|
"step": 305,
|
|
"token_acc": 0.9235401079083078,
|
|
"train_speed(iter/s)": 0.141615
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.967450737953186,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.21637356281280518,
|
|
"memory(GiB)": 30.08,
|
|
"step": 310,
|
|
"token_acc": 0.9329970868298622,
|
|
"train_speed(iter/s)": 0.141903
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.7503668665885925,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.16243449449539185,
|
|
"memory(GiB)": 30.08,
|
|
"step": 315,
|
|
"token_acc": 0.9508763656370353,
|
|
"train_speed(iter/s)": 0.141997
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.8131990432739258,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.17449368238449098,
|
|
"memory(GiB)": 30.08,
|
|
"step": 320,
|
|
"token_acc": 0.9495816440955749,
|
|
"train_speed(iter/s)": 0.142202
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.3504800796508789,
|
|
"eval_runtime": 5.4562,
|
|
"eval_samples_per_second": 18.328,
|
|
"eval_steps_per_second": 4.582,
|
|
"eval_token_acc": 0.8927691175058416,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.8142232894897461,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.16946163177490234,
|
|
"memory(GiB)": 30.08,
|
|
"step": 325,
|
|
"token_acc": 0.9349265764468759,
|
|
"train_speed(iter/s)": 0.141472
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.8298311829566956,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.1755598545074463,
|
|
"memory(GiB)": 30.08,
|
|
"step": 330,
|
|
"token_acc": 0.9531752999707346,
|
|
"train_speed(iter/s)": 0.141802
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.7940059304237366,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.1952407479286194,
|
|
"memory(GiB)": 30.08,
|
|
"step": 335,
|
|
"token_acc": 0.9343925770825635,
|
|
"train_speed(iter/s)": 0.142047
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.7858513593673706,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.16918621063232422,
|
|
"memory(GiB)": 30.08,
|
|
"step": 340,
|
|
"token_acc": 0.9412830735773831,
|
|
"train_speed(iter/s)": 0.142253
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.3576539158821106,
|
|
"eval_runtime": 5.351,
|
|
"eval_samples_per_second": 18.688,
|
|
"eval_steps_per_second": 4.672,
|
|
"eval_token_acc": 0.8910406196984731,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.7290251851081848,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.1872728943824768,
|
|
"memory(GiB)": 30.08,
|
|
"step": 345,
|
|
"token_acc": 0.9292867611138251,
|
|
"train_speed(iter/s)": 0.141644
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.7997108101844788,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.16540231704711914,
|
|
"memory(GiB)": 30.08,
|
|
"step": 350,
|
|
"token_acc": 0.9447741310403294,
|
|
"train_speed(iter/s)": 0.141801
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.7840582728385925,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.17242782115936278,
|
|
"memory(GiB)": 30.08,
|
|
"step": 355,
|
|
"token_acc": 0.9399465492847037,
|
|
"train_speed(iter/s)": 0.142023
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.7512005567550659,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.1620992064476013,
|
|
"memory(GiB)": 30.08,
|
|
"step": 360,
|
|
"token_acc": 0.9435792877983619,
|
|
"train_speed(iter/s)": 0.142359
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.36009594798088074,
|
|
"eval_runtime": 5.3832,
|
|
"eval_samples_per_second": 18.576,
|
|
"eval_steps_per_second": 4.644,
|
|
"eval_token_acc": 0.8920008962581223,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.7732217311859131,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.16843740940093993,
|
|
"memory(GiB)": 30.08,
|
|
"step": 365,
|
|
"token_acc": 0.9334714548802947,
|
|
"train_speed(iter/s)": 0.141824
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.7981094121932983,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.1794123411178589,
|
|
"memory(GiB)": 30.08,
|
|
"step": 370,
|
|
"token_acc": 0.9482818106541541,
|
|
"train_speed(iter/s)": 0.142017
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.7665418982505798,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.1843852996826172,
|
|
"memory(GiB)": 30.08,
|
|
"step": 375,
|
|
"token_acc": 0.9573796089286348,
|
|
"train_speed(iter/s)": 0.142235
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.9355995655059814,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.17202303409576417,
|
|
"memory(GiB)": 30.08,
|
|
"step": 380,
|
|
"token_acc": 0.9362966839881864,
|
|
"train_speed(iter/s)": 0.142515
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.36116844415664673,
|
|
"eval_runtime": 5.3783,
|
|
"eval_samples_per_second": 18.593,
|
|
"eval_steps_per_second": 4.648,
|
|
"eval_token_acc": 0.8914567395409878,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.7312317490577698,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.17411458492279053,
|
|
"memory(GiB)": 30.08,
|
|
"step": 385,
|
|
"token_acc": 0.9338178444410082,
|
|
"train_speed(iter/s)": 0.141893
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.7563744187355042,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.1915157437324524,
|
|
"memory(GiB)": 30.08,
|
|
"step": 390,
|
|
"token_acc": 0.9295483460559797,
|
|
"train_speed(iter/s)": 0.142171
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.7823712229728699,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.15321061611175538,
|
|
"memory(GiB)": 30.08,
|
|
"step": 395,
|
|
"token_acc": 0.9472502392696753,
|
|
"train_speed(iter/s)": 0.142376
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.6383055448532104,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.16857578754425048,
|
|
"memory(GiB)": 30.08,
|
|
"step": 400,
|
|
"token_acc": 0.9525385172164202,
|
|
"train_speed(iter/s)": 0.142658
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.36100760102272034,
|
|
"eval_runtime": 5.3454,
|
|
"eval_samples_per_second": 18.708,
|
|
"eval_steps_per_second": 4.677,
|
|
"eval_token_acc": 0.891264684229058,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.8574426174163818,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.18224529027938843,
|
|
"memory(GiB)": 30.08,
|
|
"step": 405,
|
|
"token_acc": 0.9255623050402233,
|
|
"train_speed(iter/s)": 0.142038
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.9696727395057678,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.18939828872680664,
|
|
"memory(GiB)": 30.08,
|
|
"step": 410,
|
|
"token_acc": 0.9416907375312922,
|
|
"train_speed(iter/s)": 0.142235
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.7818398475646973,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.19069280624389648,
|
|
"memory(GiB)": 30.08,
|
|
"step": 415,
|
|
"token_acc": 0.9258034817542685,
|
|
"train_speed(iter/s)": 0.142409
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.8981226086616516,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.17302310466766357,
|
|
"memory(GiB)": 30.08,
|
|
"step": 420,
|
|
"token_acc": 0.950883135736753,
|
|
"train_speed(iter/s)": 0.142691
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.3606036305427551,
|
|
"eval_runtime": 5.3784,
|
|
"eval_samples_per_second": 18.593,
|
|
"eval_steps_per_second": 4.648,
|
|
"eval_token_acc": 0.8917128132902276,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.754592776298523,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.16562799215316773,
|
|
"memory(GiB)": 30.08,
|
|
"step": 425,
|
|
"token_acc": 0.9294097342078012,
|
|
"train_speed(iter/s)": 0.14213
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.811010479927063,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.17516304254531861,
|
|
"memory(GiB)": 30.08,
|
|
"step": 430,
|
|
"token_acc": 0.953036002149382,
|
|
"train_speed(iter/s)": 0.142302
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.8035722374916077,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.16143158674240113,
|
|
"memory(GiB)": 30.08,
|
|
"step": 435,
|
|
"token_acc": 0.9470889436753271,
|
|
"train_speed(iter/s)": 0.142493
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.7885386347770691,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.16893348693847657,
|
|
"memory(GiB)": 30.08,
|
|
"step": 440,
|
|
"token_acc": 0.946100607044813,
|
|
"train_speed(iter/s)": 0.142701
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.36063292622566223,
|
|
"eval_runtime": 5.3864,
|
|
"eval_samples_per_second": 18.565,
|
|
"eval_steps_per_second": 4.641,
|
|
"eval_token_acc": 0.8917448225088825,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.8363362550735474,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.166959547996521,
|
|
"memory(GiB)": 30.08,
|
|
"step": 445,
|
|
"token_acc": 0.9339094103124109,
|
|
"train_speed(iter/s)": 0.142143
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.817148745059967,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.16920559406280516,
|
|
"memory(GiB)": 30.08,
|
|
"step": 450,
|
|
"token_acc": 0.9441476444876153,
|
|
"train_speed(iter/s)": 0.142396
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.7608515620231628,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.16781603097915648,
|
|
"memory(GiB)": 30.08,
|
|
"step": 455,
|
|
"token_acc": 0.9337727971874313,
|
|
"train_speed(iter/s)": 0.142524
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.8431264162063599,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.2008026123046875,
|
|
"memory(GiB)": 30.08,
|
|
"step": 460,
|
|
"token_acc": 0.9438367531683766,
|
|
"train_speed(iter/s)": 0.142801
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.36077243089675903,
|
|
"eval_runtime": 5.3777,
|
|
"eval_samples_per_second": 18.595,
|
|
"eval_steps_per_second": 4.649,
|
|
"eval_token_acc": 0.8913927211036778,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.3604045808315277,
|
|
"eval_runtime": 5.3956,
|
|
"eval_samples_per_second": 18.534,
|
|
"eval_steps_per_second": 4.633,
|
|
"eval_token_acc": 0.8918728593835025,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.754364855085957e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|