1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 300,
|
|
"best_metric": 0.40253255,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v8-20250507-004645/checkpoint-300",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 4.932199478149414,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.7780591249465942,
|
|
"memory(GiB)": 27.73,
|
|
"step": 1,
|
|
"token_acc": 0.782099343955014,
|
|
"train_speed(iter/s)": 0.064891
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 2.3600621223449707,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.6091042757034302,
|
|
"memory(GiB)": 27.73,
|
|
"step": 5,
|
|
"token_acc": 0.8179287124866458,
|
|
"train_speed(iter/s)": 0.118621
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 1.088510274887085,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.4719734191894531,
|
|
"memory(GiB)": 27.73,
|
|
"step": 10,
|
|
"token_acc": 0.8583190394511149,
|
|
"train_speed(iter/s)": 0.135341
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.0002374649047852,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.47036895751953123,
|
|
"memory(GiB)": 27.73,
|
|
"step": 15,
|
|
"token_acc": 0.8504078264405482,
|
|
"train_speed(iter/s)": 0.137962
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 0.9563055038452148,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.42792816162109376,
|
|
"memory(GiB)": 27.73,
|
|
"step": 20,
|
|
"token_acc": 0.863187115610118,
|
|
"train_speed(iter/s)": 0.141664
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.4551742970943451,
|
|
"eval_runtime": 5.4465,
|
|
"eval_samples_per_second": 18.36,
|
|
"eval_steps_per_second": 4.59,
|
|
"eval_token_acc": 0.8559020470633251,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 0.8774688243865967,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.41231346130371094,
|
|
"memory(GiB)": 27.73,
|
|
"step": 25,
|
|
"token_acc": 0.8627429786160803,
|
|
"train_speed(iter/s)": 0.130766
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 0.7832381129264832,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.4305295467376709,
|
|
"memory(GiB)": 27.73,
|
|
"step": 30,
|
|
"token_acc": 0.8626980747248807,
|
|
"train_speed(iter/s)": 0.135526
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.7605119943618774,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.40700688362121584,
|
|
"memory(GiB)": 27.77,
|
|
"step": 35,
|
|
"token_acc": 0.8702584217812644,
|
|
"train_speed(iter/s)": 0.137556
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.7823914289474487,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.40500674247741697,
|
|
"memory(GiB)": 27.77,
|
|
"step": 40,
|
|
"token_acc": 0.8749971213412246,
|
|
"train_speed(iter/s)": 0.139743
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.4355735778808594,
|
|
"eval_runtime": 5.436,
|
|
"eval_samples_per_second": 18.396,
|
|
"eval_steps_per_second": 4.599,
|
|
"eval_token_acc": 0.8621580256361201,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.8007071018218994,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.41349210739135744,
|
|
"memory(GiB)": 27.77,
|
|
"step": 45,
|
|
"token_acc": 0.8671768894761958,
|
|
"train_speed(iter/s)": 0.134577
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.8084492683410645,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.41720309257507326,
|
|
"memory(GiB)": 27.77,
|
|
"step": 50,
|
|
"token_acc": 0.8574223526534605,
|
|
"train_speed(iter/s)": 0.136609
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.757235050201416,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.41148929595947265,
|
|
"memory(GiB)": 27.77,
|
|
"step": 55,
|
|
"token_acc": 0.8688445445767622,
|
|
"train_speed(iter/s)": 0.138016
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 0.787464439868927,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.40866241455078123,
|
|
"memory(GiB)": 30.15,
|
|
"step": 60,
|
|
"token_acc": 0.8828734404289198,
|
|
"train_speed(iter/s)": 0.139782
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.4253135919570923,
|
|
"eval_runtime": 5.4547,
|
|
"eval_samples_per_second": 18.333,
|
|
"eval_steps_per_second": 4.583,
|
|
"eval_token_acc": 0.8650277405777693,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.8023079633712769,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.3891183137893677,
|
|
"memory(GiB)": 30.15,
|
|
"step": 65,
|
|
"token_acc": 0.8755363232975173,
|
|
"train_speed(iter/s)": 0.135184
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.762795090675354,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.4177716255187988,
|
|
"memory(GiB)": 30.15,
|
|
"step": 70,
|
|
"token_acc": 0.8719830172135309,
|
|
"train_speed(iter/s)": 0.137146
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.7374395728111267,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.3992452621459961,
|
|
"memory(GiB)": 30.15,
|
|
"step": 75,
|
|
"token_acc": 0.872343302756429,
|
|
"train_speed(iter/s)": 0.138292
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 0.7449392080307007,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.39937677383422854,
|
|
"memory(GiB)": 30.15,
|
|
"step": 80,
|
|
"token_acc": 0.8710408988995696,
|
|
"train_speed(iter/s)": 0.138969
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.4182729721069336,
|
|
"eval_runtime": 5.4623,
|
|
"eval_samples_per_second": 18.307,
|
|
"eval_steps_per_second": 4.577,
|
|
"eval_token_acc": 0.8654868949684331,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.7495951652526855,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.39512038230895996,
|
|
"memory(GiB)": 30.15,
|
|
"step": 85,
|
|
"token_acc": 0.870334291390984,
|
|
"train_speed(iter/s)": 0.136059
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 0.8733298182487488,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.4132417678833008,
|
|
"memory(GiB)": 30.15,
|
|
"step": 90,
|
|
"token_acc": 0.8729202391435325,
|
|
"train_speed(iter/s)": 0.137339
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 0.8373153805732727,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.40344934463500975,
|
|
"memory(GiB)": 30.15,
|
|
"step": 95,
|
|
"token_acc": 0.8812556053811659,
|
|
"train_speed(iter/s)": 0.138502
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.8512130379676819,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.3863351821899414,
|
|
"memory(GiB)": 30.16,
|
|
"step": 100,
|
|
"token_acc": 0.8727646779553727,
|
|
"train_speed(iter/s)": 0.139249
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.4132169485092163,
|
|
"eval_runtime": 5.4403,
|
|
"eval_samples_per_second": 18.381,
|
|
"eval_steps_per_second": 4.595,
|
|
"eval_token_acc": 0.8673235125310885,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 0.8247293829917908,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.40260896682739256,
|
|
"memory(GiB)": 30.16,
|
|
"step": 105,
|
|
"token_acc": 0.8619686556852478,
|
|
"train_speed(iter/s)": 0.137282
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.7707865834236145,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.3850682020187378,
|
|
"memory(GiB)": 30.16,
|
|
"step": 110,
|
|
"token_acc": 0.8824787229538045,
|
|
"train_speed(iter/s)": 0.138008
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.7147298455238342,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.37834107875823975,
|
|
"memory(GiB)": 30.16,
|
|
"step": 115,
|
|
"token_acc": 0.8782488780852655,
|
|
"train_speed(iter/s)": 0.138792
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 0.7929354906082153,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.40454673767089844,
|
|
"memory(GiB)": 30.16,
|
|
"step": 120,
|
|
"token_acc": 0.8664209147790658,
|
|
"train_speed(iter/s)": 0.139398
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.40885430574417114,
|
|
"eval_runtime": 5.4542,
|
|
"eval_samples_per_second": 18.334,
|
|
"eval_steps_per_second": 4.584,
|
|
"eval_token_acc": 0.8693323129902429,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.7925952076911926,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.39439709186553956,
|
|
"memory(GiB)": 30.16,
|
|
"step": 125,
|
|
"token_acc": 0.8721447484554281,
|
|
"train_speed(iter/s)": 0.137539
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.7893044352531433,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.39348788261413575,
|
|
"memory(GiB)": 30.16,
|
|
"step": 130,
|
|
"token_acc": 0.872299544278852,
|
|
"train_speed(iter/s)": 0.137807
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.7297965884208679,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.390042781829834,
|
|
"memory(GiB)": 30.16,
|
|
"step": 135,
|
|
"token_acc": 0.872897976215314,
|
|
"train_speed(iter/s)": 0.138413
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.7339494824409485,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.3968710660934448,
|
|
"memory(GiB)": 30.16,
|
|
"step": 140,
|
|
"token_acc": 0.8762483817273904,
|
|
"train_speed(iter/s)": 0.138796
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.40561485290527344,
|
|
"eval_runtime": 5.454,
|
|
"eval_samples_per_second": 18.335,
|
|
"eval_steps_per_second": 4.584,
|
|
"eval_token_acc": 0.8705184618327912,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.7347233891487122,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.3751286506652832,
|
|
"memory(GiB)": 30.16,
|
|
"step": 145,
|
|
"token_acc": 0.8804492278895648,
|
|
"train_speed(iter/s)": 0.137132
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.7385802268981934,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.38750033378601073,
|
|
"memory(GiB)": 30.16,
|
|
"step": 150,
|
|
"token_acc": 0.8667628785284477,
|
|
"train_speed(iter/s)": 0.137631
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7280504107475281,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.38422205448150637,
|
|
"memory(GiB)": 30.16,
|
|
"step": 155,
|
|
"token_acc": 0.8743071565213126,
|
|
"train_speed(iter/s)": 0.138389
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 0.7759126424789429,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.33709375858306884,
|
|
"memory(GiB)": 30.16,
|
|
"step": 160,
|
|
"token_acc": 0.8903214253738025,
|
|
"train_speed(iter/s)": 0.138965
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.41121506690979004,
|
|
"eval_runtime": 5.4481,
|
|
"eval_samples_per_second": 18.355,
|
|
"eval_steps_per_second": 4.589,
|
|
"eval_token_acc": 0.8699062559785727,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 0.7367027997970581,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.3059047222137451,
|
|
"memory(GiB)": 30.16,
|
|
"step": 165,
|
|
"token_acc": 0.893117110476366,
|
|
"train_speed(iter/s)": 0.13755
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 0.7874158024787903,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.3194535255432129,
|
|
"memory(GiB)": 30.16,
|
|
"step": 170,
|
|
"token_acc": 0.8963855982498197,
|
|
"train_speed(iter/s)": 0.13837
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 0.8298231959342957,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.32886972427368166,
|
|
"memory(GiB)": 30.16,
|
|
"step": 175,
|
|
"token_acc": 0.8890347381744879,
|
|
"train_speed(iter/s)": 0.138796
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.7421779632568359,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.3223016977310181,
|
|
"memory(GiB)": 30.16,
|
|
"step": 180,
|
|
"token_acc": 0.8996364289240989,
|
|
"train_speed(iter/s)": 0.139126
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.4112629294395447,
|
|
"eval_runtime": 5.4548,
|
|
"eval_samples_per_second": 18.333,
|
|
"eval_steps_per_second": 4.583,
|
|
"eval_token_acc": 0.8689114214654677,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.6949385404586792,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.3096341609954834,
|
|
"memory(GiB)": 30.16,
|
|
"step": 185,
|
|
"token_acc": 0.8867440022985204,
|
|
"train_speed(iter/s)": 0.137864
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 0.7675971984863281,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.3296669483184814,
|
|
"memory(GiB)": 30.16,
|
|
"step": 190,
|
|
"token_acc": 0.8872481430414091,
|
|
"train_speed(iter/s)": 0.138534
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.6753478646278381,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.31036269664764404,
|
|
"memory(GiB)": 30.16,
|
|
"step": 195,
|
|
"token_acc": 0.8989660334986432,
|
|
"train_speed(iter/s)": 0.139226
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.7082095742225647,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.3189213752746582,
|
|
"memory(GiB)": 30.16,
|
|
"step": 200,
|
|
"token_acc": 0.893559169826382,
|
|
"train_speed(iter/s)": 0.139505
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.4101768136024475,
|
|
"eval_runtime": 5.4502,
|
|
"eval_samples_per_second": 18.348,
|
|
"eval_steps_per_second": 4.587,
|
|
"eval_token_acc": 0.8691792615266883,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.7023712992668152,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.3078160285949707,
|
|
"memory(GiB)": 30.16,
|
|
"step": 205,
|
|
"token_acc": 0.8920373624341071,
|
|
"train_speed(iter/s)": 0.138308
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.7248120307922363,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.326322340965271,
|
|
"memory(GiB)": 30.16,
|
|
"step": 210,
|
|
"token_acc": 0.888576901881544,
|
|
"train_speed(iter/s)": 0.138642
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.7424785494804382,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.3178426504135132,
|
|
"memory(GiB)": 30.16,
|
|
"step": 215,
|
|
"token_acc": 0.8996045025859446,
|
|
"train_speed(iter/s)": 0.138982
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.7585700154304504,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.31358323097229,
|
|
"memory(GiB)": 30.16,
|
|
"step": 220,
|
|
"token_acc": 0.8893666839273251,
|
|
"train_speed(iter/s)": 0.139322
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.4098529815673828,
|
|
"eval_runtime": 5.4526,
|
|
"eval_samples_per_second": 18.34,
|
|
"eval_steps_per_second": 4.585,
|
|
"eval_token_acc": 0.8699062559785727,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 0.7577831149101257,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.3181041717529297,
|
|
"memory(GiB)": 30.16,
|
|
"step": 225,
|
|
"token_acc": 0.8940511833475905,
|
|
"train_speed(iter/s)": 0.138553
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.8157357573509216,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.3100062370300293,
|
|
"memory(GiB)": 30.16,
|
|
"step": 230,
|
|
"token_acc": 0.8942779905384095,
|
|
"train_speed(iter/s)": 0.138826
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.7473869919776917,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.3215769290924072,
|
|
"memory(GiB)": 30.16,
|
|
"step": 235,
|
|
"token_acc": 0.8864230396902226,
|
|
"train_speed(iter/s)": 0.139172
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.7379017472267151,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.32453505992889403,
|
|
"memory(GiB)": 30.16,
|
|
"step": 240,
|
|
"token_acc": 0.9024658286970259,
|
|
"train_speed(iter/s)": 0.139554
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.406717449426651,
|
|
"eval_runtime": 5.4667,
|
|
"eval_samples_per_second": 18.293,
|
|
"eval_steps_per_second": 4.573,
|
|
"eval_token_acc": 0.8711880619858428,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.7182732224464417,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.3010763645172119,
|
|
"memory(GiB)": 30.16,
|
|
"step": 245,
|
|
"token_acc": 0.8939800233960227,
|
|
"train_speed(iter/s)": 0.138563
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.7385874390602112,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.30601317882537843,
|
|
"memory(GiB)": 30.16,
|
|
"step": 250,
|
|
"token_acc": 0.9005994116476079,
|
|
"train_speed(iter/s)": 0.138773
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.7460725903511047,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.32054686546325684,
|
|
"memory(GiB)": 30.16,
|
|
"step": 255,
|
|
"token_acc": 0.8981558249490219,
|
|
"train_speed(iter/s)": 0.139197
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.7663230299949646,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.32004489898681643,
|
|
"memory(GiB)": 30.16,
|
|
"step": 260,
|
|
"token_acc": 0.899984937490586,
|
|
"train_speed(iter/s)": 0.139402
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.406484454870224,
|
|
"eval_runtime": 5.4651,
|
|
"eval_samples_per_second": 18.298,
|
|
"eval_steps_per_second": 4.574,
|
|
"eval_token_acc": 0.8711306676870098,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.7412447929382324,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.3273331642150879,
|
|
"memory(GiB)": 30.16,
|
|
"step": 265,
|
|
"token_acc": 0.8869161986402602,
|
|
"train_speed(iter/s)": 0.138596
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.773398756980896,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.3207144498825073,
|
|
"memory(GiB)": 30.16,
|
|
"step": 270,
|
|
"token_acc": 0.8838720231835285,
|
|
"train_speed(iter/s)": 0.138842
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 0.737702488899231,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.32349045276641847,
|
|
"memory(GiB)": 30.16,
|
|
"step": 275,
|
|
"token_acc": 0.899477893067213,
|
|
"train_speed(iter/s)": 0.139328
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.7224950194358826,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.30070719718933103,
|
|
"memory(GiB)": 30.16,
|
|
"step": 280,
|
|
"token_acc": 0.8971495671394364,
|
|
"train_speed(iter/s)": 0.139562
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.40459758043289185,
|
|
"eval_runtime": 5.4432,
|
|
"eval_samples_per_second": 18.371,
|
|
"eval_steps_per_second": 4.593,
|
|
"eval_token_acc": 0.8716854792423953,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.6978333592414856,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.31296162605285643,
|
|
"memory(GiB)": 30.16,
|
|
"step": 285,
|
|
"token_acc": 0.8931119696495075,
|
|
"train_speed(iter/s)": 0.138664
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.7149765491485596,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.2978228569030762,
|
|
"memory(GiB)": 30.16,
|
|
"step": 290,
|
|
"token_acc": 0.9099453551912569,
|
|
"train_speed(iter/s)": 0.138912
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.7401219010353088,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.3145498752593994,
|
|
"memory(GiB)": 30.16,
|
|
"step": 295,
|
|
"token_acc": 0.8971085419769723,
|
|
"train_speed(iter/s)": 0.139171
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 0.7960948348045349,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.31858437061309813,
|
|
"memory(GiB)": 30.16,
|
|
"step": 300,
|
|
"token_acc": 0.8927697189483228,
|
|
"train_speed(iter/s)": 0.139487
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.40253254771232605,
|
|
"eval_runtime": 5.4598,
|
|
"eval_samples_per_second": 18.316,
|
|
"eval_steps_per_second": 4.579,
|
|
"eval_token_acc": 0.871608953510618,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.7060583233833313,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.3089438438415527,
|
|
"memory(GiB)": 30.16,
|
|
"step": 305,
|
|
"token_acc": 0.8951569409988135,
|
|
"train_speed(iter/s)": 0.138796
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.8663066625595093,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.30643525123596194,
|
|
"memory(GiB)": 30.16,
|
|
"step": 310,
|
|
"token_acc": 0.9002104614752836,
|
|
"train_speed(iter/s)": 0.139084
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.6777763366699219,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.2622525691986084,
|
|
"memory(GiB)": 30.16,
|
|
"step": 315,
|
|
"token_acc": 0.9182915057915058,
|
|
"train_speed(iter/s)": 0.139164
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.7604569792747498,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.2744235277175903,
|
|
"memory(GiB)": 30.16,
|
|
"step": 320,
|
|
"token_acc": 0.9147708067912951,
|
|
"train_speed(iter/s)": 0.139363
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.41404902935028076,
|
|
"eval_runtime": 5.4464,
|
|
"eval_samples_per_second": 18.361,
|
|
"eval_steps_per_second": 4.59,
|
|
"eval_token_acc": 0.8715898220776737,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.7027745246887207,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.2601346492767334,
|
|
"memory(GiB)": 30.16,
|
|
"step": 325,
|
|
"token_acc": 0.9096833050957904,
|
|
"train_speed(iter/s)": 0.138685
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.7389653325080872,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.26451470851898196,
|
|
"memory(GiB)": 30.16,
|
|
"step": 330,
|
|
"token_acc": 0.9258255445505091,
|
|
"train_speed(iter/s)": 0.138988
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.750593364238739,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.283371901512146,
|
|
"memory(GiB)": 30.16,
|
|
"step": 335,
|
|
"token_acc": 0.904814352497736,
|
|
"train_speed(iter/s)": 0.139218
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.6902281641960144,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.2589299440383911,
|
|
"memory(GiB)": 30.16,
|
|
"step": 340,
|
|
"token_acc": 0.9170253055603375,
|
|
"train_speed(iter/s)": 0.139418
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.4144185781478882,
|
|
"eval_runtime": 5.4505,
|
|
"eval_samples_per_second": 18.347,
|
|
"eval_steps_per_second": 4.587,
|
|
"eval_token_acc": 0.8716472163765066,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.6674037575721741,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.2723775148391724,
|
|
"memory(GiB)": 30.16,
|
|
"step": 345,
|
|
"token_acc": 0.9019237534484993,
|
|
"train_speed(iter/s)": 0.138831
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.7093120217323303,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.2645721912384033,
|
|
"memory(GiB)": 30.16,
|
|
"step": 350,
|
|
"token_acc": 0.9119130680746748,
|
|
"train_speed(iter/s)": 0.138979
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.6906498074531555,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.259444522857666,
|
|
"memory(GiB)": 30.16,
|
|
"step": 355,
|
|
"token_acc": 0.9145142038672714,
|
|
"train_speed(iter/s)": 0.139182
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.7055203318595886,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.25058302879333494,
|
|
"memory(GiB)": 30.16,
|
|
"step": 360,
|
|
"token_acc": 0.9134637201070926,
|
|
"train_speed(iter/s)": 0.139508
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.41360363364219666,
|
|
"eval_runtime": 5.4444,
|
|
"eval_samples_per_second": 18.367,
|
|
"eval_steps_per_second": 4.592,
|
|
"eval_token_acc": 0.8721828964989478,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.7174535393714905,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.26172120571136476,
|
|
"memory(GiB)": 30.16,
|
|
"step": 365,
|
|
"token_acc": 0.9057994175722708,
|
|
"train_speed(iter/s)": 0.139028
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.7143301367759705,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.2703177213668823,
|
|
"memory(GiB)": 30.16,
|
|
"step": 370,
|
|
"token_acc": 0.9194112781795432,
|
|
"train_speed(iter/s)": 0.139208
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.7012506127357483,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.27690658569335935,
|
|
"memory(GiB)": 30.16,
|
|
"step": 375,
|
|
"token_acc": 0.9169483450919897,
|
|
"train_speed(iter/s)": 0.139394
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.7648996114730835,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.26938657760620116,
|
|
"memory(GiB)": 30.16,
|
|
"step": 380,
|
|
"token_acc": 0.9042763382008948,
|
|
"train_speed(iter/s)": 0.139668
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.41534754633903503,
|
|
"eval_runtime": 5.5436,
|
|
"eval_samples_per_second": 18.039,
|
|
"eval_steps_per_second": 4.51,
|
|
"eval_token_acc": 0.871436770614119,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.6768928170204163,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.2653059482574463,
|
|
"memory(GiB)": 30.16,
|
|
"step": 385,
|
|
"token_acc": 0.9062684911242603,
|
|
"train_speed(iter/s)": 0.139087
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.6828300952911377,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.27868268489837644,
|
|
"memory(GiB)": 30.16,
|
|
"step": 390,
|
|
"token_acc": 0.9041146306155998,
|
|
"train_speed(iter/s)": 0.139361
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.6979082822799683,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.24405245780944823,
|
|
"memory(GiB)": 30.16,
|
|
"step": 395,
|
|
"token_acc": 0.9200107009095773,
|
|
"train_speed(iter/s)": 0.139546
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.6200575828552246,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.2625063419342041,
|
|
"memory(GiB)": 30.16,
|
|
"step": 400,
|
|
"token_acc": 0.9186572124972302,
|
|
"train_speed(iter/s)": 0.139799
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.41504132747650146,
|
|
"eval_runtime": 5.4443,
|
|
"eval_samples_per_second": 18.368,
|
|
"eval_steps_per_second": 4.592,
|
|
"eval_token_acc": 0.871723742108284,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.7500612139701843,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.26823058128356936,
|
|
"memory(GiB)": 30.16,
|
|
"step": 405,
|
|
"token_acc": 0.8981710236522072,
|
|
"train_speed(iter/s)": 0.139221
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.8057283163070679,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.27291839122772216,
|
|
"memory(GiB)": 30.16,
|
|
"step": 410,
|
|
"token_acc": 0.9183253730661121,
|
|
"train_speed(iter/s)": 0.139402
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.7092121243476868,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.28043303489685056,
|
|
"memory(GiB)": 30.16,
|
|
"step": 415,
|
|
"token_acc": 0.9004696220894495,
|
|
"train_speed(iter/s)": 0.13956
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.7463679909706116,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.26267204284667967,
|
|
"memory(GiB)": 30.16,
|
|
"step": 420,
|
|
"token_acc": 0.9165323480546532,
|
|
"train_speed(iter/s)": 0.139835
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.41523492336273193,
|
|
"eval_runtime": 5.4494,
|
|
"eval_samples_per_second": 18.351,
|
|
"eval_steps_per_second": 4.588,
|
|
"eval_token_acc": 0.8714559020470634,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.6929903626441956,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.26644191741943357,
|
|
"memory(GiB)": 30.16,
|
|
"step": 425,
|
|
"token_acc": 0.8992937483651583,
|
|
"train_speed(iter/s)": 0.139333
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.707722008228302,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.266437292098999,
|
|
"memory(GiB)": 30.16,
|
|
"step": 430,
|
|
"token_acc": 0.9212307137056753,
|
|
"train_speed(iter/s)": 0.139491
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.7085736393928528,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.25338120460510255,
|
|
"memory(GiB)": 30.16,
|
|
"step": 435,
|
|
"token_acc": 0.9183108895950982,
|
|
"train_speed(iter/s)": 0.139672
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.7079160213470459,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.26110315322875977,
|
|
"memory(GiB)": 30.16,
|
|
"step": 440,
|
|
"token_acc": 0.9123924065558306,
|
|
"train_speed(iter/s)": 0.139879
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.4152638614177704,
|
|
"eval_runtime": 5.4468,
|
|
"eval_samples_per_second": 18.359,
|
|
"eval_steps_per_second": 4.59,
|
|
"eval_token_acc": 0.8718959250047829,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.7511946558952332,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.2701514482498169,
|
|
"memory(GiB)": 30.16,
|
|
"step": 445,
|
|
"token_acc": 0.9035763569457221,
|
|
"train_speed(iter/s)": 0.13938
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.7157277464866638,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.25613832473754883,
|
|
"memory(GiB)": 30.16,
|
|
"step": 450,
|
|
"token_acc": 0.9136720727064674,
|
|
"train_speed(iter/s)": 0.139627
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.7009196281433105,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.25980963706970217,
|
|
"memory(GiB)": 30.16,
|
|
"step": 455,
|
|
"token_acc": 0.9089919103920349,
|
|
"train_speed(iter/s)": 0.139745
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.7183138728141785,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.27978599071502686,
|
|
"memory(GiB)": 30.16,
|
|
"step": 460,
|
|
"token_acc": 0.9131394658753709,
|
|
"train_speed(iter/s)": 0.140016
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.41502535343170166,
|
|
"eval_runtime": 5.4392,
|
|
"eval_samples_per_second": 18.385,
|
|
"eval_steps_per_second": 4.596,
|
|
"eval_token_acc": 0.8717046106753396,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.41519150137901306,
|
|
"eval_runtime": 5.4401,
|
|
"eval_samples_per_second": 18.382,
|
|
"eval_steps_per_second": 4.596,
|
|
"eval_token_acc": 0.871608953510618,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.033815544329667e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|