Model: waltonfuture/qwen2.5vl-3b-sampled_15000_mixed-reflection-cot-32b Source: Original Platform
1750 lines
50 KiB
JSON
1750 lines
50 KiB
JSON
{
|
|
"best_global_step": 460,
|
|
"best_metric": 0.21736681,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v33-20250511-155921/checkpoint-460",
|
|
"epoch": 2.9911123081066524,
|
|
"eval_steps": 20,
|
|
"global_step": 696,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0043091839482897925,
|
|
"grad_norm": 2.5237181186676025,
|
|
"learning_rate": 9.99994906450425e-06,
|
|
"loss": 0.4181586503982544,
|
|
"memory(GiB)": 30.52,
|
|
"step": 1,
|
|
"token_acc": 0.8749789668517584,
|
|
"train_speed(iter/s)": 0.065716
|
|
},
|
|
{
|
|
"epoch": 0.02154591974144896,
|
|
"grad_norm": 1.6377122402191162,
|
|
"learning_rate": 9.99872666449397e-06,
|
|
"loss": 0.346223920583725,
|
|
"memory(GiB)": 30.53,
|
|
"step": 5,
|
|
"token_acc": 0.892663520418113,
|
|
"train_speed(iter/s)": 0.123965
|
|
},
|
|
{
|
|
"epoch": 0.04309183948289792,
|
|
"grad_norm": 1.0654209852218628,
|
|
"learning_rate": 9.994907306529203e-06,
|
|
"loss": 0.28658442497253417,
|
|
"memory(GiB)": 30.53,
|
|
"step": 10,
|
|
"token_acc": 0.9107923074276671,
|
|
"train_speed(iter/s)": 0.141394
|
|
},
|
|
{
|
|
"epoch": 0.06463775922434689,
|
|
"grad_norm": 0.9550829529762268,
|
|
"learning_rate": 9.988543871435342e-06,
|
|
"loss": 0.27949891090393064,
|
|
"memory(GiB)": 30.53,
|
|
"step": 15,
|
|
"token_acc": 0.9063064246085158,
|
|
"train_speed(iter/s)": 0.145526
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"grad_norm": 0.9428064227104187,
|
|
"learning_rate": 9.979639600327522e-06,
|
|
"loss": 0.26371135711669924,
|
|
"memory(GiB)": 30.53,
|
|
"step": 20,
|
|
"token_acc": 0.9098609104557944,
|
|
"train_speed(iter/s)": 0.147486
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"eval_loss": 0.28782597184181213,
|
|
"eval_runtime": 9.1011,
|
|
"eval_samples_per_second": 16.482,
|
|
"eval_steps_per_second": 4.175,
|
|
"eval_token_acc": 0.9104138732457316,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.10772959870724481,
|
|
"grad_norm": 0.8753194808959961,
|
|
"learning_rate": 9.96819902845557e-06,
|
|
"loss": 0.2613688468933105,
|
|
"memory(GiB)": 30.53,
|
|
"step": 25,
|
|
"token_acc": 0.9133286666148143,
|
|
"train_speed(iter/s)": 0.132937
|
|
},
|
|
{
|
|
"epoch": 0.12927551844869378,
|
|
"grad_norm": 0.8821267485618591,
|
|
"learning_rate": 9.954227982894034e-06,
|
|
"loss": 0.2568368434906006,
|
|
"memory(GiB)": 30.53,
|
|
"step": 30,
|
|
"token_acc": 0.9194433893857257,
|
|
"train_speed(iter/s)": 0.136714
|
|
},
|
|
{
|
|
"epoch": 0.15082143819014274,
|
|
"grad_norm": 0.8757939338684082,
|
|
"learning_rate": 9.937733579574263e-06,
|
|
"loss": 0.2422783136367798,
|
|
"memory(GiB)": 30.53,
|
|
"step": 35,
|
|
"token_acc": 0.9147548274711834,
|
|
"train_speed(iter/s)": 0.138761
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"grad_norm": 0.849438488483429,
|
|
"learning_rate": 9.918724219660013e-06,
|
|
"loss": 0.24745543003082277,
|
|
"memory(GiB)": 30.53,
|
|
"step": 40,
|
|
"token_acc": 0.9217827626918536,
|
|
"train_speed(iter/s)": 0.141087
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"eval_loss": 0.26439228653907776,
|
|
"eval_runtime": 9.0902,
|
|
"eval_samples_per_second": 16.501,
|
|
"eval_steps_per_second": 4.18,
|
|
"eval_token_acc": 0.9148833467417539,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.19391327767304067,
|
|
"grad_norm": 0.8241924047470093,
|
|
"learning_rate": 9.897209585268459e-06,
|
|
"loss": 0.2613823413848877,
|
|
"memory(GiB)": 30.53,
|
|
"step": 45,
|
|
"token_acc": 0.9152877021436631,
|
|
"train_speed(iter/s)": 0.13376
|
|
},
|
|
{
|
|
"epoch": 0.21545919741448963,
|
|
"grad_norm": 0.8378185033798218,
|
|
"learning_rate": 9.873200634538746e-06,
|
|
"loss": 0.2541953086853027,
|
|
"memory(GiB)": 30.53,
|
|
"step": 50,
|
|
"token_acc": 0.9222211442062029,
|
|
"train_speed(iter/s)": 0.135296
|
|
},
|
|
{
|
|
"epoch": 0.23700511715593858,
|
|
"grad_norm": 0.8501411080360413,
|
|
"learning_rate": 9.846709596050646e-06,
|
|
"loss": 0.24187664985656737,
|
|
"memory(GiB)": 30.53,
|
|
"step": 55,
|
|
"token_acc": 0.9192013726407736,
|
|
"train_speed(iter/s)": 0.136824
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"grad_norm": 0.9697138071060181,
|
|
"learning_rate": 9.817749962596115e-06,
|
|
"loss": 0.2432713031768799,
|
|
"memory(GiB)": 30.53,
|
|
"step": 60,
|
|
"token_acc": 0.9142042416681518,
|
|
"train_speed(iter/s)": 0.138638
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"eval_loss": 0.25323811173439026,
|
|
"eval_runtime": 9.0482,
|
|
"eval_samples_per_second": 16.578,
|
|
"eval_steps_per_second": 4.2,
|
|
"eval_token_acc": 0.9181013676588898,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.28009695663883655,
|
|
"grad_norm": 0.7712034583091736,
|
|
"learning_rate": 9.786336484306966e-06,
|
|
"loss": 0.25183610916137694,
|
|
"memory(GiB)": 30.53,
|
|
"step": 65,
|
|
"token_acc": 0.9200226351032726,
|
|
"train_speed(iter/s)": 0.134844
|
|
},
|
|
{
|
|
"epoch": 0.3016428763802855,
|
|
"grad_norm": 0.822388231754303,
|
|
"learning_rate": 9.752485161142103e-06,
|
|
"loss": 0.23003783226013183,
|
|
"memory(GiB)": 30.53,
|
|
"step": 70,
|
|
"token_acc": 0.9277887742117913,
|
|
"train_speed(iter/s)": 0.136026
|
|
},
|
|
{
|
|
"epoch": 0.32318879612173446,
|
|
"grad_norm": 0.792780339717865,
|
|
"learning_rate": 9.716213234738216e-06,
|
|
"loss": 0.2195737838745117,
|
|
"memory(GiB)": 30.53,
|
|
"step": 75,
|
|
"token_acc": 0.9205170406647666,
|
|
"train_speed(iter/s)": 0.137139
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"grad_norm": 0.8536691069602966,
|
|
"learning_rate": 9.677539179628005e-06,
|
|
"loss": 0.2305692672729492,
|
|
"memory(GiB)": 30.53,
|
|
"step": 80,
|
|
"token_acc": 0.9304421364200957,
|
|
"train_speed(iter/s)": 0.138275
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"eval_loss": 0.24655859172344208,
|
|
"eval_runtime": 9.1028,
|
|
"eval_samples_per_second": 16.479,
|
|
"eval_steps_per_second": 4.175,
|
|
"eval_token_acc": 0.9181907571288103,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.36628063560463237,
|
|
"grad_norm": 0.8240283727645874,
|
|
"learning_rate": 9.636482693830488e-06,
|
|
"loss": 0.2409297466278076,
|
|
"memory(GiB)": 30.53,
|
|
"step": 85,
|
|
"token_acc": 0.9139544242668143,
|
|
"train_speed(iter/s)": 0.134608
|
|
},
|
|
{
|
|
"epoch": 0.38782655534608135,
|
|
"grad_norm": 0.793641984462738,
|
|
"learning_rate": 9.59306468881811e-06,
|
|
"loss": 0.24126851558685303,
|
|
"memory(GiB)": 30.53,
|
|
"step": 90,
|
|
"token_acc": 0.9150926617579056,
|
|
"train_speed(iter/s)": 0.136497
|
|
},
|
|
{
|
|
"epoch": 0.4093724750875303,
|
|
"grad_norm": 0.9017167091369629,
|
|
"learning_rate": 9.547307278865823e-06,
|
|
"loss": 0.22098026275634766,
|
|
"memory(GiB)": 30.53,
|
|
"step": 95,
|
|
"token_acc": 0.9268885204180002,
|
|
"train_speed(iter/s)": 0.137464
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"grad_norm": 0.9111009240150452,
|
|
"learning_rate": 9.499233769787534e-06,
|
|
"loss": 0.23211708068847656,
|
|
"memory(GiB)": 30.53,
|
|
"step": 100,
|
|
"token_acc": 0.9252183406113537,
|
|
"train_speed(iter/s)": 0.138704
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"eval_loss": 0.24070490896701813,
|
|
"eval_runtime": 9.1064,
|
|
"eval_samples_per_second": 16.472,
|
|
"eval_steps_per_second": 4.173,
|
|
"eval_token_acc": 0.9203897380888532,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.45246431457042824,
|
|
"grad_norm": 0.9242920875549316,
|
|
"learning_rate": 9.448868647065644e-06,
|
|
"loss": 0.24608321189880372,
|
|
"memory(GiB)": 30.53,
|
|
"step": 105,
|
|
"token_acc": 0.9158947454709402,
|
|
"train_speed(iter/s)": 0.136144
|
|
},
|
|
{
|
|
"epoch": 0.47401023431187717,
|
|
"grad_norm": 0.7441526055335999,
|
|
"learning_rate": 9.396237563379761e-06,
|
|
"loss": 0.21680865287780762,
|
|
"memory(GiB)": 30.53,
|
|
"step": 110,
|
|
"token_acc": 0.9275543984829493,
|
|
"train_speed(iter/s)": 0.136882
|
|
},
|
|
{
|
|
"epoch": 0.49555615405332615,
|
|
"grad_norm": 0.8983182311058044,
|
|
"learning_rate": 9.341367325540921e-06,
|
|
"loss": 0.21554439067840575,
|
|
"memory(GiB)": 30.53,
|
|
"step": 115,
|
|
"token_acc": 0.9245874763890374,
|
|
"train_speed(iter/s)": 0.137509
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"grad_norm": 0.7459524273872375,
|
|
"learning_rate": 9.284285880837947e-06,
|
|
"loss": 0.20305709838867186,
|
|
"memory(GiB)": 30.53,
|
|
"step": 120,
|
|
"token_acc": 0.9276550249465432,
|
|
"train_speed(iter/s)": 0.138213
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"eval_loss": 0.23513969779014587,
|
|
"eval_runtime": 9.1131,
|
|
"eval_samples_per_second": 16.46,
|
|
"eval_steps_per_second": 4.17,
|
|
"eval_token_acc": 0.9224635737910074,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5386479935362241,
|
|
"grad_norm": 0.6458992958068848,
|
|
"learning_rate": 9.225022302802951e-06,
|
|
"loss": 0.20988364219665528,
|
|
"memory(GiB)": 30.53,
|
|
"step": 125,
|
|
"token_acc": 0.923570767028711,
|
|
"train_speed(iter/s)": 0.135446
|
|
},
|
|
{
|
|
"epoch": 0.5601939132776731,
|
|
"grad_norm": 0.81890469789505,
|
|
"learning_rate": 9.163606776403182e-06,
|
|
"loss": 0.217695951461792,
|
|
"memory(GiB)": 30.53,
|
|
"step": 130,
|
|
"token_acc": 0.9229289737601318,
|
|
"train_speed(iter/s)": 0.13625
|
|
},
|
|
{
|
|
"epoch": 0.581739833019122,
|
|
"grad_norm": 0.9051012992858887,
|
|
"learning_rate": 9.100070582666796e-06,
|
|
"loss": 0.21849727630615234,
|
|
"memory(GiB)": 30.53,
|
|
"step": 135,
|
|
"token_acc": 0.9253172858079698,
|
|
"train_speed(iter/s)": 0.136893
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"grad_norm": 0.9010105729103088,
|
|
"learning_rate": 9.034446082750352e-06,
|
|
"loss": 0.22707552909851075,
|
|
"memory(GiB)": 30.53,
|
|
"step": 140,
|
|
"token_acc": 0.9273702731546916,
|
|
"train_speed(iter/s)": 0.137345
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"eval_loss": 0.23331375420093536,
|
|
"eval_runtime": 9.1166,
|
|
"eval_samples_per_second": 16.454,
|
|
"eval_steps_per_second": 4.168,
|
|
"eval_token_acc": 0.9220881380173416,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.6248316725020199,
|
|
"grad_norm": 0.7436938881874084,
|
|
"learning_rate": 8.966766701456177e-06,
|
|
"loss": 0.22292051315307618,
|
|
"memory(GiB)": 30.53,
|
|
"step": 145,
|
|
"token_acc": 0.9230008118530546,
|
|
"train_speed(iter/s)": 0.135251
|
|
},
|
|
{
|
|
"epoch": 0.6463775922434689,
|
|
"grad_norm": 0.7819850444793701,
|
|
"learning_rate": 8.897066910207958e-06,
|
|
"loss": 0.21327242851257325,
|
|
"memory(GiB)": 30.53,
|
|
"step": 150,
|
|
"token_acc": 0.9308567501527184,
|
|
"train_speed(iter/s)": 0.135932
|
|
},
|
|
{
|
|
"epoch": 0.6679235119849178,
|
|
"grad_norm": 0.8297721147537231,
|
|
"learning_rate": 8.825382209493284e-06,
|
|
"loss": 0.2241668701171875,
|
|
"memory(GiB)": 30.53,
|
|
"step": 155,
|
|
"token_acc": 0.924225955715661,
|
|
"train_speed(iter/s)": 0.136464
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"grad_norm": 0.8930188417434692,
|
|
"learning_rate": 8.751749110782013e-06,
|
|
"loss": 0.21810550689697267,
|
|
"memory(GiB)": 30.53,
|
|
"step": 160,
|
|
"token_acc": 0.9215799614643545,
|
|
"train_speed(iter/s)": 0.137033
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"eval_loss": 0.23200780153274536,
|
|
"eval_runtime": 9.1055,
|
|
"eval_samples_per_second": 16.474,
|
|
"eval_steps_per_second": 4.173,
|
|
"eval_token_acc": 0.9217484580316438,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.7110153514678158,
|
|
"grad_norm": 0.7813264727592468,
|
|
"learning_rate": 8.676205117929752e-06,
|
|
"loss": 0.21541600227355956,
|
|
"memory(GiB)": 30.53,
|
|
"step": 165,
|
|
"token_acc": 0.9275054882711329,
|
|
"train_speed(iter/s)": 0.135327
|
|
},
|
|
{
|
|
"epoch": 0.7325612712092647,
|
|
"grad_norm": 0.8045797348022461,
|
|
"learning_rate": 8.598788708075844e-06,
|
|
"loss": 0.20851638317108154,
|
|
"memory(GiB)": 30.53,
|
|
"step": 170,
|
|
"token_acc": 0.9192872543834669,
|
|
"train_speed(iter/s)": 0.135759
|
|
},
|
|
{
|
|
"epoch": 0.7541071909507137,
|
|
"grad_norm": 0.8655018210411072,
|
|
"learning_rate": 8.51953931204566e-06,
|
|
"loss": 0.20699663162231446,
|
|
"memory(GiB)": 30.53,
|
|
"step": 175,
|
|
"token_acc": 0.9224663747263059,
|
|
"train_speed(iter/s)": 0.136385
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"grad_norm": 0.6923186779022217,
|
|
"learning_rate": 8.438497294267117e-06,
|
|
"loss": 0.19253411293029785,
|
|
"memory(GiB)": 30.53,
|
|
"step": 180,
|
|
"token_acc": 0.9316482201615316,
|
|
"train_speed(iter/s)": 0.13683
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"eval_loss": 0.22997288405895233,
|
|
"eval_runtime": 9.1153,
|
|
"eval_samples_per_second": 16.456,
|
|
"eval_steps_per_second": 4.169,
|
|
"eval_token_acc": 0.9232859569142755,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.7971990304336116,
|
|
"grad_norm": 0.7510209083557129,
|
|
"learning_rate": 8.3557039322117e-06,
|
|
"loss": 0.2128201961517334,
|
|
"memory(GiB)": 30.53,
|
|
"step": 185,
|
|
"token_acc": 0.9327214400285154,
|
|
"train_speed(iter/s)": 0.1351
|
|
},
|
|
{
|
|
"epoch": 0.8187449501750605,
|
|
"grad_norm": 0.7761275768280029,
|
|
"learning_rate": 8.27120139537044e-06,
|
|
"loss": 0.20115509033203124,
|
|
"memory(GiB)": 30.53,
|
|
"step": 190,
|
|
"token_acc": 0.9393949424069906,
|
|
"train_speed(iter/s)": 0.135548
|
|
},
|
|
{
|
|
"epoch": 0.8402908699165096,
|
|
"grad_norm": 0.8497465252876282,
|
|
"learning_rate": 8.18503272377554e-06,
|
|
"loss": 0.2121565341949463,
|
|
"memory(GiB)": 30.53,
|
|
"step": 195,
|
|
"token_acc": 0.9327115256495669,
|
|
"train_speed(iter/s)": 0.136171
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"grad_norm": 0.7839154601097107,
|
|
"learning_rate": 8.097241806078616e-06,
|
|
"loss": 0.21248257160186768,
|
|
"memory(GiB)": 30.53,
|
|
"step": 200,
|
|
"token_acc": 0.9275539014373717,
|
|
"train_speed(iter/s)": 0.136519
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"eval_loss": 0.22366267442703247,
|
|
"eval_runtime": 9.0776,
|
|
"eval_samples_per_second": 16.524,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.9240189505676232,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.8833827093994074,
|
|
"grad_norm": 0.8640104532241821,
|
|
"learning_rate": 8.007873357196716e-06,
|
|
"loss": 0.2174234390258789,
|
|
"memory(GiB)": 30.53,
|
|
"step": 205,
|
|
"token_acc": 0.9313344114690327,
|
|
"train_speed(iter/s)": 0.135298
|
|
},
|
|
{
|
|
"epoch": 0.9049286291408565,
|
|
"grad_norm": 0.8847671151161194,
|
|
"learning_rate": 7.916972895537471e-06,
|
|
"loss": 0.21736545562744142,
|
|
"memory(GiB)": 30.53,
|
|
"step": 210,
|
|
"token_acc": 0.9281964485498724,
|
|
"train_speed(iter/s)": 0.135752
|
|
},
|
|
{
|
|
"epoch": 0.9264745488823054,
|
|
"grad_norm": 0.9129384160041809,
|
|
"learning_rate": 7.824586719815019e-06,
|
|
"loss": 0.19911231994628906,
|
|
"memory(GiB)": 30.53,
|
|
"step": 215,
|
|
"token_acc": 0.933074239549544,
|
|
"train_speed(iter/s)": 0.136175
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"grad_norm": 0.746377170085907,
|
|
"learning_rate": 7.730761885468486e-06,
|
|
"loss": 0.2088994264602661,
|
|
"memory(GiB)": 30.53,
|
|
"step": 220,
|
|
"token_acc": 0.929672384883271,
|
|
"train_speed(iter/s)": 0.136769
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"eval_loss": 0.22297517955303192,
|
|
"eval_runtime": 9.1184,
|
|
"eval_samples_per_second": 16.45,
|
|
"eval_steps_per_second": 4.167,
|
|
"eval_token_acc": 0.9244480200232412,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.9695663883652034,
|
|
"grad_norm": 0.911310076713562,
|
|
"learning_rate": 7.635546180695039e-06,
|
|
"loss": 0.21921830177307128,
|
|
"memory(GiB)": 30.53,
|
|
"step": 225,
|
|
"token_acc": 0.928082019245302,
|
|
"train_speed(iter/s)": 0.135629
|
|
},
|
|
{
|
|
"epoch": 0.9911123081066523,
|
|
"grad_norm": 0.7769207954406738,
|
|
"learning_rate": 7.538988102109728e-06,
|
|
"loss": 0.2161275863647461,
|
|
"memory(GiB)": 30.53,
|
|
"step": 230,
|
|
"token_acc": 0.9270204342784021,
|
|
"train_speed(iter/s)": 0.136241
|
|
},
|
|
{
|
|
"epoch": 1.0086183678965797,
|
|
"grad_norm": 0.6726126670837402,
|
|
"learning_rate": 7.441136830044495e-06,
|
|
"loss": 0.17683182954788207,
|
|
"memory(GiB)": 30.53,
|
|
"step": 235,
|
|
"token_acc": 0.943002946884209,
|
|
"train_speed(iter/s)": 0.136928
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"grad_norm": 0.815794050693512,
|
|
"learning_rate": 7.342042203498952e-06,
|
|
"loss": 0.14148125648498536,
|
|
"memory(GiB)": 30.53,
|
|
"step": 240,
|
|
"token_acc": 0.9480684873355031,
|
|
"train_speed(iter/s)": 0.137322
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"eval_loss": 0.2275795191526413,
|
|
"eval_runtime": 9.1077,
|
|
"eval_samples_per_second": 16.47,
|
|
"eval_steps_per_second": 4.172,
|
|
"eval_token_acc": 0.9249307231608117,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.0517102073794775,
|
|
"grad_norm": 0.8187179565429688,
|
|
"learning_rate": 7.241754694755674e-06,
|
|
"loss": 0.1443116307258606,
|
|
"memory(GiB)": 30.53,
|
|
"step": 245,
|
|
"token_acc": 0.9402755009664554,
|
|
"train_speed(iter/s)": 0.13621
|
|
},
|
|
{
|
|
"epoch": 1.0732561271209264,
|
|
"grad_norm": 0.8635661005973816,
|
|
"learning_rate": 7.140325383672938e-06,
|
|
"loss": 0.1403177261352539,
|
|
"memory(GiB)": 30.53,
|
|
"step": 250,
|
|
"token_acc": 0.9511112435202288,
|
|
"train_speed(iter/s)": 0.136646
|
|
},
|
|
{
|
|
"epoch": 1.0948020468623754,
|
|
"grad_norm": 0.7241800427436829,
|
|
"learning_rate": 7.037805931668006e-06,
|
|
"loss": 0.14431071281433105,
|
|
"memory(GiB)": 30.53,
|
|
"step": 255,
|
|
"token_acc": 0.9472287558048369,
|
|
"train_speed(iter/s)": 0.136936
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"grad_norm": 0.7335503101348877,
|
|
"learning_rate": 6.934248555404197e-06,
|
|
"loss": 0.14122509956359863,
|
|
"memory(GiB)": 30.53,
|
|
"step": 260,
|
|
"token_acc": 0.9511055540931824,
|
|
"train_speed(iter/s)": 0.137293
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"eval_loss": 0.22707919776439667,
|
|
"eval_runtime": 9.0673,
|
|
"eval_samples_per_second": 16.543,
|
|
"eval_steps_per_second": 4.191,
|
|
"eval_token_acc": 0.9248234557969072,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.1378938863452734,
|
|
"grad_norm": 0.8322599530220032,
|
|
"learning_rate": 6.8297060001951545e-06,
|
|
"loss": 0.14616423845291138,
|
|
"memory(GiB)": 30.53,
|
|
"step": 265,
|
|
"token_acc": 0.9405642484589853,
|
|
"train_speed(iter/s)": 0.136437
|
|
},
|
|
{
|
|
"epoch": 1.1594398060867224,
|
|
"grad_norm": 0.746890127658844,
|
|
"learning_rate": 6.724231513139853e-06,
|
|
"loss": 0.13995609283447266,
|
|
"memory(GiB)": 30.53,
|
|
"step": 270,
|
|
"token_acc": 0.9483776303064662,
|
|
"train_speed(iter/s)": 0.1367
|
|
},
|
|
{
|
|
"epoch": 1.1809857258281713,
|
|
"grad_norm": 0.8125350475311279,
|
|
"learning_rate": 6.617878816002032e-06,
|
|
"loss": 0.1372074842453003,
|
|
"memory(GiB)": 30.53,
|
|
"step": 275,
|
|
"token_acc": 0.9589013747076607,
|
|
"train_speed(iter/s)": 0.136989
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"grad_norm": 0.7824741005897522,
|
|
"learning_rate": 6.510702077847864e-06,
|
|
"loss": 0.14701566696166993,
|
|
"memory(GiB)": 30.53,
|
|
"step": 280,
|
|
"token_acc": 0.9469290828625913,
|
|
"train_speed(iter/s)": 0.137369
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"eval_loss": 0.22802643477916718,
|
|
"eval_runtime": 9.0755,
|
|
"eval_samples_per_second": 16.528,
|
|
"eval_steps_per_second": 4.187,
|
|
"eval_token_acc": 0.9248592115848753,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.2240775653110691,
|
|
"grad_norm": 0.6673188209533691,
|
|
"learning_rate": 6.402755887455792e-06,
|
|
"loss": 0.14538809061050414,
|
|
"memory(GiB)": 30.53,
|
|
"step": 285,
|
|
"token_acc": 0.9437563495666083,
|
|
"train_speed(iter/s)": 0.136354
|
|
},
|
|
{
|
|
"epoch": 1.2456234850525183,
|
|
"grad_norm": 0.7520856261253357,
|
|
"learning_rate": 6.294095225512604e-06,
|
|
"loss": 0.14223116636276245,
|
|
"memory(GiB)": 30.53,
|
|
"step": 290,
|
|
"token_acc": 0.9431994362226921,
|
|
"train_speed(iter/s)": 0.136648
|
|
},
|
|
{
|
|
"epoch": 1.2671694047939672,
|
|
"grad_norm": 0.8446849584579468,
|
|
"learning_rate": 6.184775436609885e-06,
|
|
"loss": 0.14606384038925171,
|
|
"memory(GiB)": 30.53,
|
|
"step": 295,
|
|
"token_acc": 0.9505753500623874,
|
|
"train_speed(iter/s)": 0.137043
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"grad_norm": 0.9321854710578918,
|
|
"learning_rate": 6.074852201055121e-06,
|
|
"loss": 0.14932063817977906,
|
|
"memory(GiB)": 30.53,
|
|
"step": 300,
|
|
"token_acc": 0.9450343760123826,
|
|
"train_speed(iter/s)": 0.137433
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"eval_loss": 0.22745147347450256,
|
|
"eval_runtime": 9.1349,
|
|
"eval_samples_per_second": 16.421,
|
|
"eval_steps_per_second": 4.16,
|
|
"eval_token_acc": 0.924591043175114,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.310261244276865,
|
|
"grad_norm": 0.8243806958198547,
|
|
"learning_rate": 5.964381506511823e-06,
|
|
"loss": 0.15078881978988648,
|
|
"memory(GiB)": 30.53,
|
|
"step": 305,
|
|
"token_acc": 0.9428590810419681,
|
|
"train_speed(iter/s)": 0.136632
|
|
},
|
|
{
|
|
"epoch": 1.331807164018314,
|
|
"grad_norm": 0.7823331356048584,
|
|
"learning_rate": 5.853419619483083e-06,
|
|
"loss": 0.14328973293304442,
|
|
"memory(GiB)": 30.53,
|
|
"step": 310,
|
|
"token_acc": 0.9486281555467584,
|
|
"train_speed(iter/s)": 0.136888
|
|
},
|
|
{
|
|
"epoch": 1.353353083759763,
|
|
"grad_norm": 0.7617143392562866,
|
|
"learning_rate": 5.742023056653131e-06,
|
|
"loss": 0.14642927646636963,
|
|
"memory(GiB)": 30.53,
|
|
"step": 315,
|
|
"token_acc": 0.9535409058393886,
|
|
"train_speed(iter/s)": 0.137158
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"grad_norm": 0.8405448794364929,
|
|
"learning_rate": 5.630248556101448e-06,
|
|
"loss": 0.14387913942337036,
|
|
"memory(GiB)": 30.53,
|
|
"step": 320,
|
|
"token_acc": 0.9477073920984331,
|
|
"train_speed(iter/s)": 0.137447
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"eval_loss": 0.2264009267091751,
|
|
"eval_runtime": 9.1222,
|
|
"eval_samples_per_second": 16.443,
|
|
"eval_steps_per_second": 4.166,
|
|
"eval_token_acc": 0.9254670599803343,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.3964449232426608,
|
|
"grad_norm": 0.7621841430664062,
|
|
"learning_rate": 5.51815304840412e-06,
|
|
"loss": 0.13728692531585693,
|
|
"memory(GiB)": 30.53,
|
|
"step": 325,
|
|
"token_acc": 0.9431869420146091,
|
|
"train_speed(iter/s)": 0.136636
|
|
},
|
|
{
|
|
"epoch": 1.41799084298411,
|
|
"grad_norm": 0.7314430475234985,
|
|
"learning_rate": 5.405793627637157e-06,
|
|
"loss": 0.14474726915359498,
|
|
"memory(GiB)": 32.82,
|
|
"step": 330,
|
|
"token_acc": 0.9499411071849234,
|
|
"train_speed(iter/s)": 0.136864
|
|
},
|
|
{
|
|
"epoch": 1.4395367627255589,
|
|
"grad_norm": 0.8704355359077454,
|
|
"learning_rate": 5.293227522296517e-06,
|
|
"loss": 0.14055614471435546,
|
|
"memory(GiB)": 32.82,
|
|
"step": 335,
|
|
"token_acc": 0.9562322700167892,
|
|
"train_speed(iter/s)": 0.13705
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"grad_norm": 0.7439765930175781,
|
|
"learning_rate": 5.180512066149682e-06,
|
|
"loss": 0.14476253986358642,
|
|
"memory(GiB)": 32.82,
|
|
"step": 340,
|
|
"token_acc": 0.9437502448196169,
|
|
"train_speed(iter/s)": 0.137336
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"eval_loss": 0.22488656640052795,
|
|
"eval_runtime": 9.1135,
|
|
"eval_samples_per_second": 16.459,
|
|
"eval_steps_per_second": 4.17,
|
|
"eval_token_acc": 0.9261464199517296,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.4826286022084567,
|
|
"grad_norm": 0.806131899356842,
|
|
"learning_rate": 5.06770466903361e-06,
|
|
"loss": 0.13870317935943605,
|
|
"memory(GiB)": 32.82,
|
|
"step": 345,
|
|
"token_acc": 0.942543247613352,
|
|
"train_speed(iter/s)": 0.136478
|
|
},
|
|
{
|
|
"epoch": 1.5041745219499059,
|
|
"grad_norm": 0.8332167267799377,
|
|
"learning_rate": 4.954862787613937e-06,
|
|
"loss": 0.13994078636169432,
|
|
"memory(GiB)": 32.82,
|
|
"step": 350,
|
|
"token_acc": 0.9441090757701915,
|
|
"train_speed(iter/s)": 0.136722
|
|
},
|
|
{
|
|
"epoch": 1.5257204416913548,
|
|
"grad_norm": 0.8114423751831055,
|
|
"learning_rate": 4.842043896120332e-06,
|
|
"loss": 0.1382569432258606,
|
|
"memory(GiB)": 32.82,
|
|
"step": 355,
|
|
"token_acc": 0.9571762441572724,
|
|
"train_speed(iter/s)": 0.137
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"grad_norm": 0.76919025182724,
|
|
"learning_rate": 4.729305457072913e-06,
|
|
"loss": 0.14944992065429688,
|
|
"memory(GiB)": 32.82,
|
|
"step": 360,
|
|
"token_acc": 0.9474090514329827,
|
|
"train_speed(iter/s)": 0.137309
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"eval_loss": 0.22209370136260986,
|
|
"eval_runtime": 9.1201,
|
|
"eval_samples_per_second": 16.447,
|
|
"eval_steps_per_second": 4.167,
|
|
"eval_token_acc": 0.9268257799231251,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.5688122811742526,
|
|
"grad_norm": 1.0794566869735718,
|
|
"learning_rate": 4.616704892014613e-06,
|
|
"loss": 0.14552514553070067,
|
|
"memory(GiB)": 32.82,
|
|
"step": 365,
|
|
"token_acc": 0.9390662094434187,
|
|
"train_speed(iter/s)": 0.136567
|
|
},
|
|
{
|
|
"epoch": 1.5903582009157016,
|
|
"grad_norm": 0.7622527480125427,
|
|
"learning_rate": 4.504299552264428e-06,
|
|
"loss": 0.134457790851593,
|
|
"memory(GiB)": 32.82,
|
|
"step": 370,
|
|
"token_acc": 0.9490503358128114,
|
|
"train_speed(iter/s)": 0.136729
|
|
},
|
|
{
|
|
"epoch": 1.6119041206571505,
|
|
"grad_norm": 0.8797208666801453,
|
|
"learning_rate": 4.392146689706426e-06,
|
|
"loss": 0.14710538387298583,
|
|
"memory(GiB)": 32.82,
|
|
"step": 375,
|
|
"token_acc": 0.9465866995942847,
|
|
"train_speed(iter/s)": 0.137072
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"grad_norm": 0.8271977305412292,
|
|
"learning_rate": 4.280303427629404e-06,
|
|
"loss": 0.1435370683670044,
|
|
"memory(GiB)": 32.82,
|
|
"step": 380,
|
|
"token_acc": 0.9512855685695192,
|
|
"train_speed(iter/s)": 0.137365
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"eval_loss": 0.22211501002311707,
|
|
"eval_runtime": 9.1527,
|
|
"eval_samples_per_second": 16.389,
|
|
"eval_steps_per_second": 4.152,
|
|
"eval_token_acc": 0.9266470009832841,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.6549959601400483,
|
|
"grad_norm": 0.7813441157341003,
|
|
"learning_rate": 4.168826731632052e-06,
|
|
"loss": 0.13174430131912232,
|
|
"memory(GiB)": 32.82,
|
|
"step": 385,
|
|
"token_acc": 0.9456577563647924,
|
|
"train_speed(iter/s)": 0.136683
|
|
},
|
|
{
|
|
"epoch": 1.6765418798814973,
|
|
"grad_norm": 0.8885687589645386,
|
|
"learning_rate": 4.057773380608411e-06,
|
|
"loss": 0.15410563945770264,
|
|
"memory(GiB)": 32.82,
|
|
"step": 390,
|
|
"token_acc": 0.9529198577680525,
|
|
"train_speed(iter/s)": 0.137045
|
|
},
|
|
{
|
|
"epoch": 1.6980877996229464,
|
|
"grad_norm": 0.8711886405944824,
|
|
"learning_rate": 3.947199937828447e-06,
|
|
"loss": 0.15066791772842408,
|
|
"memory(GiB)": 32.82,
|
|
"step": 395,
|
|
"token_acc": 0.9490923301005364,
|
|
"train_speed(iter/s)": 0.137378
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"grad_norm": 0.7740168571472168,
|
|
"learning_rate": 3.8371627221284495e-06,
|
|
"loss": 0.14116008281707765,
|
|
"memory(GiB)": 32.82,
|
|
"step": 400,
|
|
"token_acc": 0.9508052422246854,
|
|
"train_speed(iter/s)": 0.137567
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"eval_loss": 0.2199936956167221,
|
|
"eval_runtime": 9.2066,
|
|
"eval_samples_per_second": 16.293,
|
|
"eval_steps_per_second": 4.127,
|
|
"eval_token_acc": 0.9274157504246,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.7411796391058443,
|
|
"grad_norm": 0.7679008841514587,
|
|
"learning_rate": 3.727717779225912e-06,
|
|
"loss": 0.14058753252029418,
|
|
"memory(GiB)": 32.82,
|
|
"step": 405,
|
|
"token_acc": 0.944121915820029,
|
|
"train_speed(iter/s)": 0.136891
|
|
},
|
|
{
|
|
"epoch": 1.7627255588472934,
|
|
"grad_norm": 0.8533144593238831,
|
|
"learning_rate": 3.6189208531735354e-06,
|
|
"loss": 0.14932174682617189,
|
|
"memory(GiB)": 32.82,
|
|
"step": 410,
|
|
"token_acc": 0.9472646822204345,
|
|
"train_speed(iter/s)": 0.137222
|
|
},
|
|
{
|
|
"epoch": 1.7842714785887424,
|
|
"grad_norm": 0.6813901662826538,
|
|
"learning_rate": 3.510827357966876e-06,
|
|
"loss": 0.12951855659484862,
|
|
"memory(GiB)": 32.82,
|
|
"step": 415,
|
|
"token_acc": 0.9607936037903465,
|
|
"train_speed(iter/s)": 0.137481
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"grad_norm": 0.6819447875022888,
|
|
"learning_rate": 3.403492349320101e-06,
|
|
"loss": 0.12727973461151124,
|
|
"memory(GiB)": 32.82,
|
|
"step": 420,
|
|
"token_acc": 0.9527016395506184,
|
|
"train_speed(iter/s)": 0.13761
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"eval_loss": 0.21894590556621552,
|
|
"eval_runtime": 9.2314,
|
|
"eval_samples_per_second": 16.249,
|
|
"eval_steps_per_second": 4.116,
|
|
"eval_token_acc": 0.9276660409403772,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.8273633180716402,
|
|
"grad_norm": 0.7320638298988342,
|
|
"learning_rate": 3.29697049662423e-06,
|
|
"loss": 0.12717063426971437,
|
|
"memory(GiB)": 32.82,
|
|
"step": 425,
|
|
"token_acc": 0.948704977741805,
|
|
"train_speed(iter/s)": 0.136847
|
|
},
|
|
{
|
|
"epoch": 1.8489092378130891,
|
|
"grad_norm": 0.779474139213562,
|
|
"learning_rate": 3.191316055102146e-06,
|
|
"loss": 0.13753225803375244,
|
|
"memory(GiB)": 32.82,
|
|
"step": 430,
|
|
"token_acc": 0.9508869722421001,
|
|
"train_speed(iter/s)": 0.137066
|
|
},
|
|
{
|
|
"epoch": 1.870455157554538,
|
|
"grad_norm": 0.8016546368598938,
|
|
"learning_rate": 3.0865828381745515e-06,
|
|
"loss": 0.1339845299720764,
|
|
"memory(GiB)": 32.82,
|
|
"step": 435,
|
|
"token_acc": 0.9550213879844219,
|
|
"train_speed(iter/s)": 0.137237
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"grad_norm": 0.7220640778541565,
|
|
"learning_rate": 2.982824190050958e-06,
|
|
"loss": 0.14519211053848266,
|
|
"memory(GiB)": 32.82,
|
|
"step": 440,
|
|
"token_acc": 0.9519722211384651,
|
|
"train_speed(iter/s)": 0.137445
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"eval_loss": 0.21764494478702545,
|
|
"eval_runtime": 9.1305,
|
|
"eval_samples_per_second": 16.428,
|
|
"eval_steps_per_second": 4.162,
|
|
"eval_token_acc": 0.9279342093501386,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.913546997037436,
|
|
"grad_norm": 0.696753203868866,
|
|
"learning_rate": 2.8800929585596506e-06,
|
|
"loss": 0.13140536546707154,
|
|
"memory(GiB)": 32.82,
|
|
"step": 445,
|
|
"token_acc": 0.9466040818443505,
|
|
"train_speed(iter/s)": 0.136801
|
|
},
|
|
{
|
|
"epoch": 1.9350929167788848,
|
|
"grad_norm": 0.7188719511032104,
|
|
"learning_rate": 2.778441468230483e-06,
|
|
"loss": 0.1310037851333618,
|
|
"memory(GiB)": 32.82,
|
|
"step": 450,
|
|
"token_acc": 0.9548904329235702,
|
|
"train_speed(iter/s)": 0.136955
|
|
},
|
|
{
|
|
"epoch": 1.956638836520334,
|
|
"grad_norm": 0.777593731880188,
|
|
"learning_rate": 2.6779214936442056e-06,
|
|
"loss": 0.1402130603790283,
|
|
"memory(GiB)": 32.82,
|
|
"step": 455,
|
|
"token_acc": 0.9508758882829285,
|
|
"train_speed(iter/s)": 0.137143
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"grad_norm": 0.6985222697257996,
|
|
"learning_rate": 2.5785842330619038e-06,
|
|
"loss": 0.13579378128051758,
|
|
"memory(GiB)": 32.82,
|
|
"step": 460,
|
|
"token_acc": 0.949956619816068,
|
|
"train_speed(iter/s)": 0.137296
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"eval_loss": 0.21736681461334229,
|
|
"eval_runtime": 9.1209,
|
|
"eval_samples_per_second": 16.446,
|
|
"eval_steps_per_second": 4.166,
|
|
"eval_token_acc": 0.9278984535621704,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.9997306760032318,
|
|
"grad_norm": 0.7867130637168884,
|
|
"learning_rate": 2.480480282347961e-06,
|
|
"loss": 0.13830192089080812,
|
|
"memory(GiB)": 32.82,
|
|
"step": 465,
|
|
"token_acc": 0.9452121426518384,
|
|
"train_speed(iter/s)": 0.136755
|
|
},
|
|
{
|
|
"epoch": 2.0172367357931593,
|
|
"grad_norm": 0.6745245456695557,
|
|
"learning_rate": 2.383659609199873e-06,
|
|
"loss": 0.11236014366149902,
|
|
"memory(GiB)": 32.82,
|
|
"step": 470,
|
|
"token_acc": 0.9635579777931427,
|
|
"train_speed(iter/s)": 0.137215
|
|
},
|
|
{
|
|
"epoch": 2.0387826555346082,
|
|
"grad_norm": 0.6941452026367188,
|
|
"learning_rate": 2.2881715276979705e-06,
|
|
"loss": 0.09268745183944702,
|
|
"memory(GiB)": 32.82,
|
|
"step": 475,
|
|
"token_acc": 0.9666506095527476,
|
|
"train_speed(iter/s)": 0.13734
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"grad_norm": 0.7112278342247009,
|
|
"learning_rate": 2.1940646731880887e-06,
|
|
"loss": 0.09282677173614502,
|
|
"memory(GiB)": 32.82,
|
|
"step": 480,
|
|
"token_acc": 0.97280563727167,
|
|
"train_speed(iter/s)": 0.137502
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"eval_loss": 0.23180951178073883,
|
|
"eval_runtime": 9.1229,
|
|
"eval_samples_per_second": 16.442,
|
|
"eval_steps_per_second": 4.165,
|
|
"eval_token_acc": 0.9271118262268705,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.081874495017506,
|
|
"grad_norm": 0.7650997638702393,
|
|
"learning_rate": 2.101386977509907e-06,
|
|
"loss": 0.10469940900802613,
|
|
"memory(GiB)": 32.82,
|
|
"step": 485,
|
|
"token_acc": 0.9529728553554915,
|
|
"train_speed(iter/s)": 0.137
|
|
},
|
|
{
|
|
"epoch": 2.103420414758955,
|
|
"grad_norm": 0.8435229659080505,
|
|
"learning_rate": 2.010185644583641e-06,
|
|
"loss": 0.0978783905506134,
|
|
"memory(GiB)": 32.82,
|
|
"step": 490,
|
|
"token_acc": 0.9673804425410422,
|
|
"train_speed(iter/s)": 0.137178
|
|
},
|
|
{
|
|
"epoch": 2.124966334500404,
|
|
"grad_norm": 0.7170566916465759,
|
|
"learning_rate": 1.920507126367448e-06,
|
|
"loss": 0.0900570273399353,
|
|
"memory(GiB)": 32.82,
|
|
"step": 495,
|
|
"token_acc": 0.9669074241266755,
|
|
"train_speed(iter/s)": 0.137328
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"grad_norm": 0.7567837834358215,
|
|
"learning_rate": 1.8323970991978823e-06,
|
|
"loss": 0.08487753868103028,
|
|
"memory(GiB)": 32.82,
|
|
"step": 500,
|
|
"token_acc": 0.9661423064902595,
|
|
"train_speed(iter/s)": 0.137435
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"eval_loss": 0.23648440837860107,
|
|
"eval_runtime": 9.1362,
|
|
"eval_samples_per_second": 16.418,
|
|
"eval_steps_per_second": 4.159,
|
|
"eval_token_acc": 0.9269151693930455,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.168058173983302,
|
|
"grad_norm": 0.679600715637207,
|
|
"learning_rate": 1.7459004405253544e-06,
|
|
"loss": 0.08969470262527465,
|
|
"memory(GiB)": 32.82,
|
|
"step": 505,
|
|
"token_acc": 0.9617286751361162,
|
|
"train_speed(iter/s)": 0.136847
|
|
},
|
|
{
|
|
"epoch": 2.1896040937247507,
|
|
"grad_norm": 0.6843417882919312,
|
|
"learning_rate": 1.6610612060565235e-06,
|
|
"loss": 0.08585838079452515,
|
|
"memory(GiB)": 32.82,
|
|
"step": 510,
|
|
"token_acc": 0.9688960464822534,
|
|
"train_speed(iter/s)": 0.137034
|
|
},
|
|
{
|
|
"epoch": 2.2111500134661997,
|
|
"grad_norm": 0.7679196000099182,
|
|
"learning_rate": 1.5779226073152071e-06,
|
|
"loss": 0.09446129202842712,
|
|
"memory(GiB)": 32.82,
|
|
"step": 515,
|
|
"token_acc": 0.9684342952994548,
|
|
"train_speed(iter/s)": 0.137304
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"grad_norm": 0.7014828324317932,
|
|
"learning_rate": 1.4965269896332884e-06,
|
|
"loss": 0.09327901601791382,
|
|
"memory(GiB)": 32.82,
|
|
"step": 520,
|
|
"token_acc": 0.969137028942314,
|
|
"train_speed(iter/s)": 0.137493
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"eval_loss": 0.23686107993125916,
|
|
"eval_runtime": 9.0931,
|
|
"eval_samples_per_second": 16.496,
|
|
"eval_steps_per_second": 4.179,
|
|
"eval_token_acc": 0.9274157504246,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.254241852949098,
|
|
"grad_norm": 0.8045688271522522,
|
|
"learning_rate": 1.4169158105827768e-06,
|
|
"loss": 0.09479656219482421,
|
|
"memory(GiB)": 32.82,
|
|
"step": 525,
|
|
"token_acc": 0.95495518631706,
|
|
"train_speed(iter/s)": 0.137017
|
|
},
|
|
{
|
|
"epoch": 2.275787772690547,
|
|
"grad_norm": 0.6673620939254761,
|
|
"learning_rate": 1.3391296188600594e-06,
|
|
"loss": 0.09322519898414612,
|
|
"memory(GiB)": 32.82,
|
|
"step": 530,
|
|
"token_acc": 0.9673923560716013,
|
|
"train_speed(iter/s)": 0.137133
|
|
},
|
|
{
|
|
"epoch": 2.297333692431996,
|
|
"grad_norm": 0.7161964774131775,
|
|
"learning_rate": 1.2632080336330532e-06,
|
|
"loss": 0.10077807903289795,
|
|
"memory(GiB)": 32.82,
|
|
"step": 535,
|
|
"token_acc": 0.9664416709118696,
|
|
"train_speed(iter/s)": 0.137298
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"grad_norm": 0.6956729888916016,
|
|
"learning_rate": 1.1891897243618184e-06,
|
|
"loss": 0.09205458760261535,
|
|
"memory(GiB)": 32.82,
|
|
"step": 540,
|
|
"token_acc": 0.9661544035506174,
|
|
"train_speed(iter/s)": 0.137456
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"eval_loss": 0.23653477430343628,
|
|
"eval_runtime": 9.1444,
|
|
"eval_samples_per_second": 16.404,
|
|
"eval_steps_per_second": 4.156,
|
|
"eval_token_acc": 0.9273621167426477,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.3404255319148937,
|
|
"grad_norm": 0.7258738279342651,
|
|
"learning_rate": 1.1171123911028692e-06,
|
|
"loss": 0.08955551385879516,
|
|
"memory(GiB)": 32.82,
|
|
"step": 545,
|
|
"token_acc": 0.9569588438579956,
|
|
"train_speed(iter/s)": 0.13699
|
|
},
|
|
{
|
|
"epoch": 2.3619714516563426,
|
|
"grad_norm": 0.8112572431564331,
|
|
"learning_rate": 1.047012745307255e-06,
|
|
"loss": 0.08942890167236328,
|
|
"memory(GiB)": 32.82,
|
|
"step": 550,
|
|
"token_acc": 0.9669670722726794,
|
|
"train_speed(iter/s)": 0.13713
|
|
},
|
|
{
|
|
"epoch": 2.3835173713977915,
|
|
"grad_norm": 0.7578905820846558,
|
|
"learning_rate": 9.789264911221546e-07,
|
|
"loss": 0.0977406620979309,
|
|
"memory(GiB)": 32.82,
|
|
"step": 555,
|
|
"token_acc": 0.9592447966031011,
|
|
"train_speed(iter/s)": 0.137349
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"grad_norm": 0.6269740462303162,
|
|
"learning_rate": 9.128883072055411e-07,
|
|
"loss": 0.09372045993804931,
|
|
"memory(GiB)": 32.82,
|
|
"step": 560,
|
|
"token_acc": 0.9644342269853612,
|
|
"train_speed(iter/s)": 0.137606
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"eval_loss": 0.23624224960803986,
|
|
"eval_runtime": 9.1143,
|
|
"eval_samples_per_second": 16.458,
|
|
"eval_steps_per_second": 4.169,
|
|
"eval_token_acc": 0.92780906409225,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.4266092108806894,
|
|
"grad_norm": 0.7092756032943726,
|
|
"learning_rate": 8.489318290631454e-07,
|
|
"loss": 0.09312183260917664,
|
|
"memory(GiB)": 32.82,
|
|
"step": 565,
|
|
"token_acc": 0.9584454627003176,
|
|
"train_speed(iter/s)": 0.137151
|
|
},
|
|
{
|
|
"epoch": 2.4481551306221383,
|
|
"grad_norm": 0.6616098880767822,
|
|
"learning_rate": 7.870896319167548e-07,
|
|
"loss": 0.0871224045753479,
|
|
"memory(GiB)": 32.82,
|
|
"step": 570,
|
|
"token_acc": 0.9698027314112291,
|
|
"train_speed(iter/s)": 0.137275
|
|
},
|
|
{
|
|
"epoch": 2.4697010503635872,
|
|
"grad_norm": 0.7446200251579285,
|
|
"learning_rate": 7.273932141125256e-07,
|
|
"loss": 0.09404770135879517,
|
|
"memory(GiB)": 32.82,
|
|
"step": 575,
|
|
"token_acc": 0.9641330382318164,
|
|
"train_speed(iter/s)": 0.137433
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"grad_norm": 0.7769956588745117,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.09552533030509949,
|
|
"memory(GiB)": 32.82,
|
|
"step": 580,
|
|
"token_acc": 0.9669471799462847,
|
|
"train_speed(iter/s)": 0.137553
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"eval_loss": 0.2381029576063156,
|
|
"eval_runtime": 9.122,
|
|
"eval_samples_per_second": 16.444,
|
|
"eval_steps_per_second": 4.166,
|
|
"eval_token_acc": 0.9273084830606955,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.5127928898464855,
|
|
"grad_norm": 0.6479668021202087,
|
|
"learning_rate": 6.145582298346153e-07,
|
|
"loss": 0.08736968636512757,
|
|
"memory(GiB)": 32.82,
|
|
"step": 585,
|
|
"token_acc": 0.955453039981074,
|
|
"train_speed(iter/s)": 0.137018
|
|
},
|
|
{
|
|
"epoch": 2.5343388095879344,
|
|
"grad_norm": 0.7301574349403381,
|
|
"learning_rate": 5.614771340776559e-07,
|
|
"loss": 0.08440666198730469,
|
|
"memory(GiB)": 32.82,
|
|
"step": 590,
|
|
"token_acc": 0.9689230967409507,
|
|
"train_speed(iter/s)": 0.13714
|
|
},
|
|
{
|
|
"epoch": 2.5558847293293834,
|
|
"grad_norm": 0.850368082523346,
|
|
"learning_rate": 5.106567298245008e-07,
|
|
"loss": 0.09577755331993103,
|
|
"memory(GiB)": 32.82,
|
|
"step": 595,
|
|
"token_acc": 0.9682460066363145,
|
|
"train_speed(iter/s)": 0.137251
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"grad_norm": 0.7679170966148376,
|
|
"learning_rate": 4.6212290164521554e-07,
|
|
"loss": 0.0867478609085083,
|
|
"memory(GiB)": 32.82,
|
|
"step": 600,
|
|
"token_acc": 0.9678065479442077,
|
|
"train_speed(iter/s)": 0.137365
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"eval_loss": 0.23731745779514313,
|
|
"eval_runtime": 9.1263,
|
|
"eval_samples_per_second": 16.436,
|
|
"eval_steps_per_second": 4.164,
|
|
"eval_token_acc": 0.9279342093501386,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.5989765688122812,
|
|
"grad_norm": 0.7298761010169983,
|
|
"learning_rate": 4.159003694784647e-07,
|
|
"loss": 0.08937226533889771,
|
|
"memory(GiB)": 32.82,
|
|
"step": 605,
|
|
"token_acc": 0.9554988592596227,
|
|
"train_speed(iter/s)": 0.136957
|
|
},
|
|
{
|
|
"epoch": 2.62052248855373,
|
|
"grad_norm": 0.6361852288246155,
|
|
"learning_rate": 3.7201267604080436e-07,
|
|
"loss": 0.0883003294467926,
|
|
"memory(GiB)": 32.82,
|
|
"step": 610,
|
|
"token_acc": 0.9694297662657633,
|
|
"train_speed(iter/s)": 0.137058
|
|
},
|
|
{
|
|
"epoch": 2.642068408295179,
|
|
"grad_norm": 0.6873012781143188,
|
|
"learning_rate": 3.3048217483556743e-07,
|
|
"loss": 0.08733320236206055,
|
|
"memory(GiB)": 32.82,
|
|
"step": 615,
|
|
"token_acc": 0.9654447816950735,
|
|
"train_speed(iter/s)": 0.137163
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"grad_norm": 0.819065511226654,
|
|
"learning_rate": 2.9133001876746004e-07,
|
|
"loss": 0.09572435617446899,
|
|
"memory(GiB)": 32.82,
|
|
"step": 620,
|
|
"token_acc": 0.9705064194008559,
|
|
"train_speed(iter/s)": 0.137301
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"eval_loss": 0.23720206320285797,
|
|
"eval_runtime": 9.1052,
|
|
"eval_samples_per_second": 16.474,
|
|
"eval_steps_per_second": 4.173,
|
|
"eval_token_acc": 0.9275945293644409,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.685160247778077,
|
|
"grad_norm": 0.6891322731971741,
|
|
"learning_rate": 2.545761493686666e-07,
|
|
"loss": 0.0880233645439148,
|
|
"memory(GiB)": 32.82,
|
|
"step": 625,
|
|
"token_acc": 0.953374825625067,
|
|
"train_speed(iter/s)": 0.136783
|
|
},
|
|
{
|
|
"epoch": 2.706706167519526,
|
|
"grad_norm": 0.8032283782958984,
|
|
"learning_rate": 2.2023928664194229e-07,
|
|
"loss": 0.08428794145584106,
|
|
"memory(GiB)": 32.82,
|
|
"step": 630,
|
|
"token_acc": 0.9709327045726875,
|
|
"train_speed(iter/s)": 0.136943
|
|
},
|
|
{
|
|
"epoch": 2.728252087260975,
|
|
"grad_norm": 0.7355332970619202,
|
|
"learning_rate": 1.8833691952587829e-07,
|
|
"loss": 0.09050858616828919,
|
|
"memory(GiB)": 32.82,
|
|
"step": 635,
|
|
"token_acc": 0.965556864209589,
|
|
"train_speed(iter/s)": 0.13705
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"grad_norm": 0.6962365508079529,
|
|
"learning_rate": 1.5888529698718347e-07,
|
|
"loss": 0.08718093633651733,
|
|
"memory(GiB)": 32.82,
|
|
"step": 640,
|
|
"token_acc": 0.9714373207872832,
|
|
"train_speed(iter/s)": 0.137146
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"eval_loss": 0.23689626157283783,
|
|
"eval_runtime": 9.1808,
|
|
"eval_samples_per_second": 16.338,
|
|
"eval_steps_per_second": 4.139,
|
|
"eval_token_acc": 0.9280593546080271,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.7713439267438726,
|
|
"grad_norm": 0.6887531280517578,
|
|
"learning_rate": 1.3189941974453502e-07,
|
|
"loss": 0.09717867970466613,
|
|
"memory(GiB)": 32.82,
|
|
"step": 645,
|
|
"token_acc": 0.9555589965933725,
|
|
"train_speed(iter/s)": 0.136719
|
|
},
|
|
{
|
|
"epoch": 2.7928898464853216,
|
|
"grad_norm": 0.7766687273979187,
|
|
"learning_rate": 1.0739303262819301e-07,
|
|
"loss": 0.09383597373962402,
|
|
"memory(GiB)": 32.82,
|
|
"step": 650,
|
|
"token_acc": 0.9672413793103448,
|
|
"train_speed(iter/s)": 0.136821
|
|
},
|
|
{
|
|
"epoch": 2.814435766226771,
|
|
"grad_norm": 0.6897131204605103,
|
|
"learning_rate": 8.537861757929422e-08,
|
|
"loss": 0.09179171323776245,
|
|
"memory(GiB)": 32.82,
|
|
"step": 655,
|
|
"token_acc": 0.9684824536645028,
|
|
"train_speed(iter/s)": 0.136983
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"grad_norm": 0.7220735549926758,
|
|
"learning_rate": 6.58673872923693e-08,
|
|
"loss": 0.08836306929588318,
|
|
"memory(GiB)": 35.18,
|
|
"step": 660,
|
|
"token_acc": 0.9739326289291511,
|
|
"train_speed(iter/s)": 0.137137
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"eval_loss": 0.23682241141796112,
|
|
"eval_runtime": 9.1257,
|
|
"eval_samples_per_second": 16.437,
|
|
"eval_steps_per_second": 4.164,
|
|
"eval_token_acc": 0.9278805756681863,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.857527605709669,
|
|
"grad_norm": 0.7643243670463562,
|
|
"learning_rate": 4.88692795043344e-08,
|
|
"loss": 0.09028244614601136,
|
|
"memory(GiB)": 35.18,
|
|
"step": 665,
|
|
"token_acc": 0.9558236887466142,
|
|
"train_speed(iter/s)": 0.13671
|
|
},
|
|
{
|
|
"epoch": 2.8790735254511177,
|
|
"grad_norm": 0.8298976421356201,
|
|
"learning_rate": 3.439295193286174e-08,
|
|
"loss": 0.0949346661567688,
|
|
"memory(GiB)": 35.18,
|
|
"step": 670,
|
|
"token_acc": 0.9662216181643748,
|
|
"train_speed(iter/s)": 0.136882
|
|
},
|
|
{
|
|
"epoch": 2.9006194451925666,
|
|
"grad_norm": 0.7280375361442566,
|
|
"learning_rate": 2.2445777866709208e-08,
|
|
"loss": 0.08288905620574952,
|
|
"memory(GiB)": 35.18,
|
|
"step": 675,
|
|
"token_acc": 0.9704877806147827,
|
|
"train_speed(iter/s)": 0.136978
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"grad_norm": 0.7563872337341309,
|
|
"learning_rate": 1.3033842410251074e-08,
|
|
"loss": 0.0895566999912262,
|
|
"memory(GiB)": 35.18,
|
|
"step": 680,
|
|
"token_acc": 0.9717285366203239,
|
|
"train_speed(iter/s)": 0.137088
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"eval_loss": 0.23678408563137054,
|
|
"eval_runtime": 9.1203,
|
|
"eval_samples_per_second": 16.447,
|
|
"eval_steps_per_second": 4.167,
|
|
"eval_token_acc": 0.928023598820059,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.9437112846754645,
|
|
"grad_norm": 0.7505417466163635,
|
|
"learning_rate": 6.16193938412557e-09,
|
|
"loss": 0.09423564076423645,
|
|
"memory(GiB)": 35.18,
|
|
"step": 685,
|
|
"token_acc": 0.9571300622110334,
|
|
"train_speed(iter/s)": 0.136664
|
|
},
|
|
{
|
|
"epoch": 2.9652572044169134,
|
|
"grad_norm": 0.7332241535186768,
|
|
"learning_rate": 1.8335688835802169e-09,
|
|
"loss": 0.08703168034553528,
|
|
"memory(GiB)": 35.18,
|
|
"step": 690,
|
|
"token_acc": 0.9672596800717596,
|
|
"train_speed(iter/s)": 0.1368
|
|
},
|
|
{
|
|
"epoch": 2.9868031241583624,
|
|
"grad_norm": 0.7780314087867737,
|
|
"learning_rate": 5.093549575119205e-11,
|
|
"loss": 0.08567211627960206,
|
|
"memory(GiB)": 35.18,
|
|
"step": 695,
|
|
"token_acc": 0.9732243229432805,
|
|
"train_speed(iter/s)": 0.136919
|
|
},
|
|
{
|
|
"epoch": 2.9911123081066524,
|
|
"eval_loss": 0.23683081567287445,
|
|
"eval_runtime": 9.116,
|
|
"eval_samples_per_second": 16.455,
|
|
"eval_steps_per_second": 4.168,
|
|
"eval_token_acc": 0.9277911861982658,
|
|
"step": 696
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 696,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 7.596747044568433e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|