26585 lines
694 KiB
JSON
26585 lines
694 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 2949,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.001017293997965412,
|
|
"grad_norm": 31.607948303222656,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.2814,
|
|
"mean_token_accuracy": 0.7476553320884705,
|
|
"num_tokens": 314183.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.002034587995930824,
|
|
"grad_norm": 31.262537002563477,
|
|
"learning_rate": 3.389830508474576e-09,
|
|
"loss": 1.3075,
|
|
"mean_token_accuracy": 0.740410327911377,
|
|
"num_tokens": 627227.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.003051881993896236,
|
|
"grad_norm": 31.910720825195312,
|
|
"learning_rate": 6.779661016949152e-09,
|
|
"loss": 1.3239,
|
|
"mean_token_accuracy": 0.7391442656517029,
|
|
"num_tokens": 926658.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.004069175991861648,
|
|
"grad_norm": 31.123292922973633,
|
|
"learning_rate": 1.0169491525423728e-08,
|
|
"loss": 1.3076,
|
|
"mean_token_accuracy": 0.7411203384399414,
|
|
"num_tokens": 1253116.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.00508646998982706,
|
|
"grad_norm": 30.289949417114258,
|
|
"learning_rate": 1.3559322033898304e-08,
|
|
"loss": 1.2796,
|
|
"mean_token_accuracy": 0.7462595105171204,
|
|
"num_tokens": 1582136.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.006103763987792472,
|
|
"grad_norm": 31.040441513061523,
|
|
"learning_rate": 1.6949152542372882e-08,
|
|
"loss": 1.295,
|
|
"mean_token_accuracy": 0.7442319989204407,
|
|
"num_tokens": 1899325.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.007121057985757884,
|
|
"grad_norm": 30.435606002807617,
|
|
"learning_rate": 2.0338983050847456e-08,
|
|
"loss": 1.2849,
|
|
"mean_token_accuracy": 0.7433764338493347,
|
|
"num_tokens": 2225992.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.008138351983723296,
|
|
"grad_norm": 30.160449981689453,
|
|
"learning_rate": 2.3728813559322034e-08,
|
|
"loss": 1.281,
|
|
"mean_token_accuracy": 0.7444077730178833,
|
|
"num_tokens": 2564876.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.009155645981688708,
|
|
"grad_norm": 31.000755310058594,
|
|
"learning_rate": 2.7118644067796608e-08,
|
|
"loss": 1.2994,
|
|
"mean_token_accuracy": 0.7422164678573608,
|
|
"num_tokens": 2885094.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.01017293997965412,
|
|
"grad_norm": 32.12958908081055,
|
|
"learning_rate": 3.0508474576271186e-08,
|
|
"loss": 1.2746,
|
|
"mean_token_accuracy": 0.7504814863204956,
|
|
"num_tokens": 3200547.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.011190233977619531,
|
|
"grad_norm": 31.04578399658203,
|
|
"learning_rate": 3.3898305084745764e-08,
|
|
"loss": 1.3196,
|
|
"mean_token_accuracy": 0.7372593879699707,
|
|
"num_tokens": 3522299.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.012207527975584944,
|
|
"grad_norm": 30.585004806518555,
|
|
"learning_rate": 3.728813559322034e-08,
|
|
"loss": 1.3012,
|
|
"mean_token_accuracy": 0.7397406101226807,
|
|
"num_tokens": 3843052.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.013224821973550356,
|
|
"grad_norm": 30.747228622436523,
|
|
"learning_rate": 4.067796610169491e-08,
|
|
"loss": 1.2812,
|
|
"mean_token_accuracy": 0.745479941368103,
|
|
"num_tokens": 4163923.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.014242115971515769,
|
|
"grad_norm": 29.38701629638672,
|
|
"learning_rate": 4.406779661016949e-08,
|
|
"loss": 1.2879,
|
|
"mean_token_accuracy": 0.7403217554092407,
|
|
"num_tokens": 4501785.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.015259409969481181,
|
|
"grad_norm": 30.99683380126953,
|
|
"learning_rate": 4.745762711864407e-08,
|
|
"loss": 1.278,
|
|
"mean_token_accuracy": 0.7468803524971008,
|
|
"num_tokens": 4826248.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.01627670396744659,
|
|
"grad_norm": 30.700899124145508,
|
|
"learning_rate": 5.0847457627118645e-08,
|
|
"loss": 1.2839,
|
|
"mean_token_accuracy": 0.7428768873214722,
|
|
"num_tokens": 5159429.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.017293997965412006,
|
|
"grad_norm": 31.73674201965332,
|
|
"learning_rate": 5.4237288135593217e-08,
|
|
"loss": 1.3102,
|
|
"mean_token_accuracy": 0.7424096465110779,
|
|
"num_tokens": 5470920.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.018311291963377416,
|
|
"grad_norm": 31.500507354736328,
|
|
"learning_rate": 5.7627118644067794e-08,
|
|
"loss": 1.3163,
|
|
"mean_token_accuracy": 0.7393807172775269,
|
|
"num_tokens": 5783722.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.019328585961342827,
|
|
"grad_norm": 30.2214412689209,
|
|
"learning_rate": 6.101694915254237e-08,
|
|
"loss": 1.2991,
|
|
"mean_token_accuracy": 0.7394208312034607,
|
|
"num_tokens": 6105483.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.02034587995930824,
|
|
"grad_norm": 30.756328582763672,
|
|
"learning_rate": 6.440677966101695e-08,
|
|
"loss": 1.3097,
|
|
"mean_token_accuracy": 0.7406641840934753,
|
|
"num_tokens": 6423575.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.021363173957273652,
|
|
"grad_norm": 31.446762084960938,
|
|
"learning_rate": 6.779661016949153e-08,
|
|
"loss": 1.3054,
|
|
"mean_token_accuracy": 0.7429530024528503,
|
|
"num_tokens": 6740933.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.022380467955239063,
|
|
"grad_norm": 31.185230255126953,
|
|
"learning_rate": 7.11864406779661e-08,
|
|
"loss": 1.3002,
|
|
"mean_token_accuracy": 0.7414672374725342,
|
|
"num_tokens": 7053795.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.023397761953204477,
|
|
"grad_norm": 30.35567855834961,
|
|
"learning_rate": 7.457627118644068e-08,
|
|
"loss": 1.3191,
|
|
"mean_token_accuracy": 0.7355214357376099,
|
|
"num_tokens": 7375236.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.024415055951169887,
|
|
"grad_norm": 30.93867301940918,
|
|
"learning_rate": 7.796610169491526e-08,
|
|
"loss": 1.3039,
|
|
"mean_token_accuracy": 0.739065945148468,
|
|
"num_tokens": 7693804.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.0254323499491353,
|
|
"grad_norm": 29.841861724853516,
|
|
"learning_rate": 8.135593220338982e-08,
|
|
"loss": 1.2686,
|
|
"mean_token_accuracy": 0.7462300062179565,
|
|
"num_tokens": 8024175.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.026449643947100712,
|
|
"grad_norm": 30.483552932739258,
|
|
"learning_rate": 8.47457627118644e-08,
|
|
"loss": 1.2886,
|
|
"mean_token_accuracy": 0.7437601089477539,
|
|
"num_tokens": 8351946.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.027466937945066123,
|
|
"grad_norm": 30.007417678833008,
|
|
"learning_rate": 8.813559322033898e-08,
|
|
"loss": 1.2807,
|
|
"mean_token_accuracy": 0.7446364164352417,
|
|
"num_tokens": 8661017.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.028484231943031537,
|
|
"grad_norm": 30.700592041015625,
|
|
"learning_rate": 9.152542372881356e-08,
|
|
"loss": 1.2904,
|
|
"mean_token_accuracy": 0.7436801195144653,
|
|
"num_tokens": 8975390.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.029501525940996948,
|
|
"grad_norm": 31.300748825073242,
|
|
"learning_rate": 9.491525423728814e-08,
|
|
"loss": 1.3052,
|
|
"mean_token_accuracy": 0.7431154847145081,
|
|
"num_tokens": 9280736.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.030518819938962362,
|
|
"grad_norm": 30.187368392944336,
|
|
"learning_rate": 9.830508474576271e-08,
|
|
"loss": 1.2955,
|
|
"mean_token_accuracy": 0.7419048547744751,
|
|
"num_tokens": 9587259.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.03153611393692777,
|
|
"grad_norm": 30.064258575439453,
|
|
"learning_rate": 1.0169491525423729e-07,
|
|
"loss": 1.2773,
|
|
"mean_token_accuracy": 0.7429222464561462,
|
|
"num_tokens": 9903156.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.03255340793489318,
|
|
"grad_norm": 30.224626541137695,
|
|
"learning_rate": 1.0508474576271186e-07,
|
|
"loss": 1.2707,
|
|
"mean_token_accuracy": 0.7471416592597961,
|
|
"num_tokens": 10214147.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.0335707019328586,
|
|
"grad_norm": 30.359447479248047,
|
|
"learning_rate": 1.0847457627118643e-07,
|
|
"loss": 1.2772,
|
|
"mean_token_accuracy": 0.7445785403251648,
|
|
"num_tokens": 10528001.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.03458799593082401,
|
|
"grad_norm": 28.968889236450195,
|
|
"learning_rate": 1.1186440677966101e-07,
|
|
"loss": 1.3036,
|
|
"mean_token_accuracy": 0.7359491586685181,
|
|
"num_tokens": 10852213.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.03560528992878942,
|
|
"grad_norm": 28.165321350097656,
|
|
"learning_rate": 1.1525423728813559e-07,
|
|
"loss": 1.2836,
|
|
"mean_token_accuracy": 0.7375534176826477,
|
|
"num_tokens": 11178241.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.03662258392675483,
|
|
"grad_norm": 28.93723487854004,
|
|
"learning_rate": 1.1864406779661017e-07,
|
|
"loss": 1.262,
|
|
"mean_token_accuracy": 0.745404839515686,
|
|
"num_tokens": 11489548.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.03763987792472025,
|
|
"grad_norm": 27.576478958129883,
|
|
"learning_rate": 1.2203389830508474e-07,
|
|
"loss": 1.2396,
|
|
"mean_token_accuracy": 0.745377242565155,
|
|
"num_tokens": 11818390.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.038657171922685654,
|
|
"grad_norm": 27.26495933532715,
|
|
"learning_rate": 1.254237288135593e-07,
|
|
"loss": 1.276,
|
|
"mean_token_accuracy": 0.7369311451911926,
|
|
"num_tokens": 12131732.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.03967446592065107,
|
|
"grad_norm": 28.351390838623047,
|
|
"learning_rate": 1.288135593220339e-07,
|
|
"loss": 1.2852,
|
|
"mean_token_accuracy": 0.7390662431716919,
|
|
"num_tokens": 12436906.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.04069175991861648,
|
|
"grad_norm": 27.946863174438477,
|
|
"learning_rate": 1.3220338983050846e-07,
|
|
"loss": 1.2786,
|
|
"mean_token_accuracy": 0.7396658658981323,
|
|
"num_tokens": 12747689.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.04170905391658189,
|
|
"grad_norm": 27.03894805908203,
|
|
"learning_rate": 1.3559322033898305e-07,
|
|
"loss": 1.2435,
|
|
"mean_token_accuracy": 0.7437921762466431,
|
|
"num_tokens": 13081387.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.042726347914547304,
|
|
"grad_norm": 26.022869110107422,
|
|
"learning_rate": 1.3898305084745762e-07,
|
|
"loss": 1.2492,
|
|
"mean_token_accuracy": 0.7406527996063232,
|
|
"num_tokens": 13408622.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.04374364191251272,
|
|
"grad_norm": 28.38178253173828,
|
|
"learning_rate": 1.423728813559322e-07,
|
|
"loss": 1.2827,
|
|
"mean_token_accuracy": 0.7385092973709106,
|
|
"num_tokens": 13706016.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.044760935910478125,
|
|
"grad_norm": 27.15532112121582,
|
|
"learning_rate": 1.4576271186440677e-07,
|
|
"loss": 1.2311,
|
|
"mean_token_accuracy": 0.7478699684143066,
|
|
"num_tokens": 14029427.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.04577822990844354,
|
|
"grad_norm": 26.92449378967285,
|
|
"learning_rate": 1.4915254237288137e-07,
|
|
"loss": 1.2324,
|
|
"mean_token_accuracy": 0.7458136677742004,
|
|
"num_tokens": 14356828.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.04679552390640895,
|
|
"grad_norm": 27.326919555664062,
|
|
"learning_rate": 1.5254237288135593e-07,
|
|
"loss": 1.234,
|
|
"mean_token_accuracy": 0.7476122975349426,
|
|
"num_tokens": 14675485.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.04781281790437437,
|
|
"grad_norm": 28.272430419921875,
|
|
"learning_rate": 1.5593220338983052e-07,
|
|
"loss": 1.269,
|
|
"mean_token_accuracy": 0.741723358631134,
|
|
"num_tokens": 14983974.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.048830111902339775,
|
|
"grad_norm": 27.293691635131836,
|
|
"learning_rate": 1.5932203389830506e-07,
|
|
"loss": 1.2908,
|
|
"mean_token_accuracy": 0.7345436811447144,
|
|
"num_tokens": 15289033.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.04984740590030519,
|
|
"grad_norm": 25.178010940551758,
|
|
"learning_rate": 1.6271186440677965e-07,
|
|
"loss": 1.2383,
|
|
"mean_token_accuracy": 0.7389808893203735,
|
|
"num_tokens": 15594811.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.0508646998982706,
|
|
"grad_norm": 20.5047664642334,
|
|
"learning_rate": 1.6610169491525421e-07,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.7475082874298096,
|
|
"num_tokens": 15922340.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.05188199389623601,
|
|
"grad_norm": 19.705347061157227,
|
|
"learning_rate": 1.694915254237288e-07,
|
|
"loss": 1.1562,
|
|
"mean_token_accuracy": 0.7475390434265137,
|
|
"num_tokens": 16238658.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.052899287894201424,
|
|
"grad_norm": 20.425750732421875,
|
|
"learning_rate": 1.7288135593220337e-07,
|
|
"loss": 1.1638,
|
|
"mean_token_accuracy": 0.7460206747055054,
|
|
"num_tokens": 16546360.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.05391658189216684,
|
|
"grad_norm": 20.014739990234375,
|
|
"learning_rate": 1.7627118644067796e-07,
|
|
"loss": 1.1551,
|
|
"mean_token_accuracy": 0.7466144561767578,
|
|
"num_tokens": 16860358.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.054933875890132246,
|
|
"grad_norm": 19.32347869873047,
|
|
"learning_rate": 1.7966101694915252e-07,
|
|
"loss": 1.1763,
|
|
"mean_token_accuracy": 0.7402143478393555,
|
|
"num_tokens": 17181452.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.05595116988809766,
|
|
"grad_norm": 19.63179588317871,
|
|
"learning_rate": 1.8305084745762712e-07,
|
|
"loss": 1.1501,
|
|
"mean_token_accuracy": 0.7471902370452881,
|
|
"num_tokens": 17493379.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.056968463886063074,
|
|
"grad_norm": 19.4603328704834,
|
|
"learning_rate": 1.8644067796610168e-07,
|
|
"loss": 1.1404,
|
|
"mean_token_accuracy": 0.75003582239151,
|
|
"num_tokens": 17810924.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.05798575788402848,
|
|
"grad_norm": 18.85099983215332,
|
|
"learning_rate": 1.8983050847457627e-07,
|
|
"loss": 1.166,
|
|
"mean_token_accuracy": 0.7426574230194092,
|
|
"num_tokens": 18134320.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.059003051881993895,
|
|
"grad_norm": 20.299636840820312,
|
|
"learning_rate": 1.9322033898305084e-07,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7439650297164917,
|
|
"num_tokens": 18426725.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.06002034587995931,
|
|
"grad_norm": 18.17243003845215,
|
|
"learning_rate": 1.9661016949152543e-07,
|
|
"loss": 1.1545,
|
|
"mean_token_accuracy": 0.7428985834121704,
|
|
"num_tokens": 18749574.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.061037639877924724,
|
|
"grad_norm": 17.998579025268555,
|
|
"learning_rate": 2e-07,
|
|
"loss": 1.1666,
|
|
"mean_token_accuracy": 0.7411248683929443,
|
|
"num_tokens": 19068637.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.06205493387589013,
|
|
"grad_norm": 18.67633056640625,
|
|
"learning_rate": 2.0338983050847458e-07,
|
|
"loss": 1.1499,
|
|
"mean_token_accuracy": 0.7433934211730957,
|
|
"num_tokens": 19367232.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.06307222787385554,
|
|
"grad_norm": 16.794296264648438,
|
|
"learning_rate": 2.0677966101694912e-07,
|
|
"loss": 1.1185,
|
|
"mean_token_accuracy": 0.7493900060653687,
|
|
"num_tokens": 19693698.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.06408952187182096,
|
|
"grad_norm": 17.70916748046875,
|
|
"learning_rate": 2.101694915254237e-07,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.7437374591827393,
|
|
"num_tokens": 20001680.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.06510681586978637,
|
|
"grad_norm": 16.543203353881836,
|
|
"learning_rate": 2.1355932203389828e-07,
|
|
"loss": 1.1166,
|
|
"mean_token_accuracy": 0.7477895617485046,
|
|
"num_tokens": 20326096.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.06612410986775177,
|
|
"grad_norm": 15.88288688659668,
|
|
"learning_rate": 2.1694915254237287e-07,
|
|
"loss": 1.1553,
|
|
"mean_token_accuracy": 0.7364981770515442,
|
|
"num_tokens": 20657596.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.0671414038657172,
|
|
"grad_norm": 15.658295631408691,
|
|
"learning_rate": 2.2033898305084743e-07,
|
|
"loss": 1.1007,
|
|
"mean_token_accuracy": 0.7487651705741882,
|
|
"num_tokens": 20978319.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.0681586978636826,
|
|
"grad_norm": 15.981629371643066,
|
|
"learning_rate": 2.2372881355932202e-07,
|
|
"loss": 1.0944,
|
|
"mean_token_accuracy": 0.7503189444541931,
|
|
"num_tokens": 21283777.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.06917599186164802,
|
|
"grad_norm": 12.884788513183594,
|
|
"learning_rate": 2.271186440677966e-07,
|
|
"loss": 1.0762,
|
|
"mean_token_accuracy": 0.7451905012130737,
|
|
"num_tokens": 21590885.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.07019328585961343,
|
|
"grad_norm": 9.129504203796387,
|
|
"learning_rate": 2.3050847457627118e-07,
|
|
"loss": 1.0612,
|
|
"mean_token_accuracy": 0.7446426153182983,
|
|
"num_tokens": 21898426.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.07121057985757884,
|
|
"grad_norm": 7.249643802642822,
|
|
"learning_rate": 2.3389830508474577e-07,
|
|
"loss": 1.0684,
|
|
"mean_token_accuracy": 0.7394939661026001,
|
|
"num_tokens": 22215818.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.07222787385554426,
|
|
"grad_norm": 6.1339192390441895,
|
|
"learning_rate": 2.3728813559322033e-07,
|
|
"loss": 1.0216,
|
|
"mean_token_accuracy": 0.7490743398666382,
|
|
"num_tokens": 22537203.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.07324516785350967,
|
|
"grad_norm": 6.061101913452148,
|
|
"learning_rate": 2.406779661016949e-07,
|
|
"loss": 1.0146,
|
|
"mean_token_accuracy": 0.7498236298561096,
|
|
"num_tokens": 22850630.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.07426246185147507,
|
|
"grad_norm": 5.744910717010498,
|
|
"learning_rate": 2.440677966101695e-07,
|
|
"loss": 1.0161,
|
|
"mean_token_accuracy": 0.7497204542160034,
|
|
"num_tokens": 23159234.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.0752797558494405,
|
|
"grad_norm": 5.69437313079834,
|
|
"learning_rate": 2.4745762711864405e-07,
|
|
"loss": 1.0004,
|
|
"mean_token_accuracy": 0.7524319887161255,
|
|
"num_tokens": 23473092.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.0762970498474059,
|
|
"grad_norm": 5.5687785148620605,
|
|
"learning_rate": 2.508474576271186e-07,
|
|
"loss": 0.9938,
|
|
"mean_token_accuracy": 0.7536816596984863,
|
|
"num_tokens": 23777307.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.07731434384537131,
|
|
"grad_norm": 5.6238203048706055,
|
|
"learning_rate": 2.542372881355932e-07,
|
|
"loss": 1.0231,
|
|
"mean_token_accuracy": 0.7456361055374146,
|
|
"num_tokens": 24087522.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.07833163784333673,
|
|
"grad_norm": 5.4180378913879395,
|
|
"learning_rate": 2.576271186440678e-07,
|
|
"loss": 1.0101,
|
|
"mean_token_accuracy": 0.7502592206001282,
|
|
"num_tokens": 24394212.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.07934893184130214,
|
|
"grad_norm": 5.385940074920654,
|
|
"learning_rate": 2.6101694915254236e-07,
|
|
"loss": 1.0241,
|
|
"mean_token_accuracy": 0.7468531131744385,
|
|
"num_tokens": 24703099.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.08036622583926754,
|
|
"grad_norm": 5.0685882568359375,
|
|
"learning_rate": 2.6440677966101693e-07,
|
|
"loss": 0.9893,
|
|
"mean_token_accuracy": 0.7535327672958374,
|
|
"num_tokens": 25018302.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.08138351983723296,
|
|
"grad_norm": 4.923753261566162,
|
|
"learning_rate": 2.677966101694915e-07,
|
|
"loss": 0.9959,
|
|
"mean_token_accuracy": 0.7521530389785767,
|
|
"num_tokens": 25345695.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.08240081383519837,
|
|
"grad_norm": 4.761390686035156,
|
|
"learning_rate": 2.711864406779661e-07,
|
|
"loss": 0.972,
|
|
"mean_token_accuracy": 0.7573574185371399,
|
|
"num_tokens": 25674463.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.08341810783316378,
|
|
"grad_norm": 4.758273601531982,
|
|
"learning_rate": 2.745762711864407e-07,
|
|
"loss": 0.9888,
|
|
"mean_token_accuracy": 0.7523093223571777,
|
|
"num_tokens": 25991547.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.0844354018311292,
|
|
"grad_norm": 4.4657158851623535,
|
|
"learning_rate": 2.7796610169491524e-07,
|
|
"loss": 0.977,
|
|
"mean_token_accuracy": 0.7544059157371521,
|
|
"num_tokens": 26310637.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.08545269582909461,
|
|
"grad_norm": 4.273565292358398,
|
|
"learning_rate": 2.813559322033898e-07,
|
|
"loss": 0.9622,
|
|
"mean_token_accuracy": 0.7580838799476624,
|
|
"num_tokens": 26632464.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.08646998982706001,
|
|
"grad_norm": 4.093416690826416,
|
|
"learning_rate": 2.847457627118644e-07,
|
|
"loss": 0.9687,
|
|
"mean_token_accuracy": 0.7556474208831787,
|
|
"num_tokens": 26951965.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.08748728382502544,
|
|
"grad_norm": 3.6930601596832275,
|
|
"learning_rate": 2.88135593220339e-07,
|
|
"loss": 0.9338,
|
|
"mean_token_accuracy": 0.7628868818283081,
|
|
"num_tokens": 27279172.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.08850457782299084,
|
|
"grad_norm": 3.5280673503875732,
|
|
"learning_rate": 2.9152542372881355e-07,
|
|
"loss": 0.9686,
|
|
"mean_token_accuracy": 0.754558801651001,
|
|
"num_tokens": 27595872.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.08952187182095625,
|
|
"grad_norm": 3.2873504161834717,
|
|
"learning_rate": 2.949152542372881e-07,
|
|
"loss": 0.9341,
|
|
"mean_token_accuracy": 0.7624785304069519,
|
|
"num_tokens": 27923943.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.09053916581892167,
|
|
"grad_norm": 3.2077414989471436,
|
|
"learning_rate": 2.9830508474576273e-07,
|
|
"loss": 0.9196,
|
|
"mean_token_accuracy": 0.7646963000297546,
|
|
"num_tokens": 28243518.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.09155645981688708,
|
|
"grad_norm": 3.1801390647888184,
|
|
"learning_rate": 3.016949152542373e-07,
|
|
"loss": 0.9401,
|
|
"mean_token_accuracy": 0.758694589138031,
|
|
"num_tokens": 28558441.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.09257375381485249,
|
|
"grad_norm": 3.2310826778411865,
|
|
"learning_rate": 3.0508474576271186e-07,
|
|
"loss": 0.9157,
|
|
"mean_token_accuracy": 0.7642312049865723,
|
|
"num_tokens": 28877502.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.0935910478128179,
|
|
"grad_norm": 3.286752462387085,
|
|
"learning_rate": 3.084745762711864e-07,
|
|
"loss": 0.9419,
|
|
"mean_token_accuracy": 0.7571986317634583,
|
|
"num_tokens": 29189342.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.09460834181078331,
|
|
"grad_norm": 3.144559383392334,
|
|
"learning_rate": 3.1186440677966104e-07,
|
|
"loss": 0.8815,
|
|
"mean_token_accuracy": 0.7729621529579163,
|
|
"num_tokens": 29523929.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.09562563580874874,
|
|
"grad_norm": 3.3081698417663574,
|
|
"learning_rate": 3.152542372881356e-07,
|
|
"loss": 0.9071,
|
|
"mean_token_accuracy": 0.7662538290023804,
|
|
"num_tokens": 29839653.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.09664292980671414,
|
|
"grad_norm": 3.4476613998413086,
|
|
"learning_rate": 3.186440677966101e-07,
|
|
"loss": 0.9231,
|
|
"mean_token_accuracy": 0.7608891129493713,
|
|
"num_tokens": 30163501.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.09766022380467955,
|
|
"grad_norm": 3.5375242233276367,
|
|
"learning_rate": 3.220338983050847e-07,
|
|
"loss": 0.9056,
|
|
"mean_token_accuracy": 0.764274001121521,
|
|
"num_tokens": 30492907.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.09867751780264497,
|
|
"grad_norm": 3.2265610694885254,
|
|
"learning_rate": 3.254237288135593e-07,
|
|
"loss": 0.8757,
|
|
"mean_token_accuracy": 0.7704571485519409,
|
|
"num_tokens": 30830951.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.09969481180061038,
|
|
"grad_norm": 3.309645175933838,
|
|
"learning_rate": 3.2881355932203386e-07,
|
|
"loss": 0.8855,
|
|
"mean_token_accuracy": 0.7690130472183228,
|
|
"num_tokens": 31153089.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.10071210579857579,
|
|
"grad_norm": 3.1601974964141846,
|
|
"learning_rate": 3.3220338983050843e-07,
|
|
"loss": 0.9156,
|
|
"mean_token_accuracy": 0.7623469829559326,
|
|
"num_tokens": 31462790.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.1017293997965412,
|
|
"grad_norm": 3.098494052886963,
|
|
"learning_rate": 3.35593220338983e-07,
|
|
"loss": 0.9097,
|
|
"mean_token_accuracy": 0.7615648508071899,
|
|
"num_tokens": 31757983.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.10274669379450661,
|
|
"grad_norm": 2.887308359146118,
|
|
"learning_rate": 3.389830508474576e-07,
|
|
"loss": 0.8915,
|
|
"mean_token_accuracy": 0.7654359340667725,
|
|
"num_tokens": 32072015.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.10376398779247202,
|
|
"grad_norm": 2.7770333290100098,
|
|
"learning_rate": 3.423728813559322e-07,
|
|
"loss": 0.8763,
|
|
"mean_token_accuracy": 0.7692670822143555,
|
|
"num_tokens": 32401421.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.10478128179043744,
|
|
"grad_norm": 2.542947769165039,
|
|
"learning_rate": 3.4576271186440674e-07,
|
|
"loss": 0.8768,
|
|
"mean_token_accuracy": 0.7691189050674438,
|
|
"num_tokens": 32710194.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.10579857578840285,
|
|
"grad_norm": 2.471694231033325,
|
|
"learning_rate": 3.4915254237288136e-07,
|
|
"loss": 0.896,
|
|
"mean_token_accuracy": 0.7648525238037109,
|
|
"num_tokens": 33048273.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.10681586978636826,
|
|
"grad_norm": 2.376176357269287,
|
|
"learning_rate": 3.525423728813559e-07,
|
|
"loss": 0.8623,
|
|
"mean_token_accuracy": 0.7729743123054504,
|
|
"num_tokens": 33375803.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.10783316378433368,
|
|
"grad_norm": 2.270570755004883,
|
|
"learning_rate": 3.559322033898305e-07,
|
|
"loss": 0.903,
|
|
"mean_token_accuracy": 0.7616955637931824,
|
|
"num_tokens": 33675958.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.10885045778229908,
|
|
"grad_norm": 2.171126127243042,
|
|
"learning_rate": 3.5932203389830505e-07,
|
|
"loss": 0.8738,
|
|
"mean_token_accuracy": 0.7685263156890869,
|
|
"num_tokens": 34013911.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.10986775178026449,
|
|
"grad_norm": 2.0027692317962646,
|
|
"learning_rate": 3.6271186440677967e-07,
|
|
"loss": 0.8549,
|
|
"mean_token_accuracy": 0.7736266851425171,
|
|
"num_tokens": 34337717.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.11088504577822991,
|
|
"grad_norm": 1.915738582611084,
|
|
"learning_rate": 3.6610169491525423e-07,
|
|
"loss": 0.8559,
|
|
"mean_token_accuracy": 0.7725293040275574,
|
|
"num_tokens": 34668424.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.11190233977619532,
|
|
"grad_norm": 1.8464709520339966,
|
|
"learning_rate": 3.694915254237288e-07,
|
|
"loss": 0.864,
|
|
"mean_token_accuracy": 0.7720854878425598,
|
|
"num_tokens": 34986133.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.11291963377416073,
|
|
"grad_norm": 1.863933801651001,
|
|
"learning_rate": 3.7288135593220336e-07,
|
|
"loss": 0.8559,
|
|
"mean_token_accuracy": 0.7732508778572083,
|
|
"num_tokens": 35301888.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.11393692777212615,
|
|
"grad_norm": 1.771524429321289,
|
|
"learning_rate": 3.76271186440678e-07,
|
|
"loss": 0.8498,
|
|
"mean_token_accuracy": 0.7744065523147583,
|
|
"num_tokens": 35621625.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.11495422177009156,
|
|
"grad_norm": 1.7587084770202637,
|
|
"learning_rate": 3.7966101694915254e-07,
|
|
"loss": 0.8296,
|
|
"mean_token_accuracy": 0.7790131568908691,
|
|
"num_tokens": 35936577.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.11597151576805696,
|
|
"grad_norm": 1.6823285818099976,
|
|
"learning_rate": 3.830508474576271e-07,
|
|
"loss": 0.8507,
|
|
"mean_token_accuracy": 0.7727391123771667,
|
|
"num_tokens": 36258492.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.11698880976602238,
|
|
"grad_norm": 1.6039552688598633,
|
|
"learning_rate": 3.8644067796610167e-07,
|
|
"loss": 0.8337,
|
|
"mean_token_accuracy": 0.7773861289024353,
|
|
"num_tokens": 36569958.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.11800610376398779,
|
|
"grad_norm": 1.829788327217102,
|
|
"learning_rate": 3.898305084745763e-07,
|
|
"loss": 0.8552,
|
|
"mean_token_accuracy": 0.7726601362228394,
|
|
"num_tokens": 36886881.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.1190233977619532,
|
|
"grad_norm": 1.4620957374572754,
|
|
"learning_rate": 3.9322033898305085e-07,
|
|
"loss": 0.8252,
|
|
"mean_token_accuracy": 0.7793046236038208,
|
|
"num_tokens": 37215889.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.12004069175991862,
|
|
"grad_norm": 1.456753134727478,
|
|
"learning_rate": 3.966101694915254e-07,
|
|
"loss": 0.8254,
|
|
"mean_token_accuracy": 0.7786370515823364,
|
|
"num_tokens": 37528382.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.12105798575788403,
|
|
"grad_norm": 1.4053311347961426,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.8491,
|
|
"mean_token_accuracy": 0.7720214128494263,
|
|
"num_tokens": 37848901.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.12207527975584945,
|
|
"grad_norm": 1.395704984664917,
|
|
"learning_rate": 4.033898305084746e-07,
|
|
"loss": 0.8484,
|
|
"mean_token_accuracy": 0.7713293433189392,
|
|
"num_tokens": 38166048.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.12309257375381485,
|
|
"grad_norm": 1.3442366123199463,
|
|
"learning_rate": 4.0677966101694916e-07,
|
|
"loss": 0.7989,
|
|
"mean_token_accuracy": 0.7831647992134094,
|
|
"num_tokens": 38487350.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.12410986775178026,
|
|
"grad_norm": 1.3379216194152832,
|
|
"learning_rate": 4.101694915254237e-07,
|
|
"loss": 0.8444,
|
|
"mean_token_accuracy": 0.7723356485366821,
|
|
"num_tokens": 38803903.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.12512716174974567,
|
|
"grad_norm": 1.4637871980667114,
|
|
"learning_rate": 4.1355932203389824e-07,
|
|
"loss": 0.806,
|
|
"mean_token_accuracy": 0.7808359861373901,
|
|
"num_tokens": 39109262.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.12614445574771108,
|
|
"grad_norm": 1.468603491783142,
|
|
"learning_rate": 4.1694915254237286e-07,
|
|
"loss": 0.8097,
|
|
"mean_token_accuracy": 0.77936190366745,
|
|
"num_tokens": 39438988.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.1271617497456765,
|
|
"grad_norm": 1.3074346780776978,
|
|
"learning_rate": 4.203389830508474e-07,
|
|
"loss": 0.8555,
|
|
"mean_token_accuracy": 0.7697174549102783,
|
|
"num_tokens": 39780179.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.12817904374364192,
|
|
"grad_norm": 1.2865965366363525,
|
|
"learning_rate": 4.23728813559322e-07,
|
|
"loss": 0.832,
|
|
"mean_token_accuracy": 0.7754062414169312,
|
|
"num_tokens": 40103824.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.12919633774160733,
|
|
"grad_norm": 1.2583401203155518,
|
|
"learning_rate": 4.2711864406779655e-07,
|
|
"loss": 0.8114,
|
|
"mean_token_accuracy": 0.7794609069824219,
|
|
"num_tokens": 40433373.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.13021363173957273,
|
|
"grad_norm": 1.1636322736740112,
|
|
"learning_rate": 4.3050847457627117e-07,
|
|
"loss": 0.8085,
|
|
"mean_token_accuracy": 0.7795127630233765,
|
|
"num_tokens": 40743349.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.13123092573753814,
|
|
"grad_norm": 1.1396219730377197,
|
|
"learning_rate": 4.3389830508474573e-07,
|
|
"loss": 0.7826,
|
|
"mean_token_accuracy": 0.7847133278846741,
|
|
"num_tokens": 41064135.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.13224821973550355,
|
|
"grad_norm": 1.1963047981262207,
|
|
"learning_rate": 4.372881355932203e-07,
|
|
"loss": 0.7915,
|
|
"mean_token_accuracy": 0.7845783233642578,
|
|
"num_tokens": 41373300.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.13326551373346898,
|
|
"grad_norm": 1.128556489944458,
|
|
"learning_rate": 4.4067796610169486e-07,
|
|
"loss": 0.824,
|
|
"mean_token_accuracy": 0.7758683562278748,
|
|
"num_tokens": 41699217.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.1342828077314344,
|
|
"grad_norm": 1.1753361225128174,
|
|
"learning_rate": 4.440677966101695e-07,
|
|
"loss": 0.8166,
|
|
"mean_token_accuracy": 0.7765942811965942,
|
|
"num_tokens": 42029303.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.1353001017293998,
|
|
"grad_norm": 1.1195635795593262,
|
|
"learning_rate": 4.4745762711864404e-07,
|
|
"loss": 0.8082,
|
|
"mean_token_accuracy": 0.7790402173995972,
|
|
"num_tokens": 42349174.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.1363173957273652,
|
|
"grad_norm": 1.065693974494934,
|
|
"learning_rate": 4.508474576271186e-07,
|
|
"loss": 0.7991,
|
|
"mean_token_accuracy": 0.7818088531494141,
|
|
"num_tokens": 42676569.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.1373346897253306,
|
|
"grad_norm": 1.0510700941085815,
|
|
"learning_rate": 4.542372881355932e-07,
|
|
"loss": 0.8062,
|
|
"mean_token_accuracy": 0.7790409922599792,
|
|
"num_tokens": 42992073.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.13835198372329605,
|
|
"grad_norm": 1.014585018157959,
|
|
"learning_rate": 4.576271186440678e-07,
|
|
"loss": 0.7901,
|
|
"mean_token_accuracy": 0.7838433980941772,
|
|
"num_tokens": 43319905.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.13936927772126145,
|
|
"grad_norm": 1.0587494373321533,
|
|
"learning_rate": 4.6101694915254235e-07,
|
|
"loss": 0.782,
|
|
"mean_token_accuracy": 0.7848911285400391,
|
|
"num_tokens": 43642021.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.14038657171922686,
|
|
"grad_norm": 1.0070114135742188,
|
|
"learning_rate": 4.644067796610169e-07,
|
|
"loss": 0.7956,
|
|
"mean_token_accuracy": 0.7806844711303711,
|
|
"num_tokens": 43948725.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.14140386571719227,
|
|
"grad_norm": 1.0419402122497559,
|
|
"learning_rate": 4.6779661016949154e-07,
|
|
"loss": 0.7772,
|
|
"mean_token_accuracy": 0.7851964831352234,
|
|
"num_tokens": 44265793.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.14242115971515767,
|
|
"grad_norm": 1.03512442111969,
|
|
"learning_rate": 4.711864406779661e-07,
|
|
"loss": 0.7913,
|
|
"mean_token_accuracy": 0.7820444107055664,
|
|
"num_tokens": 44585677.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.14343845371312308,
|
|
"grad_norm": 0.9669404029846191,
|
|
"learning_rate": 4.7457627118644066e-07,
|
|
"loss": 0.7981,
|
|
"mean_token_accuracy": 0.7805585265159607,
|
|
"num_tokens": 44896422.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.14445574771108852,
|
|
"grad_norm": 0.9535713195800781,
|
|
"learning_rate": 4.779661016949152e-07,
|
|
"loss": 0.7657,
|
|
"mean_token_accuracy": 0.7876882553100586,
|
|
"num_tokens": 45222963.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.14547304170905392,
|
|
"grad_norm": 0.9769468903541565,
|
|
"learning_rate": 4.813559322033898e-07,
|
|
"loss": 0.7582,
|
|
"mean_token_accuracy": 0.7912623882293701,
|
|
"num_tokens": 45525565.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.14649033570701933,
|
|
"grad_norm": 0.9299623370170593,
|
|
"learning_rate": 4.847457627118644e-07,
|
|
"loss": 0.7813,
|
|
"mean_token_accuracy": 0.7845970392227173,
|
|
"num_tokens": 45852191.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.14750762970498474,
|
|
"grad_norm": 1.0642598867416382,
|
|
"learning_rate": 4.88135593220339e-07,
|
|
"loss": 0.8012,
|
|
"mean_token_accuracy": 0.781639575958252,
|
|
"num_tokens": 46163464.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.14852492370295015,
|
|
"grad_norm": 0.9342326521873474,
|
|
"learning_rate": 4.915254237288136e-07,
|
|
"loss": 0.7732,
|
|
"mean_token_accuracy": 0.7873122096061707,
|
|
"num_tokens": 46475098.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.14954221770091555,
|
|
"grad_norm": 0.9148728251457214,
|
|
"learning_rate": 4.949152542372881e-07,
|
|
"loss": 0.7663,
|
|
"mean_token_accuracy": 0.7879598140716553,
|
|
"num_tokens": 46799760.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.150559511698881,
|
|
"grad_norm": 0.8839486837387085,
|
|
"learning_rate": 4.983050847457627e-07,
|
|
"loss": 0.7478,
|
|
"mean_token_accuracy": 0.7923979759216309,
|
|
"num_tokens": 47128771.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.1515768056968464,
|
|
"grad_norm": 0.9131452441215515,
|
|
"learning_rate": 5.016949152542372e-07,
|
|
"loss": 0.7751,
|
|
"mean_token_accuracy": 0.7854825258255005,
|
|
"num_tokens": 47455167.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.1525940996948118,
|
|
"grad_norm": 0.8893353343009949,
|
|
"learning_rate": 5.050847457627119e-07,
|
|
"loss": 0.7649,
|
|
"mean_token_accuracy": 0.7875303030014038,
|
|
"num_tokens": 47768491.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.1536113936927772,
|
|
"grad_norm": 0.9076464772224426,
|
|
"learning_rate": 5.084745762711864e-07,
|
|
"loss": 0.7347,
|
|
"mean_token_accuracy": 0.7955377101898193,
|
|
"num_tokens": 48076479.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.15462868769074262,
|
|
"grad_norm": 0.9250063896179199,
|
|
"learning_rate": 5.11864406779661e-07,
|
|
"loss": 0.7281,
|
|
"mean_token_accuracy": 0.7981938123703003,
|
|
"num_tokens": 48390048.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.15564598168870802,
|
|
"grad_norm": 0.86025470495224,
|
|
"learning_rate": 5.152542372881356e-07,
|
|
"loss": 0.7421,
|
|
"mean_token_accuracy": 0.793510913848877,
|
|
"num_tokens": 48712282.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.15666327568667346,
|
|
"grad_norm": 0.8844801783561707,
|
|
"learning_rate": 5.186440677966102e-07,
|
|
"loss": 0.7886,
|
|
"mean_token_accuracy": 0.7832847237586975,
|
|
"num_tokens": 49032570.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.15768056968463887,
|
|
"grad_norm": 0.8823682069778442,
|
|
"learning_rate": 5.220338983050847e-07,
|
|
"loss": 0.7683,
|
|
"mean_token_accuracy": 0.7857640981674194,
|
|
"num_tokens": 49357658.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.15869786368260427,
|
|
"grad_norm": 0.8387380838394165,
|
|
"learning_rate": 5.254237288135593e-07,
|
|
"loss": 0.7383,
|
|
"mean_token_accuracy": 0.7938828468322754,
|
|
"num_tokens": 49678022.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.15971515768056968,
|
|
"grad_norm": 0.8787022829055786,
|
|
"learning_rate": 5.288135593220339e-07,
|
|
"loss": 0.7387,
|
|
"mean_token_accuracy": 0.7946102023124695,
|
|
"num_tokens": 50003658.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.1607324516785351,
|
|
"grad_norm": 0.8462948203086853,
|
|
"learning_rate": 5.322033898305085e-07,
|
|
"loss": 0.7505,
|
|
"mean_token_accuracy": 0.7912555932998657,
|
|
"num_tokens": 50337188.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.1617497456765005,
|
|
"grad_norm": 1.0470606088638306,
|
|
"learning_rate": 5.35593220338983e-07,
|
|
"loss": 0.7759,
|
|
"mean_token_accuracy": 0.7850117087364197,
|
|
"num_tokens": 50649690.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.16276703967446593,
|
|
"grad_norm": 0.8577026128768921,
|
|
"learning_rate": 5.389830508474577e-07,
|
|
"loss": 0.7579,
|
|
"mean_token_accuracy": 0.7884396910667419,
|
|
"num_tokens": 50973651.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.16378433367243134,
|
|
"grad_norm": 0.8702759742736816,
|
|
"learning_rate": 5.423728813559322e-07,
|
|
"loss": 0.7647,
|
|
"mean_token_accuracy": 0.7875997424125671,
|
|
"num_tokens": 51302446.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.16480162767039674,
|
|
"grad_norm": 0.8638347387313843,
|
|
"learning_rate": 5.457627118644067e-07,
|
|
"loss": 0.7555,
|
|
"mean_token_accuracy": 0.7890993356704712,
|
|
"num_tokens": 51613127.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.16581892166836215,
|
|
"grad_norm": 0.8361454606056213,
|
|
"learning_rate": 5.491525423728813e-07,
|
|
"loss": 0.7335,
|
|
"mean_token_accuracy": 0.7952470779418945,
|
|
"num_tokens": 51935798.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.16683621566632756,
|
|
"grad_norm": 0.8453940749168396,
|
|
"learning_rate": 5.525423728813559e-07,
|
|
"loss": 0.7525,
|
|
"mean_token_accuracy": 0.7894124984741211,
|
|
"num_tokens": 52241386.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.167853509664293,
|
|
"grad_norm": 0.8433927297592163,
|
|
"learning_rate": 5.559322033898305e-07,
|
|
"loss": 0.7566,
|
|
"mean_token_accuracy": 0.7888467907905579,
|
|
"num_tokens": 52550468.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.1688708036622584,
|
|
"grad_norm": 0.890924870967865,
|
|
"learning_rate": 5.59322033898305e-07,
|
|
"loss": 0.7577,
|
|
"mean_token_accuracy": 0.7890304327011108,
|
|
"num_tokens": 52870603.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.1698880976602238,
|
|
"grad_norm": 0.8495107293128967,
|
|
"learning_rate": 5.627118644067796e-07,
|
|
"loss": 0.7612,
|
|
"mean_token_accuracy": 0.7883809804916382,
|
|
"num_tokens": 53186596.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.17090539165818922,
|
|
"grad_norm": 0.8580964207649231,
|
|
"learning_rate": 5.661016949152541e-07,
|
|
"loss": 0.7522,
|
|
"mean_token_accuracy": 0.7893248796463013,
|
|
"num_tokens": 53511107.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.17192268565615462,
|
|
"grad_norm": 0.8602516055107117,
|
|
"learning_rate": 5.694915254237288e-07,
|
|
"loss": 0.7306,
|
|
"mean_token_accuracy": 0.7956889867782593,
|
|
"num_tokens": 53829950.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.17293997965412003,
|
|
"grad_norm": 0.8338865637779236,
|
|
"learning_rate": 5.728813559322034e-07,
|
|
"loss": 0.7611,
|
|
"mean_token_accuracy": 0.7888447046279907,
|
|
"num_tokens": 54156414.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.17395727365208546,
|
|
"grad_norm": 0.8236330151557922,
|
|
"learning_rate": 5.76271186440678e-07,
|
|
"loss": 0.7451,
|
|
"mean_token_accuracy": 0.7912829518318176,
|
|
"num_tokens": 54487854.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.17497456765005087,
|
|
"grad_norm": 0.825598418712616,
|
|
"learning_rate": 5.796610169491525e-07,
|
|
"loss": 0.7408,
|
|
"mean_token_accuracy": 0.7934503555297852,
|
|
"num_tokens": 54799217.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.17599186164801628,
|
|
"grad_norm": 0.8386024832725525,
|
|
"learning_rate": 5.830508474576271e-07,
|
|
"loss": 0.7371,
|
|
"mean_token_accuracy": 0.7941863536834717,
|
|
"num_tokens": 55126417.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.1770091556459817,
|
|
"grad_norm": 0.8538760542869568,
|
|
"learning_rate": 5.864406779661016e-07,
|
|
"loss": 0.752,
|
|
"mean_token_accuracy": 0.7901371717453003,
|
|
"num_tokens": 55457557.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.1780264496439471,
|
|
"grad_norm": 0.8268012404441833,
|
|
"learning_rate": 5.898305084745762e-07,
|
|
"loss": 0.7468,
|
|
"mean_token_accuracy": 0.7909038066864014,
|
|
"num_tokens": 55780975.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.1790437436419125,
|
|
"grad_norm": 0.8469352126121521,
|
|
"learning_rate": 5.932203389830508e-07,
|
|
"loss": 0.7643,
|
|
"mean_token_accuracy": 0.7857322096824646,
|
|
"num_tokens": 56097949.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.18006103763987794,
|
|
"grad_norm": 0.8765305280685425,
|
|
"learning_rate": 5.966101694915255e-07,
|
|
"loss": 0.7732,
|
|
"mean_token_accuracy": 0.7842049598693848,
|
|
"num_tokens": 56427795.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.18107833163784334,
|
|
"grad_norm": 0.9026130437850952,
|
|
"learning_rate": 6e-07,
|
|
"loss": 0.7455,
|
|
"mean_token_accuracy": 0.790698766708374,
|
|
"num_tokens": 56725887.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.18209562563580875,
|
|
"grad_norm": 0.8640514016151428,
|
|
"learning_rate": 6.033898305084746e-07,
|
|
"loss": 0.7259,
|
|
"mean_token_accuracy": 0.795634388923645,
|
|
"num_tokens": 57066741.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.18311291963377416,
|
|
"grad_norm": 0.8257157802581787,
|
|
"learning_rate": 6.067796610169491e-07,
|
|
"loss": 0.7334,
|
|
"mean_token_accuracy": 0.7942250967025757,
|
|
"num_tokens": 57382943.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.18413021363173956,
|
|
"grad_norm": 0.8310932517051697,
|
|
"learning_rate": 6.101694915254237e-07,
|
|
"loss": 0.7276,
|
|
"mean_token_accuracy": 0.7951348423957825,
|
|
"num_tokens": 57697387.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.18514750762970497,
|
|
"grad_norm": 0.7885932326316833,
|
|
"learning_rate": 6.135593220338982e-07,
|
|
"loss": 0.7314,
|
|
"mean_token_accuracy": 0.7951779961585999,
|
|
"num_tokens": 58024333.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.1861648016276704,
|
|
"grad_norm": 0.860317051410675,
|
|
"learning_rate": 6.169491525423728e-07,
|
|
"loss": 0.7316,
|
|
"mean_token_accuracy": 0.7940535545349121,
|
|
"num_tokens": 58338761.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.1871820956256358,
|
|
"grad_norm": 0.8580062985420227,
|
|
"learning_rate": 6.203389830508475e-07,
|
|
"loss": 0.7538,
|
|
"mean_token_accuracy": 0.7890710234642029,
|
|
"num_tokens": 58643138.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.18819938962360122,
|
|
"grad_norm": 0.8123632073402405,
|
|
"learning_rate": 6.237288135593221e-07,
|
|
"loss": 0.7133,
|
|
"mean_token_accuracy": 0.7988241314888,
|
|
"num_tokens": 58978624.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.18921668362156663,
|
|
"grad_norm": 0.8669635653495789,
|
|
"learning_rate": 6.271186440677966e-07,
|
|
"loss": 0.6936,
|
|
"mean_token_accuracy": 0.8056967854499817,
|
|
"num_tokens": 59307831.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.19023397761953204,
|
|
"grad_norm": 0.8834171295166016,
|
|
"learning_rate": 6.305084745762712e-07,
|
|
"loss": 0.7359,
|
|
"mean_token_accuracy": 0.7936495542526245,
|
|
"num_tokens": 59612951.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.19125127161749747,
|
|
"grad_norm": 0.80832839012146,
|
|
"learning_rate": 6.338983050847457e-07,
|
|
"loss": 0.6905,
|
|
"mean_token_accuracy": 0.8033033609390259,
|
|
"num_tokens": 59940444.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.19226856561546288,
|
|
"grad_norm": 0.8026413321495056,
|
|
"learning_rate": 6.372881355932202e-07,
|
|
"loss": 0.726,
|
|
"mean_token_accuracy": 0.7942773103713989,
|
|
"num_tokens": 60270518.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.19328585961342828,
|
|
"grad_norm": 0.9134705662727356,
|
|
"learning_rate": 6.406779661016949e-07,
|
|
"loss": 0.7269,
|
|
"mean_token_accuracy": 0.7943723201751709,
|
|
"num_tokens": 60587162.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.1943031536113937,
|
|
"grad_norm": 0.8130223751068115,
|
|
"learning_rate": 6.440677966101694e-07,
|
|
"loss": 0.7114,
|
|
"mean_token_accuracy": 0.7991458177566528,
|
|
"num_tokens": 60917294.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.1953204476093591,
|
|
"grad_norm": 0.8199300169944763,
|
|
"learning_rate": 6.474576271186441e-07,
|
|
"loss": 0.7219,
|
|
"mean_token_accuracy": 0.7966375350952148,
|
|
"num_tokens": 61248579.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.1963377416073245,
|
|
"grad_norm": 0.8786404132843018,
|
|
"learning_rate": 6.508474576271186e-07,
|
|
"loss": 0.7247,
|
|
"mean_token_accuracy": 0.7953713536262512,
|
|
"num_tokens": 61568050.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.19735503560528994,
|
|
"grad_norm": 0.7589143514633179,
|
|
"learning_rate": 6.542372881355932e-07,
|
|
"loss": 0.7081,
|
|
"mean_token_accuracy": 0.8009048104286194,
|
|
"num_tokens": 61895500.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.19837232960325535,
|
|
"grad_norm": 0.8833111524581909,
|
|
"learning_rate": 6.576271186440677e-07,
|
|
"loss": 0.7257,
|
|
"mean_token_accuracy": 0.7971071004867554,
|
|
"num_tokens": 62247115.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.19938962360122076,
|
|
"grad_norm": 0.902251660823822,
|
|
"learning_rate": 6.610169491525423e-07,
|
|
"loss": 0.7258,
|
|
"mean_token_accuracy": 0.7947437763214111,
|
|
"num_tokens": 62559441.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.20040691759918616,
|
|
"grad_norm": 0.8570722341537476,
|
|
"learning_rate": 6.644067796610169e-07,
|
|
"loss": 0.7453,
|
|
"mean_token_accuracy": 0.7911810278892517,
|
|
"num_tokens": 62863338.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.20142421159715157,
|
|
"grad_norm": 0.7847694158554077,
|
|
"learning_rate": 6.677966101694915e-07,
|
|
"loss": 0.6925,
|
|
"mean_token_accuracy": 0.802798330783844,
|
|
"num_tokens": 63189778.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.20244150559511698,
|
|
"grad_norm": 0.8860125541687012,
|
|
"learning_rate": 6.71186440677966e-07,
|
|
"loss": 0.7432,
|
|
"mean_token_accuracy": 0.7898187637329102,
|
|
"num_tokens": 63513614.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.2034587995930824,
|
|
"grad_norm": 0.8800017833709717,
|
|
"learning_rate": 6.745762711864407e-07,
|
|
"loss": 0.7245,
|
|
"mean_token_accuracy": 0.7962162494659424,
|
|
"num_tokens": 63810862.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.20447609359104782,
|
|
"grad_norm": 0.7929122447967529,
|
|
"learning_rate": 6.779661016949152e-07,
|
|
"loss": 0.7337,
|
|
"mean_token_accuracy": 0.7929904460906982,
|
|
"num_tokens": 64131809.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.20549338758901323,
|
|
"grad_norm": 0.8461807370185852,
|
|
"learning_rate": 6.813559322033898e-07,
|
|
"loss": 0.726,
|
|
"mean_token_accuracy": 0.7958388328552246,
|
|
"num_tokens": 64453734.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.20651068158697863,
|
|
"grad_norm": 0.7745478749275208,
|
|
"learning_rate": 6.847457627118643e-07,
|
|
"loss": 0.7154,
|
|
"mean_token_accuracy": 0.7977970838546753,
|
|
"num_tokens": 64783877.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.20752797558494404,
|
|
"grad_norm": 0.8068340420722961,
|
|
"learning_rate": 6.88135593220339e-07,
|
|
"loss": 0.7194,
|
|
"mean_token_accuracy": 0.7968555688858032,
|
|
"num_tokens": 65105547.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.20854526958290945,
|
|
"grad_norm": 0.8017715811729431,
|
|
"learning_rate": 6.915254237288135e-07,
|
|
"loss": 0.7199,
|
|
"mean_token_accuracy": 0.7974048256874084,
|
|
"num_tokens": 65425596.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.20956256358087488,
|
|
"grad_norm": 0.8070000410079956,
|
|
"learning_rate": 6.949152542372881e-07,
|
|
"loss": 0.7379,
|
|
"mean_token_accuracy": 0.7917187213897705,
|
|
"num_tokens": 65753543.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.2105798575788403,
|
|
"grad_norm": 0.782502293586731,
|
|
"learning_rate": 6.983050847457627e-07,
|
|
"loss": 0.6966,
|
|
"mean_token_accuracy": 0.8027874231338501,
|
|
"num_tokens": 66079261.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.2115971515768057,
|
|
"grad_norm": 0.8917709589004517,
|
|
"learning_rate": 7.016949152542373e-07,
|
|
"loss": 0.7335,
|
|
"mean_token_accuracy": 0.7919062376022339,
|
|
"num_tokens": 66410765.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.2126144455747711,
|
|
"grad_norm": 0.7903974056243896,
|
|
"learning_rate": 7.050847457627118e-07,
|
|
"loss": 0.7222,
|
|
"mean_token_accuracy": 0.7964398860931396,
|
|
"num_tokens": 66737804.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.2136317395727365,
|
|
"grad_norm": 0.7872498631477356,
|
|
"learning_rate": 7.084745762711865e-07,
|
|
"loss": 0.7237,
|
|
"mean_token_accuracy": 0.7953373193740845,
|
|
"num_tokens": 67056288.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.21464903357070192,
|
|
"grad_norm": 0.8102449774742126,
|
|
"learning_rate": 7.11864406779661e-07,
|
|
"loss": 0.7061,
|
|
"mean_token_accuracy": 0.7994955778121948,
|
|
"num_tokens": 67369756.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.21566632756866735,
|
|
"grad_norm": 0.8323161005973816,
|
|
"learning_rate": 7.152542372881356e-07,
|
|
"loss": 0.7026,
|
|
"mean_token_accuracy": 0.7993268966674805,
|
|
"num_tokens": 67688042.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.21668362156663276,
|
|
"grad_norm": 0.7897840738296509,
|
|
"learning_rate": 7.186440677966101e-07,
|
|
"loss": 0.7025,
|
|
"mean_token_accuracy": 0.8000015020370483,
|
|
"num_tokens": 68008533.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.21770091556459817,
|
|
"grad_norm": 0.8085548877716064,
|
|
"learning_rate": 7.220338983050847e-07,
|
|
"loss": 0.6911,
|
|
"mean_token_accuracy": 0.8024193644523621,
|
|
"num_tokens": 68332401.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.21871820956256358,
|
|
"grad_norm": 0.8432585000991821,
|
|
"learning_rate": 7.254237288135593e-07,
|
|
"loss": 0.7061,
|
|
"mean_token_accuracy": 0.7990224361419678,
|
|
"num_tokens": 68643444.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.21973550356052898,
|
|
"grad_norm": 0.8513753414154053,
|
|
"learning_rate": 7.288135593220338e-07,
|
|
"loss": 0.7054,
|
|
"mean_token_accuracy": 0.7992247343063354,
|
|
"num_tokens": 68976455.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.22075279755849442,
|
|
"grad_norm": 0.7816675305366516,
|
|
"learning_rate": 7.322033898305085e-07,
|
|
"loss": 0.7403,
|
|
"mean_token_accuracy": 0.790706992149353,
|
|
"num_tokens": 69307832.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.22177009155645983,
|
|
"grad_norm": 0.786855161190033,
|
|
"learning_rate": 7.35593220338983e-07,
|
|
"loss": 0.6763,
|
|
"mean_token_accuracy": 0.8071946501731873,
|
|
"num_tokens": 69619859.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.22278738555442523,
|
|
"grad_norm": 0.8236937522888184,
|
|
"learning_rate": 7.389830508474576e-07,
|
|
"loss": 0.6995,
|
|
"mean_token_accuracy": 0.8011282682418823,
|
|
"num_tokens": 69924843.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.22380467955239064,
|
|
"grad_norm": 0.8247690200805664,
|
|
"learning_rate": 7.423728813559321e-07,
|
|
"loss": 0.7059,
|
|
"mean_token_accuracy": 0.7996514439582825,
|
|
"num_tokens": 70244433.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.22482197355035605,
|
|
"grad_norm": 0.7740869522094727,
|
|
"learning_rate": 7.457627118644067e-07,
|
|
"loss": 0.7155,
|
|
"mean_token_accuracy": 0.7963728904724121,
|
|
"num_tokens": 70568120.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.22583926754832145,
|
|
"grad_norm": 0.8260399699211121,
|
|
"learning_rate": 7.491525423728812e-07,
|
|
"loss": 0.7275,
|
|
"mean_token_accuracy": 0.7930858135223389,
|
|
"num_tokens": 70881374.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.2268565615462869,
|
|
"grad_norm": 0.7941381335258484,
|
|
"learning_rate": 7.52542372881356e-07,
|
|
"loss": 0.7088,
|
|
"mean_token_accuracy": 0.7990828156471252,
|
|
"num_tokens": 71192310.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.2278738555442523,
|
|
"grad_norm": 0.8065604567527771,
|
|
"learning_rate": 7.559322033898305e-07,
|
|
"loss": 0.7243,
|
|
"mean_token_accuracy": 0.7951697111129761,
|
|
"num_tokens": 71509580.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.2288911495422177,
|
|
"grad_norm": 0.808077335357666,
|
|
"learning_rate": 7.593220338983051e-07,
|
|
"loss": 0.6918,
|
|
"mean_token_accuracy": 0.8035826683044434,
|
|
"num_tokens": 71825201.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.2299084435401831,
|
|
"grad_norm": 0.7790554761886597,
|
|
"learning_rate": 7.627118644067796e-07,
|
|
"loss": 0.6938,
|
|
"mean_token_accuracy": 0.8024296760559082,
|
|
"num_tokens": 72158603.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.23092573753814852,
|
|
"grad_norm": 0.7746298313140869,
|
|
"learning_rate": 7.661016949152542e-07,
|
|
"loss": 0.6972,
|
|
"mean_token_accuracy": 0.8012113571166992,
|
|
"num_tokens": 72476307.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.23194303153611392,
|
|
"grad_norm": 0.7807798385620117,
|
|
"learning_rate": 7.694915254237287e-07,
|
|
"loss": 0.6955,
|
|
"mean_token_accuracy": 0.8009060025215149,
|
|
"num_tokens": 72805014.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.23296032553407936,
|
|
"grad_norm": 0.7983285784721375,
|
|
"learning_rate": 7.728813559322033e-07,
|
|
"loss": 0.6659,
|
|
"mean_token_accuracy": 0.8106253743171692,
|
|
"num_tokens": 73123811.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.23397761953204477,
|
|
"grad_norm": 0.8004521131515503,
|
|
"learning_rate": 7.762711864406779e-07,
|
|
"loss": 0.6967,
|
|
"mean_token_accuracy": 0.8012536764144897,
|
|
"num_tokens": 73432474.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.23499491353001017,
|
|
"grad_norm": 0.7807230353355408,
|
|
"learning_rate": 7.796610169491526e-07,
|
|
"loss": 0.7013,
|
|
"mean_token_accuracy": 0.7990843057632446,
|
|
"num_tokens": 73759617.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.23601220752797558,
|
|
"grad_norm": 0.7825319170951843,
|
|
"learning_rate": 7.830508474576271e-07,
|
|
"loss": 0.712,
|
|
"mean_token_accuracy": 0.7976258993148804,
|
|
"num_tokens": 74082923.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.237029501525941,
|
|
"grad_norm": 0.7608177661895752,
|
|
"learning_rate": 7.864406779661017e-07,
|
|
"loss": 0.6872,
|
|
"mean_token_accuracy": 0.8039759397506714,
|
|
"num_tokens": 74410401.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.2380467955239064,
|
|
"grad_norm": 0.7753943204879761,
|
|
"learning_rate": 7.898305084745762e-07,
|
|
"loss": 0.6987,
|
|
"mean_token_accuracy": 0.8019410371780396,
|
|
"num_tokens": 74727709.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.23906408952187183,
|
|
"grad_norm": 0.7589889764785767,
|
|
"learning_rate": 7.932203389830508e-07,
|
|
"loss": 0.6697,
|
|
"mean_token_accuracy": 0.8082659244537354,
|
|
"num_tokens": 75056158.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.24008138351983724,
|
|
"grad_norm": 0.7917013168334961,
|
|
"learning_rate": 7.966101694915253e-07,
|
|
"loss": 0.6882,
|
|
"mean_token_accuracy": 0.803979218006134,
|
|
"num_tokens": 75380129.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.24109867751780265,
|
|
"grad_norm": 0.8265038728713989,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.6652,
|
|
"mean_token_accuracy": 0.8091133832931519,
|
|
"num_tokens": 75692158.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.24211597151576805,
|
|
"grad_norm": 0.8178495168685913,
|
|
"learning_rate": 8.033898305084746e-07,
|
|
"loss": 0.6695,
|
|
"mean_token_accuracy": 0.8072071075439453,
|
|
"num_tokens": 76015367.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.24313326551373346,
|
|
"grad_norm": 0.8022916913032532,
|
|
"learning_rate": 8.067796610169492e-07,
|
|
"loss": 0.6941,
|
|
"mean_token_accuracy": 0.801773190498352,
|
|
"num_tokens": 76309618.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.2441505595116989,
|
|
"grad_norm": 0.7696356177330017,
|
|
"learning_rate": 8.101694915254237e-07,
|
|
"loss": 0.6796,
|
|
"mean_token_accuracy": 0.8047494292259216,
|
|
"num_tokens": 76638347.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.2451678535096643,
|
|
"grad_norm": 0.7858201861381531,
|
|
"learning_rate": 8.135593220338983e-07,
|
|
"loss": 0.6956,
|
|
"mean_token_accuracy": 0.8011404275894165,
|
|
"num_tokens": 76960377.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.2461851475076297,
|
|
"grad_norm": 0.8141928315162659,
|
|
"learning_rate": 8.169491525423728e-07,
|
|
"loss": 0.7296,
|
|
"mean_token_accuracy": 0.792807936668396,
|
|
"num_tokens": 77284277.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.24720244150559512,
|
|
"grad_norm": 0.7566704750061035,
|
|
"learning_rate": 8.203389830508474e-07,
|
|
"loss": 0.6863,
|
|
"mean_token_accuracy": 0.8035607933998108,
|
|
"num_tokens": 77624014.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.24821973550356052,
|
|
"grad_norm": 0.8007069230079651,
|
|
"learning_rate": 8.23728813559322e-07,
|
|
"loss": 0.7073,
|
|
"mean_token_accuracy": 0.7980139255523682,
|
|
"num_tokens": 77955105.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.24923702950152593,
|
|
"grad_norm": 0.7965030670166016,
|
|
"learning_rate": 8.271186440677965e-07,
|
|
"loss": 0.6755,
|
|
"mean_token_accuracy": 0.8065483570098877,
|
|
"num_tokens": 78259934.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.25025432349949134,
|
|
"grad_norm": 0.7759296894073486,
|
|
"learning_rate": 8.305084745762712e-07,
|
|
"loss": 0.6885,
|
|
"mean_token_accuracy": 0.8040277361869812,
|
|
"num_tokens": 78575928.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.25127161749745675,
|
|
"grad_norm": 0.8442168235778809,
|
|
"learning_rate": 8.338983050847457e-07,
|
|
"loss": 0.6945,
|
|
"mean_token_accuracy": 0.8002915382385254,
|
|
"num_tokens": 78897539.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.25228891149542215,
|
|
"grad_norm": 0.7938070297241211,
|
|
"learning_rate": 8.372881355932203e-07,
|
|
"loss": 0.6755,
|
|
"mean_token_accuracy": 0.806767463684082,
|
|
"num_tokens": 79206412.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.2533062054933876,
|
|
"grad_norm": 0.7777937650680542,
|
|
"learning_rate": 8.406779661016948e-07,
|
|
"loss": 0.6851,
|
|
"mean_token_accuracy": 0.8038220405578613,
|
|
"num_tokens": 79550225.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.254323499491353,
|
|
"grad_norm": 0.7907181978225708,
|
|
"learning_rate": 8.440677966101695e-07,
|
|
"loss": 0.6769,
|
|
"mean_token_accuracy": 0.8062283992767334,
|
|
"num_tokens": 79861100.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.25534079348931843,
|
|
"grad_norm": 0.7875558137893677,
|
|
"learning_rate": 8.47457627118644e-07,
|
|
"loss": 0.6873,
|
|
"mean_token_accuracy": 0.8025171756744385,
|
|
"num_tokens": 80184953.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.25635808748728384,
|
|
"grad_norm": 0.8327239751815796,
|
|
"learning_rate": 8.508474576271186e-07,
|
|
"loss": 0.6759,
|
|
"mean_token_accuracy": 0.8057800531387329,
|
|
"num_tokens": 80497644.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.25737538148524924,
|
|
"grad_norm": 0.8272035121917725,
|
|
"learning_rate": 8.542372881355931e-07,
|
|
"loss": 0.7038,
|
|
"mean_token_accuracy": 0.7997850775718689,
|
|
"num_tokens": 80815501.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.25839267548321465,
|
|
"grad_norm": 0.8474062085151672,
|
|
"learning_rate": 8.576271186440678e-07,
|
|
"loss": 0.705,
|
|
"mean_token_accuracy": 0.797932505607605,
|
|
"num_tokens": 81137163.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.25940996948118006,
|
|
"grad_norm": 0.8098815679550171,
|
|
"learning_rate": 8.610169491525423e-07,
|
|
"loss": 0.6947,
|
|
"mean_token_accuracy": 0.8005977272987366,
|
|
"num_tokens": 81448192.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.26042726347914547,
|
|
"grad_norm": 0.7739518284797668,
|
|
"learning_rate": 8.64406779661017e-07,
|
|
"loss": 0.697,
|
|
"mean_token_accuracy": 0.7992587685585022,
|
|
"num_tokens": 81757788.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.2614445574771109,
|
|
"grad_norm": 0.7883069515228271,
|
|
"learning_rate": 8.677966101694915e-07,
|
|
"loss": 0.6769,
|
|
"mean_token_accuracy": 0.8063681125640869,
|
|
"num_tokens": 82085431.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.2624618514750763,
|
|
"grad_norm": 0.7693223357200623,
|
|
"learning_rate": 8.711864406779661e-07,
|
|
"loss": 0.6924,
|
|
"mean_token_accuracy": 0.8022792935371399,
|
|
"num_tokens": 82404647.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.2634791454730417,
|
|
"grad_norm": 0.7538211941719055,
|
|
"learning_rate": 8.745762711864406e-07,
|
|
"loss": 0.6846,
|
|
"mean_token_accuracy": 0.8039592504501343,
|
|
"num_tokens": 82738083.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.2644964394710071,
|
|
"grad_norm": 0.8052369356155396,
|
|
"learning_rate": 8.779661016949152e-07,
|
|
"loss": 0.6828,
|
|
"mean_token_accuracy": 0.8034741878509521,
|
|
"num_tokens": 83033584.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.26551373346897256,
|
|
"grad_norm": 0.7891734838485718,
|
|
"learning_rate": 8.813559322033897e-07,
|
|
"loss": 0.6932,
|
|
"mean_token_accuracy": 0.8006192445755005,
|
|
"num_tokens": 83363928.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.26653102746693796,
|
|
"grad_norm": 0.8296700119972229,
|
|
"learning_rate": 8.847457627118644e-07,
|
|
"loss": 0.6739,
|
|
"mean_token_accuracy": 0.8050704598426819,
|
|
"num_tokens": 83680255.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.26754832146490337,
|
|
"grad_norm": 0.7551729679107666,
|
|
"learning_rate": 8.88135593220339e-07,
|
|
"loss": 0.6909,
|
|
"mean_token_accuracy": 0.8014122843742371,
|
|
"num_tokens": 84014162.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.2685656154628688,
|
|
"grad_norm": 0.752968966960907,
|
|
"learning_rate": 8.915254237288136e-07,
|
|
"loss": 0.6621,
|
|
"mean_token_accuracy": 0.8096007704734802,
|
|
"num_tokens": 84329460.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.2695829094608342,
|
|
"grad_norm": 0.7830281257629395,
|
|
"learning_rate": 8.949152542372881e-07,
|
|
"loss": 0.6477,
|
|
"mean_token_accuracy": 0.813726544380188,
|
|
"num_tokens": 84643661.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.2706002034587996,
|
|
"grad_norm": 0.785149097442627,
|
|
"learning_rate": 8.983050847457627e-07,
|
|
"loss": 0.6637,
|
|
"mean_token_accuracy": 0.8084698915481567,
|
|
"num_tokens": 84968874.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.271617497456765,
|
|
"grad_norm": 0.7704117298126221,
|
|
"learning_rate": 9.016949152542372e-07,
|
|
"loss": 0.6716,
|
|
"mean_token_accuracy": 0.8069436550140381,
|
|
"num_tokens": 85292100.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.2726347914547304,
|
|
"grad_norm": 0.7994391918182373,
|
|
"learning_rate": 9.050847457627118e-07,
|
|
"loss": 0.667,
|
|
"mean_token_accuracy": 0.8093474507331848,
|
|
"num_tokens": 85605945.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.2736520854526958,
|
|
"grad_norm": 0.7655627727508545,
|
|
"learning_rate": 9.084745762711864e-07,
|
|
"loss": 0.6726,
|
|
"mean_token_accuracy": 0.8057523965835571,
|
|
"num_tokens": 85924331.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.2746693794506612,
|
|
"grad_norm": 0.8249563574790955,
|
|
"learning_rate": 9.11864406779661e-07,
|
|
"loss": 0.7019,
|
|
"mean_token_accuracy": 0.7991850972175598,
|
|
"num_tokens": 86231923.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.27568667344862663,
|
|
"grad_norm": 0.7793235182762146,
|
|
"learning_rate": 9.152542372881356e-07,
|
|
"loss": 0.6568,
|
|
"mean_token_accuracy": 0.8094903826713562,
|
|
"num_tokens": 86546912.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.2767039674465921,
|
|
"grad_norm": 0.7628976702690125,
|
|
"learning_rate": 9.186440677966101e-07,
|
|
"loss": 0.716,
|
|
"mean_token_accuracy": 0.7952671051025391,
|
|
"num_tokens": 86872441.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.2777212614445575,
|
|
"grad_norm": 0.7923837900161743,
|
|
"learning_rate": 9.220338983050847e-07,
|
|
"loss": 0.6864,
|
|
"mean_token_accuracy": 0.8018236756324768,
|
|
"num_tokens": 87204457.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.2787385554425229,
|
|
"grad_norm": 0.7962261438369751,
|
|
"learning_rate": 9.254237288135592e-07,
|
|
"loss": 0.6931,
|
|
"mean_token_accuracy": 0.8012575507164001,
|
|
"num_tokens": 87501470.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.2797558494404883,
|
|
"grad_norm": 0.8026267886161804,
|
|
"learning_rate": 9.288135593220338e-07,
|
|
"loss": 0.675,
|
|
"mean_token_accuracy": 0.8057051301002502,
|
|
"num_tokens": 87816423.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.2807731434384537,
|
|
"grad_norm": 0.8577560186386108,
|
|
"learning_rate": 9.322033898305083e-07,
|
|
"loss": 0.6871,
|
|
"mean_token_accuracy": 0.8022831678390503,
|
|
"num_tokens": 88105701.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.28179043743641913,
|
|
"grad_norm": 0.8111786842346191,
|
|
"learning_rate": 9.355932203389831e-07,
|
|
"loss": 0.681,
|
|
"mean_token_accuracy": 0.8033446073532104,
|
|
"num_tokens": 88429993.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.28280773143438453,
|
|
"grad_norm": 0.7656622529029846,
|
|
"learning_rate": 9.389830508474576e-07,
|
|
"loss": 0.685,
|
|
"mean_token_accuracy": 0.8025110960006714,
|
|
"num_tokens": 88754109.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.28382502543234994,
|
|
"grad_norm": 0.8410552144050598,
|
|
"learning_rate": 9.423728813559322e-07,
|
|
"loss": 0.7007,
|
|
"mean_token_accuracy": 0.7993711829185486,
|
|
"num_tokens": 89075556.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.28484231943031535,
|
|
"grad_norm": 0.8009824752807617,
|
|
"learning_rate": 9.457627118644067e-07,
|
|
"loss": 0.6852,
|
|
"mean_token_accuracy": 0.8035849332809448,
|
|
"num_tokens": 89394941.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.28585961342828076,
|
|
"grad_norm": 0.8217434287071228,
|
|
"learning_rate": 9.491525423728813e-07,
|
|
"loss": 0.6889,
|
|
"mean_token_accuracy": 0.80260169506073,
|
|
"num_tokens": 89726320.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.28687690742624616,
|
|
"grad_norm": 0.7959878444671631,
|
|
"learning_rate": 9.525423728813558e-07,
|
|
"loss": 0.6847,
|
|
"mean_token_accuracy": 0.8041878938674927,
|
|
"num_tokens": 90046963.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.28789420142421157,
|
|
"grad_norm": 0.8031529188156128,
|
|
"learning_rate": 9.559322033898305e-07,
|
|
"loss": 0.6705,
|
|
"mean_token_accuracy": 0.8066917657852173,
|
|
"num_tokens": 90360145.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.28891149542217703,
|
|
"grad_norm": 0.8071709275245667,
|
|
"learning_rate": 9.59322033898305e-07,
|
|
"loss": 0.6762,
|
|
"mean_token_accuracy": 0.8048709034919739,
|
|
"num_tokens": 90666248.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.28992878942014244,
|
|
"grad_norm": 0.7606756091117859,
|
|
"learning_rate": 9.627118644067797e-07,
|
|
"loss": 0.6748,
|
|
"mean_token_accuracy": 0.8058351278305054,
|
|
"num_tokens": 90990168.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.29094608341810785,
|
|
"grad_norm": 0.8229522705078125,
|
|
"learning_rate": 9.661016949152542e-07,
|
|
"loss": 0.668,
|
|
"mean_token_accuracy": 0.8086898326873779,
|
|
"num_tokens": 91287120.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.29196337741607326,
|
|
"grad_norm": 0.7963361144065857,
|
|
"learning_rate": 9.694915254237287e-07,
|
|
"loss": 0.6602,
|
|
"mean_token_accuracy": 0.809562623500824,
|
|
"num_tokens": 91605888.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.29298067141403866,
|
|
"grad_norm": 0.8544802665710449,
|
|
"learning_rate": 9.728813559322032e-07,
|
|
"loss": 0.7004,
|
|
"mean_token_accuracy": 0.79966139793396,
|
|
"num_tokens": 91923111.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.29399796541200407,
|
|
"grad_norm": 0.7967947721481323,
|
|
"learning_rate": 9.76271186440678e-07,
|
|
"loss": 0.6634,
|
|
"mean_token_accuracy": 0.8083997964859009,
|
|
"num_tokens": 92233100.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.2950152594099695,
|
|
"grad_norm": 0.8263453245162964,
|
|
"learning_rate": 9.796610169491525e-07,
|
|
"loss": 0.6741,
|
|
"mean_token_accuracy": 0.8063830733299255,
|
|
"num_tokens": 92538563.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.2960325534079349,
|
|
"grad_norm": 0.7387613654136658,
|
|
"learning_rate": 9.830508474576272e-07,
|
|
"loss": 0.6343,
|
|
"mean_token_accuracy": 0.8165593147277832,
|
|
"num_tokens": 92861897.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.2970498474059003,
|
|
"grad_norm": 0.8126459717750549,
|
|
"learning_rate": 9.864406779661017e-07,
|
|
"loss": 0.6224,
|
|
"mean_token_accuracy": 0.8183848261833191,
|
|
"num_tokens": 93169971.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.2980671414038657,
|
|
"grad_norm": 0.820056676864624,
|
|
"learning_rate": 9.898305084745762e-07,
|
|
"loss": 0.678,
|
|
"mean_token_accuracy": 0.8047983050346375,
|
|
"num_tokens": 93476129.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.2990844354018311,
|
|
"grad_norm": 0.7961844801902771,
|
|
"learning_rate": 9.932203389830507e-07,
|
|
"loss": 0.6694,
|
|
"mean_token_accuracy": 0.8077576756477356,
|
|
"num_tokens": 93795891.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.30010172939979657,
|
|
"grad_norm": 0.8246234059333801,
|
|
"learning_rate": 9.966101694915254e-07,
|
|
"loss": 0.6731,
|
|
"mean_token_accuracy": 0.8063099384307861,
|
|
"num_tokens": 94099353.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.301119023397762,
|
|
"grad_norm": 0.8062840104103088,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.7053,
|
|
"mean_token_accuracy": 0.7989630699157715,
|
|
"num_tokens": 94420660.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.3021363173957274,
|
|
"grad_norm": 0.7715139389038086,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6706,
|
|
"mean_token_accuracy": 0.8074213266372681,
|
|
"num_tokens": 94745810.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.3031536113936928,
|
|
"grad_norm": 0.7462009191513062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6759,
|
|
"mean_token_accuracy": 0.8049613237380981,
|
|
"num_tokens": 95053065.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.3041709053916582,
|
|
"grad_norm": 0.8057239055633545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.671,
|
|
"mean_token_accuracy": 0.8050029873847961,
|
|
"num_tokens": 95365564.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.3051881993896236,
|
|
"grad_norm": 0.7807227969169617,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6597,
|
|
"mean_token_accuracy": 0.8084917068481445,
|
|
"num_tokens": 95677617.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.306205493387589,
|
|
"grad_norm": 0.7490959167480469,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6935,
|
|
"mean_token_accuracy": 0.799102783203125,
|
|
"num_tokens": 96017692.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.3072227873855544,
|
|
"grad_norm": 0.7807316184043884,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6705,
|
|
"mean_token_accuracy": 0.8059637546539307,
|
|
"num_tokens": 96337729.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.3082400813835198,
|
|
"grad_norm": 0.8054029941558838,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6739,
|
|
"mean_token_accuracy": 0.804855465888977,
|
|
"num_tokens": 96644965.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.30925737538148523,
|
|
"grad_norm": 0.7975510954856873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6517,
|
|
"mean_token_accuracy": 0.8112246990203857,
|
|
"num_tokens": 96966380.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.31027466937945064,
|
|
"grad_norm": 0.7594712376594543,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6646,
|
|
"mean_token_accuracy": 0.8069475889205933,
|
|
"num_tokens": 97279451.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.31129196337741605,
|
|
"grad_norm": 0.7637832760810852,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.678,
|
|
"mean_token_accuracy": 0.8046350479125977,
|
|
"num_tokens": 97595648.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.3123092573753815,
|
|
"grad_norm": 0.8266690373420715,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6783,
|
|
"mean_token_accuracy": 0.8042589426040649,
|
|
"num_tokens": 97916479.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.3133265513733469,
|
|
"grad_norm": 0.7826452255249023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6799,
|
|
"mean_token_accuracy": 0.8030845522880554,
|
|
"num_tokens": 98234766.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.3143438453713123,
|
|
"grad_norm": 1.1081233024597168,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6539,
|
|
"mean_token_accuracy": 0.8105137348175049,
|
|
"num_tokens": 98567715.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.31536113936927773,
|
|
"grad_norm": 0.7553532123565674,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6828,
|
|
"mean_token_accuracy": 0.8033838272094727,
|
|
"num_tokens": 98906165.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.31637843336724314,
|
|
"grad_norm": 0.7629131078720093,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6519,
|
|
"mean_token_accuracy": 0.8110224008560181,
|
|
"num_tokens": 99211579.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.31739572736520855,
|
|
"grad_norm": 0.7820941805839539,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6836,
|
|
"mean_token_accuracy": 0.8022605180740356,
|
|
"num_tokens": 99530349.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.31841302136317395,
|
|
"grad_norm": 0.7474110126495361,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6843,
|
|
"mean_token_accuracy": 0.8024811744689941,
|
|
"num_tokens": 99867681.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.31943031536113936,
|
|
"grad_norm": 0.7628995180130005,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6644,
|
|
"mean_token_accuracy": 0.8076979517936707,
|
|
"num_tokens": 100198041.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.32044760935910477,
|
|
"grad_norm": 0.822460949420929,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6373,
|
|
"mean_token_accuracy": 0.8149458169937134,
|
|
"num_tokens": 100509537.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.3214649033570702,
|
|
"grad_norm": 0.7507079243659973,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.666,
|
|
"mean_token_accuracy": 0.8075241446495056,
|
|
"num_tokens": 100831452.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.3224821973550356,
|
|
"grad_norm": 0.8105189800262451,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6727,
|
|
"mean_token_accuracy": 0.8060649633407593,
|
|
"num_tokens": 101131963.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.323499491353001,
|
|
"grad_norm": 0.7931933999061584,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6643,
|
|
"mean_token_accuracy": 0.8087434768676758,
|
|
"num_tokens": 101448301.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.32451678535096645,
|
|
"grad_norm": 0.8080085515975952,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6669,
|
|
"mean_token_accuracy": 0.8062810301780701,
|
|
"num_tokens": 101750506.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.32553407934893186,
|
|
"grad_norm": 0.7895338535308838,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6769,
|
|
"mean_token_accuracy": 0.8053954243659973,
|
|
"num_tokens": 102079659.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.32655137334689727,
|
|
"grad_norm": 0.7637503743171692,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6416,
|
|
"mean_token_accuracy": 0.8128135800361633,
|
|
"num_tokens": 102405734.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.3275686673448627,
|
|
"grad_norm": 0.7679542899131775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6908,
|
|
"mean_token_accuracy": 0.8014698624610901,
|
|
"num_tokens": 102739870.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.3285859613428281,
|
|
"grad_norm": 0.7717718482017517,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6621,
|
|
"mean_token_accuracy": 0.8089470267295837,
|
|
"num_tokens": 103055954.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.3296032553407935,
|
|
"grad_norm": 0.885274350643158,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.667,
|
|
"mean_token_accuracy": 0.8072282075881958,
|
|
"num_tokens": 103373341.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.3306205493387589,
|
|
"grad_norm": 0.7615958452224731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6488,
|
|
"mean_token_accuracy": 0.8105665445327759,
|
|
"num_tokens": 103709672.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.3316378433367243,
|
|
"grad_norm": 0.7842453718185425,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6477,
|
|
"mean_token_accuracy": 0.8108562231063843,
|
|
"num_tokens": 104027661.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.3326551373346897,
|
|
"grad_norm": 0.7681851983070374,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6755,
|
|
"mean_token_accuracy": 0.8049620389938354,
|
|
"num_tokens": 104353014.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.3336724313326551,
|
|
"grad_norm": 0.7865566611289978,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6815,
|
|
"mean_token_accuracy": 0.8016282320022583,
|
|
"num_tokens": 104669203.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.3346897253306205,
|
|
"grad_norm": 0.7729827761650085,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6893,
|
|
"mean_token_accuracy": 0.8000586032867432,
|
|
"num_tokens": 104980367.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.335707019328586,
|
|
"grad_norm": 0.8236528038978577,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6737,
|
|
"mean_token_accuracy": 0.8063338994979858,
|
|
"num_tokens": 105298008.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.3367243133265514,
|
|
"grad_norm": 0.8104047775268555,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6687,
|
|
"mean_token_accuracy": 0.8056067228317261,
|
|
"num_tokens": 105619978.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.3377416073245168,
|
|
"grad_norm": 0.7776243090629578,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6702,
|
|
"mean_token_accuracy": 0.8067533373832703,
|
|
"num_tokens": 105942503.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.3387589013224822,
|
|
"grad_norm": 0.792806088924408,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6394,
|
|
"mean_token_accuracy": 0.8132205009460449,
|
|
"num_tokens": 106246812.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.3397761953204476,
|
|
"grad_norm": 0.9993346333503723,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6655,
|
|
"mean_token_accuracy": 0.8066599369049072,
|
|
"num_tokens": 106556412.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.340793489318413,
|
|
"grad_norm": 0.7874058485031128,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6694,
|
|
"mean_token_accuracy": 0.8050298690795898,
|
|
"num_tokens": 106869710.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.34181078331637843,
|
|
"grad_norm": 0.8449519276618958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6597,
|
|
"mean_token_accuracy": 0.8095645308494568,
|
|
"num_tokens": 107197492.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.34282807731434384,
|
|
"grad_norm": 0.7852010726928711,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6529,
|
|
"mean_token_accuracy": 0.8105023503303528,
|
|
"num_tokens": 107520176.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.34384537131230924,
|
|
"grad_norm": 0.7831073999404907,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6523,
|
|
"mean_token_accuracy": 0.8103317022323608,
|
|
"num_tokens": 107829848.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.34486266531027465,
|
|
"grad_norm": 0.7952174544334412,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6847,
|
|
"mean_token_accuracy": 0.8014416694641113,
|
|
"num_tokens": 108169162.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.34587995930824006,
|
|
"grad_norm": 0.7554988861083984,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6382,
|
|
"mean_token_accuracy": 0.8142600059509277,
|
|
"num_tokens": 108481647.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.34689725330620547,
|
|
"grad_norm": 0.7773537635803223,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.648,
|
|
"mean_token_accuracy": 0.8115161657333374,
|
|
"num_tokens": 108795594.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.34791454730417093,
|
|
"grad_norm": 0.7583483457565308,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6605,
|
|
"mean_token_accuracy": 0.8079638481140137,
|
|
"num_tokens": 109100848.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.34893184130213634,
|
|
"grad_norm": 0.8855310678482056,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6703,
|
|
"mean_token_accuracy": 0.8064523935317993,
|
|
"num_tokens": 109425497.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.34994913530010174,
|
|
"grad_norm": 0.708702027797699,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6432,
|
|
"mean_token_accuracy": 0.8125522136688232,
|
|
"num_tokens": 109752861.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.35096642929806715,
|
|
"grad_norm": 0.7805309891700745,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6361,
|
|
"mean_token_accuracy": 0.8148272037506104,
|
|
"num_tokens": 110075837.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.35198372329603256,
|
|
"grad_norm": 1.29429030418396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6647,
|
|
"mean_token_accuracy": 0.8079172968864441,
|
|
"num_tokens": 110388178.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.35300101729399797,
|
|
"grad_norm": 0.7764292359352112,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6668,
|
|
"mean_token_accuracy": 0.8062443733215332,
|
|
"num_tokens": 110712545.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.3540183112919634,
|
|
"grad_norm": 0.7958059906959534,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6552,
|
|
"mean_token_accuracy": 0.8096246719360352,
|
|
"num_tokens": 111014148.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.3550356052899288,
|
|
"grad_norm": 0.8071582317352295,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6623,
|
|
"mean_token_accuracy": 0.8077216148376465,
|
|
"num_tokens": 111318802.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.3560528992878942,
|
|
"grad_norm": 0.7708571553230286,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6537,
|
|
"mean_token_accuracy": 0.809117317199707,
|
|
"num_tokens": 111631108.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.3570701932858596,
|
|
"grad_norm": 0.8013575077056885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6683,
|
|
"mean_token_accuracy": 0.8072644472122192,
|
|
"num_tokens": 111956492.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.358087487283825,
|
|
"grad_norm": 0.7933052182197571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6564,
|
|
"mean_token_accuracy": 0.8090948462486267,
|
|
"num_tokens": 112271716.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.35910478128179046,
|
|
"grad_norm": 0.7996081709861755,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6228,
|
|
"mean_token_accuracy": 0.8179689645767212,
|
|
"num_tokens": 112595775.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.36012207527975587,
|
|
"grad_norm": 0.7942414879798889,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6468,
|
|
"mean_token_accuracy": 0.8119409680366516,
|
|
"num_tokens": 112906425.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.3611393692777213,
|
|
"grad_norm": 0.7331574559211731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.657,
|
|
"mean_token_accuracy": 0.8085232377052307,
|
|
"num_tokens": 113237249.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.3621566632756867,
|
|
"grad_norm": 0.7615078091621399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6561,
|
|
"mean_token_accuracy": 0.8098192811012268,
|
|
"num_tokens": 113555518.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.3631739572736521,
|
|
"grad_norm": 0.7951853275299072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6867,
|
|
"mean_token_accuracy": 0.8017234206199646,
|
|
"num_tokens": 113869795.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.3641912512716175,
|
|
"grad_norm": 0.7761409282684326,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6536,
|
|
"mean_token_accuracy": 0.809200644493103,
|
|
"num_tokens": 114180922.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.3652085452695829,
|
|
"grad_norm": 0.7533003091812134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6368,
|
|
"mean_token_accuracy": 0.8134925365447998,
|
|
"num_tokens": 114503125.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.3662258392675483,
|
|
"grad_norm": 0.8157519102096558,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.679,
|
|
"mean_token_accuracy": 0.8032369613647461,
|
|
"num_tokens": 114829230.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.3672431332655137,
|
|
"grad_norm": 0.7701749801635742,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6547,
|
|
"mean_token_accuracy": 0.8107021450996399,
|
|
"num_tokens": 115154558.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.36826042726347913,
|
|
"grad_norm": 0.741514265537262,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6463,
|
|
"mean_token_accuracy": 0.8118999004364014,
|
|
"num_tokens": 115471150.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.36927772126144454,
|
|
"grad_norm": 0.7648496627807617,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6666,
|
|
"mean_token_accuracy": 0.8067079782485962,
|
|
"num_tokens": 115802068.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.37029501525940994,
|
|
"grad_norm": 0.7627708911895752,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6652,
|
|
"mean_token_accuracy": 0.8071906566619873,
|
|
"num_tokens": 116121996.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.3713123092573754,
|
|
"grad_norm": 0.7299651503562927,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6472,
|
|
"mean_token_accuracy": 0.8119363784790039,
|
|
"num_tokens": 116456648.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.3723296032553408,
|
|
"grad_norm": 0.7837634086608887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6781,
|
|
"mean_token_accuracy": 0.8045236468315125,
|
|
"num_tokens": 116787232.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.3733468972533062,
|
|
"grad_norm": 0.865563690662384,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6947,
|
|
"mean_token_accuracy": 0.7989071607589722,
|
|
"num_tokens": 117122455.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.3743641912512716,
|
|
"grad_norm": 0.8054040670394897,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6542,
|
|
"mean_token_accuracy": 0.8087784051895142,
|
|
"num_tokens": 117430802.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.37538148524923703,
|
|
"grad_norm": 0.9044189453125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6733,
|
|
"mean_token_accuracy": 0.8037705421447754,
|
|
"num_tokens": 117747701.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.37639877924720244,
|
|
"grad_norm": 0.7625953555107117,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6367,
|
|
"mean_token_accuracy": 0.814502477645874,
|
|
"num_tokens": 118069631.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.37741607324516785,
|
|
"grad_norm": 0.7826732397079468,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6396,
|
|
"mean_token_accuracy": 0.8139146566390991,
|
|
"num_tokens": 118364107.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.37843336724313326,
|
|
"grad_norm": 0.7956497669219971,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.665,
|
|
"mean_token_accuracy": 0.8056970834732056,
|
|
"num_tokens": 118689942.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.37945066124109866,
|
|
"grad_norm": 0.8007025122642517,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6529,
|
|
"mean_token_accuracy": 0.809262752532959,
|
|
"num_tokens": 119027905.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.38046795523906407,
|
|
"grad_norm": 0.8179851174354553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6508,
|
|
"mean_token_accuracy": 0.8107311129570007,
|
|
"num_tokens": 119333549.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.3814852492370295,
|
|
"grad_norm": 0.775359570980072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6412,
|
|
"mean_token_accuracy": 0.8123606443405151,
|
|
"num_tokens": 119635566.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.38250254323499494,
|
|
"grad_norm": 0.7540283799171448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6688,
|
|
"mean_token_accuracy": 0.8049556612968445,
|
|
"num_tokens": 119965071.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.38351983723296035,
|
|
"grad_norm": 0.8038058280944824,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6568,
|
|
"mean_token_accuracy": 0.8087278604507446,
|
|
"num_tokens": 120275408.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.38453713123092575,
|
|
"grad_norm": 0.794776439666748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6659,
|
|
"mean_token_accuracy": 0.8071854114532471,
|
|
"num_tokens": 120600556.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.38555442522889116,
|
|
"grad_norm": 0.7935164570808411,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6441,
|
|
"mean_token_accuracy": 0.8129609823226929,
|
|
"num_tokens": 120922100.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.38657171922685657,
|
|
"grad_norm": 0.8230006694793701,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6497,
|
|
"mean_token_accuracy": 0.8110771179199219,
|
|
"num_tokens": 121235661.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.387589013224822,
|
|
"grad_norm": 0.8138030767440796,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6664,
|
|
"mean_token_accuracy": 0.8064550161361694,
|
|
"num_tokens": 121539889.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.3886063072227874,
|
|
"grad_norm": 0.8121137022972107,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6274,
|
|
"mean_token_accuracy": 0.8163203001022339,
|
|
"num_tokens": 121844594.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.3896236012207528,
|
|
"grad_norm": 0.8339466452598572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6656,
|
|
"mean_token_accuracy": 0.80696702003479,
|
|
"num_tokens": 122181626.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.3906408952187182,
|
|
"grad_norm": 0.8089752197265625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6304,
|
|
"mean_token_accuracy": 0.8159544467926025,
|
|
"num_tokens": 122486556.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.3916581892166836,
|
|
"grad_norm": 0.7690979242324829,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6565,
|
|
"mean_token_accuracy": 0.8081955909729004,
|
|
"num_tokens": 122806517.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.392675483214649,
|
|
"grad_norm": 0.757345974445343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6541,
|
|
"mean_token_accuracy": 0.8088266849517822,
|
|
"num_tokens": 123130884.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.3936927772126144,
|
|
"grad_norm": 0.8054885864257812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6224,
|
|
"mean_token_accuracy": 0.8190693855285645,
|
|
"num_tokens": 123442859.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.3947100712105799,
|
|
"grad_norm": 0.8839951157569885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6385,
|
|
"mean_token_accuracy": 0.8120584487915039,
|
|
"num_tokens": 123764752.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.3957273652085453,
|
|
"grad_norm": 0.7563456296920776,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6492,
|
|
"mean_token_accuracy": 0.8099383115768433,
|
|
"num_tokens": 124083040.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.3967446592065107,
|
|
"grad_norm": 0.7801865339279175,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6461,
|
|
"mean_token_accuracy": 0.8110958337783813,
|
|
"num_tokens": 124398859.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.3977619532044761,
|
|
"grad_norm": 0.8431596159934998,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6735,
|
|
"mean_token_accuracy": 0.8044856786727905,
|
|
"num_tokens": 124705300.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.3987792472024415,
|
|
"grad_norm": 0.7838254570960999,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6659,
|
|
"mean_token_accuracy": 0.8059735298156738,
|
|
"num_tokens": 125017794.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.3997965412004069,
|
|
"grad_norm": 0.8154076933860779,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6659,
|
|
"mean_token_accuracy": 0.8061184883117676,
|
|
"num_tokens": 125331610.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.4008138351983723,
|
|
"grad_norm": 0.850584864616394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6451,
|
|
"mean_token_accuracy": 0.8114930987358093,
|
|
"num_tokens": 125650464.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.40183112919633773,
|
|
"grad_norm": 0.7639232873916626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6382,
|
|
"mean_token_accuracy": 0.813078761100769,
|
|
"num_tokens": 125964354.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.40284842319430314,
|
|
"grad_norm": 0.8114677667617798,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6402,
|
|
"mean_token_accuracy": 0.8129217624664307,
|
|
"num_tokens": 126266332.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.40386571719226855,
|
|
"grad_norm": 0.7987565398216248,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6422,
|
|
"mean_token_accuracy": 0.8125091195106506,
|
|
"num_tokens": 126582617.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.40488301119023395,
|
|
"grad_norm": 0.8058961033821106,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.646,
|
|
"mean_token_accuracy": 0.8124598264694214,
|
|
"num_tokens": 126894951.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.4059003051881994,
|
|
"grad_norm": 0.8132612109184265,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6779,
|
|
"mean_token_accuracy": 0.803368866443634,
|
|
"num_tokens": 127193010.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.4069175991861648,
|
|
"grad_norm": 0.7651572227478027,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.645,
|
|
"mean_token_accuracy": 0.8124961256980896,
|
|
"num_tokens": 127530354.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.40793489318413023,
|
|
"grad_norm": 0.7767626047134399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6385,
|
|
"mean_token_accuracy": 0.8131939768791199,
|
|
"num_tokens": 127833308.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.40895218718209564,
|
|
"grad_norm": 0.7715514898300171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.66,
|
|
"mean_token_accuracy": 0.8088132739067078,
|
|
"num_tokens": 128161302.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.40996948118006105,
|
|
"grad_norm": 0.8009824156761169,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6558,
|
|
"mean_token_accuracy": 0.8080823421478271,
|
|
"num_tokens": 128485010.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.41098677517802645,
|
|
"grad_norm": 0.8095982074737549,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6564,
|
|
"mean_token_accuracy": 0.8079812526702881,
|
|
"num_tokens": 128794807.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.41200406917599186,
|
|
"grad_norm": 0.7956030964851379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6398,
|
|
"mean_token_accuracy": 0.8127642869949341,
|
|
"num_tokens": 129117445.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.41302136317395727,
|
|
"grad_norm": 0.7819294333457947,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6583,
|
|
"mean_token_accuracy": 0.8086738586425781,
|
|
"num_tokens": 129465561.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.4140386571719227,
|
|
"grad_norm": 0.727975070476532,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6235,
|
|
"mean_token_accuracy": 0.8174310326576233,
|
|
"num_tokens": 129789431.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.4150559511698881,
|
|
"grad_norm": 0.8050909042358398,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6413,
|
|
"mean_token_accuracy": 0.8126672506332397,
|
|
"num_tokens": 130099262.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.4160732451678535,
|
|
"grad_norm": 0.7858853936195374,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6572,
|
|
"mean_token_accuracy": 0.8087785840034485,
|
|
"num_tokens": 130412634.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.4170905391658189,
|
|
"grad_norm": 0.7969059348106384,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6543,
|
|
"mean_token_accuracy": 0.8089858889579773,
|
|
"num_tokens": 130738545.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.41810783316378436,
|
|
"grad_norm": 0.7841689586639404,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6537,
|
|
"mean_token_accuracy": 0.8089855909347534,
|
|
"num_tokens": 131069889.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.41912512716174977,
|
|
"grad_norm": 0.8327849507331848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6557,
|
|
"mean_token_accuracy": 0.8091450929641724,
|
|
"num_tokens": 131374265.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.4201424211597152,
|
|
"grad_norm": 0.77179354429245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6566,
|
|
"mean_token_accuracy": 0.8080447316169739,
|
|
"num_tokens": 131702768.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.4211597151576806,
|
|
"grad_norm": 0.8102238774299622,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6466,
|
|
"mean_token_accuracy": 0.8112056851387024,
|
|
"num_tokens": 132005394.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.422177009155646,
|
|
"grad_norm": 0.9549462795257568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6259,
|
|
"mean_token_accuracy": 0.815801739692688,
|
|
"num_tokens": 132318929.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.4231943031536114,
|
|
"grad_norm": 0.7793801426887512,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6206,
|
|
"mean_token_accuracy": 0.8180446624755859,
|
|
"num_tokens": 132639135.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.4242115971515768,
|
|
"grad_norm": 0.7840235233306885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6648,
|
|
"mean_token_accuracy": 0.8058639764785767,
|
|
"num_tokens": 132955884.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.4252288911495422,
|
|
"grad_norm": 0.7213472127914429,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6164,
|
|
"mean_token_accuracy": 0.8190960884094238,
|
|
"num_tokens": 133281242.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.4262461851475076,
|
|
"grad_norm": 0.7838244438171387,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.648,
|
|
"mean_token_accuracy": 0.8113455772399902,
|
|
"num_tokens": 133620766.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.427263479145473,
|
|
"grad_norm": 0.7640904784202576,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6398,
|
|
"mean_token_accuracy": 0.8127099275588989,
|
|
"num_tokens": 133962681.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.42828077314343843,
|
|
"grad_norm": 0.7636725306510925,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6727,
|
|
"mean_token_accuracy": 0.8043913841247559,
|
|
"num_tokens": 134299862.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.42929806714140384,
|
|
"grad_norm": 0.7917993664741516,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6346,
|
|
"mean_token_accuracy": 0.8139828443527222,
|
|
"num_tokens": 134603411.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.4303153611393693,
|
|
"grad_norm": 0.836502194404602,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6397,
|
|
"mean_token_accuracy": 0.8130548596382141,
|
|
"num_tokens": 134910477.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.4313326551373347,
|
|
"grad_norm": 0.8161730766296387,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6692,
|
|
"mean_token_accuracy": 0.8051292896270752,
|
|
"num_tokens": 135215377.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.4323499491353001,
|
|
"grad_norm": 0.7963213324546814,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6504,
|
|
"mean_token_accuracy": 0.8107162714004517,
|
|
"num_tokens": 135530087.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.4333672431332655,
|
|
"grad_norm": 0.7756025195121765,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6241,
|
|
"mean_token_accuracy": 0.8174505233764648,
|
|
"num_tokens": 135852811.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.43438453713123093,
|
|
"grad_norm": 0.7596142888069153,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6345,
|
|
"mean_token_accuracy": 0.8148894906044006,
|
|
"num_tokens": 136177024.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.43540183112919634,
|
|
"grad_norm": 0.8031265139579773,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6412,
|
|
"mean_token_accuracy": 0.8117724657058716,
|
|
"num_tokens": 136482739.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.43641912512716174,
|
|
"grad_norm": 0.7338760495185852,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6227,
|
|
"mean_token_accuracy": 0.8177262544631958,
|
|
"num_tokens": 136822688.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.43743641912512715,
|
|
"grad_norm": 0.7699108719825745,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6176,
|
|
"mean_token_accuracy": 0.8188862800598145,
|
|
"num_tokens": 137137295.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.43845371312309256,
|
|
"grad_norm": 0.8056550025939941,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.637,
|
|
"mean_token_accuracy": 0.8134770393371582,
|
|
"num_tokens": 137449013.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.43947100712105797,
|
|
"grad_norm": 0.7790605425834656,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.636,
|
|
"mean_token_accuracy": 0.8130491971969604,
|
|
"num_tokens": 137767680.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.4404883011190234,
|
|
"grad_norm": 0.7934316992759705,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6459,
|
|
"mean_token_accuracy": 0.8120208978652954,
|
|
"num_tokens": 138066689.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.44150559511698884,
|
|
"grad_norm": 0.7886009812355042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6802,
|
|
"mean_token_accuracy": 0.8030209541320801,
|
|
"num_tokens": 138390463.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.44252288911495424,
|
|
"grad_norm": 0.7820769548416138,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.651,
|
|
"mean_token_accuracy": 0.8096523880958557,
|
|
"num_tokens": 138712726.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.44354018311291965,
|
|
"grad_norm": 0.7809168100357056,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6393,
|
|
"mean_token_accuracy": 0.8129587173461914,
|
|
"num_tokens": 139056494.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.44455747711088506,
|
|
"grad_norm": 0.7615790367126465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6534,
|
|
"mean_token_accuracy": 0.8090481758117676,
|
|
"num_tokens": 139362971.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.44557477110885046,
|
|
"grad_norm": 0.8051952719688416,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6359,
|
|
"mean_token_accuracy": 0.8134530186653137,
|
|
"num_tokens": 139661330.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.44659206510681587,
|
|
"grad_norm": 0.7830991744995117,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6544,
|
|
"mean_token_accuracy": 0.8091185092926025,
|
|
"num_tokens": 139996848.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.4476093591047813,
|
|
"grad_norm": 0.7350242733955383,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.642,
|
|
"mean_token_accuracy": 0.8117263317108154,
|
|
"num_tokens": 140315404.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.4486266531027467,
|
|
"grad_norm": 0.763858437538147,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6558,
|
|
"mean_token_accuracy": 0.8085876703262329,
|
|
"num_tokens": 140632686.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.4496439471007121,
|
|
"grad_norm": 0.722997784614563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6282,
|
|
"mean_token_accuracy": 0.8143622875213623,
|
|
"num_tokens": 140945277.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.4506612410986775,
|
|
"grad_norm": 0.7560616135597229,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6451,
|
|
"mean_token_accuracy": 0.8108248114585876,
|
|
"num_tokens": 141261262.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.4516785350966429,
|
|
"grad_norm": 0.7908616065979004,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6362,
|
|
"mean_token_accuracy": 0.813894510269165,
|
|
"num_tokens": 141584264.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.4526958290946083,
|
|
"grad_norm": 0.7412334680557251,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6186,
|
|
"mean_token_accuracy": 0.8175861835479736,
|
|
"num_tokens": 141902690.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.4537131230925738,
|
|
"grad_norm": 0.8058040142059326,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6469,
|
|
"mean_token_accuracy": 0.809880256652832,
|
|
"num_tokens": 142212848.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.4547304170905392,
|
|
"grad_norm": 0.7759329676628113,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6561,
|
|
"mean_token_accuracy": 0.8088507056236267,
|
|
"num_tokens": 142533419.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.4557477110885046,
|
|
"grad_norm": 0.772881269454956,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6447,
|
|
"mean_token_accuracy": 0.8117966055870056,
|
|
"num_tokens": 142834121.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.45676500508647,
|
|
"grad_norm": 0.814788281917572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6384,
|
|
"mean_token_accuracy": 0.8127095103263855,
|
|
"num_tokens": 143165685.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.4577822990844354,
|
|
"grad_norm": 0.8127166628837585,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6213,
|
|
"mean_token_accuracy": 0.8171497583389282,
|
|
"num_tokens": 143485496.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.4587995930824008,
|
|
"grad_norm": 0.7722148299217224,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6591,
|
|
"mean_token_accuracy": 0.8063204288482666,
|
|
"num_tokens": 143805905.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.4598168870803662,
|
|
"grad_norm": 0.8112136721611023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6339,
|
|
"mean_token_accuracy": 0.813546895980835,
|
|
"num_tokens": 144107657.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.46083418107833163,
|
|
"grad_norm": 0.795857310295105,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6535,
|
|
"mean_token_accuracy": 0.8090566396713257,
|
|
"num_tokens": 144426788.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.46185147507629704,
|
|
"grad_norm": 0.8356524109840393,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6234,
|
|
"mean_token_accuracy": 0.816721498966217,
|
|
"num_tokens": 144742302.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.46286876907426244,
|
|
"grad_norm": 0.8701412677764893,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6432,
|
|
"mean_token_accuracy": 0.8105349540710449,
|
|
"num_tokens": 145041480.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.46388606307222785,
|
|
"grad_norm": 0.7511188387870789,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6307,
|
|
"mean_token_accuracy": 0.8154951333999634,
|
|
"num_tokens": 145365776.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.4649033570701933,
|
|
"grad_norm": 0.8432009816169739,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6296,
|
|
"mean_token_accuracy": 0.8161913156509399,
|
|
"num_tokens": 145686829.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.4659206510681587,
|
|
"grad_norm": 0.8876609802246094,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6484,
|
|
"mean_token_accuracy": 0.8092880249023438,
|
|
"num_tokens": 146031134.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.4669379450661241,
|
|
"grad_norm": 0.8064795732498169,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6724,
|
|
"mean_token_accuracy": 0.805739164352417,
|
|
"num_tokens": 146342004.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.46795523906408953,
|
|
"grad_norm": 0.7816082835197449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6462,
|
|
"mean_token_accuracy": 0.8109795451164246,
|
|
"num_tokens": 146660600.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.46897253306205494,
|
|
"grad_norm": 0.7669892311096191,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6927,
|
|
"mean_token_accuracy": 0.8000683784484863,
|
|
"num_tokens": 146980645.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.46998982706002035,
|
|
"grad_norm": 0.8293817639350891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6595,
|
|
"mean_token_accuracy": 0.8065913915634155,
|
|
"num_tokens": 147298752.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.47100712105798576,
|
|
"grad_norm": 0.8262353539466858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6165,
|
|
"mean_token_accuracy": 0.818239688873291,
|
|
"num_tokens": 147606035.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.47202441505595116,
|
|
"grad_norm": 0.7733154296875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6074,
|
|
"mean_token_accuracy": 0.8200194835662842,
|
|
"num_tokens": 147900796.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.47304170905391657,
|
|
"grad_norm": 0.812021791934967,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6167,
|
|
"mean_token_accuracy": 0.8169869184494019,
|
|
"num_tokens": 148213700.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.474059003051882,
|
|
"grad_norm": 0.7849969863891602,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6406,
|
|
"mean_token_accuracy": 0.8115862607955933,
|
|
"num_tokens": 148519502.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.4750762970498474,
|
|
"grad_norm": 0.8655486106872559,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6592,
|
|
"mean_token_accuracy": 0.8073008060455322,
|
|
"num_tokens": 148837050.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.4760935910478128,
|
|
"grad_norm": 0.7930212616920471,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6423,
|
|
"mean_token_accuracy": 0.8117002248764038,
|
|
"num_tokens": 149158085.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.47711088504577825,
|
|
"grad_norm": 0.8016993999481201,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6499,
|
|
"mean_token_accuracy": 0.8118265271186829,
|
|
"num_tokens": 149470715.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.47812817904374366,
|
|
"grad_norm": 0.7912672162055969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6527,
|
|
"mean_token_accuracy": 0.8092522025108337,
|
|
"num_tokens": 149779243.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.47914547304170907,
|
|
"grad_norm": 0.7673670649528503,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6641,
|
|
"mean_token_accuracy": 0.8060247898101807,
|
|
"num_tokens": 150086890.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.4801627670396745,
|
|
"grad_norm": 0.7772818207740784,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6151,
|
|
"mean_token_accuracy": 0.8201066851615906,
|
|
"num_tokens": 150414091.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.4811800610376399,
|
|
"grad_norm": 0.7861338257789612,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6536,
|
|
"mean_token_accuracy": 0.8087126016616821,
|
|
"num_tokens": 150749759.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.4821973550356053,
|
|
"grad_norm": 0.7545295357704163,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6377,
|
|
"mean_token_accuracy": 0.813199520111084,
|
|
"num_tokens": 151078517.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.4832146490335707,
|
|
"grad_norm": 0.7383038401603699,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6368,
|
|
"mean_token_accuracy": 0.8135240077972412,
|
|
"num_tokens": 151404760.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.4842319430315361,
|
|
"grad_norm": 0.8081879019737244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6106,
|
|
"mean_token_accuracy": 0.8206827044487,
|
|
"num_tokens": 151714374.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.4852492370295015,
|
|
"grad_norm": 0.7860760688781738,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.624,
|
|
"mean_token_accuracy": 0.817007303237915,
|
|
"num_tokens": 152034628.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.4862665310274669,
|
|
"grad_norm": 0.8067706227302551,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6373,
|
|
"mean_token_accuracy": 0.8134366273880005,
|
|
"num_tokens": 152336970.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.4872838250254323,
|
|
"grad_norm": 0.7861080765724182,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6464,
|
|
"mean_token_accuracy": 0.8105477094650269,
|
|
"num_tokens": 152684663.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.4883011190233978,
|
|
"grad_norm": 0.7935335636138916,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6551,
|
|
"mean_token_accuracy": 0.808892011642456,
|
|
"num_tokens": 153010247.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.4893184130213632,
|
|
"grad_norm": 0.81971275806427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8200415372848511,
|
|
"num_tokens": 153337306.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.4903357070193286,
|
|
"grad_norm": 0.7935061454772949,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.635,
|
|
"mean_token_accuracy": 0.8127099871635437,
|
|
"num_tokens": 153652759.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.491353001017294,
|
|
"grad_norm": 0.8456212878227234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.8179374933242798,
|
|
"num_tokens": 153982904.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.4923702950152594,
|
|
"grad_norm": 0.7553489804267883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.658,
|
|
"mean_token_accuracy": 0.8084520101547241,
|
|
"num_tokens": 154318454.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.4933875890132248,
|
|
"grad_norm": 0.7678808569908142,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6568,
|
|
"mean_token_accuracy": 0.8083710670471191,
|
|
"num_tokens": 154646655.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.49440488301119023,
|
|
"grad_norm": 0.7790781259536743,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.637,
|
|
"mean_token_accuracy": 0.813983678817749,
|
|
"num_tokens": 154961491.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.49542217700915564,
|
|
"grad_norm": 0.8041170835494995,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6285,
|
|
"mean_token_accuracy": 0.814818263053894,
|
|
"num_tokens": 155273562.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.49643947100712105,
|
|
"grad_norm": 0.8097386956214905,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6087,
|
|
"mean_token_accuracy": 0.8204355239868164,
|
|
"num_tokens": 155577059.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.49745676500508645,
|
|
"grad_norm": 0.7783612608909607,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6495,
|
|
"mean_token_accuracy": 0.809700608253479,
|
|
"num_tokens": 155896969.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.49847405900305186,
|
|
"grad_norm": 0.8291912078857422,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6444,
|
|
"mean_token_accuracy": 0.8116985559463501,
|
|
"num_tokens": 156221661.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.49949135300101727,
|
|
"grad_norm": 0.8157835602760315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6503,
|
|
"mean_token_accuracy": 0.8086612820625305,
|
|
"num_tokens": 156538559.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.5005086469989827,
|
|
"grad_norm": 0.7264803051948547,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6531,
|
|
"mean_token_accuracy": 0.8084179162979126,
|
|
"num_tokens": 156879118.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.5015259409969481,
|
|
"grad_norm": 0.7339447736740112,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6112,
|
|
"mean_token_accuracy": 0.8194094896316528,
|
|
"num_tokens": 157198178.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.5025432349949135,
|
|
"grad_norm": 0.7389786839485168,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6306,
|
|
"mean_token_accuracy": 0.8140756487846375,
|
|
"num_tokens": 157519093.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.503560528992879,
|
|
"grad_norm": 0.869776725769043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6312,
|
|
"mean_token_accuracy": 0.8145843744277954,
|
|
"num_tokens": 157826508.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.5045778229908443,
|
|
"grad_norm": 0.7960457801818848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6465,
|
|
"mean_token_accuracy": 0.8094485998153687,
|
|
"num_tokens": 158153856.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.5055951169888098,
|
|
"grad_norm": 0.7548427581787109,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6424,
|
|
"mean_token_accuracy": 0.8124884366989136,
|
|
"num_tokens": 158468860.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.5066124109867752,
|
|
"grad_norm": 0.7868984341621399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6378,
|
|
"mean_token_accuracy": 0.8119939565658569,
|
|
"num_tokens": 158789067.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.5076297049847406,
|
|
"grad_norm": 0.7604773640632629,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6417,
|
|
"mean_token_accuracy": 0.8112584948539734,
|
|
"num_tokens": 159115006.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.508646998982706,
|
|
"grad_norm": 0.7894480228424072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6192,
|
|
"mean_token_accuracy": 0.8169581890106201,
|
|
"num_tokens": 159430845.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.5096642929806714,
|
|
"grad_norm": 0.7751943469047546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.643,
|
|
"mean_token_accuracy": 0.8119826316833496,
|
|
"num_tokens": 159766028.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.5106815869786369,
|
|
"grad_norm": 0.7243176102638245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6145,
|
|
"mean_token_accuracy": 0.8198132514953613,
|
|
"num_tokens": 160093084.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.5116988809766022,
|
|
"grad_norm": 0.7752160429954529,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6371,
|
|
"mean_token_accuracy": 0.8124977350234985,
|
|
"num_tokens": 160414551.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.5127161749745677,
|
|
"grad_norm": 0.7632299661636353,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.635,
|
|
"mean_token_accuracy": 0.8139313459396362,
|
|
"num_tokens": 160732492.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.513733468972533,
|
|
"grad_norm": 0.8722953796386719,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6399,
|
|
"mean_token_accuracy": 0.8128257989883423,
|
|
"num_tokens": 161063180.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.5147507629704985,
|
|
"grad_norm": 0.7689654231071472,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6473,
|
|
"mean_token_accuracy": 0.8114692568778992,
|
|
"num_tokens": 161375840.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.5157680569684638,
|
|
"grad_norm": 0.7574338912963867,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6455,
|
|
"mean_token_accuracy": 0.8104807734489441,
|
|
"num_tokens": 161712451.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.5167853509664293,
|
|
"grad_norm": 0.721907377243042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6275,
|
|
"mean_token_accuracy": 0.8147372007369995,
|
|
"num_tokens": 162053742.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.5178026449643948,
|
|
"grad_norm": 0.792888879776001,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6462,
|
|
"mean_token_accuracy": 0.8106032609939575,
|
|
"num_tokens": 162383502.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.5188199389623601,
|
|
"grad_norm": 0.7896516919136047,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6273,
|
|
"mean_token_accuracy": 0.8150818347930908,
|
|
"num_tokens": 162721800.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.5198372329603256,
|
|
"grad_norm": 0.814561128616333,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6347,
|
|
"mean_token_accuracy": 0.8130834102630615,
|
|
"num_tokens": 163040860.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.5208545269582909,
|
|
"grad_norm": 0.7659908533096313,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6295,
|
|
"mean_token_accuracy": 0.8147145509719849,
|
|
"num_tokens": 163354092.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.5218718209562564,
|
|
"grad_norm": 0.7354118824005127,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6353,
|
|
"mean_token_accuracy": 0.8131240606307983,
|
|
"num_tokens": 163677302.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.5228891149542217,
|
|
"grad_norm": 0.7778656482696533,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.8165751099586487,
|
|
"num_tokens": 163993198.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.5239064089521872,
|
|
"grad_norm": 0.773829460144043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6491,
|
|
"mean_token_accuracy": 0.8093839883804321,
|
|
"num_tokens": 164325603.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.5249237029501526,
|
|
"grad_norm": 0.7734475135803223,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6156,
|
|
"mean_token_accuracy": 0.818503737449646,
|
|
"num_tokens": 164634955.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.525940996948118,
|
|
"grad_norm": 0.7791919112205505,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6157,
|
|
"mean_token_accuracy": 0.8177918195724487,
|
|
"num_tokens": 164949736.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.5269582909460834,
|
|
"grad_norm": 0.8125184774398804,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6312,
|
|
"mean_token_accuracy": 0.8144404888153076,
|
|
"num_tokens": 165268681.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.5279755849440488,
|
|
"grad_norm": 0.7059145569801331,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6018,
|
|
"mean_token_accuracy": 0.8215758800506592,
|
|
"num_tokens": 165611395.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.5289928789420142,
|
|
"grad_norm": 0.7735430598258972,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6619,
|
|
"mean_token_accuracy": 0.8062798380851746,
|
|
"num_tokens": 165930541.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.5300101729399797,
|
|
"grad_norm": 0.8060302734375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6271,
|
|
"mean_token_accuracy": 0.8155234456062317,
|
|
"num_tokens": 166253912.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.5310274669379451,
|
|
"grad_norm": 0.7790757417678833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6217,
|
|
"mean_token_accuracy": 0.8167293667793274,
|
|
"num_tokens": 166570026.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.5320447609359105,
|
|
"grad_norm": 0.7891453504562378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6385,
|
|
"mean_token_accuracy": 0.8134738206863403,
|
|
"num_tokens": 166891414.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.5330620549338759,
|
|
"grad_norm": 0.7717955708503723,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6457,
|
|
"mean_token_accuracy": 0.8105074167251587,
|
|
"num_tokens": 167199448.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.5340793489318413,
|
|
"grad_norm": 0.7495227456092834,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6305,
|
|
"mean_token_accuracy": 0.8140999674797058,
|
|
"num_tokens": 167508836.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.5350966429298067,
|
|
"grad_norm": 0.7756131887435913,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6392,
|
|
"mean_token_accuracy": 0.8117661476135254,
|
|
"num_tokens": 167821540.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.5361139369277721,
|
|
"grad_norm": 0.7604585886001587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6174,
|
|
"mean_token_accuracy": 0.8179171085357666,
|
|
"num_tokens": 168163810.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.5371312309257376,
|
|
"grad_norm": 0.7594449520111084,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6101,
|
|
"mean_token_accuracy": 0.8211343288421631,
|
|
"num_tokens": 168465144.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.5381485249237029,
|
|
"grad_norm": 0.7848928570747375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6356,
|
|
"mean_token_accuracy": 0.812796413898468,
|
|
"num_tokens": 168772296.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.5391658189216684,
|
|
"grad_norm": 0.8153516054153442,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6366,
|
|
"mean_token_accuracy": 0.8122652769088745,
|
|
"num_tokens": 169084941.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.5401831129196337,
|
|
"grad_norm": 0.8312289714813232,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6125,
|
|
"mean_token_accuracy": 0.8194169998168945,
|
|
"num_tokens": 169398931.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.5412004069175992,
|
|
"grad_norm": 0.8274112939834595,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6204,
|
|
"mean_token_accuracy": 0.8165071606636047,
|
|
"num_tokens": 169713890.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.5422177009155646,
|
|
"grad_norm": 0.7571256756782532,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6399,
|
|
"mean_token_accuracy": 0.8139414191246033,
|
|
"num_tokens": 170042021.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.54323499491353,
|
|
"grad_norm": 0.8405437469482422,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6168,
|
|
"mean_token_accuracy": 0.8171428442001343,
|
|
"num_tokens": 170344681.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.5442522889114955,
|
|
"grad_norm": 0.7748994827270508,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6345,
|
|
"mean_token_accuracy": 0.814400315284729,
|
|
"num_tokens": 170662002.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.5452695829094608,
|
|
"grad_norm": 0.7834451198577881,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.64,
|
|
"mean_token_accuracy": 0.8116738796234131,
|
|
"num_tokens": 170965381.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.5462868769074263,
|
|
"grad_norm": 0.7792726159095764,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6654,
|
|
"mean_token_accuracy": 0.8080847263336182,
|
|
"num_tokens": 171297534.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.5473041709053916,
|
|
"grad_norm": 0.7707831859588623,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6332,
|
|
"mean_token_accuracy": 0.8134000897407532,
|
|
"num_tokens": 171615049.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.5483214649033571,
|
|
"grad_norm": 0.8062372803688049,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6126,
|
|
"mean_token_accuracy": 0.8191686868667603,
|
|
"num_tokens": 171931276.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.5493387589013224,
|
|
"grad_norm": 0.8075754046440125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6238,
|
|
"mean_token_accuracy": 0.8167059421539307,
|
|
"num_tokens": 172259334.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.5503560528992879,
|
|
"grad_norm": 0.778333306312561,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.618,
|
|
"mean_token_accuracy": 0.8180431127548218,
|
|
"num_tokens": 172580086.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 0.5513733468972533,
|
|
"grad_norm": 0.7373014688491821,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6378,
|
|
"mean_token_accuracy": 0.8125739097595215,
|
|
"num_tokens": 172895819.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 0.5523906408952187,
|
|
"grad_norm": 0.7623785734176636,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6158,
|
|
"mean_token_accuracy": 0.8183821439743042,
|
|
"num_tokens": 173207899.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 0.5534079348931842,
|
|
"grad_norm": 0.7864964604377747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6314,
|
|
"mean_token_accuracy": 0.8134520053863525,
|
|
"num_tokens": 173547290.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 0.5544252288911495,
|
|
"grad_norm": 0.8133346438407898,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6467,
|
|
"mean_token_accuracy": 0.811774492263794,
|
|
"num_tokens": 173872148.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.555442522889115,
|
|
"grad_norm": 0.828346848487854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6425,
|
|
"mean_token_accuracy": 0.8110955953598022,
|
|
"num_tokens": 174174594.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 0.5564598168870803,
|
|
"grad_norm": 0.7909507751464844,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6289,
|
|
"mean_token_accuracy": 0.8137595057487488,
|
|
"num_tokens": 174483118.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 0.5574771108850458,
|
|
"grad_norm": 0.7661901712417603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6233,
|
|
"mean_token_accuracy": 0.8151018619537354,
|
|
"num_tokens": 174798555.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 0.5584944048830112,
|
|
"grad_norm": 0.7890106439590454,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6207,
|
|
"mean_token_accuracy": 0.8171303868293762,
|
|
"num_tokens": 175130207.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 0.5595116988809766,
|
|
"grad_norm": 0.7659645676612854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6083,
|
|
"mean_token_accuracy": 0.8203459978103638,
|
|
"num_tokens": 175445505.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.560528992878942,
|
|
"grad_norm": 0.7757659554481506,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6364,
|
|
"mean_token_accuracy": 0.8122877478599548,
|
|
"num_tokens": 175767713.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 0.5615462868769074,
|
|
"grad_norm": 0.7445358037948608,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.646,
|
|
"mean_token_accuracy": 0.8095182180404663,
|
|
"num_tokens": 176094548.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 0.5625635808748728,
|
|
"grad_norm": 0.7649447321891785,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6224,
|
|
"mean_token_accuracy": 0.8160684704780579,
|
|
"num_tokens": 176419770.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 0.5635808748728383,
|
|
"grad_norm": 0.78580242395401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6275,
|
|
"mean_token_accuracy": 0.814705491065979,
|
|
"num_tokens": 176731375.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 0.5645981688708036,
|
|
"grad_norm": 0.7920994162559509,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8193442821502686,
|
|
"num_tokens": 177037040.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.5656154628687691,
|
|
"grad_norm": 0.7603932023048401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6411,
|
|
"mean_token_accuracy": 0.8114141225814819,
|
|
"num_tokens": 177353869.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 0.5666327568667345,
|
|
"grad_norm": 0.8625222444534302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6059,
|
|
"mean_token_accuracy": 0.8205225467681885,
|
|
"num_tokens": 177663239.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 0.5676500508646999,
|
|
"grad_norm": 0.732420027256012,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6178,
|
|
"mean_token_accuracy": 0.8171700239181519,
|
|
"num_tokens": 177989713.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 0.5686673448626653,
|
|
"grad_norm": 0.7737362384796143,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6318,
|
|
"mean_token_accuracy": 0.8142409324645996,
|
|
"num_tokens": 178311252.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 0.5696846388606307,
|
|
"grad_norm": 0.8172044157981873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6552,
|
|
"mean_token_accuracy": 0.807847261428833,
|
|
"num_tokens": 178635634.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.5707019328585962,
|
|
"grad_norm": 0.844097375869751,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6421,
|
|
"mean_token_accuracy": 0.8111605048179626,
|
|
"num_tokens": 178948373.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 0.5717192268565615,
|
|
"grad_norm": 0.7378110885620117,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6098,
|
|
"mean_token_accuracy": 0.8191424012184143,
|
|
"num_tokens": 179265598.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 0.572736520854527,
|
|
"grad_norm": 0.779130756855011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6243,
|
|
"mean_token_accuracy": 0.8169468641281128,
|
|
"num_tokens": 179577100.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 0.5737538148524923,
|
|
"grad_norm": 0.9463231563568115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6387,
|
|
"mean_token_accuracy": 0.8117104768753052,
|
|
"num_tokens": 179892505.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 0.5747711088504578,
|
|
"grad_norm": 0.7623103857040405,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.635,
|
|
"mean_token_accuracy": 0.8120288252830505,
|
|
"num_tokens": 180223161.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.5757884028484231,
|
|
"grad_norm": 0.8463129997253418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6403,
|
|
"mean_token_accuracy": 0.8121374249458313,
|
|
"num_tokens": 180529292.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 0.5768056968463886,
|
|
"grad_norm": 0.7645483613014221,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6161,
|
|
"mean_token_accuracy": 0.8175876140594482,
|
|
"num_tokens": 180831990.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 0.5778229908443541,
|
|
"grad_norm": 0.7431308627128601,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6373,
|
|
"mean_token_accuracy": 0.8129169940948486,
|
|
"num_tokens": 181158179.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 0.5788402848423194,
|
|
"grad_norm": 0.7826768159866333,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6067,
|
|
"mean_token_accuracy": 0.8198922276496887,
|
|
"num_tokens": 181465876.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 0.5798575788402849,
|
|
"grad_norm": 0.8153584599494934,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6368,
|
|
"mean_token_accuracy": 0.811568558216095,
|
|
"num_tokens": 181772486.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.5808748728382502,
|
|
"grad_norm": 1.32663094997406,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6331,
|
|
"mean_token_accuracy": 0.8144712448120117,
|
|
"num_tokens": 182092553.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 0.5818921668362157,
|
|
"grad_norm": 0.7690805792808533,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6395,
|
|
"mean_token_accuracy": 0.8114436268806458,
|
|
"num_tokens": 182399236.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 0.582909460834181,
|
|
"grad_norm": 0.8244120478630066,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6425,
|
|
"mean_token_accuracy": 0.810742974281311,
|
|
"num_tokens": 182711095.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 0.5839267548321465,
|
|
"grad_norm": 0.7508371472358704,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.656,
|
|
"mean_token_accuracy": 0.8080297112464905,
|
|
"num_tokens": 183045279.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 0.5849440488301119,
|
|
"grad_norm": 0.7884571552276611,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6173,
|
|
"mean_token_accuracy": 0.817246675491333,
|
|
"num_tokens": 183377751.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.5859613428280773,
|
|
"grad_norm": 0.828568696975708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6211,
|
|
"mean_token_accuracy": 0.8165144920349121,
|
|
"num_tokens": 183676743.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 0.5869786368260427,
|
|
"grad_norm": 0.7600045800209045,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6187,
|
|
"mean_token_accuracy": 0.817339301109314,
|
|
"num_tokens": 184006491.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 0.5879959308240081,
|
|
"grad_norm": 0.7350272536277771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6152,
|
|
"mean_token_accuracy": 0.8175899386405945,
|
|
"num_tokens": 184334536.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 0.5890132248219736,
|
|
"grad_norm": 0.7769359350204468,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6265,
|
|
"mean_token_accuracy": 0.8150594830513,
|
|
"num_tokens": 184644555.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 0.590030518819939,
|
|
"grad_norm": 0.791875422000885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.629,
|
|
"mean_token_accuracy": 0.8136811256408691,
|
|
"num_tokens": 184956764.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.5910478128179044,
|
|
"grad_norm": 0.7764705419540405,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6242,
|
|
"mean_token_accuracy": 0.8154654502868652,
|
|
"num_tokens": 185278874.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 0.5920651068158698,
|
|
"grad_norm": 0.7930245995521545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6141,
|
|
"mean_token_accuracy": 0.8177323341369629,
|
|
"num_tokens": 185603559.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 0.5930824008138352,
|
|
"grad_norm": 0.7742440700531006,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6298,
|
|
"mean_token_accuracy": 0.8144494295120239,
|
|
"num_tokens": 185904158.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 0.5940996948118006,
|
|
"grad_norm": 0.8462097644805908,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6132,
|
|
"mean_token_accuracy": 0.8187432885169983,
|
|
"num_tokens": 186208990.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 0.595116988809766,
|
|
"grad_norm": 0.7964716553688049,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6499,
|
|
"mean_token_accuracy": 0.8092055320739746,
|
|
"num_tokens": 186527391.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.5961342828077314,
|
|
"grad_norm": 0.8105505108833313,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6131,
|
|
"mean_token_accuracy": 0.8204156756401062,
|
|
"num_tokens": 186851711.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 0.5971515768056969,
|
|
"grad_norm": 0.7496419548988342,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6317,
|
|
"mean_token_accuracy": 0.8129917979240417,
|
|
"num_tokens": 187158410.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 0.5981688708036622,
|
|
"grad_norm": 0.7854833602905273,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6025,
|
|
"mean_token_accuracy": 0.8220297694206238,
|
|
"num_tokens": 187492387.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 0.5991861648016277,
|
|
"grad_norm": 0.854580819606781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6447,
|
|
"mean_token_accuracy": 0.8100663423538208,
|
|
"num_tokens": 187815200.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 0.6002034587995931,
|
|
"grad_norm": 0.7688719034194946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8207713961601257,
|
|
"num_tokens": 188141681.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.6012207527975585,
|
|
"grad_norm": 0.7543858885765076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6387,
|
|
"mean_token_accuracy": 0.8124486207962036,
|
|
"num_tokens": 188459816.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 0.602238046795524,
|
|
"grad_norm": 0.7601234912872314,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.645,
|
|
"mean_token_accuracy": 0.8103123307228088,
|
|
"num_tokens": 188793110.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 0.6032553407934893,
|
|
"grad_norm": 0.7299609780311584,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6081,
|
|
"mean_token_accuracy": 0.8188794851303101,
|
|
"num_tokens": 189123138.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 0.6042726347914548,
|
|
"grad_norm": 0.7960324883460999,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6144,
|
|
"mean_token_accuracy": 0.8179804086685181,
|
|
"num_tokens": 189448908.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 0.6052899287894201,
|
|
"grad_norm": 0.799144983291626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6413,
|
|
"mean_token_accuracy": 0.8109234571456909,
|
|
"num_tokens": 189760563.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.6063072227873856,
|
|
"grad_norm": 0.7431643605232239,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6339,
|
|
"mean_token_accuracy": 0.8132368326187134,
|
|
"num_tokens": 190067307.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 0.6073245167853509,
|
|
"grad_norm": 0.7618584632873535,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6206,
|
|
"mean_token_accuracy": 0.8160717487335205,
|
|
"num_tokens": 190372698.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 0.6083418107833164,
|
|
"grad_norm": 0.7846772074699402,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6288,
|
|
"mean_token_accuracy": 0.8136916756629944,
|
|
"num_tokens": 190679521.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 0.6093591047812817,
|
|
"grad_norm": 0.7604075074195862,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6257,
|
|
"mean_token_accuracy": 0.8160173892974854,
|
|
"num_tokens": 190990350.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 0.6103763987792472,
|
|
"grad_norm": 0.8070312142372131,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6373,
|
|
"mean_token_accuracy": 0.812640905380249,
|
|
"num_tokens": 191298356.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.6113936927772126,
|
|
"grad_norm": 0.7849087119102478,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5958,
|
|
"mean_token_accuracy": 0.8239523768424988,
|
|
"num_tokens": 191607061.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 0.612410986775178,
|
|
"grad_norm": 0.7357282042503357,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6175,
|
|
"mean_token_accuracy": 0.8173874616622925,
|
|
"num_tokens": 191920982.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 0.6134282807731435,
|
|
"grad_norm": 0.7360347509384155,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6203,
|
|
"mean_token_accuracy": 0.816581130027771,
|
|
"num_tokens": 192244544.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 0.6144455747711088,
|
|
"grad_norm": 0.7564297318458557,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6332,
|
|
"mean_token_accuracy": 0.8155977129936218,
|
|
"num_tokens": 192568952.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 0.6154628687690743,
|
|
"grad_norm": 0.7610177993774414,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6161,
|
|
"mean_token_accuracy": 0.818304717540741,
|
|
"num_tokens": 192871340.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.6164801627670397,
|
|
"grad_norm": 0.7376631498336792,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6146,
|
|
"mean_token_accuracy": 0.8194389343261719,
|
|
"num_tokens": 193194025.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 0.6174974567650051,
|
|
"grad_norm": 0.8029021620750427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8219469785690308,
|
|
"num_tokens": 193505599.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 0.6185147507629705,
|
|
"grad_norm": 0.746333122253418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6551,
|
|
"mean_token_accuracy": 0.8087726831436157,
|
|
"num_tokens": 193846107.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 0.6195320447609359,
|
|
"grad_norm": 0.7344440221786499,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6024,
|
|
"mean_token_accuracy": 0.8207103610038757,
|
|
"num_tokens": 194162440.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 0.6205493387589013,
|
|
"grad_norm": 0.7551791071891785,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6188,
|
|
"mean_token_accuracy": 0.8163305521011353,
|
|
"num_tokens": 194471827.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.6215666327568667,
|
|
"grad_norm": 0.7398301959037781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6224,
|
|
"mean_token_accuracy": 0.8153471946716309,
|
|
"num_tokens": 194798176.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 0.6225839267548321,
|
|
"grad_norm": 0.794724702835083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6403,
|
|
"mean_token_accuracy": 0.810512900352478,
|
|
"num_tokens": 195115010.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 0.6236012207527976,
|
|
"grad_norm": 0.7791882753372192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6012,
|
|
"mean_token_accuracy": 0.822370171546936,
|
|
"num_tokens": 195442166.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 0.624618514750763,
|
|
"grad_norm": 0.7617864608764648,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.8227953314781189,
|
|
"num_tokens": 195755945.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 0.6256358087487284,
|
|
"grad_norm": 0.757507860660553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6,
|
|
"mean_token_accuracy": 0.8223221302032471,
|
|
"num_tokens": 196076642.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.6266531027466938,
|
|
"grad_norm": 0.7687931656837463,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.632,
|
|
"mean_token_accuracy": 0.8135921955108643,
|
|
"num_tokens": 196384640.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 0.6276703967446592,
|
|
"grad_norm": 0.7535430192947388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6511,
|
|
"mean_token_accuracy": 0.8093615770339966,
|
|
"num_tokens": 196709416.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 0.6286876907426246,
|
|
"grad_norm": 0.7329970598220825,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6154,
|
|
"mean_token_accuracy": 0.8180556297302246,
|
|
"num_tokens": 197038688.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 0.62970498474059,
|
|
"grad_norm": 0.7846882343292236,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.65,
|
|
"mean_token_accuracy": 0.8091417551040649,
|
|
"num_tokens": 197361082.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 0.6307222787385555,
|
|
"grad_norm": 0.7763864398002625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.608,
|
|
"mean_token_accuracy": 0.8196543455123901,
|
|
"num_tokens": 197654672.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.6317395727365208,
|
|
"grad_norm": 0.7358604669570923,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6206,
|
|
"mean_token_accuracy": 0.8170638084411621,
|
|
"num_tokens": 197972971.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 0.6327568667344863,
|
|
"grad_norm": 0.7396625876426697,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5936,
|
|
"mean_token_accuracy": 0.8225463628768921,
|
|
"num_tokens": 198288270.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 0.6337741607324516,
|
|
"grad_norm": 0.7535018920898438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6097,
|
|
"mean_token_accuracy": 0.818221390247345,
|
|
"num_tokens": 198600564.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 0.6347914547304171,
|
|
"grad_norm": 0.771710991859436,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5982,
|
|
"mean_token_accuracy": 0.8219149708747864,
|
|
"num_tokens": 198919816.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 0.6358087487283826,
|
|
"grad_norm": 0.7761126160621643,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6438,
|
|
"mean_token_accuracy": 0.8097991943359375,
|
|
"num_tokens": 199234621.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.6368260427263479,
|
|
"grad_norm": 0.7397571206092834,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6168,
|
|
"mean_token_accuracy": 0.8178320527076721,
|
|
"num_tokens": 199581880.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 0.6378433367243134,
|
|
"grad_norm": 0.8031889200210571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6385,
|
|
"mean_token_accuracy": 0.8121150732040405,
|
|
"num_tokens": 199887035.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 0.6388606307222787,
|
|
"grad_norm": 0.743266761302948,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6191,
|
|
"mean_token_accuracy": 0.8165163397789001,
|
|
"num_tokens": 200218153.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 0.6398779247202442,
|
|
"grad_norm": 0.789598286151886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6343,
|
|
"mean_token_accuracy": 0.814096212387085,
|
|
"num_tokens": 200538925.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 0.6408952187182095,
|
|
"grad_norm": 0.7879233360290527,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6081,
|
|
"mean_token_accuracy": 0.8194499015808105,
|
|
"num_tokens": 200852605.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.641912512716175,
|
|
"grad_norm": 0.7725964784622192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6239,
|
|
"mean_token_accuracy": 0.8154667615890503,
|
|
"num_tokens": 201171353.0,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 0.6429298067141404,
|
|
"grad_norm": 0.7540669441223145,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6222,
|
|
"mean_token_accuracy": 0.8165876865386963,
|
|
"num_tokens": 201496911.0,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 0.6439471007121058,
|
|
"grad_norm": 0.7851372957229614,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6429,
|
|
"mean_token_accuracy": 0.8111737370491028,
|
|
"num_tokens": 201822744.0,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 0.6449643947100712,
|
|
"grad_norm": 0.7377258539199829,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8218101859092712,
|
|
"num_tokens": 202163172.0,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 0.6459816887080366,
|
|
"grad_norm": 0.8279411792755127,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.621,
|
|
"mean_token_accuracy": 0.8161522150039673,
|
|
"num_tokens": 202477704.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.646998982706002,
|
|
"grad_norm": 0.7455507516860962,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6541,
|
|
"mean_token_accuracy": 0.8080745935440063,
|
|
"num_tokens": 202799169.0,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 0.6480162767039674,
|
|
"grad_norm": 0.7769946455955505,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6183,
|
|
"mean_token_accuracy": 0.8177330493927002,
|
|
"num_tokens": 203103019.0,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 0.6490335707019329,
|
|
"grad_norm": 0.7767770290374756,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6348,
|
|
"mean_token_accuracy": 0.8127952218055725,
|
|
"num_tokens": 203417686.0,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 0.6500508646998983,
|
|
"grad_norm": 0.7546711564064026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6318,
|
|
"mean_token_accuracy": 0.813634991645813,
|
|
"num_tokens": 203737547.0,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 0.6510681586978637,
|
|
"grad_norm": 0.7748486995697021,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6217,
|
|
"mean_token_accuracy": 0.8149056434631348,
|
|
"num_tokens": 204040525.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.6520854526958291,
|
|
"grad_norm": 0.7774174213409424,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6058,
|
|
"mean_token_accuracy": 0.8201233744621277,
|
|
"num_tokens": 204371623.0,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 0.6531027466937945,
|
|
"grad_norm": 0.8860936164855957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6045,
|
|
"mean_token_accuracy": 0.8205939531326294,
|
|
"num_tokens": 204695860.0,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 0.6541200406917599,
|
|
"grad_norm": 0.7313113212585449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5968,
|
|
"mean_token_accuracy": 0.8227372169494629,
|
|
"num_tokens": 205022471.0,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 0.6551373346897253,
|
|
"grad_norm": 0.8307287096977234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6163,
|
|
"mean_token_accuracy": 0.8170334100723267,
|
|
"num_tokens": 205338248.0,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 0.6561546286876907,
|
|
"grad_norm": 0.7945787310600281,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6379,
|
|
"mean_token_accuracy": 0.810707688331604,
|
|
"num_tokens": 205654689.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.6571719226856562,
|
|
"grad_norm": 0.7601882219314575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6534,
|
|
"mean_token_accuracy": 0.8086690902709961,
|
|
"num_tokens": 205984399.0,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 0.6581892166836215,
|
|
"grad_norm": 0.7996160387992859,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6397,
|
|
"mean_token_accuracy": 0.8113598227500916,
|
|
"num_tokens": 206308245.0,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 0.659206510681587,
|
|
"grad_norm": 0.7633128762245178,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6068,
|
|
"mean_token_accuracy": 0.8195191621780396,
|
|
"num_tokens": 206613022.0,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 0.6602238046795524,
|
|
"grad_norm": 0.7742624878883362,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6328,
|
|
"mean_token_accuracy": 0.8122345209121704,
|
|
"num_tokens": 206935916.0,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 0.6612410986775178,
|
|
"grad_norm": 0.7525912523269653,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5972,
|
|
"mean_token_accuracy": 0.823207437992096,
|
|
"num_tokens": 207249857.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.6622583926754833,
|
|
"grad_norm": 0.7381570935249329,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6392,
|
|
"mean_token_accuracy": 0.8117334842681885,
|
|
"num_tokens": 207585946.0,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 0.6632756866734486,
|
|
"grad_norm": 0.7533280849456787,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5878,
|
|
"mean_token_accuracy": 0.8249242901802063,
|
|
"num_tokens": 207905907.0,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 0.6642929806714141,
|
|
"grad_norm": 0.7347438931465149,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5935,
|
|
"mean_token_accuracy": 0.8232535123825073,
|
|
"num_tokens": 208221915.0,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 0.6653102746693794,
|
|
"grad_norm": 0.8024598360061646,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6038,
|
|
"mean_token_accuracy": 0.8214566707611084,
|
|
"num_tokens": 208541953.0,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 0.6663275686673449,
|
|
"grad_norm": 0.7610267400741577,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6102,
|
|
"mean_token_accuracy": 0.819033145904541,
|
|
"num_tokens": 208855549.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.6673448626653102,
|
|
"grad_norm": 0.7803341746330261,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6346,
|
|
"mean_token_accuracy": 0.8129167556762695,
|
|
"num_tokens": 209167815.0,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 0.6683621566632757,
|
|
"grad_norm": 0.9436068534851074,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6464,
|
|
"mean_token_accuracy": 0.8097710013389587,
|
|
"num_tokens": 209506826.0,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 0.669379450661241,
|
|
"grad_norm": 0.7793200612068176,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5991,
|
|
"mean_token_accuracy": 0.8215120434761047,
|
|
"num_tokens": 209810131.0,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 0.6703967446592065,
|
|
"grad_norm": 0.7526964545249939,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6149,
|
|
"mean_token_accuracy": 0.8179228901863098,
|
|
"num_tokens": 210123131.0,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 0.671414038657172,
|
|
"grad_norm": 0.7783557772636414,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6143,
|
|
"mean_token_accuracy": 0.8184784650802612,
|
|
"num_tokens": 210444300.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.6724313326551373,
|
|
"grad_norm": 0.7479385733604431,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6148,
|
|
"mean_token_accuracy": 0.816991925239563,
|
|
"num_tokens": 210771386.0,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 0.6734486266531028,
|
|
"grad_norm": 0.7698660492897034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6198,
|
|
"mean_token_accuracy": 0.8170925974845886,
|
|
"num_tokens": 211073799.0,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 0.6744659206510681,
|
|
"grad_norm": 0.7108475565910339,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6239,
|
|
"mean_token_accuracy": 0.816068172454834,
|
|
"num_tokens": 211407370.0,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 0.6754832146490336,
|
|
"grad_norm": 0.780982255935669,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6107,
|
|
"mean_token_accuracy": 0.8190521001815796,
|
|
"num_tokens": 211739189.0,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 0.676500508646999,
|
|
"grad_norm": 0.7685346007347107,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6075,
|
|
"mean_token_accuracy": 0.8201148509979248,
|
|
"num_tokens": 212047699.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.6775178026449644,
|
|
"grad_norm": 0.7723684310913086,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6275,
|
|
"mean_token_accuracy": 0.8142590522766113,
|
|
"num_tokens": 212350768.0,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 0.6785350966429298,
|
|
"grad_norm": 0.8147526979446411,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6257,
|
|
"mean_token_accuracy": 0.8146811723709106,
|
|
"num_tokens": 212656414.0,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 0.6795523906408952,
|
|
"grad_norm": 0.7926346659660339,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6154,
|
|
"mean_token_accuracy": 0.8165136575698853,
|
|
"num_tokens": 212970630.0,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 0.6805696846388606,
|
|
"grad_norm": 0.7599271535873413,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6187,
|
|
"mean_token_accuracy": 0.8165913820266724,
|
|
"num_tokens": 213296625.0,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 0.681586978636826,
|
|
"grad_norm": 0.7710536122322083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5976,
|
|
"mean_token_accuracy": 0.8237712383270264,
|
|
"num_tokens": 213608058.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.6826042726347915,
|
|
"grad_norm": 0.754758894443512,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6074,
|
|
"mean_token_accuracy": 0.8195925951004028,
|
|
"num_tokens": 213919287.0,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 0.6836215666327569,
|
|
"grad_norm": 0.7790162563323975,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8195450305938721,
|
|
"num_tokens": 214248836.0,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 0.6846388606307223,
|
|
"grad_norm": 0.754189133644104,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6124,
|
|
"mean_token_accuracy": 0.8181887865066528,
|
|
"num_tokens": 214578559.0,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 0.6856561546286877,
|
|
"grad_norm": 0.7615801692008972,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6231,
|
|
"mean_token_accuracy": 0.8142949342727661,
|
|
"num_tokens": 214902909.0,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 0.6866734486266531,
|
|
"grad_norm": 0.8176745772361755,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6138,
|
|
"mean_token_accuracy": 0.817600667476654,
|
|
"num_tokens": 215220787.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.6876907426246185,
|
|
"grad_norm": 0.7809671759605408,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.633,
|
|
"mean_token_accuracy": 0.8143289089202881,
|
|
"num_tokens": 215532596.0,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 0.688708036622584,
|
|
"grad_norm": 0.7892170548439026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6267,
|
|
"mean_token_accuracy": 0.8146532773971558,
|
|
"num_tokens": 215854411.0,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 0.6897253306205493,
|
|
"grad_norm": 0.7844973802566528,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6125,
|
|
"mean_token_accuracy": 0.8172857761383057,
|
|
"num_tokens": 216176025.0,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 0.6907426246185148,
|
|
"grad_norm": 0.7520744204521179,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6075,
|
|
"mean_token_accuracy": 0.819256603717804,
|
|
"num_tokens": 216490560.0,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 0.6917599186164801,
|
|
"grad_norm": 0.8263868689537048,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6214,
|
|
"mean_token_accuracy": 0.8156100511550903,
|
|
"num_tokens": 216788892.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.6927772126144456,
|
|
"grad_norm": 0.7588439583778381,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6057,
|
|
"mean_token_accuracy": 0.8199942111968994,
|
|
"num_tokens": 217091789.0,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 0.6937945066124109,
|
|
"grad_norm": 0.7323412299156189,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6175,
|
|
"mean_token_accuracy": 0.8157312273979187,
|
|
"num_tokens": 217391649.0,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 0.6948118006103764,
|
|
"grad_norm": 0.8044806122779846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6027,
|
|
"mean_token_accuracy": 0.8197875022888184,
|
|
"num_tokens": 217705676.0,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 0.6958290946083419,
|
|
"grad_norm": 0.7340426445007324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6376,
|
|
"mean_token_accuracy": 0.8123747706413269,
|
|
"num_tokens": 218034192.0,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 0.6968463886063072,
|
|
"grad_norm": 0.7277271747589111,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6173,
|
|
"mean_token_accuracy": 0.8182982206344604,
|
|
"num_tokens": 218361880.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.6978636826042727,
|
|
"grad_norm": 0.7453147172927856,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.8207399845123291,
|
|
"num_tokens": 218677170.0,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 0.698880976602238,
|
|
"grad_norm": 0.7955684661865234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6152,
|
|
"mean_token_accuracy": 0.8173407316207886,
|
|
"num_tokens": 218990542.0,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 0.6998982706002035,
|
|
"grad_norm": 0.7468408346176147,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6128,
|
|
"mean_token_accuracy": 0.8189893960952759,
|
|
"num_tokens": 219320664.0,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 0.7009155645981688,
|
|
"grad_norm": 0.7514234781265259,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6228,
|
|
"mean_token_accuracy": 0.8156855702400208,
|
|
"num_tokens": 219643460.0,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 0.7019328585961343,
|
|
"grad_norm": 0.7620576024055481,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6174,
|
|
"mean_token_accuracy": 0.8172132968902588,
|
|
"num_tokens": 219953874.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.7029501525940997,
|
|
"grad_norm": 0.754711389541626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5951,
|
|
"mean_token_accuracy": 0.8232397437095642,
|
|
"num_tokens": 220272230.0,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 0.7039674465920651,
|
|
"grad_norm": 0.8804978728294373,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.8175148963928223,
|
|
"num_tokens": 220588939.0,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 0.7049847405900305,
|
|
"grad_norm": 0.776063859462738,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.604,
|
|
"mean_token_accuracy": 0.8212346434593201,
|
|
"num_tokens": 220916843.0,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 0.7060020345879959,
|
|
"grad_norm": 0.7388089299201965,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6415,
|
|
"mean_token_accuracy": 0.8113117218017578,
|
|
"num_tokens": 221258649.0,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 0.7070193285859614,
|
|
"grad_norm": 0.7917898893356323,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.627,
|
|
"mean_token_accuracy": 0.8141858577728271,
|
|
"num_tokens": 221583215.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.7080366225839267,
|
|
"grad_norm": 0.7290518283843994,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6346,
|
|
"mean_token_accuracy": 0.8122216463088989,
|
|
"num_tokens": 221923372.0,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 0.7090539165818922,
|
|
"grad_norm": 0.7753685116767883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6005,
|
|
"mean_token_accuracy": 0.8205819129943848,
|
|
"num_tokens": 222233287.0,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 0.7100712105798576,
|
|
"grad_norm": 0.7622568607330322,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.628,
|
|
"mean_token_accuracy": 0.8145500421524048,
|
|
"num_tokens": 222549595.0,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 0.711088504577823,
|
|
"grad_norm": 0.7664331793785095,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6203,
|
|
"mean_token_accuracy": 0.8161635398864746,
|
|
"num_tokens": 222873104.0,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 0.7121057985757884,
|
|
"grad_norm": 0.7780277729034424,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6132,
|
|
"mean_token_accuracy": 0.8182451725006104,
|
|
"num_tokens": 223175189.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.7131230925737538,
|
|
"grad_norm": 0.7432042956352234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6133,
|
|
"mean_token_accuracy": 0.8169593811035156,
|
|
"num_tokens": 223499553.0,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 0.7141403865717192,
|
|
"grad_norm": 0.7784262299537659,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6085,
|
|
"mean_token_accuracy": 0.8182696104049683,
|
|
"num_tokens": 223808765.0,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 0.7151576805696847,
|
|
"grad_norm": 0.7821851372718811,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8244631290435791,
|
|
"num_tokens": 224133481.0,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 0.71617497456765,
|
|
"grad_norm": 0.7383291125297546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6189,
|
|
"mean_token_accuracy": 0.8167239427566528,
|
|
"num_tokens": 224447298.0,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 0.7171922685656155,
|
|
"grad_norm": 0.7307918071746826,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6119,
|
|
"mean_token_accuracy": 0.8202638626098633,
|
|
"num_tokens": 224757199.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.7182095625635809,
|
|
"grad_norm": 0.779835045337677,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6112,
|
|
"mean_token_accuracy": 0.8192306756973267,
|
|
"num_tokens": 225078638.0,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 0.7192268565615463,
|
|
"grad_norm": 0.7415493130683899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6086,
|
|
"mean_token_accuracy": 0.8193663954734802,
|
|
"num_tokens": 225402143.0,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 0.7202441505595117,
|
|
"grad_norm": 0.7614285349845886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6148,
|
|
"mean_token_accuracy": 0.8191728591918945,
|
|
"num_tokens": 225720513.0,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 0.7212614445574771,
|
|
"grad_norm": 0.7944474220275879,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6213,
|
|
"mean_token_accuracy": 0.8146188259124756,
|
|
"num_tokens": 226039357.0,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 0.7222787385554426,
|
|
"grad_norm": 0.8491767644882202,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6069,
|
|
"mean_token_accuracy": 0.8203014135360718,
|
|
"num_tokens": 226335920.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.7232960325534079,
|
|
"grad_norm": 0.8166708946228027,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6039,
|
|
"mean_token_accuracy": 0.8206759691238403,
|
|
"num_tokens": 226657995.0,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 0.7243133265513734,
|
|
"grad_norm": 0.7617036700248718,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6233,
|
|
"mean_token_accuracy": 0.8158437013626099,
|
|
"num_tokens": 226982248.0,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 0.7253306205493387,
|
|
"grad_norm": 0.8162404894828796,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8187158703804016,
|
|
"num_tokens": 227274613.0,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 0.7263479145473042,
|
|
"grad_norm": 0.9296181797981262,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6193,
|
|
"mean_token_accuracy": 0.816686749458313,
|
|
"num_tokens": 227605134.0,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 0.7273652085452695,
|
|
"grad_norm": 0.8144844770431519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6344,
|
|
"mean_token_accuracy": 0.8128677606582642,
|
|
"num_tokens": 227922627.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.728382502543235,
|
|
"grad_norm": 0.7691407799720764,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6137,
|
|
"mean_token_accuracy": 0.8177823424339294,
|
|
"num_tokens": 228242141.0,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 0.7293997965412004,
|
|
"grad_norm": 0.839131236076355,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8203939199447632,
|
|
"num_tokens": 228545473.0,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 0.7304170905391658,
|
|
"grad_norm": 0.7813650965690613,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6324,
|
|
"mean_token_accuracy": 0.813715934753418,
|
|
"num_tokens": 228884216.0,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 0.7314343845371313,
|
|
"grad_norm": 0.7858477234840393,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6444,
|
|
"mean_token_accuracy": 0.8113440275192261,
|
|
"num_tokens": 229181920.0,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 0.7324516785350966,
|
|
"grad_norm": 0.7855584025382996,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6175,
|
|
"mean_token_accuracy": 0.8171951770782471,
|
|
"num_tokens": 229505884.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.7334689725330621,
|
|
"grad_norm": 0.7636620998382568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.608,
|
|
"mean_token_accuracy": 0.8192106485366821,
|
|
"num_tokens": 229844345.0,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 0.7344862665310274,
|
|
"grad_norm": 0.7640539407730103,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6283,
|
|
"mean_token_accuracy": 0.8148088455200195,
|
|
"num_tokens": 230160760.0,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 0.7355035605289929,
|
|
"grad_norm": 0.7512305378913879,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6107,
|
|
"mean_token_accuracy": 0.8187347650527954,
|
|
"num_tokens": 230480542.0,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 0.7365208545269583,
|
|
"grad_norm": 0.769347608089447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6095,
|
|
"mean_token_accuracy": 0.8190985918045044,
|
|
"num_tokens": 230793950.0,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 0.7375381485249237,
|
|
"grad_norm": 0.7874099016189575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6312,
|
|
"mean_token_accuracy": 0.8138229250907898,
|
|
"num_tokens": 231116995.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.7385554425228891,
|
|
"grad_norm": 0.7738003730773926,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5944,
|
|
"mean_token_accuracy": 0.8237497806549072,
|
|
"num_tokens": 231446504.0,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 0.7395727365208545,
|
|
"grad_norm": 0.7526485919952393,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6249,
|
|
"mean_token_accuracy": 0.815697968006134,
|
|
"num_tokens": 231772874.0,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 0.7405900305188199,
|
|
"grad_norm": 0.76657634973526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6153,
|
|
"mean_token_accuracy": 0.8176850080490112,
|
|
"num_tokens": 232087611.0,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 0.7416073245167853,
|
|
"grad_norm": 0.794198215007782,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6284,
|
|
"mean_token_accuracy": 0.8136752843856812,
|
|
"num_tokens": 232405181.0,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 0.7426246185147508,
|
|
"grad_norm": 0.7804276347160339,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6273,
|
|
"mean_token_accuracy": 0.8142085075378418,
|
|
"num_tokens": 232730814.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.7436419125127162,
|
|
"grad_norm": 0.7470871210098267,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6036,
|
|
"mean_token_accuracy": 0.8203924894332886,
|
|
"num_tokens": 233062413.0,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 0.7446592065106816,
|
|
"grad_norm": 0.7899121642112732,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6003,
|
|
"mean_token_accuracy": 0.8206931352615356,
|
|
"num_tokens": 233382314.0,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 0.745676500508647,
|
|
"grad_norm": 0.7328281402587891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6071,
|
|
"mean_token_accuracy": 0.8203123807907104,
|
|
"num_tokens": 233714244.0,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 0.7466937945066124,
|
|
"grad_norm": 0.7478933930397034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5988,
|
|
"mean_token_accuracy": 0.822023868560791,
|
|
"num_tokens": 234028924.0,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 0.7477110885045778,
|
|
"grad_norm": 0.7476601600646973,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5942,
|
|
"mean_token_accuracy": 0.8236653804779053,
|
|
"num_tokens": 234352052.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.7487283825025433,
|
|
"grad_norm": 0.7471521496772766,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6122,
|
|
"mean_token_accuracy": 0.8166206479072571,
|
|
"num_tokens": 234685164.0,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 0.7497456765005086,
|
|
"grad_norm": 0.7760440111160278,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5881,
|
|
"mean_token_accuracy": 0.82477867603302,
|
|
"num_tokens": 234999779.0,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 0.7507629704984741,
|
|
"grad_norm": 0.7630813717842102,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6705,
|
|
"mean_token_accuracy": 0.803142786026001,
|
|
"num_tokens": 235319603.0,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 0.7517802644964394,
|
|
"grad_norm": 0.7749915719032288,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6098,
|
|
"mean_token_accuracy": 0.8185386657714844,
|
|
"num_tokens": 235635662.0,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 0.7527975584944049,
|
|
"grad_norm": 0.8760622143745422,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6097,
|
|
"mean_token_accuracy": 0.8190468549728394,
|
|
"num_tokens": 235935563.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.7538148524923703,
|
|
"grad_norm": 0.7899062633514404,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6127,
|
|
"mean_token_accuracy": 0.8181654810905457,
|
|
"num_tokens": 236247865.0,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 0.7548321464903357,
|
|
"grad_norm": 0.7470571994781494,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6005,
|
|
"mean_token_accuracy": 0.822014570236206,
|
|
"num_tokens": 236548474.0,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 0.7558494404883012,
|
|
"grad_norm": 0.8113415837287903,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6059,
|
|
"mean_token_accuracy": 0.8192257881164551,
|
|
"num_tokens": 236859308.0,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 0.7568667344862665,
|
|
"grad_norm": 0.7829393744468689,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6062,
|
|
"mean_token_accuracy": 0.8189871907234192,
|
|
"num_tokens": 237170473.0,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 0.757884028484232,
|
|
"grad_norm": 0.8226320147514343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5902,
|
|
"mean_token_accuracy": 0.8234495520591736,
|
|
"num_tokens": 237483382.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.7589013224821973,
|
|
"grad_norm": 0.7683728933334351,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.605,
|
|
"mean_token_accuracy": 0.8195022940635681,
|
|
"num_tokens": 237809071.0,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 0.7599186164801628,
|
|
"grad_norm": 0.7355079054832458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6232,
|
|
"mean_token_accuracy": 0.8160339593887329,
|
|
"num_tokens": 238146507.0,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 0.7609359104781281,
|
|
"grad_norm": 0.7261055111885071,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5939,
|
|
"mean_token_accuracy": 0.8235647678375244,
|
|
"num_tokens": 238483591.0,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 0.7619532044760936,
|
|
"grad_norm": 0.8172787427902222,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.8236143589019775,
|
|
"num_tokens": 238793523.0,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 0.762970498474059,
|
|
"grad_norm": 0.7747340202331543,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5968,
|
|
"mean_token_accuracy": 0.8225404024124146,
|
|
"num_tokens": 239112968.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.7639877924720244,
|
|
"grad_norm": 0.7910871505737305,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6176,
|
|
"mean_token_accuracy": 0.8168953657150269,
|
|
"num_tokens": 239429305.0,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 0.7650050864699899,
|
|
"grad_norm": 0.7941386699676514,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6074,
|
|
"mean_token_accuracy": 0.8191172480583191,
|
|
"num_tokens": 239745054.0,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 0.7660223804679552,
|
|
"grad_norm": 0.7600555419921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6005,
|
|
"mean_token_accuracy": 0.8215411901473999,
|
|
"num_tokens": 240051112.0,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 0.7670396744659207,
|
|
"grad_norm": 0.7864240407943726,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5883,
|
|
"mean_token_accuracy": 0.8241608142852783,
|
|
"num_tokens": 240373170.0,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 0.768056968463886,
|
|
"grad_norm": 0.7831670641899109,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6217,
|
|
"mean_token_accuracy": 0.8154398798942566,
|
|
"num_tokens": 240668912.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.7690742624618515,
|
|
"grad_norm": 0.7375035881996155,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5965,
|
|
"mean_token_accuracy": 0.822466254234314,
|
|
"num_tokens": 240996115.0,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 0.7700915564598169,
|
|
"grad_norm": 0.7203482985496521,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5924,
|
|
"mean_token_accuracy": 0.8243117332458496,
|
|
"num_tokens": 241313175.0,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 0.7711088504577823,
|
|
"grad_norm": 0.7840688228607178,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6212,
|
|
"mean_token_accuracy": 0.8159429430961609,
|
|
"num_tokens": 241631176.0,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 0.7721261444557477,
|
|
"grad_norm": 0.7749730944633484,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6167,
|
|
"mean_token_accuracy": 0.8183140158653259,
|
|
"num_tokens": 241954029.0,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 0.7731434384537131,
|
|
"grad_norm": 0.7695093154907227,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6401,
|
|
"mean_token_accuracy": 0.8105282783508301,
|
|
"num_tokens": 242276624.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.7741607324516785,
|
|
"grad_norm": 0.8038228750228882,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6018,
|
|
"mean_token_accuracy": 0.8202804327011108,
|
|
"num_tokens": 242576931.0,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 0.775178026449644,
|
|
"grad_norm": 0.7450258135795593,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6079,
|
|
"mean_token_accuracy": 0.8187384605407715,
|
|
"num_tokens": 242882978.0,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 0.7761953204476093,
|
|
"grad_norm": 0.7412680387496948,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6064,
|
|
"mean_token_accuracy": 0.8196582198143005,
|
|
"num_tokens": 243206785.0,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 0.7772126144455748,
|
|
"grad_norm": 0.7654978036880493,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5988,
|
|
"mean_token_accuracy": 0.8214221000671387,
|
|
"num_tokens": 243512602.0,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 0.7782299084435402,
|
|
"grad_norm": 0.7713125348091125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.603,
|
|
"mean_token_accuracy": 0.8209126591682434,
|
|
"num_tokens": 243845125.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.7792472024415056,
|
|
"grad_norm": 0.7866131663322449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5972,
|
|
"mean_token_accuracy": 0.8228663206100464,
|
|
"num_tokens": 244139431.0,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 0.780264496439471,
|
|
"grad_norm": 0.7617781162261963,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.616,
|
|
"mean_token_accuracy": 0.8174664378166199,
|
|
"num_tokens": 244456341.0,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 0.7812817904374364,
|
|
"grad_norm": 0.7536523938179016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6066,
|
|
"mean_token_accuracy": 0.819066047668457,
|
|
"num_tokens": 244763392.0,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 0.7822990844354019,
|
|
"grad_norm": 0.739930272102356,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6127,
|
|
"mean_token_accuracy": 0.8170568943023682,
|
|
"num_tokens": 245082597.0,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 0.7833163784333672,
|
|
"grad_norm": 0.7585535049438477,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6179,
|
|
"mean_token_accuracy": 0.815958559513092,
|
|
"num_tokens": 245379953.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.7843336724313327,
|
|
"grad_norm": 0.7480993866920471,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5959,
|
|
"mean_token_accuracy": 0.8213130235671997,
|
|
"num_tokens": 245696677.0,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 0.785350966429298,
|
|
"grad_norm": 0.7671734690666199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6148,
|
|
"mean_token_accuracy": 0.8170229196548462,
|
|
"num_tokens": 246030786.0,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 0.7863682604272635,
|
|
"grad_norm": 0.7891989350318909,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.59,
|
|
"mean_token_accuracy": 0.8221583366394043,
|
|
"num_tokens": 246332311.0,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 0.7873855544252288,
|
|
"grad_norm": 0.7767341136932373,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6052,
|
|
"mean_token_accuracy": 0.8198896646499634,
|
|
"num_tokens": 246646398.0,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 0.7884028484231943,
|
|
"grad_norm": 0.7895208597183228,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5876,
|
|
"mean_token_accuracy": 0.8229328989982605,
|
|
"num_tokens": 246957754.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.7894201424211598,
|
|
"grad_norm": 0.7392787337303162,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.831958532333374,
|
|
"num_tokens": 247284678.0,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 0.7904374364191251,
|
|
"grad_norm": 0.748408317565918,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6179,
|
|
"mean_token_accuracy": 0.8162950277328491,
|
|
"num_tokens": 247601438.0,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 0.7914547304170906,
|
|
"grad_norm": 0.7726233601570129,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6052,
|
|
"mean_token_accuracy": 0.8199659585952759,
|
|
"num_tokens": 247935788.0,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 0.7924720244150559,
|
|
"grad_norm": 0.7829846143722534,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5839,
|
|
"mean_token_accuracy": 0.8276870250701904,
|
|
"num_tokens": 248252748.0,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 0.7934893184130214,
|
|
"grad_norm": 0.7898768782615662,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.8218032121658325,
|
|
"num_tokens": 248562629.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.7945066124109867,
|
|
"grad_norm": 0.779306948184967,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.8156956434249878,
|
|
"num_tokens": 248884711.0,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 0.7955239064089522,
|
|
"grad_norm": 0.7667821645736694,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6118,
|
|
"mean_token_accuracy": 0.8179029226303101,
|
|
"num_tokens": 249198668.0,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 0.7965412004069176,
|
|
"grad_norm": 0.810929000377655,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6143,
|
|
"mean_token_accuracy": 0.8167812824249268,
|
|
"num_tokens": 249506726.0,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 0.797558494404883,
|
|
"grad_norm": 0.7606924176216125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6124,
|
|
"mean_token_accuracy": 0.8178728818893433,
|
|
"num_tokens": 249815595.0,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 0.7985757884028484,
|
|
"grad_norm": 0.7567647099494934,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6092,
|
|
"mean_token_accuracy": 0.8186475038528442,
|
|
"num_tokens": 250145992.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.7995930824008138,
|
|
"grad_norm": 0.7598842978477478,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6132,
|
|
"mean_token_accuracy": 0.8182504177093506,
|
|
"num_tokens": 250443792.0,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 0.8006103763987793,
|
|
"grad_norm": 0.802496075630188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6203,
|
|
"mean_token_accuracy": 0.8158894777297974,
|
|
"num_tokens": 250747877.0,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 0.8016276703967447,
|
|
"grad_norm": 0.8037290573120117,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6054,
|
|
"mean_token_accuracy": 0.8193314075469971,
|
|
"num_tokens": 251052502.0,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 0.8026449643947101,
|
|
"grad_norm": 0.8257606625556946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.593,
|
|
"mean_token_accuracy": 0.8240830302238464,
|
|
"num_tokens": 251352775.0,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 0.8036622583926755,
|
|
"grad_norm": 0.7704517841339111,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6032,
|
|
"mean_token_accuracy": 0.8215023279190063,
|
|
"num_tokens": 251677652.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.8046795523906409,
|
|
"grad_norm": 0.7924766540527344,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6454,
|
|
"mean_token_accuracy": 0.8102990388870239,
|
|
"num_tokens": 251995232.0,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 0.8056968463886063,
|
|
"grad_norm": 0.7513923645019531,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6143,
|
|
"mean_token_accuracy": 0.8175941705703735,
|
|
"num_tokens": 252318139.0,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 0.8067141403865717,
|
|
"grad_norm": 0.7312906980514526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6006,
|
|
"mean_token_accuracy": 0.8224383592605591,
|
|
"num_tokens": 252655015.0,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 0.8077314343845371,
|
|
"grad_norm": 0.7360526323318481,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6034,
|
|
"mean_token_accuracy": 0.8211574554443359,
|
|
"num_tokens": 252975872.0,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 0.8087487283825026,
|
|
"grad_norm": 0.8279215693473816,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6156,
|
|
"mean_token_accuracy": 0.8176878690719604,
|
|
"num_tokens": 253298562.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.8097660223804679,
|
|
"grad_norm": 0.7401262521743774,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5835,
|
|
"mean_token_accuracy": 0.8262432813644409,
|
|
"num_tokens": 253623711.0,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 0.8107833163784334,
|
|
"grad_norm": 0.7732365131378174,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6123,
|
|
"mean_token_accuracy": 0.8178532123565674,
|
|
"num_tokens": 253951728.0,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 0.8118006103763988,
|
|
"grad_norm": 0.8180770874023438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6155,
|
|
"mean_token_accuracy": 0.816921591758728,
|
|
"num_tokens": 254256769.0,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 0.8128179043743642,
|
|
"grad_norm": 0.7186532020568848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6257,
|
|
"mean_token_accuracy": 0.8152790069580078,
|
|
"num_tokens": 254592698.0,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 0.8138351983723296,
|
|
"grad_norm": 0.7515096664428711,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6123,
|
|
"mean_token_accuracy": 0.8181686997413635,
|
|
"num_tokens": 254907582.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.814852492370295,
|
|
"grad_norm": 0.7662571668624878,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8246775269508362,
|
|
"num_tokens": 255220612.0,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 0.8158697863682605,
|
|
"grad_norm": 0.7624213099479675,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6251,
|
|
"mean_token_accuracy": 0.8160996437072754,
|
|
"num_tokens": 255530491.0,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 0.8168870803662258,
|
|
"grad_norm": 0.723014771938324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6105,
|
|
"mean_token_accuracy": 0.8190559148788452,
|
|
"num_tokens": 255861693.0,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 0.8179043743641913,
|
|
"grad_norm": 0.7166092395782471,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5867,
|
|
"mean_token_accuracy": 0.8237287998199463,
|
|
"num_tokens": 256184434.0,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 0.8189216683621566,
|
|
"grad_norm": 0.7516783475875854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5941,
|
|
"mean_token_accuracy": 0.8218938708305359,
|
|
"num_tokens": 256517073.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.8199389623601221,
|
|
"grad_norm": 0.7856842279434204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6056,
|
|
"mean_token_accuracy": 0.81956946849823,
|
|
"num_tokens": 256848111.0,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 0.8209562563580874,
|
|
"grad_norm": 0.9904868602752686,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.61,
|
|
"mean_token_accuracy": 0.8192522525787354,
|
|
"num_tokens": 257182035.0,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 0.8219735503560529,
|
|
"grad_norm": 0.7695131301879883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.628,
|
|
"mean_token_accuracy": 0.8135746717453003,
|
|
"num_tokens": 257508833.0,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 0.8229908443540183,
|
|
"grad_norm": 0.7403817176818848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6003,
|
|
"mean_token_accuracy": 0.8210437893867493,
|
|
"num_tokens": 257832236.0,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 0.8240081383519837,
|
|
"grad_norm": 0.7729365825653076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5929,
|
|
"mean_token_accuracy": 0.82197505235672,
|
|
"num_tokens": 258156268.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.8250254323499492,
|
|
"grad_norm": 0.7479487657546997,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6124,
|
|
"mean_token_accuracy": 0.8176651000976562,
|
|
"num_tokens": 258495850.0,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 0.8260427263479145,
|
|
"grad_norm": 0.7545899152755737,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8191924095153809,
|
|
"num_tokens": 258823978.0,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 0.82706002034588,
|
|
"grad_norm": 0.7741188406944275,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6089,
|
|
"mean_token_accuracy": 0.8178560733795166,
|
|
"num_tokens": 259143758.0,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 0.8280773143438453,
|
|
"grad_norm": 0.8673993349075317,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6152,
|
|
"mean_token_accuracy": 0.8175423741340637,
|
|
"num_tokens": 259459034.0,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 0.8290946083418108,
|
|
"grad_norm": 0.8135408759117126,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5933,
|
|
"mean_token_accuracy": 0.8214139938354492,
|
|
"num_tokens": 259769628.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.8301119023397762,
|
|
"grad_norm": 0.7409703731536865,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.624,
|
|
"mean_token_accuracy": 0.815783679485321,
|
|
"num_tokens": 260091045.0,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 0.8311291963377416,
|
|
"grad_norm": 0.7929558753967285,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6012,
|
|
"mean_token_accuracy": 0.8218845129013062,
|
|
"num_tokens": 260414955.0,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 0.832146490335707,
|
|
"grad_norm": 0.7872368693351746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5748,
|
|
"mean_token_accuracy": 0.8281674385070801,
|
|
"num_tokens": 260738547.0,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 0.8331637843336724,
|
|
"grad_norm": 0.802636444568634,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6101,
|
|
"mean_token_accuracy": 0.8184858560562134,
|
|
"num_tokens": 261051308.0,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 0.8341810783316378,
|
|
"grad_norm": 0.7677578926086426,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5977,
|
|
"mean_token_accuracy": 0.8210058212280273,
|
|
"num_tokens": 261358402.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.8351983723296033,
|
|
"grad_norm": 0.7571184039115906,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6149,
|
|
"mean_token_accuracy": 0.8174717426300049,
|
|
"num_tokens": 261687233.0,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 0.8362156663275687,
|
|
"grad_norm": 0.7534165978431702,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6163,
|
|
"mean_token_accuracy": 0.8172184228897095,
|
|
"num_tokens": 262026963.0,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 0.8372329603255341,
|
|
"grad_norm": 1.1013829708099365,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5999,
|
|
"mean_token_accuracy": 0.8210376501083374,
|
|
"num_tokens": 262365347.0,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 0.8382502543234995,
|
|
"grad_norm": 0.7831832766532898,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6017,
|
|
"mean_token_accuracy": 0.8204481601715088,
|
|
"num_tokens": 262673650.0,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 0.8392675483214649,
|
|
"grad_norm": 0.8020746111869812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6233,
|
|
"mean_token_accuracy": 0.8149141073226929,
|
|
"num_tokens": 262970304.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.8402848423194303,
|
|
"grad_norm": 0.7812209129333496,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6197,
|
|
"mean_token_accuracy": 0.8161625266075134,
|
|
"num_tokens": 263293537.0,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 0.8413021363173957,
|
|
"grad_norm": 0.7618210315704346,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5971,
|
|
"mean_token_accuracy": 0.8219529986381531,
|
|
"num_tokens": 263615516.0,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 0.8423194303153612,
|
|
"grad_norm": 0.8371942639350891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5918,
|
|
"mean_token_accuracy": 0.8239431381225586,
|
|
"num_tokens": 263924035.0,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 0.8433367243133265,
|
|
"grad_norm": 0.7650486826896667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6399,
|
|
"mean_token_accuracy": 0.8103851675987244,
|
|
"num_tokens": 264253124.0,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 0.844354018311292,
|
|
"grad_norm": 0.7482921481132507,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6014,
|
|
"mean_token_accuracy": 0.8206346035003662,
|
|
"num_tokens": 264578027.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.8453713123092573,
|
|
"grad_norm": 0.7574668526649475,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6143,
|
|
"mean_token_accuracy": 0.8168084621429443,
|
|
"num_tokens": 264888054.0,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 0.8463886063072228,
|
|
"grad_norm": 2.012491226196289,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6089,
|
|
"mean_token_accuracy": 0.8190878033638,
|
|
"num_tokens": 265191225.0,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 0.8474059003051883,
|
|
"grad_norm": 0.7282354235649109,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.611,
|
|
"mean_token_accuracy": 0.8182060718536377,
|
|
"num_tokens": 265529823.0,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 0.8484231943031536,
|
|
"grad_norm": 0.7751017212867737,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5908,
|
|
"mean_token_accuracy": 0.8238675594329834,
|
|
"num_tokens": 265845965.0,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 0.8494404883011191,
|
|
"grad_norm": 0.7922971248626709,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.622,
|
|
"mean_token_accuracy": 0.8150073289871216,
|
|
"num_tokens": 266161615.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.8504577822990844,
|
|
"grad_norm": 0.761900007724762,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6141,
|
|
"mean_token_accuracy": 0.8176765441894531,
|
|
"num_tokens": 266462436.0,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 0.8514750762970499,
|
|
"grad_norm": 0.7576068043708801,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6041,
|
|
"mean_token_accuracy": 0.8197664618492126,
|
|
"num_tokens": 266773293.0,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 0.8524923702950152,
|
|
"grad_norm": 0.7256014943122864,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6158,
|
|
"mean_token_accuracy": 0.8169959783554077,
|
|
"num_tokens": 267100828.0,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 0.8535096642929807,
|
|
"grad_norm": 0.8366914987564087,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5976,
|
|
"mean_token_accuracy": 0.821556568145752,
|
|
"num_tokens": 267429972.0,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 0.854526958290946,
|
|
"grad_norm": 0.7880463600158691,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5851,
|
|
"mean_token_accuracy": 0.8247042894363403,
|
|
"num_tokens": 267748631.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.8555442522889115,
|
|
"grad_norm": 0.7307940721511841,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5981,
|
|
"mean_token_accuracy": 0.8206034898757935,
|
|
"num_tokens": 268058483.0,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 0.8565615462868769,
|
|
"grad_norm": 0.764660120010376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6033,
|
|
"mean_token_accuracy": 0.8215179443359375,
|
|
"num_tokens": 268382188.0,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 0.8575788402848423,
|
|
"grad_norm": 0.779708981513977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5991,
|
|
"mean_token_accuracy": 0.8204516172409058,
|
|
"num_tokens": 268701052.0,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 0.8585961342828077,
|
|
"grad_norm": 0.7856629490852356,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8267958164215088,
|
|
"num_tokens": 268996708.0,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 0.8596134282807731,
|
|
"grad_norm": 0.7329586744308472,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.599,
|
|
"mean_token_accuracy": 0.820576548576355,
|
|
"num_tokens": 269311951.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.8606307222787386,
|
|
"grad_norm": 0.8273540139198303,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5917,
|
|
"mean_token_accuracy": 0.8222566246986389,
|
|
"num_tokens": 269598930.0,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 0.861648016276704,
|
|
"grad_norm": 0.7287344932556152,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6259,
|
|
"mean_token_accuracy": 0.814606785774231,
|
|
"num_tokens": 269921692.0,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 0.8626653102746694,
|
|
"grad_norm": 0.755750834941864,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6048,
|
|
"mean_token_accuracy": 0.8209663033485413,
|
|
"num_tokens": 270232620.0,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 0.8636826042726348,
|
|
"grad_norm": 0.7512905597686768,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.599,
|
|
"mean_token_accuracy": 0.8223528265953064,
|
|
"num_tokens": 270560619.0,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 0.8646998982706002,
|
|
"grad_norm": 0.8476002216339111,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6071,
|
|
"mean_token_accuracy": 0.8203320503234863,
|
|
"num_tokens": 270874728.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.8657171922685656,
|
|
"grad_norm": 0.7668102383613586,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5996,
|
|
"mean_token_accuracy": 0.8214831352233887,
|
|
"num_tokens": 271202782.0,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 0.866734486266531,
|
|
"grad_norm": 0.7954599857330322,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6064,
|
|
"mean_token_accuracy": 0.820402979850769,
|
|
"num_tokens": 271529123.0,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 0.8677517802644964,
|
|
"grad_norm": 0.7808477282524109,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5886,
|
|
"mean_token_accuracy": 0.8243553638458252,
|
|
"num_tokens": 271828281.0,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 0.8687690742624619,
|
|
"grad_norm": 0.7486982941627502,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6037,
|
|
"mean_token_accuracy": 0.8210355043411255,
|
|
"num_tokens": 272152291.0,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 0.8697863682604272,
|
|
"grad_norm": 0.7671699523925781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6264,
|
|
"mean_token_accuracy": 0.8134573698043823,
|
|
"num_tokens": 272469049.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.8708036622583927,
|
|
"grad_norm": 0.8027375340461731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6341,
|
|
"mean_token_accuracy": 0.8117994070053101,
|
|
"num_tokens": 272768408.0,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 0.8718209562563581,
|
|
"grad_norm": 0.7631490230560303,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5992,
|
|
"mean_token_accuracy": 0.8212313652038574,
|
|
"num_tokens": 273089732.0,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 0.8728382502543235,
|
|
"grad_norm": 0.7767783403396606,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8243077397346497,
|
|
"num_tokens": 273407433.0,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 0.873855544252289,
|
|
"grad_norm": 0.7670124769210815,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6168,
|
|
"mean_token_accuracy": 0.81707364320755,
|
|
"num_tokens": 273726324.0,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 0.8748728382502543,
|
|
"grad_norm": 0.8127790093421936,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5947,
|
|
"mean_token_accuracy": 0.8218735456466675,
|
|
"num_tokens": 274048187.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.8758901322482198,
|
|
"grad_norm": 0.7514941096305847,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5947,
|
|
"mean_token_accuracy": 0.8224079012870789,
|
|
"num_tokens": 274377833.0,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 0.8769074262461851,
|
|
"grad_norm": 0.7778471112251282,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6303,
|
|
"mean_token_accuracy": 0.8132891654968262,
|
|
"num_tokens": 274705006.0,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 0.8779247202441506,
|
|
"grad_norm": 0.7269313931465149,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5905,
|
|
"mean_token_accuracy": 0.8242048025131226,
|
|
"num_tokens": 275014059.0,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 0.8789420142421159,
|
|
"grad_norm": 0.7438430190086365,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6238,
|
|
"mean_token_accuracy": 0.8166732788085938,
|
|
"num_tokens": 275338869.0,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 0.8799593082400814,
|
|
"grad_norm": 0.8035140037536621,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.614,
|
|
"mean_token_accuracy": 0.8169056177139282,
|
|
"num_tokens": 275653401.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.8809766022380467,
|
|
"grad_norm": 0.7698527574539185,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5935,
|
|
"mean_token_accuracy": 0.8229260444641113,
|
|
"num_tokens": 275962373.0,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 0.8819938962360122,
|
|
"grad_norm": 0.7348355054855347,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6071,
|
|
"mean_token_accuracy": 0.8198667764663696,
|
|
"num_tokens": 276287064.0,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 0.8830111902339777,
|
|
"grad_norm": 0.7421631813049316,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6031,
|
|
"mean_token_accuracy": 0.8199079036712646,
|
|
"num_tokens": 276610916.0,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 0.884028484231943,
|
|
"grad_norm": 0.7622741460800171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.8169533610343933,
|
|
"num_tokens": 276928368.0,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 0.8850457782299085,
|
|
"grad_norm": 0.7316154837608337,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.824696958065033,
|
|
"num_tokens": 277246859.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.8860630722278738,
|
|
"grad_norm": 0.7528466582298279,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6049,
|
|
"mean_token_accuracy": 0.8203611373901367,
|
|
"num_tokens": 277573295.0,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 0.8870803662258393,
|
|
"grad_norm": 0.7816317677497864,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5988,
|
|
"mean_token_accuracy": 0.8214170336723328,
|
|
"num_tokens": 277877252.0,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 0.8880976602238047,
|
|
"grad_norm": 0.7750648856163025,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5982,
|
|
"mean_token_accuracy": 0.8203676342964172,
|
|
"num_tokens": 278194339.0,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 0.8891149542217701,
|
|
"grad_norm": 0.7397667765617371,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6133,
|
|
"mean_token_accuracy": 0.8168438076972961,
|
|
"num_tokens": 278516575.0,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 0.8901322482197355,
|
|
"grad_norm": 0.7264861464500427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6212,
|
|
"mean_token_accuracy": 0.8153183460235596,
|
|
"num_tokens": 278844290.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.8911495422177009,
|
|
"grad_norm": 0.7935214042663574,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6162,
|
|
"mean_token_accuracy": 0.8169482350349426,
|
|
"num_tokens": 279161918.0,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 0.8921668362156663,
|
|
"grad_norm": 0.7562498450279236,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6243,
|
|
"mean_token_accuracy": 0.815357506275177,
|
|
"num_tokens": 279481295.0,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 0.8931841302136317,
|
|
"grad_norm": 0.7284414768218994,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5926,
|
|
"mean_token_accuracy": 0.8232520818710327,
|
|
"num_tokens": 279800329.0,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 0.8942014242115972,
|
|
"grad_norm": 0.7370333671569824,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5821,
|
|
"mean_token_accuracy": 0.8256187438964844,
|
|
"num_tokens": 280111037.0,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 0.8952187182095626,
|
|
"grad_norm": 0.7799801826477051,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5985,
|
|
"mean_token_accuracy": 0.8217278718948364,
|
|
"num_tokens": 280405852.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.896236012207528,
|
|
"grad_norm": 0.7867871522903442,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6103,
|
|
"mean_token_accuracy": 0.8187962770462036,
|
|
"num_tokens": 280727104.0,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 0.8972533062054934,
|
|
"grad_norm": 0.8229040503501892,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5896,
|
|
"mean_token_accuracy": 0.8244612812995911,
|
|
"num_tokens": 281043942.0,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 0.8982706002034588,
|
|
"grad_norm": 0.773310124874115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6063,
|
|
"mean_token_accuracy": 0.8205931186676025,
|
|
"num_tokens": 281374849.0,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 0.8992878942014242,
|
|
"grad_norm": 0.7774438858032227,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5967,
|
|
"mean_token_accuracy": 0.8223268985748291,
|
|
"num_tokens": 281681982.0,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 0.9003051881993896,
|
|
"grad_norm": 0.7470009922981262,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6149,
|
|
"mean_token_accuracy": 0.816724956035614,
|
|
"num_tokens": 282019096.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.901322482197355,
|
|
"grad_norm": 0.7227064371109009,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5956,
|
|
"mean_token_accuracy": 0.8233357667922974,
|
|
"num_tokens": 282360553.0,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 0.9023397761953205,
|
|
"grad_norm": 0.7855974435806274,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6083,
|
|
"mean_token_accuracy": 0.8194457292556763,
|
|
"num_tokens": 282674995.0,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 0.9033570701932858,
|
|
"grad_norm": 0.7482305765151978,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5932,
|
|
"mean_token_accuracy": 0.8232486248016357,
|
|
"num_tokens": 282997592.0,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 0.9043743641912513,
|
|
"grad_norm": 0.7636599540710449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5822,
|
|
"mean_token_accuracy": 0.8257430195808411,
|
|
"num_tokens": 283307527.0,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 0.9053916581892166,
|
|
"grad_norm": 0.7545581459999084,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6215,
|
|
"mean_token_accuracy": 0.8150089979171753,
|
|
"num_tokens": 283620377.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.9064089521871821,
|
|
"grad_norm": 0.8514378070831299,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.8237584233283997,
|
|
"num_tokens": 283931044.0,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 0.9074262461851476,
|
|
"grad_norm": 0.7739594578742981,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6076,
|
|
"mean_token_accuracy": 0.8195616006851196,
|
|
"num_tokens": 284248921.0,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 0.9084435401831129,
|
|
"grad_norm": 0.71832275390625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8258185386657715,
|
|
"num_tokens": 284570062.0,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 0.9094608341810784,
|
|
"grad_norm": 0.7497410178184509,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6099,
|
|
"mean_token_accuracy": 0.8188490867614746,
|
|
"num_tokens": 284892788.0,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 0.9104781281790437,
|
|
"grad_norm": 0.7797565460205078,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6173,
|
|
"mean_token_accuracy": 0.8166163563728333,
|
|
"num_tokens": 285203700.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.9114954221770092,
|
|
"grad_norm": 0.7578243017196655,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6033,
|
|
"mean_token_accuracy": 0.8208448886871338,
|
|
"num_tokens": 285550546.0,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 0.9125127161749745,
|
|
"grad_norm": 0.8120006918907166,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6093,
|
|
"mean_token_accuracy": 0.8185294270515442,
|
|
"num_tokens": 285845604.0,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 0.91353001017294,
|
|
"grad_norm": 0.829473614692688,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6071,
|
|
"mean_token_accuracy": 0.8186506628990173,
|
|
"num_tokens": 286144853.0,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 0.9145473041709054,
|
|
"grad_norm": 0.7547066807746887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6046,
|
|
"mean_token_accuracy": 0.8188067674636841,
|
|
"num_tokens": 286458777.0,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 0.9155645981688708,
|
|
"grad_norm": 0.8693905472755432,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6052,
|
|
"mean_token_accuracy": 0.8198082447052002,
|
|
"num_tokens": 286774374.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.9165818921668362,
|
|
"grad_norm": 0.774172306060791,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6096,
|
|
"mean_token_accuracy": 0.8185197710990906,
|
|
"num_tokens": 287085779.0,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 0.9175991861648016,
|
|
"grad_norm": 0.7755725979804993,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8177812099456787,
|
|
"num_tokens": 287405700.0,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 0.9186164801627671,
|
|
"grad_norm": 0.7723372578620911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5859,
|
|
"mean_token_accuracy": 0.8251550197601318,
|
|
"num_tokens": 287716716.0,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 0.9196337741607324,
|
|
"grad_norm": 0.7204782962799072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8186980485916138,
|
|
"num_tokens": 288040043.0,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 0.9206510681586979,
|
|
"grad_norm": 0.7748673558235168,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6074,
|
|
"mean_token_accuracy": 0.8196672201156616,
|
|
"num_tokens": 288352383.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.9216683621566633,
|
|
"grad_norm": 0.7603813409805298,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5872,
|
|
"mean_token_accuracy": 0.8238770365715027,
|
|
"num_tokens": 288681122.0,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 0.9226856561546287,
|
|
"grad_norm": 0.736601710319519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6236,
|
|
"mean_token_accuracy": 0.8154827356338501,
|
|
"num_tokens": 289007588.0,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 0.9237029501525941,
|
|
"grad_norm": 0.7555220723152161,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5979,
|
|
"mean_token_accuracy": 0.8206131458282471,
|
|
"num_tokens": 289338174.0,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 0.9247202441505595,
|
|
"grad_norm": 0.7680867314338684,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5923,
|
|
"mean_token_accuracy": 0.8236669301986694,
|
|
"num_tokens": 289657315.0,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 0.9257375381485249,
|
|
"grad_norm": 0.7650969624519348,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.598,
|
|
"mean_token_accuracy": 0.8214796781539917,
|
|
"num_tokens": 289982984.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.9267548321464903,
|
|
"grad_norm": 0.7762976288795471,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5928,
|
|
"mean_token_accuracy": 0.8228493332862854,
|
|
"num_tokens": 290298649.0,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 0.9277721261444557,
|
|
"grad_norm": 0.7978520393371582,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5979,
|
|
"mean_token_accuracy": 0.8216226100921631,
|
|
"num_tokens": 290617255.0,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 0.9287894201424212,
|
|
"grad_norm": 0.7277722954750061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5907,
|
|
"mean_token_accuracy": 0.8223746418952942,
|
|
"num_tokens": 290937706.0,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 0.9298067141403866,
|
|
"grad_norm": 0.7542223930358887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6075,
|
|
"mean_token_accuracy": 0.8198230862617493,
|
|
"num_tokens": 291248241.0,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 0.930824008138352,
|
|
"grad_norm": 0.7750515341758728,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6099,
|
|
"mean_token_accuracy": 0.8176894783973694,
|
|
"num_tokens": 291586275.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.9318413021363174,
|
|
"grad_norm": 0.7996515035629272,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5835,
|
|
"mean_token_accuracy": 0.8253262042999268,
|
|
"num_tokens": 291904561.0,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 0.9328585961342828,
|
|
"grad_norm": 0.7990328669548035,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.603,
|
|
"mean_token_accuracy": 0.8202857971191406,
|
|
"num_tokens": 292214056.0,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 0.9338758901322483,
|
|
"grad_norm": 0.7909140586853027,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6053,
|
|
"mean_token_accuracy": 0.8177269101142883,
|
|
"num_tokens": 292505385.0,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 0.9348931841302136,
|
|
"grad_norm": 0.7725432515144348,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6114,
|
|
"mean_token_accuracy": 0.8192976713180542,
|
|
"num_tokens": 292829795.0,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 0.9359104781281791,
|
|
"grad_norm": 0.8053933382034302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5869,
|
|
"mean_token_accuracy": 0.8256614804267883,
|
|
"num_tokens": 293144553.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.9369277721261444,
|
|
"grad_norm": 0.7472183108329773,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6104,
|
|
"mean_token_accuracy": 0.8184031248092651,
|
|
"num_tokens": 293463953.0,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 0.9379450661241099,
|
|
"grad_norm": 0.7906295657157898,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.572,
|
|
"mean_token_accuracy": 0.8287568688392639,
|
|
"num_tokens": 293773864.0,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 0.9389623601220752,
|
|
"grad_norm": 0.8005516529083252,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5817,
|
|
"mean_token_accuracy": 0.8264212012290955,
|
|
"num_tokens": 294103688.0,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 0.9399796541200407,
|
|
"grad_norm": 0.7889928221702576,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8293386101722717,
|
|
"num_tokens": 294425536.0,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 0.940996948118006,
|
|
"grad_norm": 0.7291927933692932,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6044,
|
|
"mean_token_accuracy": 0.8204392194747925,
|
|
"num_tokens": 294743889.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.9420142421159715,
|
|
"grad_norm": 0.7222439646720886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6224,
|
|
"mean_token_accuracy": 0.8163737058639526,
|
|
"num_tokens": 295078964.0,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 0.943031536113937,
|
|
"grad_norm": 0.7308232188224792,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5772,
|
|
"mean_token_accuracy": 0.8282566070556641,
|
|
"num_tokens": 295382594.0,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 0.9440488301119023,
|
|
"grad_norm": 0.7753663063049316,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6013,
|
|
"mean_token_accuracy": 0.8208799362182617,
|
|
"num_tokens": 295694723.0,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 0.9450661241098678,
|
|
"grad_norm": 0.8650704622268677,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5855,
|
|
"mean_token_accuracy": 0.8251305222511292,
|
|
"num_tokens": 295998034.0,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 0.9460834181078331,
|
|
"grad_norm": 0.7788504958152771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6017,
|
|
"mean_token_accuracy": 0.8198367357254028,
|
|
"num_tokens": 296290081.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.9471007121057986,
|
|
"grad_norm": 0.76593017578125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5911,
|
|
"mean_token_accuracy": 0.8246276378631592,
|
|
"num_tokens": 296598043.0,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 0.948118006103764,
|
|
"grad_norm": 0.7856141328811646,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5922,
|
|
"mean_token_accuracy": 0.8231593370437622,
|
|
"num_tokens": 296907598.0,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 0.9491353001017294,
|
|
"grad_norm": 1.8626831769943237,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5952,
|
|
"mean_token_accuracy": 0.8226741552352905,
|
|
"num_tokens": 297229538.0,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 0.9501525940996948,
|
|
"grad_norm": 0.753101646900177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6198,
|
|
"mean_token_accuracy": 0.8162423372268677,
|
|
"num_tokens": 297545849.0,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 0.9511698880976602,
|
|
"grad_norm": 0.7566388249397278,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6118,
|
|
"mean_token_accuracy": 0.8176044225692749,
|
|
"num_tokens": 297857385.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.9521871820956256,
|
|
"grad_norm": 0.7380351424217224,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5952,
|
|
"mean_token_accuracy": 0.8211214542388916,
|
|
"num_tokens": 298178437.0,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 0.953204476093591,
|
|
"grad_norm": 0.7686142325401306,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5836,
|
|
"mean_token_accuracy": 0.8248642086982727,
|
|
"num_tokens": 298502544.0,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 0.9542217700915565,
|
|
"grad_norm": 0.7463445067405701,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6089,
|
|
"mean_token_accuracy": 0.8178445100784302,
|
|
"num_tokens": 298813928.0,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 0.9552390640895219,
|
|
"grad_norm": 0.7960903644561768,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5806,
|
|
"mean_token_accuracy": 0.8256028294563293,
|
|
"num_tokens": 299145054.0,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 0.9562563580874873,
|
|
"grad_norm": 0.708060622215271,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5984,
|
|
"mean_token_accuracy": 0.8219953775405884,
|
|
"num_tokens": 299484449.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.9572736520854527,
|
|
"grad_norm": 0.7917901277542114,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6106,
|
|
"mean_token_accuracy": 0.8174960613250732,
|
|
"num_tokens": 299821866.0,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 0.9582909460834181,
|
|
"grad_norm": 0.7996972799301147,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6084,
|
|
"mean_token_accuracy": 0.8205087184906006,
|
|
"num_tokens": 300122149.0,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 0.9593082400813835,
|
|
"grad_norm": 0.7606682777404785,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.607,
|
|
"mean_token_accuracy": 0.8185034990310669,
|
|
"num_tokens": 300434761.0,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 0.960325534079349,
|
|
"grad_norm": 0.7440558075904846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6108,
|
|
"mean_token_accuracy": 0.8182252645492554,
|
|
"num_tokens": 300761476.0,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 0.9613428280773143,
|
|
"grad_norm": 0.760749101638794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8271734714508057,
|
|
"num_tokens": 301078714.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.9623601220752798,
|
|
"grad_norm": 0.8019530773162842,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5965,
|
|
"mean_token_accuracy": 0.8223601579666138,
|
|
"num_tokens": 301390887.0,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 0.9633774160732451,
|
|
"grad_norm": 0.7818415760993958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6072,
|
|
"mean_token_accuracy": 0.8200531005859375,
|
|
"num_tokens": 301709240.0,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 0.9643947100712106,
|
|
"grad_norm": 0.7479201555252075,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.61,
|
|
"mean_token_accuracy": 0.8179908990859985,
|
|
"num_tokens": 302020968.0,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 0.965412004069176,
|
|
"grad_norm": 0.7905112504959106,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6384,
|
|
"mean_token_accuracy": 0.8104801774024963,
|
|
"num_tokens": 302349196.0,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 0.9664292980671414,
|
|
"grad_norm": 0.743297815322876,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6185,
|
|
"mean_token_accuracy": 0.8159749507904053,
|
|
"num_tokens": 302661848.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.9674465920651069,
|
|
"grad_norm": 0.7662633657455444,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6067,
|
|
"mean_token_accuracy": 0.8191525936126709,
|
|
"num_tokens": 302985335.0,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 0.9684638860630722,
|
|
"grad_norm": 0.739526093006134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8264555931091309,
|
|
"num_tokens": 303312885.0,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 0.9694811800610377,
|
|
"grad_norm": 0.7629678845405579,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6088,
|
|
"mean_token_accuracy": 0.8180946111679077,
|
|
"num_tokens": 303632397.0,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 0.970498474059003,
|
|
"grad_norm": 0.7920119166374207,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6134,
|
|
"mean_token_accuracy": 0.8164322376251221,
|
|
"num_tokens": 303954402.0,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 0.9715157680569685,
|
|
"grad_norm": 0.7756142616271973,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6094,
|
|
"mean_token_accuracy": 0.819657564163208,
|
|
"num_tokens": 304285608.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.9725330620549338,
|
|
"grad_norm": 0.7486905455589294,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5684,
|
|
"mean_token_accuracy": 0.8285978436470032,
|
|
"num_tokens": 304604968.0,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 0.9735503560528993,
|
|
"grad_norm": 0.7525914907455444,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5843,
|
|
"mean_token_accuracy": 0.8244077563285828,
|
|
"num_tokens": 304919739.0,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 0.9745676500508647,
|
|
"grad_norm": 0.7886344194412231,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6057,
|
|
"mean_token_accuracy": 0.8192185759544373,
|
|
"num_tokens": 305245025.0,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 0.9755849440488301,
|
|
"grad_norm": 0.7626937627792358,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5983,
|
|
"mean_token_accuracy": 0.8203018307685852,
|
|
"num_tokens": 305554045.0,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 0.9766022380467956,
|
|
"grad_norm": 0.7469524145126343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5814,
|
|
"mean_token_accuracy": 0.8262737989425659,
|
|
"num_tokens": 305880937.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.9776195320447609,
|
|
"grad_norm": 0.7448502779006958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5921,
|
|
"mean_token_accuracy": 0.8239433765411377,
|
|
"num_tokens": 306205585.0,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 0.9786368260427264,
|
|
"grad_norm": 0.8299108147621155,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5972,
|
|
"mean_token_accuracy": 0.8226567506790161,
|
|
"num_tokens": 306521564.0,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 0.9796541200406917,
|
|
"grad_norm": 0.7663608193397522,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6119,
|
|
"mean_token_accuracy": 0.81708824634552,
|
|
"num_tokens": 306832682.0,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 0.9806714140386572,
|
|
"grad_norm": 0.7992986440658569,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8270239233970642,
|
|
"num_tokens": 307155638.0,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 0.9816887080366226,
|
|
"grad_norm": 0.7808005213737488,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5832,
|
|
"mean_token_accuracy": 0.8254443407058716,
|
|
"num_tokens": 307461975.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.982706002034588,
|
|
"grad_norm": 0.7484624981880188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6133,
|
|
"mean_token_accuracy": 0.816098690032959,
|
|
"num_tokens": 307787106.0,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 0.9837232960325534,
|
|
"grad_norm": 0.7650085091590881,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6086,
|
|
"mean_token_accuracy": 0.8195856809616089,
|
|
"num_tokens": 308114670.0,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 0.9847405900305188,
|
|
"grad_norm": 0.7795204520225525,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6082,
|
|
"mean_token_accuracy": 0.8185417056083679,
|
|
"num_tokens": 308438717.0,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 0.9857578840284842,
|
|
"grad_norm": 0.7380291819572449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5781,
|
|
"mean_token_accuracy": 0.82597416639328,
|
|
"num_tokens": 308770389.0,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 0.9867751780264497,
|
|
"grad_norm": 0.7749535441398621,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5905,
|
|
"mean_token_accuracy": 0.8239677548408508,
|
|
"num_tokens": 309098467.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.987792472024415,
|
|
"grad_norm": 0.8078527450561523,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6127,
|
|
"mean_token_accuracy": 0.8165298700332642,
|
|
"num_tokens": 309430767.0,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 0.9888097660223805,
|
|
"grad_norm": 0.8138400316238403,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6065,
|
|
"mean_token_accuracy": 0.8182769417762756,
|
|
"num_tokens": 309743337.0,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 0.9898270600203459,
|
|
"grad_norm": 0.7963126301765442,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6171,
|
|
"mean_token_accuracy": 0.816325306892395,
|
|
"num_tokens": 310051593.0,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 0.9908443540183113,
|
|
"grad_norm": 0.7809932231903076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5941,
|
|
"mean_token_accuracy": 0.8219630718231201,
|
|
"num_tokens": 310376369.0,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 0.9918616480162767,
|
|
"grad_norm": 0.7698544263839722,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6147,
|
|
"mean_token_accuracy": 0.8174964189529419,
|
|
"num_tokens": 310705795.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.9928789420142421,
|
|
"grad_norm": 0.8088345527648926,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5998,
|
|
"mean_token_accuracy": 0.8205104470252991,
|
|
"num_tokens": 311023220.0,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 0.9938962360122076,
|
|
"grad_norm": 0.8139130473136902,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.616,
|
|
"mean_token_accuracy": 0.8165026903152466,
|
|
"num_tokens": 311361294.0,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 0.9949135300101729,
|
|
"grad_norm": 0.7873548269271851,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6413,
|
|
"mean_token_accuracy": 0.8094485402107239,
|
|
"num_tokens": 311682933.0,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 0.9959308240081384,
|
|
"grad_norm": 0.7542492747306824,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.820980429649353,
|
|
"num_tokens": 311997757.0,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 0.9969481180061037,
|
|
"grad_norm": 0.8661382794380188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5982,
|
|
"mean_token_accuracy": 0.8212977051734924,
|
|
"num_tokens": 312300232.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.9979654120040692,
|
|
"grad_norm": 0.805185079574585,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5992,
|
|
"mean_token_accuracy": 0.8209521770477295,
|
|
"num_tokens": 312617272.0,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 0.9989827060020345,
|
|
"grad_norm": 0.7660844922065735,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6015,
|
|
"mean_token_accuracy": 0.821892261505127,
|
|
"num_tokens": 312940535.0,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7682638168334961,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6044,
|
|
"mean_token_accuracy": 0.8197771906852722,
|
|
"num_tokens": 313255150.0,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 1.0010172939979654,
|
|
"grad_norm": 0.7689452171325684,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.582,
|
|
"mean_token_accuracy": 0.8259244561195374,
|
|
"num_tokens": 313576480.0,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 1.002034587995931,
|
|
"grad_norm": 0.820095419883728,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6209,
|
|
"mean_token_accuracy": 0.814440906047821,
|
|
"num_tokens": 313894362.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.0030518819938963,
|
|
"grad_norm": 0.7715836763381958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.601,
|
|
"mean_token_accuracy": 0.8188121914863586,
|
|
"num_tokens": 314199033.0,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 1.0040691759918616,
|
|
"grad_norm": 0.733830988407135,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5837,
|
|
"mean_token_accuracy": 0.8249683976173401,
|
|
"num_tokens": 314529806.0,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 1.005086469989827,
|
|
"grad_norm": 0.8194170594215393,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.822218120098114,
|
|
"num_tokens": 314836980.0,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 1.0061037639877926,
|
|
"grad_norm": 0.7957749366760254,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6269,
|
|
"mean_token_accuracy": 0.8144529461860657,
|
|
"num_tokens": 315155851.0,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 1.007121057985758,
|
|
"grad_norm": 0.7519081234931946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5834,
|
|
"mean_token_accuracy": 0.8247135877609253,
|
|
"num_tokens": 315477553.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.0081383519837233,
|
|
"grad_norm": 0.8048916459083557,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5977,
|
|
"mean_token_accuracy": 0.820995032787323,
|
|
"num_tokens": 315789205.0,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 1.0091556459816886,
|
|
"grad_norm": 0.756081223487854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6002,
|
|
"mean_token_accuracy": 0.820772111415863,
|
|
"num_tokens": 316108041.0,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 1.0101729399796542,
|
|
"grad_norm": 0.7621251344680786,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5943,
|
|
"mean_token_accuracy": 0.8217575550079346,
|
|
"num_tokens": 316433915.0,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 1.0111902339776195,
|
|
"grad_norm": 0.7687362432479858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5982,
|
|
"mean_token_accuracy": 0.8215383291244507,
|
|
"num_tokens": 316764032.0,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 1.0122075279755849,
|
|
"grad_norm": 0.7695761919021606,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6062,
|
|
"mean_token_accuracy": 0.8193082213401794,
|
|
"num_tokens": 317068693.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.0132248219735505,
|
|
"grad_norm": 0.7918261885643005,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8233439922332764,
|
|
"num_tokens": 317399513.0,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 1.0142421159715158,
|
|
"grad_norm": 0.755750834941864,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5862,
|
|
"mean_token_accuracy": 0.8246799111366272,
|
|
"num_tokens": 317722640.0,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 1.0152594099694812,
|
|
"grad_norm": 0.7813019752502441,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5833,
|
|
"mean_token_accuracy": 0.8261044025421143,
|
|
"num_tokens": 318033418.0,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 1.0162767039674465,
|
|
"grad_norm": 0.7828826308250427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6031,
|
|
"mean_token_accuracy": 0.8192239999771118,
|
|
"num_tokens": 318339258.0,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 1.017293997965412,
|
|
"grad_norm": 0.7472410798072815,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5792,
|
|
"mean_token_accuracy": 0.8274403810501099,
|
|
"num_tokens": 318648663.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.0183112919633774,
|
|
"grad_norm": 0.795993983745575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.598,
|
|
"mean_token_accuracy": 0.820968747138977,
|
|
"num_tokens": 318960576.0,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 1.0193285859613428,
|
|
"grad_norm": 0.7590869665145874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5855,
|
|
"mean_token_accuracy": 0.8260129690170288,
|
|
"num_tokens": 319276087.0,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 1.0203458799593081,
|
|
"grad_norm": 0.7744798064231873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5849,
|
|
"mean_token_accuracy": 0.8254764676094055,
|
|
"num_tokens": 319576787.0,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 1.0213631739572737,
|
|
"grad_norm": 0.7490171194076538,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5948,
|
|
"mean_token_accuracy": 0.8221466541290283,
|
|
"num_tokens": 319902904.0,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 1.022380467955239,
|
|
"grad_norm": 0.738842248916626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5995,
|
|
"mean_token_accuracy": 0.8217558264732361,
|
|
"num_tokens": 320212284.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.0233977619532044,
|
|
"grad_norm": 0.7941291332244873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.604,
|
|
"mean_token_accuracy": 0.8189691305160522,
|
|
"num_tokens": 320529125.0,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 1.02441505595117,
|
|
"grad_norm": 0.7658363580703735,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5914,
|
|
"mean_token_accuracy": 0.8234286308288574,
|
|
"num_tokens": 320834351.0,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 1.0254323499491353,
|
|
"grad_norm": 0.7774712443351746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5955,
|
|
"mean_token_accuracy": 0.8215651512145996,
|
|
"num_tokens": 321155407.0,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 1.0264496439471007,
|
|
"grad_norm": 0.8093451857566833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6158,
|
|
"mean_token_accuracy": 0.816012978553772,
|
|
"num_tokens": 321466786.0,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 1.027466937945066,
|
|
"grad_norm": 0.7412152290344238,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8279814124107361,
|
|
"num_tokens": 321794738.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.0284842319430316,
|
|
"grad_norm": 0.7972337603569031,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6011,
|
|
"mean_token_accuracy": 0.8213940262794495,
|
|
"num_tokens": 322105270.0,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 1.029501525940997,
|
|
"grad_norm": 0.7478787899017334,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5958,
|
|
"mean_token_accuracy": 0.8207067251205444,
|
|
"num_tokens": 322430631.0,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 1.0305188199389623,
|
|
"grad_norm": 0.7256690859794617,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8281009197235107,
|
|
"num_tokens": 322762321.0,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 1.0315361139369277,
|
|
"grad_norm": 0.8089573979377747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.601,
|
|
"mean_token_accuracy": 0.8199090957641602,
|
|
"num_tokens": 323079150.0,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 1.0325534079348933,
|
|
"grad_norm": 0.7496512532234192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.601,
|
|
"mean_token_accuracy": 0.8193944692611694,
|
|
"num_tokens": 323409267.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.0335707019328586,
|
|
"grad_norm": 0.7756277322769165,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6172,
|
|
"mean_token_accuracy": 0.8161155581474304,
|
|
"num_tokens": 323717438.0,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 1.034587995930824,
|
|
"grad_norm": 0.7738403081893921,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6066,
|
|
"mean_token_accuracy": 0.8196358680725098,
|
|
"num_tokens": 324033814.0,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 1.0356052899287893,
|
|
"grad_norm": 0.807456910610199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6016,
|
|
"mean_token_accuracy": 0.8201157450675964,
|
|
"num_tokens": 324339938.0,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 1.0366225839267549,
|
|
"grad_norm": 0.8129794001579285,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5997,
|
|
"mean_token_accuracy": 0.8207684755325317,
|
|
"num_tokens": 324649242.0,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 1.0376398779247202,
|
|
"grad_norm": 0.747581958770752,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8210676908493042,
|
|
"num_tokens": 324973812.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.0386571719226856,
|
|
"grad_norm": 0.7553842067718506,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6006,
|
|
"mean_token_accuracy": 0.8205328583717346,
|
|
"num_tokens": 325293984.0,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 1.0396744659206512,
|
|
"grad_norm": 0.7708786725997925,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6125,
|
|
"mean_token_accuracy": 0.8168396949768066,
|
|
"num_tokens": 325602970.0,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 1.0406917599186165,
|
|
"grad_norm": 0.7987100481987,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5998,
|
|
"mean_token_accuracy": 0.8218681812286377,
|
|
"num_tokens": 325898114.0,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 1.0417090539165819,
|
|
"grad_norm": 0.8853242993354797,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8202185034751892,
|
|
"num_tokens": 326210039.0,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 1.0427263479145472,
|
|
"grad_norm": 0.7377429008483887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.8149514198303223,
|
|
"num_tokens": 326539486.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.0437436419125128,
|
|
"grad_norm": 0.7787431478500366,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.594,
|
|
"mean_token_accuracy": 0.82254958152771,
|
|
"num_tokens": 326855696.0,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 1.0447609359104781,
|
|
"grad_norm": 0.7603839635848999,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6047,
|
|
"mean_token_accuracy": 0.8195805549621582,
|
|
"num_tokens": 327187580.0,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 1.0457782299084435,
|
|
"grad_norm": 0.753252387046814,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6052,
|
|
"mean_token_accuracy": 0.8188362121582031,
|
|
"num_tokens": 327507342.0,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 1.0467955239064088,
|
|
"grad_norm": 0.7527574300765991,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6086,
|
|
"mean_token_accuracy": 0.8182722926139832,
|
|
"num_tokens": 327836633.0,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 1.0478128179043744,
|
|
"grad_norm": 0.784575879573822,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5906,
|
|
"mean_token_accuracy": 0.822380006313324,
|
|
"num_tokens": 328133022.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.0488301119023398,
|
|
"grad_norm": 0.8077467083930969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.8195399045944214,
|
|
"num_tokens": 328442910.0,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 1.0498474059003051,
|
|
"grad_norm": 0.7402403950691223,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5968,
|
|
"mean_token_accuracy": 0.8225548267364502,
|
|
"num_tokens": 328766960.0,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 1.0508646998982707,
|
|
"grad_norm": 0.7321717739105225,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6137,
|
|
"mean_token_accuracy": 0.8174277544021606,
|
|
"num_tokens": 329079757.0,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 1.051881993896236,
|
|
"grad_norm": 0.8269543647766113,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.8168300986289978,
|
|
"num_tokens": 329396255.0,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 1.0528992878942014,
|
|
"grad_norm": 0.8111584186553955,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8299026489257812,
|
|
"num_tokens": 329691299.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.0539165818921667,
|
|
"grad_norm": 0.7589960694313049,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8235443830490112,
|
|
"num_tokens": 330014371.0,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 1.0549338758901323,
|
|
"grad_norm": 0.7154140472412109,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.822191596031189,
|
|
"num_tokens": 330363604.0,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 1.0559511698880977,
|
|
"grad_norm": 0.7874635457992554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8294987082481384,
|
|
"num_tokens": 330698211.0,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 1.056968463886063,
|
|
"grad_norm": 0.741408109664917,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5952,
|
|
"mean_token_accuracy": 0.8210251927375793,
|
|
"num_tokens": 331039546.0,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 1.0579857578840284,
|
|
"grad_norm": 0.7418220043182373,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5898,
|
|
"mean_token_accuracy": 0.8246545791625977,
|
|
"num_tokens": 331349908.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.059003051881994,
|
|
"grad_norm": 0.7552366852760315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5981,
|
|
"mean_token_accuracy": 0.8202588558197021,
|
|
"num_tokens": 331685842.0,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 1.0600203458799593,
|
|
"grad_norm": 0.7713178396224976,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.584,
|
|
"mean_token_accuracy": 0.825634777545929,
|
|
"num_tokens": 332002207.0,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 1.0610376398779247,
|
|
"grad_norm": 0.7259511351585388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5722,
|
|
"mean_token_accuracy": 0.8279554843902588,
|
|
"num_tokens": 332313613.0,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 1.0620549338758902,
|
|
"grad_norm": 0.7211476564407349,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5923,
|
|
"mean_token_accuracy": 0.8233799934387207,
|
|
"num_tokens": 332638608.0,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 1.0630722278738556,
|
|
"grad_norm": 0.794397234916687,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5808,
|
|
"mean_token_accuracy": 0.8254649639129639,
|
|
"num_tokens": 332951297.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.064089521871821,
|
|
"grad_norm": 0.7747459411621094,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6075,
|
|
"mean_token_accuracy": 0.8188462257385254,
|
|
"num_tokens": 333249289.0,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 1.0651068158697863,
|
|
"grad_norm": 0.7619302868843079,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5583,
|
|
"mean_token_accuracy": 0.8311722278594971,
|
|
"num_tokens": 333565896.0,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 1.0661241098677519,
|
|
"grad_norm": 0.7596496939659119,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5967,
|
|
"mean_token_accuracy": 0.8213114738464355,
|
|
"num_tokens": 333906023.0,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 1.0671414038657172,
|
|
"grad_norm": 0.7507315278053284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.613,
|
|
"mean_token_accuracy": 0.8168172240257263,
|
|
"num_tokens": 334238798.0,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 1.0681586978636826,
|
|
"grad_norm": 0.7711279988288879,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6209,
|
|
"mean_token_accuracy": 0.8149176836013794,
|
|
"num_tokens": 334559804.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.069175991861648,
|
|
"grad_norm": 0.8109130263328552,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8220605254173279,
|
|
"num_tokens": 334867946.0,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 1.0701932858596135,
|
|
"grad_norm": 0.7828776240348816,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5962,
|
|
"mean_token_accuracy": 0.8221853971481323,
|
|
"num_tokens": 335181587.0,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 1.0712105798575788,
|
|
"grad_norm": 0.7226819396018982,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.8222191333770752,
|
|
"num_tokens": 335509021.0,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 1.0722278738555442,
|
|
"grad_norm": 0.7938230037689209,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.822351336479187,
|
|
"num_tokens": 335809158.0,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 1.0732451678535098,
|
|
"grad_norm": 0.7728512287139893,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5964,
|
|
"mean_token_accuracy": 0.8209540843963623,
|
|
"num_tokens": 336119431.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.0742624618514751,
|
|
"grad_norm": 0.7619753479957581,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6004,
|
|
"mean_token_accuracy": 0.8202822804450989,
|
|
"num_tokens": 336442944.0,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 1.0752797558494405,
|
|
"grad_norm": 0.7428168654441833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5695,
|
|
"mean_token_accuracy": 0.8283674120903015,
|
|
"num_tokens": 336743733.0,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 1.0762970498474058,
|
|
"grad_norm": 0.7476255893707275,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6108,
|
|
"mean_token_accuracy": 0.8177371621131897,
|
|
"num_tokens": 337080544.0,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 1.0773143438453714,
|
|
"grad_norm": 0.7603205442428589,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6001,
|
|
"mean_token_accuracy": 0.8195057511329651,
|
|
"num_tokens": 337392278.0,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 1.0783316378433367,
|
|
"grad_norm": 0.7620111703872681,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.8295987844467163,
|
|
"num_tokens": 337706603.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.079348931841302,
|
|
"grad_norm": 0.7455660104751587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8240776062011719,
|
|
"num_tokens": 338026298.0,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 1.0803662258392674,
|
|
"grad_norm": 0.809394121170044,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.8286706209182739,
|
|
"num_tokens": 338365205.0,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 1.081383519837233,
|
|
"grad_norm": 0.8363968729972839,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6057,
|
|
"mean_token_accuracy": 0.8185662627220154,
|
|
"num_tokens": 338668185.0,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 1.0824008138351984,
|
|
"grad_norm": 0.7580261826515198,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.593,
|
|
"mean_token_accuracy": 0.8213092088699341,
|
|
"num_tokens": 338990507.0,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 1.0834181078331637,
|
|
"grad_norm": 0.7513942122459412,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5915,
|
|
"mean_token_accuracy": 0.8225909471511841,
|
|
"num_tokens": 339326912.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 1.0844354018311293,
|
|
"grad_norm": 0.7773120999336243,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6044,
|
|
"mean_token_accuracy": 0.8196120262145996,
|
|
"num_tokens": 339637655.0,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 1.0854526958290946,
|
|
"grad_norm": 0.77564537525177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8285472989082336,
|
|
"num_tokens": 339944200.0,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 1.08646998982706,
|
|
"grad_norm": 0.763636589050293,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8201035261154175,
|
|
"num_tokens": 340252788.0,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 1.0874872838250254,
|
|
"grad_norm": 0.827096700668335,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.595,
|
|
"mean_token_accuracy": 0.8229066133499146,
|
|
"num_tokens": 340571484.0,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 1.088504577822991,
|
|
"grad_norm": 0.7606673240661621,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8228622674942017,
|
|
"num_tokens": 340886244.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 1.0895218718209563,
|
|
"grad_norm": 0.7811551094055176,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8230052590370178,
|
|
"num_tokens": 341199202.0,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 1.0905391658189216,
|
|
"grad_norm": 0.7782750129699707,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.596,
|
|
"mean_token_accuracy": 0.8207575082778931,
|
|
"num_tokens": 341532738.0,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 1.091556459816887,
|
|
"grad_norm": 0.797938346862793,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5933,
|
|
"mean_token_accuracy": 0.8228356838226318,
|
|
"num_tokens": 341847416.0,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 1.0925737538148526,
|
|
"grad_norm": 0.7769486308097839,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5859,
|
|
"mean_token_accuracy": 0.8257655501365662,
|
|
"num_tokens": 342175374.0,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 1.093591047812818,
|
|
"grad_norm": 0.7653997540473938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6121,
|
|
"mean_token_accuracy": 0.8170241713523865,
|
|
"num_tokens": 342506392.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 1.0946083418107833,
|
|
"grad_norm": 0.8226693868637085,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.8193036317825317,
|
|
"num_tokens": 342830993.0,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 1.0956256358087488,
|
|
"grad_norm": 0.7961437106132507,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5906,
|
|
"mean_token_accuracy": 0.8232593536376953,
|
|
"num_tokens": 343141065.0,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 1.0966429298067142,
|
|
"grad_norm": 0.7740011215209961,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.623,
|
|
"mean_token_accuracy": 0.8138188719749451,
|
|
"num_tokens": 343447392.0,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 1.0976602238046795,
|
|
"grad_norm": 0.7739623188972473,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5849,
|
|
"mean_token_accuracy": 0.8247867822647095,
|
|
"num_tokens": 343757619.0,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 1.0986775178026449,
|
|
"grad_norm": 0.7831132411956787,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.8220590949058533,
|
|
"num_tokens": 344073814.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.0996948118006105,
|
|
"grad_norm": 0.854607880115509,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8229290246963501,
|
|
"num_tokens": 344391441.0,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 1.1007121057985758,
|
|
"grad_norm": 0.749823808670044,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5955,
|
|
"mean_token_accuracy": 0.8226028680801392,
|
|
"num_tokens": 344712984.0,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 1.1017293997965412,
|
|
"grad_norm": 0.7485523819923401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6069,
|
|
"mean_token_accuracy": 0.8197723627090454,
|
|
"num_tokens": 345036641.0,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 1.1027466937945065,
|
|
"grad_norm": 0.7741460800170898,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5889,
|
|
"mean_token_accuracy": 0.8218555450439453,
|
|
"num_tokens": 345369577.0,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 1.103763987792472,
|
|
"grad_norm": 0.7494634389877319,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5755,
|
|
"mean_token_accuracy": 0.8278588056564331,
|
|
"num_tokens": 345683367.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 1.1047812817904374,
|
|
"grad_norm": 0.7270079851150513,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6191,
|
|
"mean_token_accuracy": 0.816685676574707,
|
|
"num_tokens": 346003193.0,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 1.1057985757884028,
|
|
"grad_norm": 0.7581070065498352,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5947,
|
|
"mean_token_accuracy": 0.8216639757156372,
|
|
"num_tokens": 346330561.0,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 1.1068158697863684,
|
|
"grad_norm": 0.7528476715087891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6004,
|
|
"mean_token_accuracy": 0.8204837441444397,
|
|
"num_tokens": 346660957.0,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 1.1078331637843337,
|
|
"grad_norm": 0.759814977645874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8286617398262024,
|
|
"num_tokens": 346990623.0,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 1.108850457782299,
|
|
"grad_norm": 0.7554075121879578,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.597,
|
|
"mean_token_accuracy": 0.8215787410736084,
|
|
"num_tokens": 347324012.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 1.1098677517802644,
|
|
"grad_norm": 0.7750098705291748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6022,
|
|
"mean_token_accuracy": 0.8182422518730164,
|
|
"num_tokens": 347632344.0,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 1.11088504577823,
|
|
"grad_norm": 0.8283107280731201,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.608,
|
|
"mean_token_accuracy": 0.816761314868927,
|
|
"num_tokens": 347925739.0,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 1.1119023397761953,
|
|
"grad_norm": 0.7343229055404663,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8283286094665527,
|
|
"num_tokens": 348240499.0,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 1.1129196337741607,
|
|
"grad_norm": 0.7427061796188354,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5767,
|
|
"mean_token_accuracy": 0.8270567655563354,
|
|
"num_tokens": 348555987.0,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 1.113936927772126,
|
|
"grad_norm": 0.7688320279121399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5914,
|
|
"mean_token_accuracy": 0.8231028318405151,
|
|
"num_tokens": 348866347.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 1.1149542217700916,
|
|
"grad_norm": 0.7548975944519043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8277769088745117,
|
|
"num_tokens": 349194754.0,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 1.115971515768057,
|
|
"grad_norm": 0.7943713665008545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5838,
|
|
"mean_token_accuracy": 0.8240779638290405,
|
|
"num_tokens": 349501086.0,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 1.1169888097660223,
|
|
"grad_norm": 0.7876549363136292,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5811,
|
|
"mean_token_accuracy": 0.826018214225769,
|
|
"num_tokens": 349818869.0,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 1.118006103763988,
|
|
"grad_norm": 0.7462044954299927,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5821,
|
|
"mean_token_accuracy": 0.8253153562545776,
|
|
"num_tokens": 350142280.0,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 1.1190233977619533,
|
|
"grad_norm": 0.7265391945838928,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5832,
|
|
"mean_token_accuracy": 0.8255075216293335,
|
|
"num_tokens": 350469513.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.1200406917599186,
|
|
"grad_norm": 0.7540880441665649,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.579,
|
|
"mean_token_accuracy": 0.8258617520332336,
|
|
"num_tokens": 350784696.0,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 1.121057985757884,
|
|
"grad_norm": 0.722629964351654,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.832967221736908,
|
|
"num_tokens": 351109045.0,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 1.1220752797558495,
|
|
"grad_norm": 0.7430449724197388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5703,
|
|
"mean_token_accuracy": 0.8284896612167358,
|
|
"num_tokens": 351433256.0,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 1.1230925737538149,
|
|
"grad_norm": 0.7905130982398987,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6125,
|
|
"mean_token_accuracy": 0.8168379664421082,
|
|
"num_tokens": 351734672.0,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 1.1241098677517802,
|
|
"grad_norm": 0.8053008317947388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.823253870010376,
|
|
"num_tokens": 352042249.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 1.1251271617497456,
|
|
"grad_norm": 0.7345619797706604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5926,
|
|
"mean_token_accuracy": 0.8202496767044067,
|
|
"num_tokens": 352358733.0,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 1.1261444557477112,
|
|
"grad_norm": 0.7487010955810547,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.596,
|
|
"mean_token_accuracy": 0.8215106725692749,
|
|
"num_tokens": 352667221.0,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 1.1271617497456765,
|
|
"grad_norm": 0.7505720853805542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5795,
|
|
"mean_token_accuracy": 0.8260778784751892,
|
|
"num_tokens": 352982322.0,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 1.1281790437436419,
|
|
"grad_norm": 0.7573304176330566,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5895,
|
|
"mean_token_accuracy": 0.8235150575637817,
|
|
"num_tokens": 353294238.0,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 1.1291963377416074,
|
|
"grad_norm": 0.7672238349914551,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.820686936378479,
|
|
"num_tokens": 353617488.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 1.1302136317395728,
|
|
"grad_norm": 0.7361064553260803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5867,
|
|
"mean_token_accuracy": 0.823577880859375,
|
|
"num_tokens": 353942973.0,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 1.1312309257375381,
|
|
"grad_norm": 0.757715106010437,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.616,
|
|
"mean_token_accuracy": 0.8163123726844788,
|
|
"num_tokens": 354241839.0,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 1.1322482197355035,
|
|
"grad_norm": 0.7401450872421265,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6027,
|
|
"mean_token_accuracy": 0.8200206160545349,
|
|
"num_tokens": 354575014.0,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 1.133265513733469,
|
|
"grad_norm": 0.8054563999176025,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5895,
|
|
"mean_token_accuracy": 0.8230452537536621,
|
|
"num_tokens": 354915712.0,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 1.1342828077314344,
|
|
"grad_norm": 0.7494530081748962,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6118,
|
|
"mean_token_accuracy": 0.8168686032295227,
|
|
"num_tokens": 355237653.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 1.1353001017293998,
|
|
"grad_norm": 0.7508543133735657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5861,
|
|
"mean_token_accuracy": 0.8246638774871826,
|
|
"num_tokens": 355582832.0,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 1.1363173957273651,
|
|
"grad_norm": 0.788264811038971,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.599,
|
|
"mean_token_accuracy": 0.8205045461654663,
|
|
"num_tokens": 355882190.0,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 1.1373346897253307,
|
|
"grad_norm": 0.7862522006034851,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5765,
|
|
"mean_token_accuracy": 0.8265331983566284,
|
|
"num_tokens": 356192221.0,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 1.138351983723296,
|
|
"grad_norm": 0.7364363074302673,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6191,
|
|
"mean_token_accuracy": 0.8141418099403381,
|
|
"num_tokens": 356515011.0,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 1.1393692777212614,
|
|
"grad_norm": 0.7794057726860046,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.8244831562042236,
|
|
"num_tokens": 356840072.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.140386571719227,
|
|
"grad_norm": 0.7552722692489624,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6027,
|
|
"mean_token_accuracy": 0.8201184272766113,
|
|
"num_tokens": 357164387.0,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 1.1414038657171923,
|
|
"grad_norm": 0.7549647092819214,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5699,
|
|
"mean_token_accuracy": 0.8276758193969727,
|
|
"num_tokens": 357480123.0,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 1.1424211597151577,
|
|
"grad_norm": 0.8171870112419128,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5673,
|
|
"mean_token_accuracy": 0.8288565874099731,
|
|
"num_tokens": 357799511.0,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 1.143438453713123,
|
|
"grad_norm": 0.7676132321357727,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6055,
|
|
"mean_token_accuracy": 0.8197697997093201,
|
|
"num_tokens": 358095805.0,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 1.1444557477110886,
|
|
"grad_norm": 0.7719829082489014,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5827,
|
|
"mean_token_accuracy": 0.8251679539680481,
|
|
"num_tokens": 358405765.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 1.145473041709054,
|
|
"grad_norm": 0.7176077365875244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6303,
|
|
"mean_token_accuracy": 0.8127435445785522,
|
|
"num_tokens": 358741998.0,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 1.1464903357070193,
|
|
"grad_norm": 0.778710126876831,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5636,
|
|
"mean_token_accuracy": 0.8297712802886963,
|
|
"num_tokens": 359053024.0,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 1.1475076297049847,
|
|
"grad_norm": 0.7981055378913879,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5856,
|
|
"mean_token_accuracy": 0.8239826560020447,
|
|
"num_tokens": 359375453.0,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 1.1485249237029502,
|
|
"grad_norm": 0.7974837422370911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8285097479820251,
|
|
"num_tokens": 359692458.0,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 1.1495422177009156,
|
|
"grad_norm": 0.7408716082572937,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.8209438323974609,
|
|
"num_tokens": 360008843.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 1.150559511698881,
|
|
"grad_norm": 0.7501970529556274,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5763,
|
|
"mean_token_accuracy": 0.8257219791412354,
|
|
"num_tokens": 360333592.0,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 1.1515768056968465,
|
|
"grad_norm": 0.7652875781059265,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.8355057835578918,
|
|
"num_tokens": 360646033.0,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 1.1525940996948119,
|
|
"grad_norm": 0.7454234957695007,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8191195726394653,
|
|
"num_tokens": 360980106.0,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 1.1536113936927772,
|
|
"grad_norm": 0.7779508829116821,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8256955742835999,
|
|
"num_tokens": 361296869.0,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 1.1546286876907426,
|
|
"grad_norm": 0.7592409253120422,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5898,
|
|
"mean_token_accuracy": 0.8217758536338806,
|
|
"num_tokens": 361597141.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 1.155645981688708,
|
|
"grad_norm": 0.7225300073623657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5861,
|
|
"mean_token_accuracy": 0.8229520320892334,
|
|
"num_tokens": 361931768.0,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 1.1566632756866735,
|
|
"grad_norm": 0.7548590302467346,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5571,
|
|
"mean_token_accuracy": 0.8324354887008667,
|
|
"num_tokens": 362246811.0,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 1.1576805696846388,
|
|
"grad_norm": 0.7888747453689575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5868,
|
|
"mean_token_accuracy": 0.8239426612854004,
|
|
"num_tokens": 362555139.0,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 1.1586978636826042,
|
|
"grad_norm": 0.7801425457000732,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.827373743057251,
|
|
"num_tokens": 362881363.0,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 1.1597151576805698,
|
|
"grad_norm": 0.7782330513000488,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5955,
|
|
"mean_token_accuracy": 0.8224520683288574,
|
|
"num_tokens": 363207013.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.1607324516785351,
|
|
"grad_norm": 0.7746996283531189,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5777,
|
|
"mean_token_accuracy": 0.8252719640731812,
|
|
"num_tokens": 363501174.0,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 1.1617497456765005,
|
|
"grad_norm": 0.7277313470840454,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8253401517868042,
|
|
"num_tokens": 363831350.0,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 1.162767039674466,
|
|
"grad_norm": 0.7452883124351501,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8252953290939331,
|
|
"num_tokens": 364161537.0,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 1.1637843336724314,
|
|
"grad_norm": 0.7810433506965637,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5834,
|
|
"mean_token_accuracy": 0.8254508972167969,
|
|
"num_tokens": 364468701.0,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 1.1648016276703967,
|
|
"grad_norm": 0.7638463377952576,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5852,
|
|
"mean_token_accuracy": 0.8235447406768799,
|
|
"num_tokens": 364793011.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 1.165818921668362,
|
|
"grad_norm": 0.8116283416748047,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8259240984916687,
|
|
"num_tokens": 365125068.0,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 1.1668362156663274,
|
|
"grad_norm": 0.7863451242446899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6043,
|
|
"mean_token_accuracy": 0.8196288347244263,
|
|
"num_tokens": 365447819.0,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 1.167853509664293,
|
|
"grad_norm": 0.7274298071861267,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.8230388760566711,
|
|
"num_tokens": 365766838.0,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 1.1688708036622584,
|
|
"grad_norm": 0.7949771285057068,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8283715844154358,
|
|
"num_tokens": 366101878.0,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 1.1698880976602237,
|
|
"grad_norm": 0.7356235384941101,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5785,
|
|
"mean_token_accuracy": 0.8254322409629822,
|
|
"num_tokens": 366439188.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 1.1709053916581893,
|
|
"grad_norm": 0.7703425884246826,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5998,
|
|
"mean_token_accuracy": 0.8203462362289429,
|
|
"num_tokens": 366769572.0,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 1.1719226856561547,
|
|
"grad_norm": 0.7374350428581238,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8288112878799438,
|
|
"num_tokens": 367097208.0,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 1.17293997965412,
|
|
"grad_norm": 0.7855170369148254,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.616,
|
|
"mean_token_accuracy": 0.8170003890991211,
|
|
"num_tokens": 367424929.0,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 1.1739572736520856,
|
|
"grad_norm": 0.7788858413696289,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5843,
|
|
"mean_token_accuracy": 0.8242186903953552,
|
|
"num_tokens": 367753127.0,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 1.174974567650051,
|
|
"grad_norm": 0.7844034433364868,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5919,
|
|
"mean_token_accuracy": 0.8226374387741089,
|
|
"num_tokens": 368073652.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 1.1759918616480163,
|
|
"grad_norm": 0.7838830947875977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5805,
|
|
"mean_token_accuracy": 0.8258245587348938,
|
|
"num_tokens": 368389034.0,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 1.1770091556459816,
|
|
"grad_norm": 0.7512738704681396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.820223867893219,
|
|
"num_tokens": 368721238.0,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 1.178026449643947,
|
|
"grad_norm": 0.7941290140151978,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5941,
|
|
"mean_token_accuracy": 0.8232244253158569,
|
|
"num_tokens": 369039106.0,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 1.1790437436419126,
|
|
"grad_norm": 0.7485936284065247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8281660079956055,
|
|
"num_tokens": 369375622.0,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 1.180061037639878,
|
|
"grad_norm": 0.7521687150001526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.605,
|
|
"mean_token_accuracy": 0.8194353580474854,
|
|
"num_tokens": 369688789.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.1810783316378433,
|
|
"grad_norm": 0.775185227394104,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6054,
|
|
"mean_token_accuracy": 0.8191851377487183,
|
|
"num_tokens": 369998967.0,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 1.1820956256358088,
|
|
"grad_norm": 0.7778295278549194,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5954,
|
|
"mean_token_accuracy": 0.8214291334152222,
|
|
"num_tokens": 370333473.0,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 1.1831129196337742,
|
|
"grad_norm": 0.7511019706726074,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5936,
|
|
"mean_token_accuracy": 0.82322096824646,
|
|
"num_tokens": 370660466.0,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 1.1841302136317395,
|
|
"grad_norm": 0.780828058719635,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.614,
|
|
"mean_token_accuracy": 0.8165101408958435,
|
|
"num_tokens": 370998386.0,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 1.1851475076297049,
|
|
"grad_norm": 0.7184352278709412,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5826,
|
|
"mean_token_accuracy": 0.8255602717399597,
|
|
"num_tokens": 371332751.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 1.1861648016276705,
|
|
"grad_norm": 0.7472013831138611,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8212395906448364,
|
|
"num_tokens": 371662573.0,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 1.1871820956256358,
|
|
"grad_norm": 0.7376736998558044,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5599,
|
|
"mean_token_accuracy": 0.8313839435577393,
|
|
"num_tokens": 371986831.0,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 1.1881993896236012,
|
|
"grad_norm": 0.8291728496551514,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.625,
|
|
"mean_token_accuracy": 0.8143845200538635,
|
|
"num_tokens": 372315615.0,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 1.1892166836215665,
|
|
"grad_norm": 0.7506025433540344,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8245192766189575,
|
|
"num_tokens": 372638897.0,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 1.190233977619532,
|
|
"grad_norm": 0.7672327160835266,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8251274824142456,
|
|
"num_tokens": 372956645.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 1.1912512716174974,
|
|
"grad_norm": 1.3121378421783447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8294004201889038,
|
|
"num_tokens": 373265980.0,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 1.1922685656154628,
|
|
"grad_norm": 0.7915225028991699,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5998,
|
|
"mean_token_accuracy": 0.821526825428009,
|
|
"num_tokens": 373575839.0,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 1.1932858596134284,
|
|
"grad_norm": 0.7794643044471741,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5788,
|
|
"mean_token_accuracy": 0.8254324793815613,
|
|
"num_tokens": 373896838.0,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 1.1943031536113937,
|
|
"grad_norm": 0.7948567271232605,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5828,
|
|
"mean_token_accuracy": 0.8252392411231995,
|
|
"num_tokens": 374210420.0,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 1.195320447609359,
|
|
"grad_norm": 0.7596521973609924,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.8344128727912903,
|
|
"num_tokens": 374539265.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 1.1963377416073244,
|
|
"grad_norm": 0.8085728287696838,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5841,
|
|
"mean_token_accuracy": 0.8254034519195557,
|
|
"num_tokens": 374846358.0,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 1.19735503560529,
|
|
"grad_norm": 0.8072423338890076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5699,
|
|
"mean_token_accuracy": 0.8295195698738098,
|
|
"num_tokens": 375157542.0,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 1.1983723296032553,
|
|
"grad_norm": 0.7838948965072632,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.825675368309021,
|
|
"num_tokens": 375472279.0,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 1.1993896236012207,
|
|
"grad_norm": 0.8102052807807922,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5754,
|
|
"mean_token_accuracy": 0.8277257680892944,
|
|
"num_tokens": 375785251.0,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 1.200406917599186,
|
|
"grad_norm": 0.7381642460823059,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5677,
|
|
"mean_token_accuracy": 0.828418493270874,
|
|
"num_tokens": 376099013.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.2014242115971516,
|
|
"grad_norm": 0.7651596665382385,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.589,
|
|
"mean_token_accuracy": 0.8231362700462341,
|
|
"num_tokens": 376414415.0,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 1.202441505595117,
|
|
"grad_norm": 0.8105350136756897,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8211319446563721,
|
|
"num_tokens": 376731071.0,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 1.2034587995930823,
|
|
"grad_norm": 0.8020378947257996,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5978,
|
|
"mean_token_accuracy": 0.8202518820762634,
|
|
"num_tokens": 377061505.0,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 1.204476093591048,
|
|
"grad_norm": 0.7923497557640076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5976,
|
|
"mean_token_accuracy": 0.8226475119590759,
|
|
"num_tokens": 377377739.0,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 1.2054933875890133,
|
|
"grad_norm": 0.7810748219490051,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.602,
|
|
"mean_token_accuracy": 0.8202524185180664,
|
|
"num_tokens": 377694342.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 1.2065106815869786,
|
|
"grad_norm": 0.7620840668678284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5758,
|
|
"mean_token_accuracy": 0.8272120952606201,
|
|
"num_tokens": 378013523.0,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 1.207527975584944,
|
|
"grad_norm": 0.8037322759628296,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5737,
|
|
"mean_token_accuracy": 0.8277493715286255,
|
|
"num_tokens": 378315219.0,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 1.2085452695829095,
|
|
"grad_norm": 0.7999018430709839,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8289440870285034,
|
|
"num_tokens": 378622003.0,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 1.2095625635808749,
|
|
"grad_norm": 0.7701825499534607,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5691,
|
|
"mean_token_accuracy": 0.8280704617500305,
|
|
"num_tokens": 378932580.0,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 1.2105798575788402,
|
|
"grad_norm": 0.7750142216682434,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.8203139305114746,
|
|
"num_tokens": 379261815.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 1.2115971515768056,
|
|
"grad_norm": 0.7240380644798279,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8214856386184692,
|
|
"num_tokens": 379589270.0,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 1.2126144455747712,
|
|
"grad_norm": 0.7719489336013794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5922,
|
|
"mean_token_accuracy": 0.8215484023094177,
|
|
"num_tokens": 379905958.0,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 1.2136317395727365,
|
|
"grad_norm": 0.7766221761703491,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5864,
|
|
"mean_token_accuracy": 0.8235345482826233,
|
|
"num_tokens": 380215696.0,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 1.2146490335707019,
|
|
"grad_norm": 0.7447682023048401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5661,
|
|
"mean_token_accuracy": 0.8295247554779053,
|
|
"num_tokens": 380552451.0,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 1.2156663275686674,
|
|
"grad_norm": 0.7715691924095154,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.826104462146759,
|
|
"num_tokens": 380862532.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 1.2166836215666328,
|
|
"grad_norm": 0.7756348252296448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5795,
|
|
"mean_token_accuracy": 0.8258826732635498,
|
|
"num_tokens": 381181297.0,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 1.2177009155645981,
|
|
"grad_norm": 0.7937551140785217,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6209,
|
|
"mean_token_accuracy": 0.8146166205406189,
|
|
"num_tokens": 381492918.0,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"epoch": 1.2187182095625635,
|
|
"grad_norm": 0.7768622040748596,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6017,
|
|
"mean_token_accuracy": 0.8206217288970947,
|
|
"num_tokens": 381810610.0,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"epoch": 1.219735503560529,
|
|
"grad_norm": 0.7419608235359192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5715,
|
|
"mean_token_accuracy": 0.8271506428718567,
|
|
"num_tokens": 382127627.0,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"epoch": 1.2207527975584944,
|
|
"grad_norm": 0.7592210173606873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5842,
|
|
"mean_token_accuracy": 0.824863076210022,
|
|
"num_tokens": 382453657.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.2217700915564598,
|
|
"grad_norm": 0.8112965226173401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8280565738677979,
|
|
"num_tokens": 382781691.0,
|
|
"step": 1201
|
|
},
|
|
{
|
|
"epoch": 1.2227873855544251,
|
|
"grad_norm": 0.7668476700782776,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5805,
|
|
"mean_token_accuracy": 0.8258017897605896,
|
|
"num_tokens": 383107927.0,
|
|
"step": 1202
|
|
},
|
|
{
|
|
"epoch": 1.2238046795523907,
|
|
"grad_norm": 1.771498680114746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5896,
|
|
"mean_token_accuracy": 0.8235318660736084,
|
|
"num_tokens": 383429967.0,
|
|
"step": 1203
|
|
},
|
|
{
|
|
"epoch": 1.224821973550356,
|
|
"grad_norm": 0.7665342688560486,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5858,
|
|
"mean_token_accuracy": 0.8229868412017822,
|
|
"num_tokens": 383744521.0,
|
|
"step": 1204
|
|
},
|
|
{
|
|
"epoch": 1.2258392675483214,
|
|
"grad_norm": 0.7280278205871582,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5783,
|
|
"mean_token_accuracy": 0.8258658647537231,
|
|
"num_tokens": 384070051.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 1.226856561546287,
|
|
"grad_norm": 0.7722219824790955,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5996,
|
|
"mean_token_accuracy": 0.8209736347198486,
|
|
"num_tokens": 384381225.0,
|
|
"step": 1206
|
|
},
|
|
{
|
|
"epoch": 1.2278738555442523,
|
|
"grad_norm": 0.7422972917556763,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5985,
|
|
"mean_token_accuracy": 0.8206053376197815,
|
|
"num_tokens": 384710145.0,
|
|
"step": 1207
|
|
},
|
|
{
|
|
"epoch": 1.2288911495422177,
|
|
"grad_norm": 0.7677599191665649,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6128,
|
|
"mean_token_accuracy": 0.817360520362854,
|
|
"num_tokens": 385030375.0,
|
|
"step": 1208
|
|
},
|
|
{
|
|
"epoch": 1.229908443540183,
|
|
"grad_norm": 0.7416399717330933,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6077,
|
|
"mean_token_accuracy": 0.8186507225036621,
|
|
"num_tokens": 385361429.0,
|
|
"step": 1209
|
|
},
|
|
{
|
|
"epoch": 1.2309257375381486,
|
|
"grad_norm": 0.7370989918708801,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5935,
|
|
"mean_token_accuracy": 0.8210272789001465,
|
|
"num_tokens": 385690394.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 1.231943031536114,
|
|
"grad_norm": 0.7769769430160522,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5777,
|
|
"mean_token_accuracy": 0.8265715837478638,
|
|
"num_tokens": 386012537.0,
|
|
"step": 1211
|
|
},
|
|
{
|
|
"epoch": 1.2329603255340793,
|
|
"grad_norm": 0.7492892146110535,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5886,
|
|
"mean_token_accuracy": 0.8234988451004028,
|
|
"num_tokens": 386338087.0,
|
|
"step": 1212
|
|
},
|
|
{
|
|
"epoch": 1.2339776195320447,
|
|
"grad_norm": 0.7639126181602478,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5804,
|
|
"mean_token_accuracy": 0.8244913816452026,
|
|
"num_tokens": 386663700.0,
|
|
"step": 1213
|
|
},
|
|
{
|
|
"epoch": 1.2349949135300102,
|
|
"grad_norm": 0.743516743183136,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.8219711184501648,
|
|
"num_tokens": 386979867.0,
|
|
"step": 1214
|
|
},
|
|
{
|
|
"epoch": 1.2360122075279756,
|
|
"grad_norm": 0.7583151459693909,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.8271145820617676,
|
|
"num_tokens": 387308229.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 1.237029501525941,
|
|
"grad_norm": 0.7758545279502869,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6084,
|
|
"mean_token_accuracy": 0.8180948495864868,
|
|
"num_tokens": 387633541.0,
|
|
"step": 1216
|
|
},
|
|
{
|
|
"epoch": 1.2380467955239065,
|
|
"grad_norm": 0.7462221384048462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5699,
|
|
"mean_token_accuracy": 0.8287074565887451,
|
|
"num_tokens": 387945149.0,
|
|
"step": 1217
|
|
},
|
|
{
|
|
"epoch": 1.2390640895218719,
|
|
"grad_norm": 0.7475136518478394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.8280231952667236,
|
|
"num_tokens": 388262139.0,
|
|
"step": 1218
|
|
},
|
|
{
|
|
"epoch": 1.2400813835198372,
|
|
"grad_norm": 0.8114516139030457,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5855,
|
|
"mean_token_accuracy": 0.8244044780731201,
|
|
"num_tokens": 388567660.0,
|
|
"step": 1219
|
|
},
|
|
{
|
|
"epoch": 1.2410986775178026,
|
|
"grad_norm": 0.8436469435691833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.8259292840957642,
|
|
"num_tokens": 388862732.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.2421159715157681,
|
|
"grad_norm": 0.7752360105514526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5798,
|
|
"mean_token_accuracy": 0.8258293867111206,
|
|
"num_tokens": 389171597.0,
|
|
"step": 1221
|
|
},
|
|
{
|
|
"epoch": 1.2431332655137335,
|
|
"grad_norm": 0.7781957983970642,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5993,
|
|
"mean_token_accuracy": 0.8197643756866455,
|
|
"num_tokens": 389471070.0,
|
|
"step": 1222
|
|
},
|
|
{
|
|
"epoch": 1.2441505595116988,
|
|
"grad_norm": 0.8264309763908386,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.8217254877090454,
|
|
"num_tokens": 389778197.0,
|
|
"step": 1223
|
|
},
|
|
{
|
|
"epoch": 1.2451678535096642,
|
|
"grad_norm": 0.8131759762763977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5843,
|
|
"mean_token_accuracy": 0.8253227472305298,
|
|
"num_tokens": 390102274.0,
|
|
"step": 1224
|
|
},
|
|
{
|
|
"epoch": 1.2461851475076298,
|
|
"grad_norm": 0.7956682443618774,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6047,
|
|
"mean_token_accuracy": 0.8182318806648254,
|
|
"num_tokens": 390422450.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 1.2472024415055951,
|
|
"grad_norm": 0.737737238407135,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8269497752189636,
|
|
"num_tokens": 390752194.0,
|
|
"step": 1226
|
|
},
|
|
{
|
|
"epoch": 1.2482197355035605,
|
|
"grad_norm": 0.7784290909767151,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8291661739349365,
|
|
"num_tokens": 391076755.0,
|
|
"step": 1227
|
|
},
|
|
{
|
|
"epoch": 1.249237029501526,
|
|
"grad_norm": 0.8793082237243652,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.8253451585769653,
|
|
"num_tokens": 391386429.0,
|
|
"step": 1228
|
|
},
|
|
{
|
|
"epoch": 1.2502543234994914,
|
|
"grad_norm": 0.7870639562606812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5929,
|
|
"mean_token_accuracy": 0.8213008642196655,
|
|
"num_tokens": 391709893.0,
|
|
"step": 1229
|
|
},
|
|
{
|
|
"epoch": 1.2512716174974567,
|
|
"grad_norm": 0.7400197982788086,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5913,
|
|
"mean_token_accuracy": 0.8226692080497742,
|
|
"num_tokens": 392024517.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 1.252288911495422,
|
|
"grad_norm": 0.7613899111747742,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5752,
|
|
"mean_token_accuracy": 0.825670063495636,
|
|
"num_tokens": 392336854.0,
|
|
"step": 1231
|
|
},
|
|
{
|
|
"epoch": 1.2533062054933877,
|
|
"grad_norm": 0.8779903650283813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5956,
|
|
"mean_token_accuracy": 0.8210177421569824,
|
|
"num_tokens": 392656687.0,
|
|
"step": 1232
|
|
},
|
|
{
|
|
"epoch": 1.254323499491353,
|
|
"grad_norm": 0.9043331742286682,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6362,
|
|
"mean_token_accuracy": 0.8101489543914795,
|
|
"num_tokens": 392972527.0,
|
|
"step": 1233
|
|
},
|
|
{
|
|
"epoch": 1.2553407934893184,
|
|
"grad_norm": 0.8443557024002075,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5845,
|
|
"mean_token_accuracy": 0.8232437372207642,
|
|
"num_tokens": 393296349.0,
|
|
"step": 1234
|
|
},
|
|
{
|
|
"epoch": 1.2563580874872837,
|
|
"grad_norm": 0.7689527869224548,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5515,
|
|
"mean_token_accuracy": 0.8333369493484497,
|
|
"num_tokens": 393622385.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 1.2573753814852493,
|
|
"grad_norm": 0.7692641615867615,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8272202610969543,
|
|
"num_tokens": 393940347.0,
|
|
"step": 1236
|
|
},
|
|
{
|
|
"epoch": 1.2583926754832147,
|
|
"grad_norm": 0.8014664649963379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.596,
|
|
"mean_token_accuracy": 0.8211422562599182,
|
|
"num_tokens": 394274909.0,
|
|
"step": 1237
|
|
},
|
|
{
|
|
"epoch": 1.25940996948118,
|
|
"grad_norm": 0.9445344805717468,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8287132382392883,
|
|
"num_tokens": 394572404.0,
|
|
"step": 1238
|
|
},
|
|
{
|
|
"epoch": 1.2604272634791456,
|
|
"grad_norm": 0.887416660785675,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6244,
|
|
"mean_token_accuracy": 0.8148610591888428,
|
|
"num_tokens": 394904157.0,
|
|
"step": 1239
|
|
},
|
|
{
|
|
"epoch": 1.261444557477111,
|
|
"grad_norm": 0.7784750461578369,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5839,
|
|
"mean_token_accuracy": 0.8244732022285461,
|
|
"num_tokens": 395205744.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.2624618514750763,
|
|
"grad_norm": 0.7089627385139465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5735,
|
|
"mean_token_accuracy": 0.8267613649368286,
|
|
"num_tokens": 395535591.0,
|
|
"step": 1241
|
|
},
|
|
{
|
|
"epoch": 1.2634791454730416,
|
|
"grad_norm": 0.7316235303878784,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5897,
|
|
"mean_token_accuracy": 0.8229551911354065,
|
|
"num_tokens": 395887109.0,
|
|
"step": 1242
|
|
},
|
|
{
|
|
"epoch": 1.264496439471007,
|
|
"grad_norm": 0.7902207970619202,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6094,
|
|
"mean_token_accuracy": 0.8176922798156738,
|
|
"num_tokens": 396215248.0,
|
|
"step": 1243
|
|
},
|
|
{
|
|
"epoch": 1.2655137334689726,
|
|
"grad_norm": 0.8321451544761658,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6082,
|
|
"mean_token_accuracy": 0.8179523944854736,
|
|
"num_tokens": 396538633.0,
|
|
"step": 1244
|
|
},
|
|
{
|
|
"epoch": 1.266531027466938,
|
|
"grad_norm": 0.7029138207435608,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.575,
|
|
"mean_token_accuracy": 0.8264862298965454,
|
|
"num_tokens": 396867308.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 1.2675483214649033,
|
|
"grad_norm": 0.7796322107315063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5952,
|
|
"mean_token_accuracy": 0.8210832476615906,
|
|
"num_tokens": 397176821.0,
|
|
"step": 1246
|
|
},
|
|
{
|
|
"epoch": 1.2685656154628688,
|
|
"grad_norm": 0.7757667899131775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.8296244740486145,
|
|
"num_tokens": 397507718.0,
|
|
"step": 1247
|
|
},
|
|
{
|
|
"epoch": 1.2695829094608342,
|
|
"grad_norm": 0.7379530072212219,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5693,
|
|
"mean_token_accuracy": 0.8277691602706909,
|
|
"num_tokens": 397832455.0,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"epoch": 1.2706002034587995,
|
|
"grad_norm": 0.7905023694038391,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8222687244415283,
|
|
"num_tokens": 398143376.0,
|
|
"step": 1249
|
|
},
|
|
{
|
|
"epoch": 1.2716174974567651,
|
|
"grad_norm": 0.7380933165550232,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5748,
|
|
"mean_token_accuracy": 0.8268264532089233,
|
|
"num_tokens": 398461934.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 1.2726347914547305,
|
|
"grad_norm": 0.7449954152107239,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5873,
|
|
"mean_token_accuracy": 0.8227959275245667,
|
|
"num_tokens": 398775458.0,
|
|
"step": 1251
|
|
},
|
|
{
|
|
"epoch": 1.2736520854526958,
|
|
"grad_norm": 0.7715222239494324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6095,
|
|
"mean_token_accuracy": 0.8172944188117981,
|
|
"num_tokens": 399094210.0,
|
|
"step": 1252
|
|
},
|
|
{
|
|
"epoch": 1.2746693794506612,
|
|
"grad_norm": 0.7808869481086731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5787,
|
|
"mean_token_accuracy": 0.8260170221328735,
|
|
"num_tokens": 399408637.0,
|
|
"step": 1253
|
|
},
|
|
{
|
|
"epoch": 1.2756866734486265,
|
|
"grad_norm": 0.7651352286338806,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5838,
|
|
"mean_token_accuracy": 0.824914813041687,
|
|
"num_tokens": 399726159.0,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"epoch": 1.276703967446592,
|
|
"grad_norm": 0.7173874974250793,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5716,
|
|
"mean_token_accuracy": 0.8283831477165222,
|
|
"num_tokens": 400058221.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 1.2777212614445574,
|
|
"grad_norm": 0.7614690065383911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.581,
|
|
"mean_token_accuracy": 0.8249248266220093,
|
|
"num_tokens": 400381507.0,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"epoch": 1.2787385554425228,
|
|
"grad_norm": 0.7964950203895569,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5803,
|
|
"mean_token_accuracy": 0.8261619806289673,
|
|
"num_tokens": 400704230.0,
|
|
"step": 1257
|
|
},
|
|
{
|
|
"epoch": 1.2797558494404884,
|
|
"grad_norm": 0.797745406627655,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8254767656326294,
|
|
"num_tokens": 401018958.0,
|
|
"step": 1258
|
|
},
|
|
{
|
|
"epoch": 1.2807731434384537,
|
|
"grad_norm": 0.7655918598175049,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8233509063720703,
|
|
"num_tokens": 401325913.0,
|
|
"step": 1259
|
|
},
|
|
{
|
|
"epoch": 1.281790437436419,
|
|
"grad_norm": 0.776522696018219,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6031,
|
|
"mean_token_accuracy": 0.8195526599884033,
|
|
"num_tokens": 401650213.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.2828077314343846,
|
|
"grad_norm": 0.7863470315933228,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8247014880180359,
|
|
"num_tokens": 401952668.0,
|
|
"step": 1261
|
|
},
|
|
{
|
|
"epoch": 1.28382502543235,
|
|
"grad_norm": 0.7836190462112427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5688,
|
|
"mean_token_accuracy": 0.8274684548377991,
|
|
"num_tokens": 402266408.0,
|
|
"step": 1262
|
|
},
|
|
{
|
|
"epoch": 1.2848423194303153,
|
|
"grad_norm": 0.7265501618385315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.612,
|
|
"mean_token_accuracy": 0.8172202110290527,
|
|
"num_tokens": 402595965.0,
|
|
"step": 1263
|
|
},
|
|
{
|
|
"epoch": 1.2858596134282807,
|
|
"grad_norm": 0.7719841003417969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6108,
|
|
"mean_token_accuracy": 0.815332293510437,
|
|
"num_tokens": 402919556.0,
|
|
"step": 1264
|
|
},
|
|
{
|
|
"epoch": 1.286876907426246,
|
|
"grad_norm": 0.7617841362953186,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8226585388183594,
|
|
"num_tokens": 403234129.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 1.2878942014242116,
|
|
"grad_norm": 3.889275312423706,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6004,
|
|
"mean_token_accuracy": 0.8203097581863403,
|
|
"num_tokens": 403545378.0,
|
|
"step": 1266
|
|
},
|
|
{
|
|
"epoch": 1.288911495422177,
|
|
"grad_norm": 0.8040711879730225,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6019,
|
|
"mean_token_accuracy": 0.8193329572677612,
|
|
"num_tokens": 403879895.0,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"epoch": 1.2899287894201423,
|
|
"grad_norm": 0.8139612078666687,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8277958631515503,
|
|
"num_tokens": 404194940.0,
|
|
"step": 1268
|
|
},
|
|
{
|
|
"epoch": 1.290946083418108,
|
|
"grad_norm": 0.7611631155014038,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6054,
|
|
"mean_token_accuracy": 0.8179622888565063,
|
|
"num_tokens": 404516058.0,
|
|
"step": 1269
|
|
},
|
|
{
|
|
"epoch": 1.2919633774160733,
|
|
"grad_norm": 0.8590632677078247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6001,
|
|
"mean_token_accuracy": 0.8203931450843811,
|
|
"num_tokens": 404823463.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 1.2929806714140386,
|
|
"grad_norm": 0.7872003316879272,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5952,
|
|
"mean_token_accuracy": 0.8210145831108093,
|
|
"num_tokens": 405133209.0,
|
|
"step": 1271
|
|
},
|
|
{
|
|
"epoch": 1.2939979654120042,
|
|
"grad_norm": 0.762592077255249,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5907,
|
|
"mean_token_accuracy": 0.8206639885902405,
|
|
"num_tokens": 405447583.0,
|
|
"step": 1272
|
|
},
|
|
{
|
|
"epoch": 1.2950152594099695,
|
|
"grad_norm": 0.7649151682853699,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5651,
|
|
"mean_token_accuracy": 0.8295191526412964,
|
|
"num_tokens": 405772749.0,
|
|
"step": 1273
|
|
},
|
|
{
|
|
"epoch": 1.2960325534079349,
|
|
"grad_norm": 0.7442457675933838,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8269622325897217,
|
|
"num_tokens": 406102126.0,
|
|
"step": 1274
|
|
},
|
|
{
|
|
"epoch": 1.2970498474059002,
|
|
"grad_norm": 0.8645238280296326,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5825,
|
|
"mean_token_accuracy": 0.8254821300506592,
|
|
"num_tokens": 406423086.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 1.2980671414038656,
|
|
"grad_norm": 0.7339827418327332,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5608,
|
|
"mean_token_accuracy": 0.8312309384346008,
|
|
"num_tokens": 406744290.0,
|
|
"step": 1276
|
|
},
|
|
{
|
|
"epoch": 1.2990844354018312,
|
|
"grad_norm": 0.8143907189369202,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5899,
|
|
"mean_token_accuracy": 0.8225513696670532,
|
|
"num_tokens": 407056075.0,
|
|
"step": 1277
|
|
},
|
|
{
|
|
"epoch": 1.3001017293997965,
|
|
"grad_norm": 0.7675595879554749,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5971,
|
|
"mean_token_accuracy": 0.8220341801643372,
|
|
"num_tokens": 407364405.0,
|
|
"step": 1278
|
|
},
|
|
{
|
|
"epoch": 1.3011190233977619,
|
|
"grad_norm": 0.7461718916893005,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5769,
|
|
"mean_token_accuracy": 0.8260219097137451,
|
|
"num_tokens": 407689878.0,
|
|
"step": 1279
|
|
},
|
|
{
|
|
"epoch": 1.3021363173957274,
|
|
"grad_norm": 0.7368419170379639,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5809,
|
|
"mean_token_accuracy": 0.8259373307228088,
|
|
"num_tokens": 408019653.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.3031536113936928,
|
|
"grad_norm": 0.7481878399848938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8262139558792114,
|
|
"num_tokens": 408343205.0,
|
|
"step": 1281
|
|
},
|
|
{
|
|
"epoch": 1.3041709053916581,
|
|
"grad_norm": 0.7598819136619568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8240863084793091,
|
|
"num_tokens": 408666686.0,
|
|
"step": 1282
|
|
},
|
|
{
|
|
"epoch": 1.3051881993896237,
|
|
"grad_norm": 0.753930389881134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5968,
|
|
"mean_token_accuracy": 0.8200889825820923,
|
|
"num_tokens": 408989130.0,
|
|
"step": 1283
|
|
},
|
|
{
|
|
"epoch": 1.306205493387589,
|
|
"grad_norm": 0.7269991040229797,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.8264918923377991,
|
|
"num_tokens": 409319697.0,
|
|
"step": 1284
|
|
},
|
|
{
|
|
"epoch": 1.3072227873855544,
|
|
"grad_norm": 0.7664880156517029,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5839,
|
|
"mean_token_accuracy": 0.8235412836074829,
|
|
"num_tokens": 409638569.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 1.3082400813835198,
|
|
"grad_norm": 0.7126306295394897,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5911,
|
|
"mean_token_accuracy": 0.821363091468811,
|
|
"num_tokens": 409963322.0,
|
|
"step": 1286
|
|
},
|
|
{
|
|
"epoch": 1.3092573753814851,
|
|
"grad_norm": 0.7578443288803101,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8326585292816162,
|
|
"num_tokens": 410289774.0,
|
|
"step": 1287
|
|
},
|
|
{
|
|
"epoch": 1.3102746693794507,
|
|
"grad_norm": 0.7516607046127319,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5506,
|
|
"mean_token_accuracy": 0.8333381414413452,
|
|
"num_tokens": 410603142.0,
|
|
"step": 1288
|
|
},
|
|
{
|
|
"epoch": 1.311291963377416,
|
|
"grad_norm": 0.7146863341331482,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.596,
|
|
"mean_token_accuracy": 0.8209625482559204,
|
|
"num_tokens": 410934281.0,
|
|
"step": 1289
|
|
},
|
|
{
|
|
"epoch": 1.3123092573753814,
|
|
"grad_norm": 0.7467058300971985,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5953,
|
|
"mean_token_accuracy": 0.8207160234451294,
|
|
"num_tokens": 411262888.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 1.313326551373347,
|
|
"grad_norm": 0.8217064142227173,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5749,
|
|
"mean_token_accuracy": 0.8268510103225708,
|
|
"num_tokens": 411567594.0,
|
|
"step": 1291
|
|
},
|
|
{
|
|
"epoch": 1.3143438453713123,
|
|
"grad_norm": 0.7733748555183411,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6042,
|
|
"mean_token_accuracy": 0.8187816143035889,
|
|
"num_tokens": 411893042.0,
|
|
"step": 1292
|
|
},
|
|
{
|
|
"epoch": 1.3153611393692777,
|
|
"grad_norm": 0.7679750919342041,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5756,
|
|
"mean_token_accuracy": 0.8263243436813354,
|
|
"num_tokens": 412215931.0,
|
|
"step": 1293
|
|
},
|
|
{
|
|
"epoch": 1.3163784333672433,
|
|
"grad_norm": 0.7660022377967834,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5826,
|
|
"mean_token_accuracy": 0.8236187100410461,
|
|
"num_tokens": 412552715.0,
|
|
"step": 1294
|
|
},
|
|
{
|
|
"epoch": 1.3173957273652086,
|
|
"grad_norm": 0.7751847505569458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5957,
|
|
"mean_token_accuracy": 0.8215477466583252,
|
|
"num_tokens": 412869340.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 1.318413021363174,
|
|
"grad_norm": 0.7322462797164917,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8294398784637451,
|
|
"num_tokens": 413187434.0,
|
|
"step": 1296
|
|
},
|
|
{
|
|
"epoch": 1.3194303153611393,
|
|
"grad_norm": 0.7586652040481567,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.605,
|
|
"mean_token_accuracy": 0.8181220293045044,
|
|
"num_tokens": 413523255.0,
|
|
"step": 1297
|
|
},
|
|
{
|
|
"epoch": 1.3204476093591047,
|
|
"grad_norm": 0.7863152027130127,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5837,
|
|
"mean_token_accuracy": 0.824659526348114,
|
|
"num_tokens": 413836107.0,
|
|
"step": 1298
|
|
},
|
|
{
|
|
"epoch": 1.3214649033570702,
|
|
"grad_norm": 0.8461436629295349,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5571,
|
|
"mean_token_accuracy": 0.8312619924545288,
|
|
"num_tokens": 414156938.0,
|
|
"step": 1299
|
|
},
|
|
{
|
|
"epoch": 1.3224821973550356,
|
|
"grad_norm": 0.7358280420303345,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6032,
|
|
"mean_token_accuracy": 0.8195974826812744,
|
|
"num_tokens": 414482061.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.323499491353001,
|
|
"grad_norm": 0.742594838142395,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8274781703948975,
|
|
"num_tokens": 414806350.0,
|
|
"step": 1301
|
|
},
|
|
{
|
|
"epoch": 1.3245167853509665,
|
|
"grad_norm": 0.7661194801330566,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8172125816345215,
|
|
"num_tokens": 415151776.0,
|
|
"step": 1302
|
|
},
|
|
{
|
|
"epoch": 1.3255340793489319,
|
|
"grad_norm": 0.7671038508415222,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5909,
|
|
"mean_token_accuracy": 0.8222029805183411,
|
|
"num_tokens": 415475717.0,
|
|
"step": 1303
|
|
},
|
|
{
|
|
"epoch": 1.3265513733468972,
|
|
"grad_norm": 0.7653346657752991,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5719,
|
|
"mean_token_accuracy": 0.8281009197235107,
|
|
"num_tokens": 415782339.0,
|
|
"step": 1304
|
|
},
|
|
{
|
|
"epoch": 1.3275686673448628,
|
|
"grad_norm": 0.7612704634666443,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5821,
|
|
"mean_token_accuracy": 0.8244327306747437,
|
|
"num_tokens": 416091384.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 1.3285859613428281,
|
|
"grad_norm": 0.7379361391067505,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6002,
|
|
"mean_token_accuracy": 0.8189595937728882,
|
|
"num_tokens": 416408034.0,
|
|
"step": 1306
|
|
},
|
|
{
|
|
"epoch": 1.3296032553407935,
|
|
"grad_norm": 0.7884662747383118,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8236218690872192,
|
|
"num_tokens": 416715317.0,
|
|
"step": 1307
|
|
},
|
|
{
|
|
"epoch": 1.3306205493387588,
|
|
"grad_norm": 0.8206257820129395,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5838,
|
|
"mean_token_accuracy": 0.8237992525100708,
|
|
"num_tokens": 417017357.0,
|
|
"step": 1308
|
|
},
|
|
{
|
|
"epoch": 1.3316378433367242,
|
|
"grad_norm": 0.7628151178359985,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.8287310600280762,
|
|
"num_tokens": 417324031.0,
|
|
"step": 1309
|
|
},
|
|
{
|
|
"epoch": 1.3326551373346898,
|
|
"grad_norm": 0.7664852142333984,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.8257479667663574,
|
|
"num_tokens": 417635848.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 1.3336724313326551,
|
|
"grad_norm": 0.717381477355957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8340250253677368,
|
|
"num_tokens": 417948063.0,
|
|
"step": 1311
|
|
},
|
|
{
|
|
"epoch": 1.3346897253306205,
|
|
"grad_norm": 0.7889302372932434,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.614,
|
|
"mean_token_accuracy": 0.8164811134338379,
|
|
"num_tokens": 418263839.0,
|
|
"step": 1312
|
|
},
|
|
{
|
|
"epoch": 1.335707019328586,
|
|
"grad_norm": 0.7360092997550964,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5587,
|
|
"mean_token_accuracy": 0.831147313117981,
|
|
"num_tokens": 418582350.0,
|
|
"step": 1313
|
|
},
|
|
{
|
|
"epoch": 1.3367243133265514,
|
|
"grad_norm": 0.8149121999740601,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8293501138687134,
|
|
"num_tokens": 418893429.0,
|
|
"step": 1314
|
|
},
|
|
{
|
|
"epoch": 1.3377416073245167,
|
|
"grad_norm": 0.7356194257736206,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5821,
|
|
"mean_token_accuracy": 0.8250120878219604,
|
|
"num_tokens": 419228134.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 1.3387589013224823,
|
|
"grad_norm": 0.7707953453063965,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6203,
|
|
"mean_token_accuracy": 0.8141721487045288,
|
|
"num_tokens": 419546274.0,
|
|
"step": 1316
|
|
},
|
|
{
|
|
"epoch": 1.3397761953204477,
|
|
"grad_norm": 0.8126129508018494,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5977,
|
|
"mean_token_accuracy": 0.8189851641654968,
|
|
"num_tokens": 419861881.0,
|
|
"step": 1317
|
|
},
|
|
{
|
|
"epoch": 1.340793489318413,
|
|
"grad_norm": 0.7575045228004456,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8282990455627441,
|
|
"num_tokens": 420174883.0,
|
|
"step": 1318
|
|
},
|
|
{
|
|
"epoch": 1.3418107833163784,
|
|
"grad_norm": 0.768957257270813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5964,
|
|
"mean_token_accuracy": 0.8211398124694824,
|
|
"num_tokens": 420490175.0,
|
|
"step": 1319
|
|
},
|
|
{
|
|
"epoch": 1.3428280773143437,
|
|
"grad_norm": 0.7741740942001343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5533,
|
|
"mean_token_accuracy": 0.8324936032295227,
|
|
"num_tokens": 420818659.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.3438453713123093,
|
|
"grad_norm": 0.8340362310409546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5953,
|
|
"mean_token_accuracy": 0.8220871686935425,
|
|
"num_tokens": 421147515.0,
|
|
"step": 1321
|
|
},
|
|
{
|
|
"epoch": 1.3448626653102747,
|
|
"grad_norm": 0.7541214823722839,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5842,
|
|
"mean_token_accuracy": 0.8246658444404602,
|
|
"num_tokens": 421474359.0,
|
|
"step": 1322
|
|
},
|
|
{
|
|
"epoch": 1.34587995930824,
|
|
"grad_norm": 0.7496918439865112,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.582,
|
|
"mean_token_accuracy": 0.8251877427101135,
|
|
"num_tokens": 421795778.0,
|
|
"step": 1323
|
|
},
|
|
{
|
|
"epoch": 1.3468972533062056,
|
|
"grad_norm": 0.7333158254623413,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5899,
|
|
"mean_token_accuracy": 0.8225975036621094,
|
|
"num_tokens": 422127600.0,
|
|
"step": 1324
|
|
},
|
|
{
|
|
"epoch": 1.347914547304171,
|
|
"grad_norm": 0.7470682263374329,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5993,
|
|
"mean_token_accuracy": 0.8197658658027649,
|
|
"num_tokens": 422441520.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 1.3489318413021363,
|
|
"grad_norm": 0.8310449123382568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6106,
|
|
"mean_token_accuracy": 0.8172779083251953,
|
|
"num_tokens": 422758073.0,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"epoch": 1.3499491353001019,
|
|
"grad_norm": 0.8071318864822388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5813,
|
|
"mean_token_accuracy": 0.8248072862625122,
|
|
"num_tokens": 423072900.0,
|
|
"step": 1327
|
|
},
|
|
{
|
|
"epoch": 1.3509664292980672,
|
|
"grad_norm": 0.7378259301185608,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8296499252319336,
|
|
"num_tokens": 423399159.0,
|
|
"step": 1328
|
|
},
|
|
{
|
|
"epoch": 1.3519837232960326,
|
|
"grad_norm": 0.7724500298500061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5937,
|
|
"mean_token_accuracy": 0.8213194608688354,
|
|
"num_tokens": 423723160.0,
|
|
"step": 1329
|
|
},
|
|
{
|
|
"epoch": 1.353001017293998,
|
|
"grad_norm": 0.7197055816650391,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.534,
|
|
"mean_token_accuracy": 0.837218165397644,
|
|
"num_tokens": 424042895.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 1.3540183112919633,
|
|
"grad_norm": 0.7320886850357056,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.8275708556175232,
|
|
"num_tokens": 424359175.0,
|
|
"step": 1331
|
|
},
|
|
{
|
|
"epoch": 1.3550356052899288,
|
|
"grad_norm": 0.7555741667747498,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5847,
|
|
"mean_token_accuracy": 0.8243857026100159,
|
|
"num_tokens": 424688740.0,
|
|
"step": 1332
|
|
},
|
|
{
|
|
"epoch": 1.3560528992878942,
|
|
"grad_norm": 0.7789037823677063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5808,
|
|
"mean_token_accuracy": 0.8249820470809937,
|
|
"num_tokens": 425017780.0,
|
|
"step": 1333
|
|
},
|
|
{
|
|
"epoch": 1.3570701932858595,
|
|
"grad_norm": 0.7404174208641052,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5878,
|
|
"mean_token_accuracy": 0.8227372169494629,
|
|
"num_tokens": 425345218.0,
|
|
"step": 1334
|
|
},
|
|
{
|
|
"epoch": 1.3580874872838251,
|
|
"grad_norm": 0.7573344707489014,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5895,
|
|
"mean_token_accuracy": 0.8235394358634949,
|
|
"num_tokens": 425660450.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 1.3591047812817905,
|
|
"grad_norm": 0.8044156432151794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.587,
|
|
"mean_token_accuracy": 0.8230143785476685,
|
|
"num_tokens": 425961495.0,
|
|
"step": 1336
|
|
},
|
|
{
|
|
"epoch": 1.3601220752797558,
|
|
"grad_norm": 0.7424970865249634,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5713,
|
|
"mean_token_accuracy": 0.826804518699646,
|
|
"num_tokens": 426290019.0,
|
|
"step": 1337
|
|
},
|
|
{
|
|
"epoch": 1.3611393692777214,
|
|
"grad_norm": 0.7788193225860596,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.8316465020179749,
|
|
"num_tokens": 426600712.0,
|
|
"step": 1338
|
|
},
|
|
{
|
|
"epoch": 1.3621566632756867,
|
|
"grad_norm": 0.7808539867401123,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8237320780754089,
|
|
"num_tokens": 426930227.0,
|
|
"step": 1339
|
|
},
|
|
{
|
|
"epoch": 1.363173957273652,
|
|
"grad_norm": 0.7088400721549988,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5641,
|
|
"mean_token_accuracy": 0.8300147652626038,
|
|
"num_tokens": 427263929.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.3641912512716174,
|
|
"grad_norm": 0.8230143189430237,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5907,
|
|
"mean_token_accuracy": 0.8224456310272217,
|
|
"num_tokens": 427581870.0,
|
|
"step": 1341
|
|
},
|
|
{
|
|
"epoch": 1.3652085452695828,
|
|
"grad_norm": 0.7356793880462646,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5686,
|
|
"mean_token_accuracy": 0.828331470489502,
|
|
"num_tokens": 427890146.0,
|
|
"step": 1342
|
|
},
|
|
{
|
|
"epoch": 1.3662258392675484,
|
|
"grad_norm": 0.7339310050010681,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5848,
|
|
"mean_token_accuracy": 0.8255443572998047,
|
|
"num_tokens": 428210521.0,
|
|
"step": 1343
|
|
},
|
|
{
|
|
"epoch": 1.3672431332655137,
|
|
"grad_norm": 0.7392643094062805,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5806,
|
|
"mean_token_accuracy": 0.8248897790908813,
|
|
"num_tokens": 428539565.0,
|
|
"step": 1344
|
|
},
|
|
{
|
|
"epoch": 1.368260427263479,
|
|
"grad_norm": 0.7387965321540833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.8200353980064392,
|
|
"num_tokens": 428868507.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 1.3692777212614446,
|
|
"grad_norm": 0.7944315075874329,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.588,
|
|
"mean_token_accuracy": 0.8235206007957458,
|
|
"num_tokens": 429192002.0,
|
|
"step": 1346
|
|
},
|
|
{
|
|
"epoch": 1.37029501525941,
|
|
"grad_norm": 0.7915907502174377,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.82527095079422,
|
|
"num_tokens": 429499970.0,
|
|
"step": 1347
|
|
},
|
|
{
|
|
"epoch": 1.3713123092573754,
|
|
"grad_norm": 0.7329350709915161,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5781,
|
|
"mean_token_accuracy": 0.8260283470153809,
|
|
"num_tokens": 429822566.0,
|
|
"step": 1348
|
|
},
|
|
{
|
|
"epoch": 1.372329603255341,
|
|
"grad_norm": 0.7127922773361206,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5584,
|
|
"mean_token_accuracy": 0.8313673734664917,
|
|
"num_tokens": 430138798.0,
|
|
"step": 1349
|
|
},
|
|
{
|
|
"epoch": 1.3733468972533063,
|
|
"grad_norm": 0.7566921710968018,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.8332539796829224,
|
|
"num_tokens": 430457184.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 1.3743641912512716,
|
|
"grad_norm": 0.7381209135055542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5956,
|
|
"mean_token_accuracy": 0.821380078792572,
|
|
"num_tokens": 430782101.0,
|
|
"step": 1351
|
|
},
|
|
{
|
|
"epoch": 1.375381485249237,
|
|
"grad_norm": 0.7444591522216797,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5606,
|
|
"mean_token_accuracy": 0.8303647637367249,
|
|
"num_tokens": 431093719.0,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"epoch": 1.3763987792472023,
|
|
"grad_norm": 0.7629130482673645,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8233219981193542,
|
|
"num_tokens": 431420888.0,
|
|
"step": 1353
|
|
},
|
|
{
|
|
"epoch": 1.377416073245168,
|
|
"grad_norm": 0.7519823312759399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5875,
|
|
"mean_token_accuracy": 0.8230313062667847,
|
|
"num_tokens": 431747572.0,
|
|
"step": 1354
|
|
},
|
|
{
|
|
"epoch": 1.3784333672431333,
|
|
"grad_norm": 0.7806484699249268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5992,
|
|
"mean_token_accuracy": 0.8208594918251038,
|
|
"num_tokens": 432053072.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 1.3794506612410986,
|
|
"grad_norm": 0.806029200553894,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8318723440170288,
|
|
"num_tokens": 432356553.0,
|
|
"step": 1356
|
|
},
|
|
{
|
|
"epoch": 1.3804679552390642,
|
|
"grad_norm": 0.7654630541801453,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5951,
|
|
"mean_token_accuracy": 0.8207912445068359,
|
|
"num_tokens": 432670085.0,
|
|
"step": 1357
|
|
},
|
|
{
|
|
"epoch": 1.3814852492370295,
|
|
"grad_norm": 0.7566870450973511,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8170797824859619,
|
|
"num_tokens": 433004996.0,
|
|
"step": 1358
|
|
},
|
|
{
|
|
"epoch": 1.3825025432349949,
|
|
"grad_norm": 0.7837706208229065,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.830281674861908,
|
|
"num_tokens": 433309340.0,
|
|
"step": 1359
|
|
},
|
|
{
|
|
"epoch": 1.3835198372329605,
|
|
"grad_norm": 0.7306072115898132,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8319963216781616,
|
|
"num_tokens": 433655158.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.3845371312309258,
|
|
"grad_norm": 0.7636180520057678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8329065442085266,
|
|
"num_tokens": 433968624.0,
|
|
"step": 1361
|
|
},
|
|
{
|
|
"epoch": 1.3855544252288912,
|
|
"grad_norm": 0.7410780787467957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.578,
|
|
"mean_token_accuracy": 0.8241591453552246,
|
|
"num_tokens": 434295363.0,
|
|
"step": 1362
|
|
},
|
|
{
|
|
"epoch": 1.3865717192268565,
|
|
"grad_norm": 0.7491535544395447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.8248358964920044,
|
|
"num_tokens": 434613231.0,
|
|
"step": 1363
|
|
},
|
|
{
|
|
"epoch": 1.3875890132248219,
|
|
"grad_norm": 0.7772935628890991,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8203805685043335,
|
|
"num_tokens": 434917053.0,
|
|
"step": 1364
|
|
},
|
|
{
|
|
"epoch": 1.3886063072227874,
|
|
"grad_norm": 0.7721388936042786,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6069,
|
|
"mean_token_accuracy": 0.818004310131073,
|
|
"num_tokens": 435226320.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 1.3896236012207528,
|
|
"grad_norm": 0.9722421169281006,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8311303853988647,
|
|
"num_tokens": 435536990.0,
|
|
"step": 1366
|
|
},
|
|
{
|
|
"epoch": 1.3906408952187181,
|
|
"grad_norm": 0.7618489265441895,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5866,
|
|
"mean_token_accuracy": 0.8225003480911255,
|
|
"num_tokens": 435847048.0,
|
|
"step": 1367
|
|
},
|
|
{
|
|
"epoch": 1.3916581892166837,
|
|
"grad_norm": 0.7776749134063721,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6094,
|
|
"mean_token_accuracy": 0.8165792226791382,
|
|
"num_tokens": 436161792.0,
|
|
"step": 1368
|
|
},
|
|
{
|
|
"epoch": 1.392675483214649,
|
|
"grad_norm": 0.7927550673484802,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8167214393615723,
|
|
"num_tokens": 436497752.0,
|
|
"step": 1369
|
|
},
|
|
{
|
|
"epoch": 1.3936927772126144,
|
|
"grad_norm": 0.7556279897689819,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5662,
|
|
"mean_token_accuracy": 0.8288087844848633,
|
|
"num_tokens": 436827912.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 1.39471007121058,
|
|
"grad_norm": 0.7612072229385376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5793,
|
|
"mean_token_accuracy": 0.8257796168327332,
|
|
"num_tokens": 437161113.0,
|
|
"step": 1371
|
|
},
|
|
{
|
|
"epoch": 1.3957273652085453,
|
|
"grad_norm": 0.8022080063819885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6024,
|
|
"mean_token_accuracy": 0.8193477392196655,
|
|
"num_tokens": 437453477.0,
|
|
"step": 1372
|
|
},
|
|
{
|
|
"epoch": 1.3967446592065107,
|
|
"grad_norm": 0.8560948967933655,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5981,
|
|
"mean_token_accuracy": 0.8201726675033569,
|
|
"num_tokens": 437749330.0,
|
|
"step": 1373
|
|
},
|
|
{
|
|
"epoch": 1.397761953204476,
|
|
"grad_norm": 0.7637492418289185,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8309266567230225,
|
|
"num_tokens": 438064905.0,
|
|
"step": 1374
|
|
},
|
|
{
|
|
"epoch": 1.3987792472024414,
|
|
"grad_norm": 0.76103675365448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8285423517227173,
|
|
"num_tokens": 438374191.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 1.399796541200407,
|
|
"grad_norm": 0.7823554873466492,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5636,
|
|
"mean_token_accuracy": 0.8292416930198669,
|
|
"num_tokens": 438672284.0,
|
|
"step": 1376
|
|
},
|
|
{
|
|
"epoch": 1.4008138351983723,
|
|
"grad_norm": 0.7935237884521484,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8235949277877808,
|
|
"num_tokens": 438980241.0,
|
|
"step": 1377
|
|
},
|
|
{
|
|
"epoch": 1.4018311291963377,
|
|
"grad_norm": 0.7849200367927551,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8282375931739807,
|
|
"num_tokens": 439300154.0,
|
|
"step": 1378
|
|
},
|
|
{
|
|
"epoch": 1.4028484231943033,
|
|
"grad_norm": 0.7585959434509277,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.8329802751541138,
|
|
"num_tokens": 439612805.0,
|
|
"step": 1379
|
|
},
|
|
{
|
|
"epoch": 1.4038657171922686,
|
|
"grad_norm": 0.7586443424224854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5755,
|
|
"mean_token_accuracy": 0.8274024724960327,
|
|
"num_tokens": 439927068.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.404883011190234,
|
|
"grad_norm": 0.7680992484092712,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8220770359039307,
|
|
"num_tokens": 440231502.0,
|
|
"step": 1381
|
|
},
|
|
{
|
|
"epoch": 1.4059003051881995,
|
|
"grad_norm": 0.7801609635353088,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5859,
|
|
"mean_token_accuracy": 0.8232927322387695,
|
|
"num_tokens": 440550613.0,
|
|
"step": 1382
|
|
},
|
|
{
|
|
"epoch": 1.4069175991861649,
|
|
"grad_norm": 0.7973816990852356,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5755,
|
|
"mean_token_accuracy": 0.8271034955978394,
|
|
"num_tokens": 440878494.0,
|
|
"step": 1383
|
|
},
|
|
{
|
|
"epoch": 1.4079348931841302,
|
|
"grad_norm": 0.808786153793335,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5749,
|
|
"mean_token_accuracy": 0.8254208564758301,
|
|
"num_tokens": 441175256.0,
|
|
"step": 1384
|
|
},
|
|
{
|
|
"epoch": 1.4089521871820956,
|
|
"grad_norm": 0.7621607184410095,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5789,
|
|
"mean_token_accuracy": 0.8252738118171692,
|
|
"num_tokens": 441490539.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 1.409969481180061,
|
|
"grad_norm": 0.7777722477912903,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.576,
|
|
"mean_token_accuracy": 0.8258455991744995,
|
|
"num_tokens": 441811314.0,
|
|
"step": 1386
|
|
},
|
|
{
|
|
"epoch": 1.4109867751780265,
|
|
"grad_norm": 0.7797698974609375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5694,
|
|
"mean_token_accuracy": 0.8287750482559204,
|
|
"num_tokens": 442127917.0,
|
|
"step": 1387
|
|
},
|
|
{
|
|
"epoch": 1.4120040691759919,
|
|
"grad_norm": 0.777352511882782,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5979,
|
|
"mean_token_accuracy": 0.8221017122268677,
|
|
"num_tokens": 442435182.0,
|
|
"step": 1388
|
|
},
|
|
{
|
|
"epoch": 1.4130213631739572,
|
|
"grad_norm": 0.74081951379776,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.830318033695221,
|
|
"num_tokens": 442769476.0,
|
|
"step": 1389
|
|
},
|
|
{
|
|
"epoch": 1.4140386571719228,
|
|
"grad_norm": 0.7134003639221191,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.8246288299560547,
|
|
"num_tokens": 443108364.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 1.4150559511698881,
|
|
"grad_norm": 0.7414463758468628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5665,
|
|
"mean_token_accuracy": 0.8285084962844849,
|
|
"num_tokens": 443434228.0,
|
|
"step": 1391
|
|
},
|
|
{
|
|
"epoch": 1.4160732451678535,
|
|
"grad_norm": 0.7754774689674377,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8222627639770508,
|
|
"num_tokens": 443753663.0,
|
|
"step": 1392
|
|
},
|
|
{
|
|
"epoch": 1.4170905391658188,
|
|
"grad_norm": 0.7414329051971436,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.831973135471344,
|
|
"num_tokens": 444064404.0,
|
|
"step": 1393
|
|
},
|
|
{
|
|
"epoch": 1.4181078331637844,
|
|
"grad_norm": 0.7357895970344543,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5848,
|
|
"mean_token_accuracy": 0.8242871761322021,
|
|
"num_tokens": 444385872.0,
|
|
"step": 1394
|
|
},
|
|
{
|
|
"epoch": 1.4191251271617498,
|
|
"grad_norm": 0.7833715081214905,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8255316019058228,
|
|
"num_tokens": 444686912.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 1.4201424211597151,
|
|
"grad_norm": 0.7233708500862122,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.8281997442245483,
|
|
"num_tokens": 445012426.0,
|
|
"step": 1396
|
|
},
|
|
{
|
|
"epoch": 1.4211597151576805,
|
|
"grad_norm": 0.6995869874954224,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.8228816986083984,
|
|
"num_tokens": 445343179.0,
|
|
"step": 1397
|
|
},
|
|
{
|
|
"epoch": 1.422177009155646,
|
|
"grad_norm": 0.7607414722442627,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5555,
|
|
"mean_token_accuracy": 0.8319629430770874,
|
|
"num_tokens": 445669852.0,
|
|
"step": 1398
|
|
},
|
|
{
|
|
"epoch": 1.4231943031536114,
|
|
"grad_norm": 0.7545032501220703,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5778,
|
|
"mean_token_accuracy": 0.8259891271591187,
|
|
"num_tokens": 445986396.0,
|
|
"step": 1399
|
|
},
|
|
{
|
|
"epoch": 1.4242115971515767,
|
|
"grad_norm": 0.7849803566932678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5805,
|
|
"mean_token_accuracy": 0.8252159357070923,
|
|
"num_tokens": 446284635.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.4252288911495423,
|
|
"grad_norm": 0.7250545620918274,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8250802755355835,
|
|
"num_tokens": 446599329.0,
|
|
"step": 1401
|
|
},
|
|
{
|
|
"epoch": 1.4262461851475077,
|
|
"grad_norm": 0.7383725047111511,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8266249895095825,
|
|
"num_tokens": 446921393.0,
|
|
"step": 1402
|
|
},
|
|
{
|
|
"epoch": 1.427263479145473,
|
|
"grad_norm": 0.7174665927886963,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.571,
|
|
"mean_token_accuracy": 0.8278155326843262,
|
|
"num_tokens": 447257986.0,
|
|
"step": 1403
|
|
},
|
|
{
|
|
"epoch": 1.4282807731434384,
|
|
"grad_norm": 0.7408458590507507,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5496,
|
|
"mean_token_accuracy": 0.8329218626022339,
|
|
"num_tokens": 447568033.0,
|
|
"step": 1404
|
|
},
|
|
{
|
|
"epoch": 1.4292980671414037,
|
|
"grad_norm": 0.771271288394928,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.828801155090332,
|
|
"num_tokens": 447885294.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 1.4303153611393693,
|
|
"grad_norm": 0.7759033441543579,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5889,
|
|
"mean_token_accuracy": 0.8226232528686523,
|
|
"num_tokens": 448201132.0,
|
|
"step": 1406
|
|
},
|
|
{
|
|
"epoch": 1.4313326551373347,
|
|
"grad_norm": 0.8168438076972961,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5863,
|
|
"mean_token_accuracy": 0.8233036398887634,
|
|
"num_tokens": 448516015.0,
|
|
"step": 1407
|
|
},
|
|
{
|
|
"epoch": 1.4323499491353,
|
|
"grad_norm": 0.786461591720581,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5849,
|
|
"mean_token_accuracy": 0.823851466178894,
|
|
"num_tokens": 448828846.0,
|
|
"step": 1408
|
|
},
|
|
{
|
|
"epoch": 1.4333672431332656,
|
|
"grad_norm": 0.7642593383789062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6012,
|
|
"mean_token_accuracy": 0.820085883140564,
|
|
"num_tokens": 449153221.0,
|
|
"step": 1409
|
|
},
|
|
{
|
|
"epoch": 1.434384537131231,
|
|
"grad_norm": 0.7777250409126282,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5988,
|
|
"mean_token_accuracy": 0.8208210468292236,
|
|
"num_tokens": 449462904.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 1.4354018311291963,
|
|
"grad_norm": 0.7578172087669373,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.8272188901901245,
|
|
"num_tokens": 449779209.0,
|
|
"step": 1411
|
|
},
|
|
{
|
|
"epoch": 1.4364191251271619,
|
|
"grad_norm": 0.8341385126113892,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6022,
|
|
"mean_token_accuracy": 0.8189631700515747,
|
|
"num_tokens": 450077259.0,
|
|
"step": 1412
|
|
},
|
|
{
|
|
"epoch": 1.4374364191251272,
|
|
"grad_norm": 0.763503909111023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8248583078384399,
|
|
"num_tokens": 450408833.0,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"epoch": 1.4384537131230926,
|
|
"grad_norm": 0.7525264620780945,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8266898989677429,
|
|
"num_tokens": 450736983.0,
|
|
"step": 1414
|
|
},
|
|
{
|
|
"epoch": 1.439471007121058,
|
|
"grad_norm": 0.7664144039154053,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8303419351577759,
|
|
"num_tokens": 451046227.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 1.4404883011190233,
|
|
"grad_norm": 0.7939390540122986,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5837,
|
|
"mean_token_accuracy": 0.825028657913208,
|
|
"num_tokens": 451349176.0,
|
|
"step": 1416
|
|
},
|
|
{
|
|
"epoch": 1.4415055951169888,
|
|
"grad_norm": 0.7194551229476929,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5816,
|
|
"mean_token_accuracy": 0.824691891670227,
|
|
"num_tokens": 451667384.0,
|
|
"step": 1417
|
|
},
|
|
{
|
|
"epoch": 1.4425228891149542,
|
|
"grad_norm": 0.7452390193939209,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.588,
|
|
"mean_token_accuracy": 0.8228015303611755,
|
|
"num_tokens": 451978435.0,
|
|
"step": 1418
|
|
},
|
|
{
|
|
"epoch": 1.4435401831129195,
|
|
"grad_norm": 0.8130427002906799,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5829,
|
|
"mean_token_accuracy": 0.8237577080726624,
|
|
"num_tokens": 452287174.0,
|
|
"step": 1419
|
|
},
|
|
{
|
|
"epoch": 1.4445574771108851,
|
|
"grad_norm": 0.7440806031227112,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5896,
|
|
"mean_token_accuracy": 0.821694016456604,
|
|
"num_tokens": 452595925.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.4455747711088505,
|
|
"grad_norm": 0.8715015649795532,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8313428163528442,
|
|
"num_tokens": 452902299.0,
|
|
"step": 1421
|
|
},
|
|
{
|
|
"epoch": 1.4465920651068158,
|
|
"grad_norm": 0.7727867364883423,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8233987092971802,
|
|
"num_tokens": 453219512.0,
|
|
"step": 1422
|
|
},
|
|
{
|
|
"epoch": 1.4476093591047814,
|
|
"grad_norm": 0.7598720192909241,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5744,
|
|
"mean_token_accuracy": 0.8273491859436035,
|
|
"num_tokens": 453528683.0,
|
|
"step": 1423
|
|
},
|
|
{
|
|
"epoch": 1.4486266531027467,
|
|
"grad_norm": 0.7164946794509888,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5766,
|
|
"mean_token_accuracy": 0.826592206954956,
|
|
"num_tokens": 453856172.0,
|
|
"step": 1424
|
|
},
|
|
{
|
|
"epoch": 1.449643947100712,
|
|
"grad_norm": 0.7681942582130432,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5483,
|
|
"mean_token_accuracy": 0.8334194421768188,
|
|
"num_tokens": 454181489.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 1.4506612410986774,
|
|
"grad_norm": 0.7888099551200867,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.8240190744400024,
|
|
"num_tokens": 454494258.0,
|
|
"step": 1426
|
|
},
|
|
{
|
|
"epoch": 1.4516785350966428,
|
|
"grad_norm": 0.7697499990463257,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5479,
|
|
"mean_token_accuracy": 0.8328573107719421,
|
|
"num_tokens": 454794488.0,
|
|
"step": 1427
|
|
},
|
|
{
|
|
"epoch": 1.4526958290946084,
|
|
"grad_norm": 0.7287480235099792,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.601,
|
|
"mean_token_accuracy": 0.8192815184593201,
|
|
"num_tokens": 455107150.0,
|
|
"step": 1428
|
|
},
|
|
{
|
|
"epoch": 1.4537131230925737,
|
|
"grad_norm": 0.7731139063835144,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8271017074584961,
|
|
"num_tokens": 455429921.0,
|
|
"step": 1429
|
|
},
|
|
{
|
|
"epoch": 1.454730417090539,
|
|
"grad_norm": 0.7933812141418457,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5848,
|
|
"mean_token_accuracy": 0.8243111371994019,
|
|
"num_tokens": 455754141.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 1.4557477110885046,
|
|
"grad_norm": 0.7570680975914001,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5713,
|
|
"mean_token_accuracy": 0.828052818775177,
|
|
"num_tokens": 456076223.0,
|
|
"step": 1431
|
|
},
|
|
{
|
|
"epoch": 1.45676500508647,
|
|
"grad_norm": 0.7283380627632141,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5755,
|
|
"mean_token_accuracy": 0.8255959749221802,
|
|
"num_tokens": 456396940.0,
|
|
"step": 1432
|
|
},
|
|
{
|
|
"epoch": 1.4577822990844354,
|
|
"grad_norm": 0.7613392472267151,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.8309558629989624,
|
|
"num_tokens": 456705322.0,
|
|
"step": 1433
|
|
},
|
|
{
|
|
"epoch": 1.458799593082401,
|
|
"grad_norm": 0.8308210968971252,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5866,
|
|
"mean_token_accuracy": 0.8235096335411072,
|
|
"num_tokens": 457034824.0,
|
|
"step": 1434
|
|
},
|
|
{
|
|
"epoch": 1.4598168870803663,
|
|
"grad_norm": 0.7514590620994568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6032,
|
|
"mean_token_accuracy": 0.8205565214157104,
|
|
"num_tokens": 457358314.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 1.4608341810783316,
|
|
"grad_norm": 0.7398003339767456,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5647,
|
|
"mean_token_accuracy": 0.829264760017395,
|
|
"num_tokens": 457677702.0,
|
|
"step": 1436
|
|
},
|
|
{
|
|
"epoch": 1.461851475076297,
|
|
"grad_norm": 0.7642903923988342,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8233004808425903,
|
|
"num_tokens": 458004068.0,
|
|
"step": 1437
|
|
},
|
|
{
|
|
"epoch": 1.4628687690742623,
|
|
"grad_norm": 0.7861106991767883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5935,
|
|
"mean_token_accuracy": 0.8219960927963257,
|
|
"num_tokens": 458327986.0,
|
|
"step": 1438
|
|
},
|
|
{
|
|
"epoch": 1.463886063072228,
|
|
"grad_norm": 0.7153347134590149,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5716,
|
|
"mean_token_accuracy": 0.827376127243042,
|
|
"num_tokens": 458641744.0,
|
|
"step": 1439
|
|
},
|
|
{
|
|
"epoch": 1.4649033570701933,
|
|
"grad_norm": 0.7323182821273804,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5684,
|
|
"mean_token_accuracy": 0.8284355401992798,
|
|
"num_tokens": 458963974.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.4659206510681586,
|
|
"grad_norm": 0.8875779509544373,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5795,
|
|
"mean_token_accuracy": 0.8269001245498657,
|
|
"num_tokens": 459273431.0,
|
|
"step": 1441
|
|
},
|
|
{
|
|
"epoch": 1.4669379450661242,
|
|
"grad_norm": 0.7622124552726746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5758,
|
|
"mean_token_accuracy": 0.8258703351020813,
|
|
"num_tokens": 459590197.0,
|
|
"step": 1442
|
|
},
|
|
{
|
|
"epoch": 1.4679552390640895,
|
|
"grad_norm": 0.7376556396484375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.825239896774292,
|
|
"num_tokens": 459898577.0,
|
|
"step": 1443
|
|
},
|
|
{
|
|
"epoch": 1.4689725330620549,
|
|
"grad_norm": 0.7311775088310242,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5639,
|
|
"mean_token_accuracy": 0.8291976451873779,
|
|
"num_tokens": 460209370.0,
|
|
"step": 1444
|
|
},
|
|
{
|
|
"epoch": 1.4699898270600205,
|
|
"grad_norm": 0.7577133178710938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5527,
|
|
"mean_token_accuracy": 0.8328637480735779,
|
|
"num_tokens": 460538263.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 1.4710071210579858,
|
|
"grad_norm": 0.7706095576286316,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5716,
|
|
"mean_token_accuracy": 0.8277468681335449,
|
|
"num_tokens": 460866578.0,
|
|
"step": 1446
|
|
},
|
|
{
|
|
"epoch": 1.4720244150559512,
|
|
"grad_norm": 0.8589508533477783,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5949,
|
|
"mean_token_accuracy": 0.8217111825942993,
|
|
"num_tokens": 461171630.0,
|
|
"step": 1447
|
|
},
|
|
{
|
|
"epoch": 1.4730417090539165,
|
|
"grad_norm": 0.7346833348274231,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8265662789344788,
|
|
"num_tokens": 461506509.0,
|
|
"step": 1448
|
|
},
|
|
{
|
|
"epoch": 1.4740590030518819,
|
|
"grad_norm": 0.75523841381073,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8384934663772583,
|
|
"num_tokens": 461835821.0,
|
|
"step": 1449
|
|
},
|
|
{
|
|
"epoch": 1.4750762970498474,
|
|
"grad_norm": 0.7664790749549866,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5588,
|
|
"mean_token_accuracy": 0.8306655883789062,
|
|
"num_tokens": 462141478.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 1.4760935910478128,
|
|
"grad_norm": 0.8109428882598877,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5556,
|
|
"mean_token_accuracy": 0.831562876701355,
|
|
"num_tokens": 462469829.0,
|
|
"step": 1451
|
|
},
|
|
{
|
|
"epoch": 1.4771108850457781,
|
|
"grad_norm": 0.726762592792511,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5665,
|
|
"mean_token_accuracy": 0.8273746967315674,
|
|
"num_tokens": 462805295.0,
|
|
"step": 1452
|
|
},
|
|
{
|
|
"epoch": 1.4781281790437437,
|
|
"grad_norm": 0.7275935411453247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8319987058639526,
|
|
"num_tokens": 463137500.0,
|
|
"step": 1453
|
|
},
|
|
{
|
|
"epoch": 1.479145473041709,
|
|
"grad_norm": 0.7638391256332397,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.593,
|
|
"mean_token_accuracy": 0.823004424571991,
|
|
"num_tokens": 463448939.0,
|
|
"step": 1454
|
|
},
|
|
{
|
|
"epoch": 1.4801627670396744,
|
|
"grad_norm": 0.7693440318107605,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5908,
|
|
"mean_token_accuracy": 0.8211554288864136,
|
|
"num_tokens": 463759585.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 1.48118006103764,
|
|
"grad_norm": 0.7615060210227966,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8256450891494751,
|
|
"num_tokens": 464082388.0,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"epoch": 1.4821973550356053,
|
|
"grad_norm": 0.713785707950592,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.8260461688041687,
|
|
"num_tokens": 464417202.0,
|
|
"step": 1457
|
|
},
|
|
{
|
|
"epoch": 1.4832146490335707,
|
|
"grad_norm": 0.7960901856422424,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8279980421066284,
|
|
"num_tokens": 464725276.0,
|
|
"step": 1458
|
|
},
|
|
{
|
|
"epoch": 1.484231943031536,
|
|
"grad_norm": 0.795560359954834,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.8323945999145508,
|
|
"num_tokens": 465058613.0,
|
|
"step": 1459
|
|
},
|
|
{
|
|
"epoch": 1.4852492370295014,
|
|
"grad_norm": 0.7435919642448425,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8298157453536987,
|
|
"num_tokens": 465376097.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.486266531027467,
|
|
"grad_norm": 0.7551680207252502,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.606,
|
|
"mean_token_accuracy": 0.817765474319458,
|
|
"num_tokens": 465702748.0,
|
|
"step": 1461
|
|
},
|
|
{
|
|
"epoch": 1.4872838250254323,
|
|
"grad_norm": 0.7732077836990356,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5458,
|
|
"mean_token_accuracy": 0.8342182636260986,
|
|
"num_tokens": 466007913.0,
|
|
"step": 1462
|
|
},
|
|
{
|
|
"epoch": 1.4883011190233977,
|
|
"grad_norm": 0.7947700619697571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.586,
|
|
"mean_token_accuracy": 0.8237362504005432,
|
|
"num_tokens": 466327362.0,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"epoch": 1.4893184130213633,
|
|
"grad_norm": 0.7750511765480042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8315537571907043,
|
|
"num_tokens": 466638743.0,
|
|
"step": 1464
|
|
},
|
|
{
|
|
"epoch": 1.4903357070193286,
|
|
"grad_norm": 0.7739797234535217,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5749,
|
|
"mean_token_accuracy": 0.8269380927085876,
|
|
"num_tokens": 466958501.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 1.491353001017294,
|
|
"grad_norm": 0.7865405082702637,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5894,
|
|
"mean_token_accuracy": 0.8227080702781677,
|
|
"num_tokens": 467271547.0,
|
|
"step": 1466
|
|
},
|
|
{
|
|
"epoch": 1.4923702950152595,
|
|
"grad_norm": 0.7469372749328613,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5635,
|
|
"mean_token_accuracy": 0.8284210562705994,
|
|
"num_tokens": 467597696.0,
|
|
"step": 1467
|
|
},
|
|
{
|
|
"epoch": 1.4933875890132249,
|
|
"grad_norm": 0.7437664866447449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.8262543082237244,
|
|
"num_tokens": 467925187.0,
|
|
"step": 1468
|
|
},
|
|
{
|
|
"epoch": 1.4944048830111902,
|
|
"grad_norm": 0.8482660055160522,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.598,
|
|
"mean_token_accuracy": 0.8197759985923767,
|
|
"num_tokens": 468226973.0,
|
|
"step": 1469
|
|
},
|
|
{
|
|
"epoch": 1.4954221770091556,
|
|
"grad_norm": 0.7456377744674683,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5536,
|
|
"mean_token_accuracy": 0.8341470956802368,
|
|
"num_tokens": 468534657.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 1.496439471007121,
|
|
"grad_norm": 0.7405015230178833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5834,
|
|
"mean_token_accuracy": 0.8242038488388062,
|
|
"num_tokens": 468863037.0,
|
|
"step": 1471
|
|
},
|
|
{
|
|
"epoch": 1.4974567650050865,
|
|
"grad_norm": 0.7640259265899658,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5837,
|
|
"mean_token_accuracy": 0.8241820335388184,
|
|
"num_tokens": 469175522.0,
|
|
"step": 1472
|
|
},
|
|
{
|
|
"epoch": 1.4984740590030519,
|
|
"grad_norm": 0.7306478023529053,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5568,
|
|
"mean_token_accuracy": 0.832508385181427,
|
|
"num_tokens": 469489877.0,
|
|
"step": 1473
|
|
},
|
|
{
|
|
"epoch": 1.4994913530010172,
|
|
"grad_norm": 0.7525316476821899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8232234716415405,
|
|
"num_tokens": 469823003.0,
|
|
"step": 1474
|
|
},
|
|
{
|
|
"epoch": 1.5005086469989828,
|
|
"grad_norm": 0.728570282459259,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5793,
|
|
"mean_token_accuracy": 0.8248062133789062,
|
|
"num_tokens": 470149249.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 1.5015259409969481,
|
|
"grad_norm": 0.7569950222969055,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6093,
|
|
"mean_token_accuracy": 0.8198111653327942,
|
|
"num_tokens": 470472689.0,
|
|
"step": 1476
|
|
},
|
|
{
|
|
"epoch": 1.5025432349949135,
|
|
"grad_norm": 0.7983245253562927,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5828,
|
|
"mean_token_accuracy": 0.8236798048019409,
|
|
"num_tokens": 470771648.0,
|
|
"step": 1477
|
|
},
|
|
{
|
|
"epoch": 1.503560528992879,
|
|
"grad_norm": 0.7313922047615051,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.826393723487854,
|
|
"num_tokens": 471089333.0,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"epoch": 1.5045778229908442,
|
|
"grad_norm": 0.739508867263794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.8257728815078735,
|
|
"num_tokens": 471422317.0,
|
|
"step": 1479
|
|
},
|
|
{
|
|
"epoch": 1.5055951169888098,
|
|
"grad_norm": 0.8040635585784912,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8284851908683777,
|
|
"num_tokens": 471730047.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.5066124109867753,
|
|
"grad_norm": 0.7771586775779724,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5927,
|
|
"mean_token_accuracy": 0.8217354416847229,
|
|
"num_tokens": 472056255.0,
|
|
"step": 1481
|
|
},
|
|
{
|
|
"epoch": 1.5076297049847405,
|
|
"grad_norm": 0.7523510456085205,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.8259488344192505,
|
|
"num_tokens": 472385581.0,
|
|
"step": 1482
|
|
},
|
|
{
|
|
"epoch": 1.508646998982706,
|
|
"grad_norm": 0.8081842660903931,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.8305275440216064,
|
|
"num_tokens": 472697254.0,
|
|
"step": 1483
|
|
},
|
|
{
|
|
"epoch": 1.5096642929806714,
|
|
"grad_norm": 0.7643046975135803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5967,
|
|
"mean_token_accuracy": 0.8213087320327759,
|
|
"num_tokens": 473029719.0,
|
|
"step": 1484
|
|
},
|
|
{
|
|
"epoch": 1.5106815869786367,
|
|
"grad_norm": 0.74493807554245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5824,
|
|
"mean_token_accuracy": 0.8234145641326904,
|
|
"num_tokens": 473348305.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 1.5116988809766023,
|
|
"grad_norm": 0.771102786064148,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5798,
|
|
"mean_token_accuracy": 0.825602650642395,
|
|
"num_tokens": 473666571.0,
|
|
"step": 1486
|
|
},
|
|
{
|
|
"epoch": 1.5127161749745677,
|
|
"grad_norm": 0.7674226760864258,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5509,
|
|
"mean_token_accuracy": 0.8334149718284607,
|
|
"num_tokens": 473976615.0,
|
|
"step": 1487
|
|
},
|
|
{
|
|
"epoch": 1.513733468972533,
|
|
"grad_norm": 0.7604283690452576,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5644,
|
|
"mean_token_accuracy": 0.8303079009056091,
|
|
"num_tokens": 474309906.0,
|
|
"step": 1488
|
|
},
|
|
{
|
|
"epoch": 1.5147507629704986,
|
|
"grad_norm": 0.7903773784637451,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5905,
|
|
"mean_token_accuracy": 0.8222702741622925,
|
|
"num_tokens": 474620054.0,
|
|
"step": 1489
|
|
},
|
|
{
|
|
"epoch": 1.5157680569684637,
|
|
"grad_norm": 0.791972815990448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.8252575397491455,
|
|
"num_tokens": 474940167.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 1.5167853509664293,
|
|
"grad_norm": 0.7404340505599976,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8309566974639893,
|
|
"num_tokens": 475258659.0,
|
|
"step": 1491
|
|
},
|
|
{
|
|
"epoch": 1.5178026449643949,
|
|
"grad_norm": 0.7210290431976318,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8279041051864624,
|
|
"num_tokens": 475577873.0,
|
|
"step": 1492
|
|
},
|
|
{
|
|
"epoch": 1.51881993896236,
|
|
"grad_norm": 0.737732470035553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5885,
|
|
"mean_token_accuracy": 0.8217068314552307,
|
|
"num_tokens": 475909298.0,
|
|
"step": 1493
|
|
},
|
|
{
|
|
"epoch": 1.5198372329603256,
|
|
"grad_norm": 0.8280147910118103,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5878,
|
|
"mean_token_accuracy": 0.8225430250167847,
|
|
"num_tokens": 476240775.0,
|
|
"step": 1494
|
|
},
|
|
{
|
|
"epoch": 1.520854526958291,
|
|
"grad_norm": 0.7968106269836426,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5777,
|
|
"mean_token_accuracy": 0.8245846629142761,
|
|
"num_tokens": 476556580.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 1.5218718209562563,
|
|
"grad_norm": 0.7490198016166687,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5663,
|
|
"mean_token_accuracy": 0.8279181718826294,
|
|
"num_tokens": 476855526.0,
|
|
"step": 1496
|
|
},
|
|
{
|
|
"epoch": 1.5228891149542219,
|
|
"grad_norm": 0.7276293635368347,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8238677978515625,
|
|
"num_tokens": 477177840.0,
|
|
"step": 1497
|
|
},
|
|
{
|
|
"epoch": 1.5239064089521872,
|
|
"grad_norm": 0.7779879570007324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.830930769443512,
|
|
"num_tokens": 477502383.0,
|
|
"step": 1498
|
|
},
|
|
{
|
|
"epoch": 1.5249237029501526,
|
|
"grad_norm": 0.8194068670272827,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5671,
|
|
"mean_token_accuracy": 0.8289549946784973,
|
|
"num_tokens": 477826492.0,
|
|
"step": 1499
|
|
},
|
|
{
|
|
"epoch": 1.5259409969481181,
|
|
"grad_norm": 0.7614669799804688,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5509,
|
|
"mean_token_accuracy": 0.8331164717674255,
|
|
"num_tokens": 478124029.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.5269582909460833,
|
|
"grad_norm": 0.7624503970146179,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5638,
|
|
"mean_token_accuracy": 0.8300232291221619,
|
|
"num_tokens": 478451961.0,
|
|
"step": 1501
|
|
},
|
|
{
|
|
"epoch": 1.5279755849440488,
|
|
"grad_norm": 0.7618391513824463,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5923,
|
|
"mean_token_accuracy": 0.8214845657348633,
|
|
"num_tokens": 478777570.0,
|
|
"step": 1502
|
|
},
|
|
{
|
|
"epoch": 1.5289928789420142,
|
|
"grad_norm": 0.8302142024040222,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.8292617201805115,
|
|
"num_tokens": 479091694.0,
|
|
"step": 1503
|
|
},
|
|
{
|
|
"epoch": 1.5300101729399795,
|
|
"grad_norm": 0.8318211436271667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5786,
|
|
"mean_token_accuracy": 0.8252511024475098,
|
|
"num_tokens": 479395630.0,
|
|
"step": 1504
|
|
},
|
|
{
|
|
"epoch": 1.5310274669379451,
|
|
"grad_norm": 0.81647789478302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8355250358581543,
|
|
"num_tokens": 479705902.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 1.5320447609359105,
|
|
"grad_norm": 0.7545574307441711,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.826928436756134,
|
|
"num_tokens": 480016304.0,
|
|
"step": 1506
|
|
},
|
|
{
|
|
"epoch": 1.5330620549338758,
|
|
"grad_norm": 0.7784783840179443,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5615,
|
|
"mean_token_accuracy": 0.8297576904296875,
|
|
"num_tokens": 480346413.0,
|
|
"step": 1507
|
|
},
|
|
{
|
|
"epoch": 1.5340793489318414,
|
|
"grad_norm": 0.8040642142295837,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5697,
|
|
"mean_token_accuracy": 0.828713059425354,
|
|
"num_tokens": 480658234.0,
|
|
"step": 1508
|
|
},
|
|
{
|
|
"epoch": 1.5350966429298067,
|
|
"grad_norm": 0.7709273099899292,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8270728588104248,
|
|
"num_tokens": 480980014.0,
|
|
"step": 1509
|
|
},
|
|
{
|
|
"epoch": 1.536113936927772,
|
|
"grad_norm": 0.8043686747550964,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6068,
|
|
"mean_token_accuracy": 0.818744957447052,
|
|
"num_tokens": 481310267.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 1.5371312309257377,
|
|
"grad_norm": 0.7916233539581299,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6269,
|
|
"mean_token_accuracy": 0.8126989603042603,
|
|
"num_tokens": 481623100.0,
|
|
"step": 1511
|
|
},
|
|
{
|
|
"epoch": 1.5381485249237028,
|
|
"grad_norm": 0.8328151702880859,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8281493186950684,
|
|
"num_tokens": 481941546.0,
|
|
"step": 1512
|
|
},
|
|
{
|
|
"epoch": 1.5391658189216684,
|
|
"grad_norm": 0.8383995294570923,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6069,
|
|
"mean_token_accuracy": 0.8183453679084778,
|
|
"num_tokens": 482261954.0,
|
|
"step": 1513
|
|
},
|
|
{
|
|
"epoch": 1.5401831129196337,
|
|
"grad_norm": 0.7694042921066284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.574,
|
|
"mean_token_accuracy": 0.8272018432617188,
|
|
"num_tokens": 482583861.0,
|
|
"step": 1514
|
|
},
|
|
{
|
|
"epoch": 1.541200406917599,
|
|
"grad_norm": 0.7397472858428955,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5686,
|
|
"mean_token_accuracy": 0.8283990621566772,
|
|
"num_tokens": 482917251.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 1.5422177009155646,
|
|
"grad_norm": 0.7571448683738708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5698,
|
|
"mean_token_accuracy": 0.8278690576553345,
|
|
"num_tokens": 483235136.0,
|
|
"step": 1516
|
|
},
|
|
{
|
|
"epoch": 1.54323499491353,
|
|
"grad_norm": 0.764622151851654,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5663,
|
|
"mean_token_accuracy": 0.8284850120544434,
|
|
"num_tokens": 483551953.0,
|
|
"step": 1517
|
|
},
|
|
{
|
|
"epoch": 1.5442522889114954,
|
|
"grad_norm": 0.7923704385757446,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8279635906219482,
|
|
"num_tokens": 483861059.0,
|
|
"step": 1518
|
|
},
|
|
{
|
|
"epoch": 1.545269582909461,
|
|
"grad_norm": 0.7625948190689087,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.8296431303024292,
|
|
"num_tokens": 484173960.0,
|
|
"step": 1519
|
|
},
|
|
{
|
|
"epoch": 1.5462868769074263,
|
|
"grad_norm": 0.7725732922554016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5984,
|
|
"mean_token_accuracy": 0.8205369710922241,
|
|
"num_tokens": 484486741.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.5473041709053916,
|
|
"grad_norm": 0.7608553171157837,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5832,
|
|
"mean_token_accuracy": 0.8246746063232422,
|
|
"num_tokens": 484829318.0,
|
|
"step": 1521
|
|
},
|
|
{
|
|
"epoch": 1.5483214649033572,
|
|
"grad_norm": 0.7753155827522278,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5807,
|
|
"mean_token_accuracy": 0.8245500326156616,
|
|
"num_tokens": 485146681.0,
|
|
"step": 1522
|
|
},
|
|
{
|
|
"epoch": 1.5493387589013223,
|
|
"grad_norm": 0.7314413785934448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5763,
|
|
"mean_token_accuracy": 0.8249948024749756,
|
|
"num_tokens": 485461388.0,
|
|
"step": 1523
|
|
},
|
|
{
|
|
"epoch": 1.550356052899288,
|
|
"grad_norm": 0.7906734943389893,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8281234502792358,
|
|
"num_tokens": 485774911.0,
|
|
"step": 1524
|
|
},
|
|
{
|
|
"epoch": 1.5513733468972533,
|
|
"grad_norm": 0.7759382724761963,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5864,
|
|
"mean_token_accuracy": 0.8234958052635193,
|
|
"num_tokens": 486099339.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 1.5523906408952186,
|
|
"grad_norm": 0.7591975927352905,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5704,
|
|
"mean_token_accuracy": 0.826912522315979,
|
|
"num_tokens": 486412016.0,
|
|
"step": 1526
|
|
},
|
|
{
|
|
"epoch": 1.5534079348931842,
|
|
"grad_norm": 0.7712595462799072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5534,
|
|
"mean_token_accuracy": 0.8311680555343628,
|
|
"num_tokens": 486710623.0,
|
|
"step": 1527
|
|
},
|
|
{
|
|
"epoch": 1.5544252288911495,
|
|
"grad_norm": 0.777800977230072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5913,
|
|
"mean_token_accuracy": 0.8223469257354736,
|
|
"num_tokens": 487025165.0,
|
|
"step": 1528
|
|
},
|
|
{
|
|
"epoch": 1.5554425228891149,
|
|
"grad_norm": 0.7843142747879028,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.59,
|
|
"mean_token_accuracy": 0.8229750990867615,
|
|
"num_tokens": 487349819.0,
|
|
"step": 1529
|
|
},
|
|
{
|
|
"epoch": 1.5564598168870805,
|
|
"grad_norm": 0.7695160508155823,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5892,
|
|
"mean_token_accuracy": 0.8223654627799988,
|
|
"num_tokens": 487662393.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 1.5574771108850458,
|
|
"grad_norm": 0.7649616599082947,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.8219040632247925,
|
|
"num_tokens": 487985726.0,
|
|
"step": 1531
|
|
},
|
|
{
|
|
"epoch": 1.5584944048830112,
|
|
"grad_norm": 0.7629556059837341,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5554,
|
|
"mean_token_accuracy": 0.8320173025131226,
|
|
"num_tokens": 488319919.0,
|
|
"step": 1532
|
|
},
|
|
{
|
|
"epoch": 1.5595116988809767,
|
|
"grad_norm": 0.7568076848983765,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.8286150097846985,
|
|
"num_tokens": 488630649.0,
|
|
"step": 1533
|
|
},
|
|
{
|
|
"epoch": 1.5605289928789419,
|
|
"grad_norm": 0.7621586322784424,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5747,
|
|
"mean_token_accuracy": 0.8262230157852173,
|
|
"num_tokens": 488932053.0,
|
|
"step": 1534
|
|
},
|
|
{
|
|
"epoch": 1.5615462868769074,
|
|
"grad_norm": 0.788788378238678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.8261498808860779,
|
|
"num_tokens": 489250757.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 1.5625635808748728,
|
|
"grad_norm": 0.7831841111183167,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5866,
|
|
"mean_token_accuracy": 0.8225564360618591,
|
|
"num_tokens": 489567664.0,
|
|
"step": 1536
|
|
},
|
|
{
|
|
"epoch": 1.5635808748728381,
|
|
"grad_norm": 0.7453765273094177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5648,
|
|
"mean_token_accuracy": 0.8284498453140259,
|
|
"num_tokens": 489874451.0,
|
|
"step": 1537
|
|
},
|
|
{
|
|
"epoch": 1.5645981688708037,
|
|
"grad_norm": 0.7343994379043579,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8306755423545837,
|
|
"num_tokens": 490204909.0,
|
|
"step": 1538
|
|
},
|
|
{
|
|
"epoch": 1.565615462868769,
|
|
"grad_norm": 0.7722320556640625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5886,
|
|
"mean_token_accuracy": 0.8224450349807739,
|
|
"num_tokens": 490514531.0,
|
|
"step": 1539
|
|
},
|
|
{
|
|
"epoch": 1.5666327568667344,
|
|
"grad_norm": 0.743450403213501,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8267664313316345,
|
|
"num_tokens": 490845055.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.5676500508647,
|
|
"grad_norm": 0.8455149531364441,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6086,
|
|
"mean_token_accuracy": 0.817818820476532,
|
|
"num_tokens": 491152661.0,
|
|
"step": 1541
|
|
},
|
|
{
|
|
"epoch": 1.5686673448626653,
|
|
"grad_norm": 0.7722828388214111,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5735,
|
|
"mean_token_accuracy": 0.8273724317550659,
|
|
"num_tokens": 491455713.0,
|
|
"step": 1542
|
|
},
|
|
{
|
|
"epoch": 1.5696846388606307,
|
|
"grad_norm": 0.7712371945381165,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5581,
|
|
"mean_token_accuracy": 0.831794798374176,
|
|
"num_tokens": 491772168.0,
|
|
"step": 1543
|
|
},
|
|
{
|
|
"epoch": 1.5707019328585963,
|
|
"grad_norm": 0.7466083765029907,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.82890784740448,
|
|
"num_tokens": 492088615.0,
|
|
"step": 1544
|
|
},
|
|
{
|
|
"epoch": 1.5717192268565614,
|
|
"grad_norm": 0.821121871471405,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8341401815414429,
|
|
"num_tokens": 492397349.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 1.572736520854527,
|
|
"grad_norm": 0.7918075323104858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8286417126655579,
|
|
"num_tokens": 492713338.0,
|
|
"step": 1546
|
|
},
|
|
{
|
|
"epoch": 1.5737538148524923,
|
|
"grad_norm": 0.7808675169944763,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5669,
|
|
"mean_token_accuracy": 0.8277544975280762,
|
|
"num_tokens": 493019564.0,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"epoch": 1.5747711088504577,
|
|
"grad_norm": 0.7513705492019653,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8273752331733704,
|
|
"num_tokens": 493335981.0,
|
|
"step": 1548
|
|
},
|
|
{
|
|
"epoch": 1.5757884028484233,
|
|
"grad_norm": 0.7467897534370422,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.55,
|
|
"mean_token_accuracy": 0.8333742618560791,
|
|
"num_tokens": 493652983.0,
|
|
"step": 1549
|
|
},
|
|
{
|
|
"epoch": 1.5768056968463886,
|
|
"grad_norm": 0.7320238351821899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8308363556861877,
|
|
"num_tokens": 493983683.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 1.577822990844354,
|
|
"grad_norm": 0.7745316028594971,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5946,
|
|
"mean_token_accuracy": 0.8205184936523438,
|
|
"num_tokens": 494296649.0,
|
|
"step": 1551
|
|
},
|
|
{
|
|
"epoch": 1.5788402848423195,
|
|
"grad_norm": 0.7687009572982788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5788,
|
|
"mean_token_accuracy": 0.8262377977371216,
|
|
"num_tokens": 494612800.0,
|
|
"step": 1552
|
|
},
|
|
{
|
|
"epoch": 1.5798575788402849,
|
|
"grad_norm": 0.7544592618942261,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5699,
|
|
"mean_token_accuracy": 0.8294517993927002,
|
|
"num_tokens": 494920941.0,
|
|
"step": 1553
|
|
},
|
|
{
|
|
"epoch": 1.5808748728382502,
|
|
"grad_norm": 0.73783278465271,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5529,
|
|
"mean_token_accuracy": 0.8330302238464355,
|
|
"num_tokens": 495234703.0,
|
|
"step": 1554
|
|
},
|
|
{
|
|
"epoch": 1.5818921668362158,
|
|
"grad_norm": 0.7729275822639465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5641,
|
|
"mean_token_accuracy": 0.8295111060142517,
|
|
"num_tokens": 495546408.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 1.582909460834181,
|
|
"grad_norm": 0.8193879723548889,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5933,
|
|
"mean_token_accuracy": 0.8216261863708496,
|
|
"num_tokens": 495873689.0,
|
|
"step": 1556
|
|
},
|
|
{
|
|
"epoch": 1.5839267548321465,
|
|
"grad_norm": 0.7715921998023987,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8296725749969482,
|
|
"num_tokens": 496200993.0,
|
|
"step": 1557
|
|
},
|
|
{
|
|
"epoch": 1.5849440488301119,
|
|
"grad_norm": 0.7675861120223999,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8267984986305237,
|
|
"num_tokens": 496528159.0,
|
|
"step": 1558
|
|
},
|
|
{
|
|
"epoch": 1.5859613428280772,
|
|
"grad_norm": 0.7609422206878662,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8270455002784729,
|
|
"num_tokens": 496842977.0,
|
|
"step": 1559
|
|
},
|
|
{
|
|
"epoch": 1.5869786368260428,
|
|
"grad_norm": 0.798121452331543,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5806,
|
|
"mean_token_accuracy": 0.8252966403961182,
|
|
"num_tokens": 497141795.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.5879959308240081,
|
|
"grad_norm": 0.7561322450637817,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.574,
|
|
"mean_token_accuracy": 0.8259497880935669,
|
|
"num_tokens": 497453652.0,
|
|
"step": 1561
|
|
},
|
|
{
|
|
"epoch": 1.5890132248219735,
|
|
"grad_norm": 0.7411714196205139,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5763,
|
|
"mean_token_accuracy": 0.8266175985336304,
|
|
"num_tokens": 497784575.0,
|
|
"step": 1562
|
|
},
|
|
{
|
|
"epoch": 1.590030518819939,
|
|
"grad_norm": 0.7651845812797546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5647,
|
|
"mean_token_accuracy": 0.8290013074874878,
|
|
"num_tokens": 498096276.0,
|
|
"step": 1563
|
|
},
|
|
{
|
|
"epoch": 1.5910478128179044,
|
|
"grad_norm": 0.7341787219047546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8271242380142212,
|
|
"num_tokens": 498422502.0,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"epoch": 1.5920651068158698,
|
|
"grad_norm": 0.7575334906578064,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5588,
|
|
"mean_token_accuracy": 0.8300933837890625,
|
|
"num_tokens": 498743572.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 1.5930824008138353,
|
|
"grad_norm": 0.7907820343971252,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5921,
|
|
"mean_token_accuracy": 0.8232530355453491,
|
|
"num_tokens": 499051577.0,
|
|
"step": 1566
|
|
},
|
|
{
|
|
"epoch": 1.5940996948118005,
|
|
"grad_norm": 0.7722137570381165,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5795,
|
|
"mean_token_accuracy": 0.824563205242157,
|
|
"num_tokens": 499360910.0,
|
|
"step": 1567
|
|
},
|
|
{
|
|
"epoch": 1.595116988809766,
|
|
"grad_norm": 0.7802767753601074,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8294447660446167,
|
|
"num_tokens": 499675969.0,
|
|
"step": 1568
|
|
},
|
|
{
|
|
"epoch": 1.5961342828077314,
|
|
"grad_norm": 0.7496720552444458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8307507634162903,
|
|
"num_tokens": 499988817.0,
|
|
"step": 1569
|
|
},
|
|
{
|
|
"epoch": 1.5971515768056967,
|
|
"grad_norm": 0.7167335152626038,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5875,
|
|
"mean_token_accuracy": 0.8224383592605591,
|
|
"num_tokens": 500312578.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 1.5981688708036623,
|
|
"grad_norm": 0.7863753437995911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.8292855620384216,
|
|
"num_tokens": 500631973.0,
|
|
"step": 1571
|
|
},
|
|
{
|
|
"epoch": 1.5991861648016277,
|
|
"grad_norm": 0.7296916246414185,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5703,
|
|
"mean_token_accuracy": 0.8279212117195129,
|
|
"num_tokens": 500955095.0,
|
|
"step": 1572
|
|
},
|
|
{
|
|
"epoch": 1.600203458799593,
|
|
"grad_norm": 0.7697763442993164,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8253737688064575,
|
|
"num_tokens": 501271088.0,
|
|
"step": 1573
|
|
},
|
|
{
|
|
"epoch": 1.6012207527975586,
|
|
"grad_norm": 0.8204194903373718,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.564,
|
|
"mean_token_accuracy": 0.8297313451766968,
|
|
"num_tokens": 501573475.0,
|
|
"step": 1574
|
|
},
|
|
{
|
|
"epoch": 1.602238046795524,
|
|
"grad_norm": 0.8012133836746216,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5654,
|
|
"mean_token_accuracy": 0.828351616859436,
|
|
"num_tokens": 501887647.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 1.6032553407934893,
|
|
"grad_norm": 0.7700391411781311,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8309698104858398,
|
|
"num_tokens": 502192904.0,
|
|
"step": 1576
|
|
},
|
|
{
|
|
"epoch": 1.6042726347914549,
|
|
"grad_norm": 0.7932988405227661,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5667,
|
|
"mean_token_accuracy": 0.8294327855110168,
|
|
"num_tokens": 502505274.0,
|
|
"step": 1577
|
|
},
|
|
{
|
|
"epoch": 1.60528992878942,
|
|
"grad_norm": 0.7901273369789124,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5926,
|
|
"mean_token_accuracy": 0.8220197558403015,
|
|
"num_tokens": 502817024.0,
|
|
"step": 1578
|
|
},
|
|
{
|
|
"epoch": 1.6063072227873856,
|
|
"grad_norm": 0.8021532297134399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.602,
|
|
"mean_token_accuracy": 0.820052981376648,
|
|
"num_tokens": 503156030.0,
|
|
"step": 1579
|
|
},
|
|
{
|
|
"epoch": 1.607324516785351,
|
|
"grad_norm": 0.7981246709823608,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.830917239189148,
|
|
"num_tokens": 503459669.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.6083418107833163,
|
|
"grad_norm": 0.7735322117805481,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8284094929695129,
|
|
"num_tokens": 503766380.0,
|
|
"step": 1581
|
|
},
|
|
{
|
|
"epoch": 1.6093591047812819,
|
|
"grad_norm": 0.8570979833602905,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5986,
|
|
"mean_token_accuracy": 0.8192204236984253,
|
|
"num_tokens": 504077350.0,
|
|
"step": 1582
|
|
},
|
|
{
|
|
"epoch": 1.6103763987792472,
|
|
"grad_norm": 0.8010280132293701,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5567,
|
|
"mean_token_accuracy": 0.8301876783370972,
|
|
"num_tokens": 504396848.0,
|
|
"step": 1583
|
|
},
|
|
{
|
|
"epoch": 1.6113936927772126,
|
|
"grad_norm": 0.8592894077301025,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8320919275283813,
|
|
"num_tokens": 504709088.0,
|
|
"step": 1584
|
|
},
|
|
{
|
|
"epoch": 1.6124109867751781,
|
|
"grad_norm": 0.7861720323562622,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8243271112442017,
|
|
"num_tokens": 505040673.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 1.6134282807731435,
|
|
"grad_norm": 0.7919045090675354,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.8284953236579895,
|
|
"num_tokens": 505369618.0,
|
|
"step": 1586
|
|
},
|
|
{
|
|
"epoch": 1.6144455747711088,
|
|
"grad_norm": 0.9178990125656128,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.8316802382469177,
|
|
"num_tokens": 505672795.0,
|
|
"step": 1587
|
|
},
|
|
{
|
|
"epoch": 1.6154628687690744,
|
|
"grad_norm": 0.7910969853401184,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5695,
|
|
"mean_token_accuracy": 0.829032301902771,
|
|
"num_tokens": 506006904.0,
|
|
"step": 1588
|
|
},
|
|
{
|
|
"epoch": 1.6164801627670395,
|
|
"grad_norm": 0.7462960481643677,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.8354339599609375,
|
|
"num_tokens": 506328403.0,
|
|
"step": 1589
|
|
},
|
|
{
|
|
"epoch": 1.6174974567650051,
|
|
"grad_norm": 0.8121559619903564,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8301266431808472,
|
|
"num_tokens": 506646322.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 1.6185147507629705,
|
|
"grad_norm": 0.8175883293151855,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.8349389433860779,
|
|
"num_tokens": 506963590.0,
|
|
"step": 1591
|
|
},
|
|
{
|
|
"epoch": 1.6195320447609358,
|
|
"grad_norm": 0.7866964936256409,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5547,
|
|
"mean_token_accuracy": 0.8321659564971924,
|
|
"num_tokens": 507261633.0,
|
|
"step": 1592
|
|
},
|
|
{
|
|
"epoch": 1.6205493387589014,
|
|
"grad_norm": 0.772517204284668,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5664,
|
|
"mean_token_accuracy": 0.8289494514465332,
|
|
"num_tokens": 507586130.0,
|
|
"step": 1593
|
|
},
|
|
{
|
|
"epoch": 1.6215666327568667,
|
|
"grad_norm": 0.7570743560791016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5737,
|
|
"mean_token_accuracy": 0.8264689445495605,
|
|
"num_tokens": 507895824.0,
|
|
"step": 1594
|
|
},
|
|
{
|
|
"epoch": 1.622583926754832,
|
|
"grad_norm": 0.7695471048355103,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5895,
|
|
"mean_token_accuracy": 0.8220570087432861,
|
|
"num_tokens": 508213579.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 1.6236012207527977,
|
|
"grad_norm": 0.7480154633522034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5662,
|
|
"mean_token_accuracy": 0.8287339210510254,
|
|
"num_tokens": 508527614.0,
|
|
"step": 1596
|
|
},
|
|
{
|
|
"epoch": 1.624618514750763,
|
|
"grad_norm": 0.7738633751869202,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8266138434410095,
|
|
"num_tokens": 508844166.0,
|
|
"step": 1597
|
|
},
|
|
{
|
|
"epoch": 1.6256358087487284,
|
|
"grad_norm": 0.7983934283256531,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5595,
|
|
"mean_token_accuracy": 0.8295655250549316,
|
|
"num_tokens": 509153388.0,
|
|
"step": 1598
|
|
},
|
|
{
|
|
"epoch": 1.626653102746694,
|
|
"grad_norm": 0.7699052095413208,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8266280889511108,
|
|
"num_tokens": 509478651.0,
|
|
"step": 1599
|
|
},
|
|
{
|
|
"epoch": 1.627670396744659,
|
|
"grad_norm": 0.8170141577720642,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8266683220863342,
|
|
"num_tokens": 509789533.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.6286876907426246,
|
|
"grad_norm": 0.7319942712783813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5648,
|
|
"mean_token_accuracy": 0.8287757635116577,
|
|
"num_tokens": 510127406.0,
|
|
"step": 1601
|
|
},
|
|
{
|
|
"epoch": 1.62970498474059,
|
|
"grad_norm": 0.7690290808677673,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.833433985710144,
|
|
"num_tokens": 510431387.0,
|
|
"step": 1602
|
|
},
|
|
{
|
|
"epoch": 1.6307222787385554,
|
|
"grad_norm": 0.8090844750404358,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5853,
|
|
"mean_token_accuracy": 0.8240605592727661,
|
|
"num_tokens": 510762803.0,
|
|
"step": 1603
|
|
},
|
|
{
|
|
"epoch": 1.631739572736521,
|
|
"grad_norm": 0.7850157022476196,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5857,
|
|
"mean_token_accuracy": 0.822102427482605,
|
|
"num_tokens": 511087806.0,
|
|
"step": 1604
|
|
},
|
|
{
|
|
"epoch": 1.6327568667344863,
|
|
"grad_norm": 0.8082931041717529,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8232232332229614,
|
|
"num_tokens": 511413057.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 1.6337741607324516,
|
|
"grad_norm": 0.7694851756095886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5359,
|
|
"mean_token_accuracy": 0.8365007042884827,
|
|
"num_tokens": 511722415.0,
|
|
"step": 1606
|
|
},
|
|
{
|
|
"epoch": 1.6347914547304172,
|
|
"grad_norm": 0.7512543201446533,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8339874744415283,
|
|
"num_tokens": 512040496.0,
|
|
"step": 1607
|
|
},
|
|
{
|
|
"epoch": 1.6358087487283826,
|
|
"grad_norm": 0.8409536480903625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.581,
|
|
"mean_token_accuracy": 0.823777437210083,
|
|
"num_tokens": 512348538.0,
|
|
"step": 1608
|
|
},
|
|
{
|
|
"epoch": 1.636826042726348,
|
|
"grad_norm": 0.8341800570487976,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.825890302658081,
|
|
"num_tokens": 512672571.0,
|
|
"step": 1609
|
|
},
|
|
{
|
|
"epoch": 1.6378433367243135,
|
|
"grad_norm": 0.7653716206550598,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5727,
|
|
"mean_token_accuracy": 0.827292799949646,
|
|
"num_tokens": 512983469.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 1.6388606307222786,
|
|
"grad_norm": 0.7501400113105774,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8306208848953247,
|
|
"num_tokens": 513304550.0,
|
|
"step": 1611
|
|
},
|
|
{
|
|
"epoch": 1.6398779247202442,
|
|
"grad_norm": 0.7226465344429016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8239978551864624,
|
|
"num_tokens": 513639647.0,
|
|
"step": 1612
|
|
},
|
|
{
|
|
"epoch": 1.6408952187182095,
|
|
"grad_norm": 0.8217362761497498,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8272889256477356,
|
|
"num_tokens": 513955189.0,
|
|
"step": 1613
|
|
},
|
|
{
|
|
"epoch": 1.6419125127161749,
|
|
"grad_norm": 0.8021873235702515,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.8280093669891357,
|
|
"num_tokens": 514262184.0,
|
|
"step": 1614
|
|
},
|
|
{
|
|
"epoch": 1.6429298067141405,
|
|
"grad_norm": 0.7563522458076477,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5583,
|
|
"mean_token_accuracy": 0.83150315284729,
|
|
"num_tokens": 514577426.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 1.6439471007121058,
|
|
"grad_norm": 0.7791472673416138,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5676,
|
|
"mean_token_accuracy": 0.8287179470062256,
|
|
"num_tokens": 514882566.0,
|
|
"step": 1616
|
|
},
|
|
{
|
|
"epoch": 1.6449643947100712,
|
|
"grad_norm": 0.742529571056366,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8295882940292358,
|
|
"num_tokens": 515212612.0,
|
|
"step": 1617
|
|
},
|
|
{
|
|
"epoch": 1.6459816887080367,
|
|
"grad_norm": 0.8474096655845642,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5765,
|
|
"mean_token_accuracy": 0.8254484534263611,
|
|
"num_tokens": 515525982.0,
|
|
"step": 1618
|
|
},
|
|
{
|
|
"epoch": 1.6469989827060019,
|
|
"grad_norm": 0.7607150673866272,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8320398330688477,
|
|
"num_tokens": 515853344.0,
|
|
"step": 1619
|
|
},
|
|
{
|
|
"epoch": 1.6480162767039674,
|
|
"grad_norm": 0.7506897449493408,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5618,
|
|
"mean_token_accuracy": 0.828422486782074,
|
|
"num_tokens": 516159473.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.649033570701933,
|
|
"grad_norm": 0.7465788125991821,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5822,
|
|
"mean_token_accuracy": 0.8236411213874817,
|
|
"num_tokens": 516492142.0,
|
|
"step": 1621
|
|
},
|
|
{
|
|
"epoch": 1.6500508646998981,
|
|
"grad_norm": 0.8063442707061768,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5684,
|
|
"mean_token_accuracy": 0.8278873562812805,
|
|
"num_tokens": 516798136.0,
|
|
"step": 1622
|
|
},
|
|
{
|
|
"epoch": 1.6510681586978637,
|
|
"grad_norm": 0.7778916954994202,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5767,
|
|
"mean_token_accuracy": 0.8266802430152893,
|
|
"num_tokens": 517108993.0,
|
|
"step": 1623
|
|
},
|
|
{
|
|
"epoch": 1.652085452695829,
|
|
"grad_norm": 1.6380517482757568,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8306084871292114,
|
|
"num_tokens": 517439575.0,
|
|
"step": 1624
|
|
},
|
|
{
|
|
"epoch": 1.6531027466937944,
|
|
"grad_norm": 0.770706295967102,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5776,
|
|
"mean_token_accuracy": 0.8241666555404663,
|
|
"num_tokens": 517744319.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 1.65412004069176,
|
|
"grad_norm": 0.7348789572715759,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8243869543075562,
|
|
"num_tokens": 518071406.0,
|
|
"step": 1626
|
|
},
|
|
{
|
|
"epoch": 1.6551373346897253,
|
|
"grad_norm": 0.7872225046157837,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5839,
|
|
"mean_token_accuracy": 0.8239084482192993,
|
|
"num_tokens": 518389903.0,
|
|
"step": 1627
|
|
},
|
|
{
|
|
"epoch": 1.6561546286876907,
|
|
"grad_norm": 0.7517799735069275,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5686,
|
|
"mean_token_accuracy": 0.827627420425415,
|
|
"num_tokens": 518689792.0,
|
|
"step": 1628
|
|
},
|
|
{
|
|
"epoch": 1.6571719226856563,
|
|
"grad_norm": 0.7697553634643555,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5583,
|
|
"mean_token_accuracy": 0.8302412033081055,
|
|
"num_tokens": 518982145.0,
|
|
"step": 1629
|
|
},
|
|
{
|
|
"epoch": 1.6581892166836214,
|
|
"grad_norm": 0.7755339741706848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5926,
|
|
"mean_token_accuracy": 0.8219212293624878,
|
|
"num_tokens": 519307621.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 1.659206510681587,
|
|
"grad_norm": 0.8022763133049011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.8267483711242676,
|
|
"num_tokens": 519626545.0,
|
|
"step": 1631
|
|
},
|
|
{
|
|
"epoch": 1.6602238046795526,
|
|
"grad_norm": 0.8232947587966919,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8260178565979004,
|
|
"num_tokens": 519945118.0,
|
|
"step": 1632
|
|
},
|
|
{
|
|
"epoch": 1.6612410986775177,
|
|
"grad_norm": 0.8284922242164612,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6118,
|
|
"mean_token_accuracy": 0.8165832757949829,
|
|
"num_tokens": 520263275.0,
|
|
"step": 1633
|
|
},
|
|
{
|
|
"epoch": 1.6622583926754833,
|
|
"grad_norm": 0.7653334140777588,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8228817582130432,
|
|
"num_tokens": 520587306.0,
|
|
"step": 1634
|
|
},
|
|
{
|
|
"epoch": 1.6632756866734486,
|
|
"grad_norm": 0.8112199306488037,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8297948837280273,
|
|
"num_tokens": 520922639.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 1.664292980671414,
|
|
"grad_norm": 0.8225566744804382,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5926,
|
|
"mean_token_accuracy": 0.8226288557052612,
|
|
"num_tokens": 521246669.0,
|
|
"step": 1636
|
|
},
|
|
{
|
|
"epoch": 1.6653102746693795,
|
|
"grad_norm": 0.7728680372238159,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.8317882418632507,
|
|
"num_tokens": 521552038.0,
|
|
"step": 1637
|
|
},
|
|
{
|
|
"epoch": 1.6663275686673449,
|
|
"grad_norm": 0.7860717177391052,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5762,
|
|
"mean_token_accuracy": 0.8248538970947266,
|
|
"num_tokens": 521871204.0,
|
|
"step": 1638
|
|
},
|
|
{
|
|
"epoch": 1.6673448626653102,
|
|
"grad_norm": 0.7809126973152161,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5793,
|
|
"mean_token_accuracy": 0.8258426189422607,
|
|
"num_tokens": 522180569.0,
|
|
"step": 1639
|
|
},
|
|
{
|
|
"epoch": 1.6683621566632758,
|
|
"grad_norm": 0.8085176348686218,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8277914524078369,
|
|
"num_tokens": 522491069.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 1.669379450661241,
|
|
"grad_norm": 0.777393102645874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.8285313844680786,
|
|
"num_tokens": 522813823.0,
|
|
"step": 1641
|
|
},
|
|
{
|
|
"epoch": 1.6703967446592065,
|
|
"grad_norm": 0.7664257884025574,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8185603618621826,
|
|
"num_tokens": 523142670.0,
|
|
"step": 1642
|
|
},
|
|
{
|
|
"epoch": 1.671414038657172,
|
|
"grad_norm": 0.7619096040725708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5888,
|
|
"mean_token_accuracy": 0.8226163983345032,
|
|
"num_tokens": 523461987.0,
|
|
"step": 1643
|
|
},
|
|
{
|
|
"epoch": 1.6724313326551372,
|
|
"grad_norm": 0.7851698398590088,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5638,
|
|
"mean_token_accuracy": 0.8290607929229736,
|
|
"num_tokens": 523794689.0,
|
|
"step": 1644
|
|
},
|
|
{
|
|
"epoch": 1.6734486266531028,
|
|
"grad_norm": 0.77797532081604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.575,
|
|
"mean_token_accuracy": 0.8254382014274597,
|
|
"num_tokens": 524128171.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 1.6744659206510681,
|
|
"grad_norm": 0.765530526638031,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5906,
|
|
"mean_token_accuracy": 0.8212093114852905,
|
|
"num_tokens": 524455562.0,
|
|
"step": 1646
|
|
},
|
|
{
|
|
"epoch": 1.6754832146490335,
|
|
"grad_norm": 0.7552871108055115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5965,
|
|
"mean_token_accuracy": 0.8209508657455444,
|
|
"num_tokens": 524788549.0,
|
|
"step": 1647
|
|
},
|
|
{
|
|
"epoch": 1.676500508646999,
|
|
"grad_norm": 0.7702638506889343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.822458803653717,
|
|
"num_tokens": 525108018.0,
|
|
"step": 1648
|
|
},
|
|
{
|
|
"epoch": 1.6775178026449644,
|
|
"grad_norm": 0.7635334730148315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8315030336380005,
|
|
"num_tokens": 525429783.0,
|
|
"step": 1649
|
|
},
|
|
{
|
|
"epoch": 1.6785350966429298,
|
|
"grad_norm": 0.7596954107284546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5957,
|
|
"mean_token_accuracy": 0.821110725402832,
|
|
"num_tokens": 525756468.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 1.6795523906408953,
|
|
"grad_norm": 0.7331662774085999,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.554,
|
|
"mean_token_accuracy": 0.8301153182983398,
|
|
"num_tokens": 526072332.0,
|
|
"step": 1651
|
|
},
|
|
{
|
|
"epoch": 1.6805696846388605,
|
|
"grad_norm": 0.7317625880241394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.8263423442840576,
|
|
"num_tokens": 526390503.0,
|
|
"step": 1652
|
|
},
|
|
{
|
|
"epoch": 1.681586978636826,
|
|
"grad_norm": 0.8702585697174072,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5949,
|
|
"mean_token_accuracy": 0.821205735206604,
|
|
"num_tokens": 526702092.0,
|
|
"step": 1653
|
|
},
|
|
{
|
|
"epoch": 1.6826042726347916,
|
|
"grad_norm": 0.7284979820251465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5651,
|
|
"mean_token_accuracy": 0.8286755681037903,
|
|
"num_tokens": 527035121.0,
|
|
"step": 1654
|
|
},
|
|
{
|
|
"epoch": 1.6836215666327567,
|
|
"grad_norm": 0.7543615698814392,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8284416198730469,
|
|
"num_tokens": 527350842.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 1.6846388606307223,
|
|
"grad_norm": 0.7683477997779846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5824,
|
|
"mean_token_accuracy": 0.8239773511886597,
|
|
"num_tokens": 527685407.0,
|
|
"step": 1656
|
|
},
|
|
{
|
|
"epoch": 1.6856561546286877,
|
|
"grad_norm": 0.732888400554657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5556,
|
|
"mean_token_accuracy": 0.8312336206436157,
|
|
"num_tokens": 528011456.0,
|
|
"step": 1657
|
|
},
|
|
{
|
|
"epoch": 1.686673448626653,
|
|
"grad_norm": 0.7869269847869873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.8261852860450745,
|
|
"num_tokens": 528319045.0,
|
|
"step": 1658
|
|
},
|
|
{
|
|
"epoch": 1.6876907426246186,
|
|
"grad_norm": 0.7795509696006775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5798,
|
|
"mean_token_accuracy": 0.8243482708930969,
|
|
"num_tokens": 528635401.0,
|
|
"step": 1659
|
|
},
|
|
{
|
|
"epoch": 1.688708036622584,
|
|
"grad_norm": 0.76092928647995,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.8321963548660278,
|
|
"num_tokens": 528936587.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 1.6897253306205493,
|
|
"grad_norm": 0.7945722341537476,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5869,
|
|
"mean_token_accuracy": 0.8236092329025269,
|
|
"num_tokens": 529263800.0,
|
|
"step": 1661
|
|
},
|
|
{
|
|
"epoch": 1.6907426246185149,
|
|
"grad_norm": 0.7325130701065063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5375,
|
|
"mean_token_accuracy": 0.8348582983016968,
|
|
"num_tokens": 529579783.0,
|
|
"step": 1662
|
|
},
|
|
{
|
|
"epoch": 1.69175991861648,
|
|
"grad_norm": 0.7459843158721924,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.82767653465271,
|
|
"num_tokens": 529922657.0,
|
|
"step": 1663
|
|
},
|
|
{
|
|
"epoch": 1.6927772126144456,
|
|
"grad_norm": 0.7394583821296692,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5387,
|
|
"mean_token_accuracy": 0.8359782695770264,
|
|
"num_tokens": 530237269.0,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"epoch": 1.693794506612411,
|
|
"grad_norm": 0.7484853267669678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5883,
|
|
"mean_token_accuracy": 0.8226250410079956,
|
|
"num_tokens": 530563601.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 1.6948118006103763,
|
|
"grad_norm": 0.7826150059700012,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5874,
|
|
"mean_token_accuracy": 0.8226125240325928,
|
|
"num_tokens": 530875466.0,
|
|
"step": 1666
|
|
},
|
|
{
|
|
"epoch": 1.6958290946083419,
|
|
"grad_norm": 0.7448320984840393,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5661,
|
|
"mean_token_accuracy": 0.8283286094665527,
|
|
"num_tokens": 531199906.0,
|
|
"step": 1667
|
|
},
|
|
{
|
|
"epoch": 1.6968463886063072,
|
|
"grad_norm": 0.7585152387619019,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8263843655586243,
|
|
"num_tokens": 531509317.0,
|
|
"step": 1668
|
|
},
|
|
{
|
|
"epoch": 1.6978636826042726,
|
|
"grad_norm": 0.758061408996582,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8299506306648254,
|
|
"num_tokens": 531821184.0,
|
|
"step": 1669
|
|
},
|
|
{
|
|
"epoch": 1.6988809766022381,
|
|
"grad_norm": 0.7765728235244751,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5588,
|
|
"mean_token_accuracy": 0.8297423720359802,
|
|
"num_tokens": 532137310.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 1.6998982706002035,
|
|
"grad_norm": 0.7671675682067871,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.574,
|
|
"mean_token_accuracy": 0.8257801532745361,
|
|
"num_tokens": 532440662.0,
|
|
"step": 1671
|
|
},
|
|
{
|
|
"epoch": 1.7009155645981688,
|
|
"grad_norm": 0.7835381627082825,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.564,
|
|
"mean_token_accuracy": 0.8288462162017822,
|
|
"num_tokens": 532750623.0,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"epoch": 1.7019328585961344,
|
|
"grad_norm": 0.7619048357009888,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8313275575637817,
|
|
"num_tokens": 533079069.0,
|
|
"step": 1673
|
|
},
|
|
{
|
|
"epoch": 1.7029501525940995,
|
|
"grad_norm": 0.7300487756729126,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.8356854915618896,
|
|
"num_tokens": 533404987.0,
|
|
"step": 1674
|
|
},
|
|
{
|
|
"epoch": 1.7039674465920651,
|
|
"grad_norm": 0.7767015695571899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8279440402984619,
|
|
"num_tokens": 533715865.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 1.7049847405900305,
|
|
"grad_norm": 0.858773946762085,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.824346661567688,
|
|
"num_tokens": 534023360.0,
|
|
"step": 1676
|
|
},
|
|
{
|
|
"epoch": 1.7060020345879958,
|
|
"grad_norm": 0.7402467131614685,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5553,
|
|
"mean_token_accuracy": 0.8312146663665771,
|
|
"num_tokens": 534338342.0,
|
|
"step": 1677
|
|
},
|
|
{
|
|
"epoch": 1.7070193285859614,
|
|
"grad_norm": 0.7947079539299011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5725,
|
|
"mean_token_accuracy": 0.8272440433502197,
|
|
"num_tokens": 534643540.0,
|
|
"step": 1678
|
|
},
|
|
{
|
|
"epoch": 1.7080366225839267,
|
|
"grad_norm": 0.7758183479309082,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5651,
|
|
"mean_token_accuracy": 0.8284360766410828,
|
|
"num_tokens": 534946089.0,
|
|
"step": 1679
|
|
},
|
|
{
|
|
"epoch": 1.709053916581892,
|
|
"grad_norm": 0.7562273144721985,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5861,
|
|
"mean_token_accuracy": 0.8222533464431763,
|
|
"num_tokens": 535248651.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 1.7100712105798577,
|
|
"grad_norm": 0.8158243894577026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5913,
|
|
"mean_token_accuracy": 0.8221856951713562,
|
|
"num_tokens": 535572330.0,
|
|
"step": 1681
|
|
},
|
|
{
|
|
"epoch": 1.711088504577823,
|
|
"grad_norm": 0.7543163895606995,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.827721357345581,
|
|
"num_tokens": 535900015.0,
|
|
"step": 1682
|
|
},
|
|
{
|
|
"epoch": 1.7121057985757884,
|
|
"grad_norm": 0.7882978916168213,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5829,
|
|
"mean_token_accuracy": 0.8247331976890564,
|
|
"num_tokens": 536213601.0,
|
|
"step": 1683
|
|
},
|
|
{
|
|
"epoch": 1.713123092573754,
|
|
"grad_norm": 0.756375253200531,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.822330892086029,
|
|
"num_tokens": 536546179.0,
|
|
"step": 1684
|
|
},
|
|
{
|
|
"epoch": 1.714140386571719,
|
|
"grad_norm": 0.8251622915267944,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5919,
|
|
"mean_token_accuracy": 0.8222557902336121,
|
|
"num_tokens": 536873341.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 1.7151576805696847,
|
|
"grad_norm": 0.7793753743171692,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5752,
|
|
"mean_token_accuracy": 0.8255091309547424,
|
|
"num_tokens": 537182989.0,
|
|
"step": 1686
|
|
},
|
|
{
|
|
"epoch": 1.71617497456765,
|
|
"grad_norm": 0.7966047525405884,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.574,
|
|
"mean_token_accuracy": 0.825797975063324,
|
|
"num_tokens": 537519346.0,
|
|
"step": 1687
|
|
},
|
|
{
|
|
"epoch": 1.7171922685656154,
|
|
"grad_norm": 0.7807252407073975,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.554,
|
|
"mean_token_accuracy": 0.8319574594497681,
|
|
"num_tokens": 537849156.0,
|
|
"step": 1688
|
|
},
|
|
{
|
|
"epoch": 1.718209562563581,
|
|
"grad_norm": 0.821812093257904,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5929,
|
|
"mean_token_accuracy": 0.8211857676506042,
|
|
"num_tokens": 538167004.0,
|
|
"step": 1689
|
|
},
|
|
{
|
|
"epoch": 1.7192268565615463,
|
|
"grad_norm": 0.83882737159729,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8303667902946472,
|
|
"num_tokens": 538455354.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 1.7202441505595116,
|
|
"grad_norm": 0.7454236149787903,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5624,
|
|
"mean_token_accuracy": 0.8291758298873901,
|
|
"num_tokens": 538776413.0,
|
|
"step": 1691
|
|
},
|
|
{
|
|
"epoch": 1.7212614445574772,
|
|
"grad_norm": 0.7784879207611084,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5707,
|
|
"mean_token_accuracy": 0.8272480368614197,
|
|
"num_tokens": 539089207.0,
|
|
"step": 1692
|
|
},
|
|
{
|
|
"epoch": 1.7222787385554426,
|
|
"grad_norm": 0.8132408261299133,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.563,
|
|
"mean_token_accuracy": 0.8302366733551025,
|
|
"num_tokens": 539409402.0,
|
|
"step": 1693
|
|
},
|
|
{
|
|
"epoch": 1.723296032553408,
|
|
"grad_norm": 0.7897824048995972,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5866,
|
|
"mean_token_accuracy": 0.8230624198913574,
|
|
"num_tokens": 539724109.0,
|
|
"step": 1694
|
|
},
|
|
{
|
|
"epoch": 1.7243133265513735,
|
|
"grad_norm": 0.7500278949737549,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.828041672706604,
|
|
"num_tokens": 540055954.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 1.7253306205493386,
|
|
"grad_norm": 0.7602378726005554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5968,
|
|
"mean_token_accuracy": 0.8209776878356934,
|
|
"num_tokens": 540380282.0,
|
|
"step": 1696
|
|
},
|
|
{
|
|
"epoch": 1.7263479145473042,
|
|
"grad_norm": 0.7624279260635376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5878,
|
|
"mean_token_accuracy": 0.8228939175605774,
|
|
"num_tokens": 540703596.0,
|
|
"step": 1697
|
|
},
|
|
{
|
|
"epoch": 1.7273652085452695,
|
|
"grad_norm": 0.7722674012184143,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5853,
|
|
"mean_token_accuracy": 0.8232327103614807,
|
|
"num_tokens": 541024110.0,
|
|
"step": 1698
|
|
},
|
|
{
|
|
"epoch": 1.7283825025432349,
|
|
"grad_norm": 0.8067665696144104,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5693,
|
|
"mean_token_accuracy": 0.828144907951355,
|
|
"num_tokens": 541343999.0,
|
|
"step": 1699
|
|
},
|
|
{
|
|
"epoch": 1.7293997965412005,
|
|
"grad_norm": 0.8109105229377747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5775,
|
|
"mean_token_accuracy": 0.8250290751457214,
|
|
"num_tokens": 541666906.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.7304170905391658,
|
|
"grad_norm": 0.753727376461029,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.8250855803489685,
|
|
"num_tokens": 541998847.0,
|
|
"step": 1701
|
|
},
|
|
{
|
|
"epoch": 1.7314343845371312,
|
|
"grad_norm": 0.7683834433555603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5695,
|
|
"mean_token_accuracy": 0.8281385898590088,
|
|
"num_tokens": 542330954.0,
|
|
"step": 1702
|
|
},
|
|
{
|
|
"epoch": 1.7324516785350967,
|
|
"grad_norm": 0.7327966094017029,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8294649124145508,
|
|
"num_tokens": 542649706.0,
|
|
"step": 1703
|
|
},
|
|
{
|
|
"epoch": 1.733468972533062,
|
|
"grad_norm": 0.754997193813324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6051,
|
|
"mean_token_accuracy": 0.8174710869789124,
|
|
"num_tokens": 542971174.0,
|
|
"step": 1704
|
|
},
|
|
{
|
|
"epoch": 1.7344862665310274,
|
|
"grad_norm": 0.7789589762687683,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.826871395111084,
|
|
"num_tokens": 543286863.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 1.735503560528993,
|
|
"grad_norm": 0.7593998312950134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6006,
|
|
"mean_token_accuracy": 0.8190546035766602,
|
|
"num_tokens": 543603599.0,
|
|
"step": 1706
|
|
},
|
|
{
|
|
"epoch": 1.7365208545269581,
|
|
"grad_norm": 0.7631123065948486,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5806,
|
|
"mean_token_accuracy": 0.8233776092529297,
|
|
"num_tokens": 543929074.0,
|
|
"step": 1707
|
|
},
|
|
{
|
|
"epoch": 1.7375381485249237,
|
|
"grad_norm": 0.7596994042396545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5957,
|
|
"mean_token_accuracy": 0.8212376832962036,
|
|
"num_tokens": 544254565.0,
|
|
"step": 1708
|
|
},
|
|
{
|
|
"epoch": 1.738555442522889,
|
|
"grad_norm": 0.7288892865180969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.557,
|
|
"mean_token_accuracy": 0.8310983777046204,
|
|
"num_tokens": 544594001.0,
|
|
"step": 1709
|
|
},
|
|
{
|
|
"epoch": 1.7395727365208544,
|
|
"grad_norm": 0.7478421330451965,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.589,
|
|
"mean_token_accuracy": 0.8226866126060486,
|
|
"num_tokens": 544905906.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 1.74059003051882,
|
|
"grad_norm": 0.7765371203422546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8321041464805603,
|
|
"num_tokens": 545218275.0,
|
|
"step": 1711
|
|
},
|
|
{
|
|
"epoch": 1.7416073245167853,
|
|
"grad_norm": 0.7736310958862305,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5802,
|
|
"mean_token_accuracy": 0.8248491883277893,
|
|
"num_tokens": 545540433.0,
|
|
"step": 1712
|
|
},
|
|
{
|
|
"epoch": 1.7426246185147507,
|
|
"grad_norm": 0.7796671390533447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5671,
|
|
"mean_token_accuracy": 0.8282781839370728,
|
|
"num_tokens": 545853945.0,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"epoch": 1.7436419125127163,
|
|
"grad_norm": 0.7976074814796448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5508,
|
|
"mean_token_accuracy": 0.8329964280128479,
|
|
"num_tokens": 546163829.0,
|
|
"step": 1714
|
|
},
|
|
{
|
|
"epoch": 1.7446592065106816,
|
|
"grad_norm": 0.7982625365257263,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8288213014602661,
|
|
"num_tokens": 546476551.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 1.745676500508647,
|
|
"grad_norm": 0.733306348323822,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5584,
|
|
"mean_token_accuracy": 0.8309009075164795,
|
|
"num_tokens": 546808037.0,
|
|
"step": 1716
|
|
},
|
|
{
|
|
"epoch": 1.7466937945066126,
|
|
"grad_norm": 0.7403319478034973,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5506,
|
|
"mean_token_accuracy": 0.8330470323562622,
|
|
"num_tokens": 547138121.0,
|
|
"step": 1717
|
|
},
|
|
{
|
|
"epoch": 1.7477110885045777,
|
|
"grad_norm": 0.7419624924659729,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8281609416007996,
|
|
"num_tokens": 547455414.0,
|
|
"step": 1718
|
|
},
|
|
{
|
|
"epoch": 1.7487283825025433,
|
|
"grad_norm": 0.7999053001403809,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5567,
|
|
"mean_token_accuracy": 0.8309951424598694,
|
|
"num_tokens": 547758420.0,
|
|
"step": 1719
|
|
},
|
|
{
|
|
"epoch": 1.7497456765005086,
|
|
"grad_norm": 0.7836387157440186,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8303165435791016,
|
|
"num_tokens": 548059321.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 1.750762970498474,
|
|
"grad_norm": 0.7795581221580505,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5964,
|
|
"mean_token_accuracy": 0.8205006122589111,
|
|
"num_tokens": 548363768.0,
|
|
"step": 1721
|
|
},
|
|
{
|
|
"epoch": 1.7517802644964395,
|
|
"grad_norm": 0.7558625340461731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8264075517654419,
|
|
"num_tokens": 548670132.0,
|
|
"step": 1722
|
|
},
|
|
{
|
|
"epoch": 1.7527975584944049,
|
|
"grad_norm": 0.7679370641708374,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5653,
|
|
"mean_token_accuracy": 0.8297629952430725,
|
|
"num_tokens": 548986310.0,
|
|
"step": 1723
|
|
},
|
|
{
|
|
"epoch": 1.7538148524923702,
|
|
"grad_norm": 0.7437918186187744,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8282225131988525,
|
|
"num_tokens": 549294340.0,
|
|
"step": 1724
|
|
},
|
|
{
|
|
"epoch": 1.7548321464903358,
|
|
"grad_norm": 0.7457557320594788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5409,
|
|
"mean_token_accuracy": 0.8366795778274536,
|
|
"num_tokens": 549610886.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 1.7558494404883012,
|
|
"grad_norm": 0.7415375113487244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8265110850334167,
|
|
"num_tokens": 549926561.0,
|
|
"step": 1726
|
|
},
|
|
{
|
|
"epoch": 1.7568667344862665,
|
|
"grad_norm": 0.7230045199394226,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8289594054222107,
|
|
"num_tokens": 550244848.0,
|
|
"step": 1727
|
|
},
|
|
{
|
|
"epoch": 1.757884028484232,
|
|
"grad_norm": 0.7837753891944885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5463,
|
|
"mean_token_accuracy": 0.8344599604606628,
|
|
"num_tokens": 550565225.0,
|
|
"step": 1728
|
|
},
|
|
{
|
|
"epoch": 1.7589013224821972,
|
|
"grad_norm": 0.7353631854057312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5645,
|
|
"mean_token_accuracy": 0.828995943069458,
|
|
"num_tokens": 550869158.0,
|
|
"step": 1729
|
|
},
|
|
{
|
|
"epoch": 1.7599186164801628,
|
|
"grad_norm": 0.7451027631759644,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5774,
|
|
"mean_token_accuracy": 0.8255603909492493,
|
|
"num_tokens": 551189311.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 1.7609359104781281,
|
|
"grad_norm": 0.7759789228439331,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.582,
|
|
"mean_token_accuracy": 0.8234883546829224,
|
|
"num_tokens": 551500042.0,
|
|
"step": 1731
|
|
},
|
|
{
|
|
"epoch": 1.7619532044760935,
|
|
"grad_norm": 0.7609255313873291,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5641,
|
|
"mean_token_accuracy": 0.8300365209579468,
|
|
"num_tokens": 551811847.0,
|
|
"step": 1732
|
|
},
|
|
{
|
|
"epoch": 1.762970498474059,
|
|
"grad_norm": 0.733325719833374,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5579,
|
|
"mean_token_accuracy": 0.8318478465080261,
|
|
"num_tokens": 552131236.0,
|
|
"step": 1733
|
|
},
|
|
{
|
|
"epoch": 1.7639877924720244,
|
|
"grad_norm": 0.8272982239723206,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5665,
|
|
"mean_token_accuracy": 0.8292044401168823,
|
|
"num_tokens": 552456755.0,
|
|
"step": 1734
|
|
},
|
|
{
|
|
"epoch": 1.7650050864699898,
|
|
"grad_norm": 0.7635635137557983,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8342831134796143,
|
|
"num_tokens": 552777207.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 1.7660223804679553,
|
|
"grad_norm": 0.7395186424255371,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5713,
|
|
"mean_token_accuracy": 0.8267441987991333,
|
|
"num_tokens": 553087376.0,
|
|
"step": 1736
|
|
},
|
|
{
|
|
"epoch": 1.7670396744659207,
|
|
"grad_norm": 0.7357089519500732,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5727,
|
|
"mean_token_accuracy": 0.8273993730545044,
|
|
"num_tokens": 553406997.0,
|
|
"step": 1737
|
|
},
|
|
{
|
|
"epoch": 1.768056968463886,
|
|
"grad_norm": 0.7930406332015991,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6024,
|
|
"mean_token_accuracy": 0.8186691403388977,
|
|
"num_tokens": 553726380.0,
|
|
"step": 1738
|
|
},
|
|
{
|
|
"epoch": 1.7690742624618516,
|
|
"grad_norm": 0.7484753727912903,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5834,
|
|
"mean_token_accuracy": 0.8241056799888611,
|
|
"num_tokens": 554060617.0,
|
|
"step": 1739
|
|
},
|
|
{
|
|
"epoch": 1.7700915564598168,
|
|
"grad_norm": 0.7562209367752075,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8281306028366089,
|
|
"num_tokens": 554385731.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 1.7711088504577823,
|
|
"grad_norm": 0.7572125792503357,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.565,
|
|
"mean_token_accuracy": 0.8297789096832275,
|
|
"num_tokens": 554714277.0,
|
|
"step": 1741
|
|
},
|
|
{
|
|
"epoch": 1.7721261444557477,
|
|
"grad_norm": 0.8044959902763367,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5804,
|
|
"mean_token_accuracy": 0.824925422668457,
|
|
"num_tokens": 555033920.0,
|
|
"step": 1742
|
|
},
|
|
{
|
|
"epoch": 1.773143438453713,
|
|
"grad_norm": 0.7124162912368774,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5581,
|
|
"mean_token_accuracy": 0.8302481770515442,
|
|
"num_tokens": 555371365.0,
|
|
"step": 1743
|
|
},
|
|
{
|
|
"epoch": 1.7741607324516786,
|
|
"grad_norm": 0.723034679889679,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.830976128578186,
|
|
"num_tokens": 555695553.0,
|
|
"step": 1744
|
|
},
|
|
{
|
|
"epoch": 1.775178026449644,
|
|
"grad_norm": 0.7885095477104187,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5889,
|
|
"mean_token_accuracy": 0.8223779797554016,
|
|
"num_tokens": 556006070.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 1.7761953204476093,
|
|
"grad_norm": 0.7051622271537781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.835534930229187,
|
|
"num_tokens": 556324330.0,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"epoch": 1.7772126144455749,
|
|
"grad_norm": 0.777579128742218,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8295488357543945,
|
|
"num_tokens": 556639463.0,
|
|
"step": 1747
|
|
},
|
|
{
|
|
"epoch": 1.7782299084435402,
|
|
"grad_norm": 0.7351852059364319,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.825910210609436,
|
|
"num_tokens": 556964733.0,
|
|
"step": 1748
|
|
},
|
|
{
|
|
"epoch": 1.7792472024415056,
|
|
"grad_norm": 0.7478289008140564,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8244177103042603,
|
|
"num_tokens": 557293193.0,
|
|
"step": 1749
|
|
},
|
|
{
|
|
"epoch": 1.7802644964394712,
|
|
"grad_norm": 0.7599475383758545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8303159475326538,
|
|
"num_tokens": 557609634.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.7812817904374363,
|
|
"grad_norm": 0.7593867182731628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8315073251724243,
|
|
"num_tokens": 557933267.0,
|
|
"step": 1751
|
|
},
|
|
{
|
|
"epoch": 1.7822990844354019,
|
|
"grad_norm": 1.2834893465042114,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6067,
|
|
"mean_token_accuracy": 0.8185144066810608,
|
|
"num_tokens": 558238439.0,
|
|
"step": 1752
|
|
},
|
|
{
|
|
"epoch": 1.7833163784333672,
|
|
"grad_norm": 0.7640964984893799,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.8251737356185913,
|
|
"num_tokens": 558564521.0,
|
|
"step": 1753
|
|
},
|
|
{
|
|
"epoch": 1.7843336724313326,
|
|
"grad_norm": 0.781453013420105,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5994,
|
|
"mean_token_accuracy": 0.8211246132850647,
|
|
"num_tokens": 558870711.0,
|
|
"step": 1754
|
|
},
|
|
{
|
|
"epoch": 1.7853509664292981,
|
|
"grad_norm": 0.7851511836051941,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8274396657943726,
|
|
"num_tokens": 559184486.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 1.7863682604272635,
|
|
"grad_norm": 0.7635732293128967,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5656,
|
|
"mean_token_accuracy": 0.8281428813934326,
|
|
"num_tokens": 559487831.0,
|
|
"step": 1756
|
|
},
|
|
{
|
|
"epoch": 1.7873855544252288,
|
|
"grad_norm": 0.7452448010444641,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8321632146835327,
|
|
"num_tokens": 559805689.0,
|
|
"step": 1757
|
|
},
|
|
{
|
|
"epoch": 1.7884028484231944,
|
|
"grad_norm": 0.7915704250335693,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5424,
|
|
"mean_token_accuracy": 0.8353804349899292,
|
|
"num_tokens": 560117177.0,
|
|
"step": 1758
|
|
},
|
|
{
|
|
"epoch": 1.7894201424211598,
|
|
"grad_norm": 0.7450749278068542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.574,
|
|
"mean_token_accuracy": 0.8259989023208618,
|
|
"num_tokens": 560434600.0,
|
|
"step": 1759
|
|
},
|
|
{
|
|
"epoch": 1.7904374364191251,
|
|
"grad_norm": 0.7560845613479614,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5673,
|
|
"mean_token_accuracy": 0.8280336856842041,
|
|
"num_tokens": 560752929.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 1.7914547304170907,
|
|
"grad_norm": 0.7767955660820007,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8263304233551025,
|
|
"num_tokens": 561057758.0,
|
|
"step": 1761
|
|
},
|
|
{
|
|
"epoch": 1.7924720244150558,
|
|
"grad_norm": 0.7627075910568237,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5576,
|
|
"mean_token_accuracy": 0.8305634260177612,
|
|
"num_tokens": 561364341.0,
|
|
"step": 1762
|
|
},
|
|
{
|
|
"epoch": 1.7934893184130214,
|
|
"grad_norm": 0.734595537185669,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.8293797969818115,
|
|
"num_tokens": 561682192.0,
|
|
"step": 1763
|
|
},
|
|
{
|
|
"epoch": 1.7945066124109867,
|
|
"grad_norm": 0.7812824845314026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8202382922172546,
|
|
"num_tokens": 561984366.0,
|
|
"step": 1764
|
|
},
|
|
{
|
|
"epoch": 1.795523906408952,
|
|
"grad_norm": 0.7395841479301453,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8286502361297607,
|
|
"num_tokens": 562319171.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 1.7965412004069177,
|
|
"grad_norm": 0.7442046999931335,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5919,
|
|
"mean_token_accuracy": 0.822655200958252,
|
|
"num_tokens": 562649718.0,
|
|
"step": 1766
|
|
},
|
|
{
|
|
"epoch": 1.797558494404883,
|
|
"grad_norm": 0.7415618896484375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5635,
|
|
"mean_token_accuracy": 0.8303205966949463,
|
|
"num_tokens": 562962769.0,
|
|
"step": 1767
|
|
},
|
|
{
|
|
"epoch": 1.7985757884028484,
|
|
"grad_norm": 0.8440918922424316,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8259419202804565,
|
|
"num_tokens": 563281657.0,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 1.799593082400814,
|
|
"grad_norm": 0.7686070203781128,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8270139098167419,
|
|
"num_tokens": 563599442.0,
|
|
"step": 1769
|
|
},
|
|
{
|
|
"epoch": 1.8006103763987793,
|
|
"grad_norm": 0.7371310591697693,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8267232179641724,
|
|
"num_tokens": 563938038.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 1.8016276703967447,
|
|
"grad_norm": 0.7386311292648315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5484,
|
|
"mean_token_accuracy": 0.8327062129974365,
|
|
"num_tokens": 564249675.0,
|
|
"step": 1771
|
|
},
|
|
{
|
|
"epoch": 1.8026449643947102,
|
|
"grad_norm": 0.7869567275047302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8284811973571777,
|
|
"num_tokens": 564549442.0,
|
|
"step": 1772
|
|
},
|
|
{
|
|
"epoch": 1.8036622583926754,
|
|
"grad_norm": 0.8040668964385986,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.557,
|
|
"mean_token_accuracy": 0.8295624256134033,
|
|
"num_tokens": 564879892.0,
|
|
"step": 1773
|
|
},
|
|
{
|
|
"epoch": 1.804679552390641,
|
|
"grad_norm": 0.8081727027893066,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.8280688524246216,
|
|
"num_tokens": 565210219.0,
|
|
"step": 1774
|
|
},
|
|
{
|
|
"epoch": 1.8056968463886063,
|
|
"grad_norm": 0.761573314666748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5601,
|
|
"mean_token_accuracy": 0.8316044807434082,
|
|
"num_tokens": 565526228.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 1.8067141403865716,
|
|
"grad_norm": 0.7849445343017578,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.591,
|
|
"mean_token_accuracy": 0.8230304718017578,
|
|
"num_tokens": 565852900.0,
|
|
"step": 1776
|
|
},
|
|
{
|
|
"epoch": 1.8077314343845372,
|
|
"grad_norm": 0.7502607703208923,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5597,
|
|
"mean_token_accuracy": 0.831209123134613,
|
|
"num_tokens": 566180405.0,
|
|
"step": 1777
|
|
},
|
|
{
|
|
"epoch": 1.8087487283825026,
|
|
"grad_norm": 0.797438383102417,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5824,
|
|
"mean_token_accuracy": 0.8235186338424683,
|
|
"num_tokens": 566504302.0,
|
|
"step": 1778
|
|
},
|
|
{
|
|
"epoch": 1.809766022380468,
|
|
"grad_norm": 0.7596687078475952,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5836,
|
|
"mean_token_accuracy": 0.8234046697616577,
|
|
"num_tokens": 566830753.0,
|
|
"step": 1779
|
|
},
|
|
{
|
|
"epoch": 1.8107833163784335,
|
|
"grad_norm": 0.8487532734870911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.8312756419181824,
|
|
"num_tokens": 567125149.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 1.8118006103763988,
|
|
"grad_norm": 0.72871994972229,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5792,
|
|
"mean_token_accuracy": 0.8263726234436035,
|
|
"num_tokens": 567439219.0,
|
|
"step": 1781
|
|
},
|
|
{
|
|
"epoch": 1.8128179043743642,
|
|
"grad_norm": 0.7636623978614807,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5693,
|
|
"mean_token_accuracy": 0.8279685974121094,
|
|
"num_tokens": 567757245.0,
|
|
"step": 1782
|
|
},
|
|
{
|
|
"epoch": 1.8138351983723298,
|
|
"grad_norm": 0.7923188805580139,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8303213119506836,
|
|
"num_tokens": 568063203.0,
|
|
"step": 1783
|
|
},
|
|
{
|
|
"epoch": 1.814852492370295,
|
|
"grad_norm": 0.784536600112915,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5724,
|
|
"mean_token_accuracy": 0.8268868923187256,
|
|
"num_tokens": 568375485.0,
|
|
"step": 1784
|
|
},
|
|
{
|
|
"epoch": 1.8158697863682605,
|
|
"grad_norm": 0.7456648945808411,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8270926475524902,
|
|
"num_tokens": 568704776.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 1.8168870803662258,
|
|
"grad_norm": 0.7257611155509949,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5567,
|
|
"mean_token_accuracy": 0.8301023244857788,
|
|
"num_tokens": 569029468.0,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"epoch": 1.8179043743641912,
|
|
"grad_norm": 0.7755439877510071,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5728,
|
|
"mean_token_accuracy": 0.8267319202423096,
|
|
"num_tokens": 569332534.0,
|
|
"step": 1787
|
|
},
|
|
{
|
|
"epoch": 1.8189216683621567,
|
|
"grad_norm": 0.7320640683174133,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8308744430541992,
|
|
"num_tokens": 569647573.0,
|
|
"step": 1788
|
|
},
|
|
{
|
|
"epoch": 1.819938962360122,
|
|
"grad_norm": 0.7781921625137329,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5658,
|
|
"mean_token_accuracy": 0.8281724452972412,
|
|
"num_tokens": 569960390.0,
|
|
"step": 1789
|
|
},
|
|
{
|
|
"epoch": 1.8209562563580874,
|
|
"grad_norm": 0.7738130688667297,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5804,
|
|
"mean_token_accuracy": 0.8257182240486145,
|
|
"num_tokens": 570276992.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 1.821973550356053,
|
|
"grad_norm": 0.7794972062110901,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8308176398277283,
|
|
"num_tokens": 570601038.0,
|
|
"step": 1791
|
|
},
|
|
{
|
|
"epoch": 1.8229908443540181,
|
|
"grad_norm": 0.7800111770629883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5713,
|
|
"mean_token_accuracy": 0.8281744718551636,
|
|
"num_tokens": 570927501.0,
|
|
"step": 1792
|
|
},
|
|
{
|
|
"epoch": 1.8240081383519837,
|
|
"grad_norm": 0.7633543610572815,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.8264333009719849,
|
|
"num_tokens": 571225760.0,
|
|
"step": 1793
|
|
},
|
|
{
|
|
"epoch": 1.8250254323499493,
|
|
"grad_norm": 0.7642391920089722,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5715,
|
|
"mean_token_accuracy": 0.8267236948013306,
|
|
"num_tokens": 571536763.0,
|
|
"step": 1794
|
|
},
|
|
{
|
|
"epoch": 1.8260427263479144,
|
|
"grad_norm": 0.7453933358192444,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8335306644439697,
|
|
"num_tokens": 571865425.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 1.82706002034588,
|
|
"grad_norm": 1.2367335557937622,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8284416198730469,
|
|
"num_tokens": 572184872.0,
|
|
"step": 1796
|
|
},
|
|
{
|
|
"epoch": 1.8280773143438453,
|
|
"grad_norm": 0.7603802680969238,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8286498785018921,
|
|
"num_tokens": 572506248.0,
|
|
"step": 1797
|
|
},
|
|
{
|
|
"epoch": 1.8290946083418107,
|
|
"grad_norm": 0.7347100377082825,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.8210036754608154,
|
|
"num_tokens": 572820670.0,
|
|
"step": 1798
|
|
},
|
|
{
|
|
"epoch": 1.8301119023397763,
|
|
"grad_norm": 1.4764974117279053,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8353832960128784,
|
|
"num_tokens": 573129521.0,
|
|
"step": 1799
|
|
},
|
|
{
|
|
"epoch": 1.8311291963377416,
|
|
"grad_norm": 0.7458595037460327,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.825110137462616,
|
|
"num_tokens": 573458787.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.832146490335707,
|
|
"grad_norm": 0.7468324899673462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.8235799074172974,
|
|
"num_tokens": 573794196.0,
|
|
"step": 1801
|
|
},
|
|
{
|
|
"epoch": 1.8331637843336726,
|
|
"grad_norm": 0.7786433696746826,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8281745910644531,
|
|
"num_tokens": 574118594.0,
|
|
"step": 1802
|
|
},
|
|
{
|
|
"epoch": 1.8341810783316377,
|
|
"grad_norm": 0.7661991119384766,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8331655263900757,
|
|
"num_tokens": 574429910.0,
|
|
"step": 1803
|
|
},
|
|
{
|
|
"epoch": 1.8351983723296033,
|
|
"grad_norm": 1.088448405265808,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8323421478271484,
|
|
"num_tokens": 574750525.0,
|
|
"step": 1804
|
|
},
|
|
{
|
|
"epoch": 1.8362156663275688,
|
|
"grad_norm": 0.7912493348121643,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8276345729827881,
|
|
"num_tokens": 575086614.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 1.837232960325534,
|
|
"grad_norm": 0.7702261209487915,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.8321763277053833,
|
|
"num_tokens": 575407175.0,
|
|
"step": 1806
|
|
},
|
|
{
|
|
"epoch": 1.8382502543234995,
|
|
"grad_norm": 0.8245096206665039,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8336420655250549,
|
|
"num_tokens": 575734331.0,
|
|
"step": 1807
|
|
},
|
|
{
|
|
"epoch": 1.8392675483214649,
|
|
"grad_norm": 0.7476911544799805,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8227272629737854,
|
|
"num_tokens": 576048212.0,
|
|
"step": 1808
|
|
},
|
|
{
|
|
"epoch": 1.8402848423194302,
|
|
"grad_norm": 0.7333124279975891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5817,
|
|
"mean_token_accuracy": 0.8251782059669495,
|
|
"num_tokens": 576379541.0,
|
|
"step": 1809
|
|
},
|
|
{
|
|
"epoch": 1.8413021363173958,
|
|
"grad_norm": 0.7403326630592346,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8277183771133423,
|
|
"num_tokens": 576693938.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 1.8423194303153612,
|
|
"grad_norm": 0.8351613879203796,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.564,
|
|
"mean_token_accuracy": 0.8285555243492126,
|
|
"num_tokens": 576996432.0,
|
|
"step": 1811
|
|
},
|
|
{
|
|
"epoch": 1.8433367243133265,
|
|
"grad_norm": 0.7698614597320557,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5847,
|
|
"mean_token_accuracy": 0.8226888179779053,
|
|
"num_tokens": 577320221.0,
|
|
"step": 1812
|
|
},
|
|
{
|
|
"epoch": 1.844354018311292,
|
|
"grad_norm": 0.7601704597473145,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8269021511077881,
|
|
"num_tokens": 577643317.0,
|
|
"step": 1813
|
|
},
|
|
{
|
|
"epoch": 1.8453713123092572,
|
|
"grad_norm": 0.7298431396484375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5588,
|
|
"mean_token_accuracy": 0.8303334712982178,
|
|
"num_tokens": 577962353.0,
|
|
"step": 1814
|
|
},
|
|
{
|
|
"epoch": 1.8463886063072228,
|
|
"grad_norm": 0.7406944036483765,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5833,
|
|
"mean_token_accuracy": 0.8237319588661194,
|
|
"num_tokens": 578280116.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 1.8474059003051884,
|
|
"grad_norm": 0.8527024984359741,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8289049863815308,
|
|
"num_tokens": 578587724.0,
|
|
"step": 1816
|
|
},
|
|
{
|
|
"epoch": 1.8484231943031535,
|
|
"grad_norm": 0.7803925275802612,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5785,
|
|
"mean_token_accuracy": 0.82612144947052,
|
|
"num_tokens": 578907271.0,
|
|
"step": 1817
|
|
},
|
|
{
|
|
"epoch": 1.849440488301119,
|
|
"grad_norm": 0.7724500298500061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.8265190720558167,
|
|
"num_tokens": 579229124.0,
|
|
"step": 1818
|
|
},
|
|
{
|
|
"epoch": 1.8504577822990844,
|
|
"grad_norm": 0.747200608253479,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5405,
|
|
"mean_token_accuracy": 0.8350002765655518,
|
|
"num_tokens": 579541087.0,
|
|
"step": 1819
|
|
},
|
|
{
|
|
"epoch": 1.8514750762970498,
|
|
"grad_norm": 0.775551438331604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5762,
|
|
"mean_token_accuracy": 0.8248890042304993,
|
|
"num_tokens": 579853781.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 1.8524923702950153,
|
|
"grad_norm": 0.7513517141342163,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8300427794456482,
|
|
"num_tokens": 580175507.0,
|
|
"step": 1821
|
|
},
|
|
{
|
|
"epoch": 1.8535096642929807,
|
|
"grad_norm": 0.8104873895645142,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5747,
|
|
"mean_token_accuracy": 0.8259702324867249,
|
|
"num_tokens": 580499293.0,
|
|
"step": 1822
|
|
},
|
|
{
|
|
"epoch": 1.854526958290946,
|
|
"grad_norm": 0.785900890827179,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5775,
|
|
"mean_token_accuracy": 0.8257489800453186,
|
|
"num_tokens": 580805832.0,
|
|
"step": 1823
|
|
},
|
|
{
|
|
"epoch": 1.8555442522889116,
|
|
"grad_norm": 0.7971948981285095,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5377,
|
|
"mean_token_accuracy": 0.8351184725761414,
|
|
"num_tokens": 581107989.0,
|
|
"step": 1824
|
|
},
|
|
{
|
|
"epoch": 1.8565615462868768,
|
|
"grad_norm": 0.7580695748329163,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8308185935020447,
|
|
"num_tokens": 581423977.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 1.8575788402848423,
|
|
"grad_norm": 0.7906219363212585,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.59,
|
|
"mean_token_accuracy": 0.8221039772033691,
|
|
"num_tokens": 581742263.0,
|
|
"step": 1826
|
|
},
|
|
{
|
|
"epoch": 1.8585961342828077,
|
|
"grad_norm": 0.7569301128387451,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8262839317321777,
|
|
"num_tokens": 582060741.0,
|
|
"step": 1827
|
|
},
|
|
{
|
|
"epoch": 1.859613428280773,
|
|
"grad_norm": 0.7414655685424805,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5637,
|
|
"mean_token_accuracy": 0.8287638425827026,
|
|
"num_tokens": 582383233.0,
|
|
"step": 1828
|
|
},
|
|
{
|
|
"epoch": 1.8606307222787386,
|
|
"grad_norm": 0.7988961338996887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5931,
|
|
"mean_token_accuracy": 0.8204246759414673,
|
|
"num_tokens": 582700928.0,
|
|
"step": 1829
|
|
},
|
|
{
|
|
"epoch": 1.861648016276704,
|
|
"grad_norm": 0.754478931427002,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5597,
|
|
"mean_token_accuracy": 0.8302208781242371,
|
|
"num_tokens": 583019968.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 1.8626653102746693,
|
|
"grad_norm": 0.811531126499176,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8267852067947388,
|
|
"num_tokens": 583318374.0,
|
|
"step": 1831
|
|
},
|
|
{
|
|
"epoch": 1.8636826042726349,
|
|
"grad_norm": 0.7797427177429199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5535,
|
|
"mean_token_accuracy": 0.8314744234085083,
|
|
"num_tokens": 583646646.0,
|
|
"step": 1832
|
|
},
|
|
{
|
|
"epoch": 1.8646998982706002,
|
|
"grad_norm": 0.757041871547699,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5613,
|
|
"mean_token_accuracy": 0.8293916583061218,
|
|
"num_tokens": 583961908.0,
|
|
"step": 1833
|
|
},
|
|
{
|
|
"epoch": 1.8657171922685656,
|
|
"grad_norm": 0.7979153394699097,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.8318359851837158,
|
|
"num_tokens": 584274112.0,
|
|
"step": 1834
|
|
},
|
|
{
|
|
"epoch": 1.8667344862665312,
|
|
"grad_norm": 0.7214437127113342,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8350898027420044,
|
|
"num_tokens": 584604753.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 1.8677517802644963,
|
|
"grad_norm": 0.7353912591934204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5704,
|
|
"mean_token_accuracy": 0.8271667957305908,
|
|
"num_tokens": 584922779.0,
|
|
"step": 1836
|
|
},
|
|
{
|
|
"epoch": 1.8687690742624619,
|
|
"grad_norm": 0.7849698662757874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5892,
|
|
"mean_token_accuracy": 0.8227310180664062,
|
|
"num_tokens": 585258146.0,
|
|
"step": 1837
|
|
},
|
|
{
|
|
"epoch": 1.8697863682604272,
|
|
"grad_norm": 0.7752400040626526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5484,
|
|
"mean_token_accuracy": 0.8339313864707947,
|
|
"num_tokens": 585575451.0,
|
|
"step": 1838
|
|
},
|
|
{
|
|
"epoch": 1.8708036622583926,
|
|
"grad_norm": 0.7641316652297974,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5751,
|
|
"mean_token_accuracy": 0.825273871421814,
|
|
"num_tokens": 585907402.0,
|
|
"step": 1839
|
|
},
|
|
{
|
|
"epoch": 1.8718209562563581,
|
|
"grad_norm": 0.7349004745483398,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5783,
|
|
"mean_token_accuracy": 0.8251415491104126,
|
|
"num_tokens": 586248102.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 1.8728382502543235,
|
|
"grad_norm": 0.7678957581520081,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5885,
|
|
"mean_token_accuracy": 0.8224442005157471,
|
|
"num_tokens": 586569377.0,
|
|
"step": 1841
|
|
},
|
|
{
|
|
"epoch": 1.8738555442522888,
|
|
"grad_norm": 0.7082367539405823,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.8304846286773682,
|
|
"num_tokens": 586891543.0,
|
|
"step": 1842
|
|
},
|
|
{
|
|
"epoch": 1.8748728382502544,
|
|
"grad_norm": 0.7639223337173462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5677,
|
|
"mean_token_accuracy": 0.826633870601654,
|
|
"num_tokens": 587214972.0,
|
|
"step": 1843
|
|
},
|
|
{
|
|
"epoch": 1.8758901322482198,
|
|
"grad_norm": 0.7483731508255005,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5892,
|
|
"mean_token_accuracy": 0.8220580816268921,
|
|
"num_tokens": 587541589.0,
|
|
"step": 1844
|
|
},
|
|
{
|
|
"epoch": 1.8769074262461851,
|
|
"grad_norm": 0.78640216588974,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5908,
|
|
"mean_token_accuracy": 0.8221210241317749,
|
|
"num_tokens": 587860976.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 1.8779247202441507,
|
|
"grad_norm": 0.7742619514465332,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.572,
|
|
"mean_token_accuracy": 0.8282708525657654,
|
|
"num_tokens": 588168649.0,
|
|
"step": 1846
|
|
},
|
|
{
|
|
"epoch": 1.8789420142421158,
|
|
"grad_norm": 0.7555391192436218,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5417,
|
|
"mean_token_accuracy": 0.8348156213760376,
|
|
"num_tokens": 588491275.0,
|
|
"step": 1847
|
|
},
|
|
{
|
|
"epoch": 1.8799593082400814,
|
|
"grad_norm": 0.7500118613243103,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5613,
|
|
"mean_token_accuracy": 0.8297252058982849,
|
|
"num_tokens": 588817168.0,
|
|
"step": 1848
|
|
},
|
|
{
|
|
"epoch": 1.8809766022380467,
|
|
"grad_norm": 0.7919448018074036,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5579,
|
|
"mean_token_accuracy": 0.8308275938034058,
|
|
"num_tokens": 589135465.0,
|
|
"step": 1849
|
|
},
|
|
{
|
|
"epoch": 1.881993896236012,
|
|
"grad_norm": 0.7840650677680969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5927,
|
|
"mean_token_accuracy": 0.8216804265975952,
|
|
"num_tokens": 589459066.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 1.8830111902339777,
|
|
"grad_norm": 0.7872382998466492,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8276762962341309,
|
|
"num_tokens": 589787765.0,
|
|
"step": 1851
|
|
},
|
|
{
|
|
"epoch": 1.884028484231943,
|
|
"grad_norm": 0.8268197178840637,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8270741701126099,
|
|
"num_tokens": 590119500.0,
|
|
"step": 1852
|
|
},
|
|
{
|
|
"epoch": 1.8850457782299084,
|
|
"grad_norm": 0.7454878091812134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.8230266571044922,
|
|
"num_tokens": 590427085.0,
|
|
"step": 1853
|
|
},
|
|
{
|
|
"epoch": 1.886063072227874,
|
|
"grad_norm": 0.7611258029937744,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5649,
|
|
"mean_token_accuracy": 0.8284235000610352,
|
|
"num_tokens": 590750486.0,
|
|
"step": 1854
|
|
},
|
|
{
|
|
"epoch": 1.8870803662258393,
|
|
"grad_norm": 0.8493229746818542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5651,
|
|
"mean_token_accuracy": 0.8279353380203247,
|
|
"num_tokens": 591063252.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 1.8880976602238047,
|
|
"grad_norm": 0.7734782099723816,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5608,
|
|
"mean_token_accuracy": 0.8284351825714111,
|
|
"num_tokens": 591367281.0,
|
|
"step": 1856
|
|
},
|
|
{
|
|
"epoch": 1.8891149542217702,
|
|
"grad_norm": 0.7691129446029663,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5937,
|
|
"mean_token_accuracy": 0.8218074440956116,
|
|
"num_tokens": 591689473.0,
|
|
"step": 1857
|
|
},
|
|
{
|
|
"epoch": 1.8901322482197354,
|
|
"grad_norm": 0.7397372126579285,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5608,
|
|
"mean_token_accuracy": 0.8286991119384766,
|
|
"num_tokens": 592022697.0,
|
|
"step": 1858
|
|
},
|
|
{
|
|
"epoch": 1.891149542217701,
|
|
"grad_norm": 0.7259275317192078,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5742,
|
|
"mean_token_accuracy": 0.8260259628295898,
|
|
"num_tokens": 592341817.0,
|
|
"step": 1859
|
|
},
|
|
{
|
|
"epoch": 1.8921668362156663,
|
|
"grad_norm": 0.8009742498397827,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5707,
|
|
"mean_token_accuracy": 0.8263646364212036,
|
|
"num_tokens": 592667852.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 1.8931841302136316,
|
|
"grad_norm": 0.7932537198066711,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8248840570449829,
|
|
"num_tokens": 592990364.0,
|
|
"step": 1861
|
|
},
|
|
{
|
|
"epoch": 1.8942014242115972,
|
|
"grad_norm": 0.7673304080963135,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.576,
|
|
"mean_token_accuracy": 0.8271982669830322,
|
|
"num_tokens": 593313033.0,
|
|
"step": 1862
|
|
},
|
|
{
|
|
"epoch": 1.8952187182095626,
|
|
"grad_norm": 0.7386042475700378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5653,
|
|
"mean_token_accuracy": 0.8284578323364258,
|
|
"num_tokens": 593629565.0,
|
|
"step": 1863
|
|
},
|
|
{
|
|
"epoch": 1.896236012207528,
|
|
"grad_norm": 0.7839403748512268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8257097005844116,
|
|
"num_tokens": 593932811.0,
|
|
"step": 1864
|
|
},
|
|
{
|
|
"epoch": 1.8972533062054935,
|
|
"grad_norm": 0.8242468237876892,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5535,
|
|
"mean_token_accuracy": 0.8315895795822144,
|
|
"num_tokens": 594251800.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 1.8982706002034588,
|
|
"grad_norm": 0.7358497381210327,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5798,
|
|
"mean_token_accuracy": 0.8254359364509583,
|
|
"num_tokens": 594566525.0,
|
|
"step": 1866
|
|
},
|
|
{
|
|
"epoch": 1.8992878942014242,
|
|
"grad_norm": 0.7373138070106506,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8289642333984375,
|
|
"num_tokens": 594903927.0,
|
|
"step": 1867
|
|
},
|
|
{
|
|
"epoch": 1.9003051881993898,
|
|
"grad_norm": 0.8773072957992554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8294283151626587,
|
|
"num_tokens": 595203411.0,
|
|
"step": 1868
|
|
},
|
|
{
|
|
"epoch": 1.901322482197355,
|
|
"grad_norm": 0.8440640568733215,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5942,
|
|
"mean_token_accuracy": 0.8205792903900146,
|
|
"num_tokens": 595523298.0,
|
|
"step": 1869
|
|
},
|
|
{
|
|
"epoch": 1.9023397761953205,
|
|
"grad_norm": 0.843758761882782,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8344087600708008,
|
|
"num_tokens": 595851295.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 1.9033570701932858,
|
|
"grad_norm": 0.8056725859642029,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5745,
|
|
"mean_token_accuracy": 0.8255429267883301,
|
|
"num_tokens": 596165520.0,
|
|
"step": 1871
|
|
},
|
|
{
|
|
"epoch": 1.9043743641912512,
|
|
"grad_norm": 0.7543884515762329,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8281387686729431,
|
|
"num_tokens": 596492724.0,
|
|
"step": 1872
|
|
},
|
|
{
|
|
"epoch": 1.9053916581892167,
|
|
"grad_norm": 0.8288529515266418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.544,
|
|
"mean_token_accuracy": 0.8341200351715088,
|
|
"num_tokens": 596815498.0,
|
|
"step": 1873
|
|
},
|
|
{
|
|
"epoch": 1.906408952187182,
|
|
"grad_norm": 0.7765027284622192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.591,
|
|
"mean_token_accuracy": 0.82168048620224,
|
|
"num_tokens": 597137505.0,
|
|
"step": 1874
|
|
},
|
|
{
|
|
"epoch": 1.9074262461851474,
|
|
"grad_norm": 0.7864693999290466,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8255944848060608,
|
|
"num_tokens": 597439964.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 1.908443540183113,
|
|
"grad_norm": 0.7858473062515259,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.8250025510787964,
|
|
"num_tokens": 597762524.0,
|
|
"step": 1876
|
|
},
|
|
{
|
|
"epoch": 1.9094608341810784,
|
|
"grad_norm": 0.8264142274856567,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5861,
|
|
"mean_token_accuracy": 0.8224828839302063,
|
|
"num_tokens": 598088815.0,
|
|
"step": 1877
|
|
},
|
|
{
|
|
"epoch": 1.9104781281790437,
|
|
"grad_norm": 0.8210098743438721,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8275391459465027,
|
|
"num_tokens": 598408435.0,
|
|
"step": 1878
|
|
},
|
|
{
|
|
"epoch": 1.9114954221770093,
|
|
"grad_norm": 0.7526994943618774,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5619,
|
|
"mean_token_accuracy": 0.829414963722229,
|
|
"num_tokens": 598703467.0,
|
|
"step": 1879
|
|
},
|
|
{
|
|
"epoch": 1.9125127161749744,
|
|
"grad_norm": 0.7536560893058777,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5724,
|
|
"mean_token_accuracy": 0.8269705772399902,
|
|
"num_tokens": 599047533.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 1.91353001017294,
|
|
"grad_norm": 0.8106531500816345,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8292891383171082,
|
|
"num_tokens": 599369815.0,
|
|
"step": 1881
|
|
},
|
|
{
|
|
"epoch": 1.9145473041709054,
|
|
"grad_norm": 0.7695590853691101,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5395,
|
|
"mean_token_accuracy": 0.8354257345199585,
|
|
"num_tokens": 599687509.0,
|
|
"step": 1882
|
|
},
|
|
{
|
|
"epoch": 1.9155645981688707,
|
|
"grad_norm": 0.7978789210319519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5412,
|
|
"mean_token_accuracy": 0.8345689177513123,
|
|
"num_tokens": 599989016.0,
|
|
"step": 1883
|
|
},
|
|
{
|
|
"epoch": 1.9165818921668363,
|
|
"grad_norm": 0.7534072995185852,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8252373933792114,
|
|
"num_tokens": 600296480.0,
|
|
"step": 1884
|
|
},
|
|
{
|
|
"epoch": 1.9175991861648016,
|
|
"grad_norm": 0.7588997483253479,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5507,
|
|
"mean_token_accuracy": 0.8319488167762756,
|
|
"num_tokens": 600617011.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 1.918616480162767,
|
|
"grad_norm": 0.8035694360733032,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5921,
|
|
"mean_token_accuracy": 0.820303201675415,
|
|
"num_tokens": 600942695.0,
|
|
"step": 1886
|
|
},
|
|
{
|
|
"epoch": 1.9196337741607326,
|
|
"grad_norm": 0.8165997862815857,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5808,
|
|
"mean_token_accuracy": 0.8230639696121216,
|
|
"num_tokens": 601254189.0,
|
|
"step": 1887
|
|
},
|
|
{
|
|
"epoch": 1.920651068158698,
|
|
"grad_norm": 0.777153730392456,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5693,
|
|
"mean_token_accuracy": 0.8262922167778015,
|
|
"num_tokens": 601571439.0,
|
|
"step": 1888
|
|
},
|
|
{
|
|
"epoch": 1.9216683621566633,
|
|
"grad_norm": 0.7819951176643372,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5671,
|
|
"mean_token_accuracy": 0.8283118009567261,
|
|
"num_tokens": 601896520.0,
|
|
"step": 1889
|
|
},
|
|
{
|
|
"epoch": 1.9226856561546288,
|
|
"grad_norm": 0.8021113276481628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8276652693748474,
|
|
"num_tokens": 602203498.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 1.923702950152594,
|
|
"grad_norm": 0.8063667416572571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.565,
|
|
"mean_token_accuracy": 0.827467143535614,
|
|
"num_tokens": 602510131.0,
|
|
"step": 1891
|
|
},
|
|
{
|
|
"epoch": 1.9247202441505595,
|
|
"grad_norm": 0.7440853714942932,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5621,
|
|
"mean_token_accuracy": 0.8289334177970886,
|
|
"num_tokens": 602835451.0,
|
|
"step": 1892
|
|
},
|
|
{
|
|
"epoch": 1.9257375381485249,
|
|
"grad_norm": 0.7506888508796692,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.579,
|
|
"mean_token_accuracy": 0.8250542283058167,
|
|
"num_tokens": 603141191.0,
|
|
"step": 1893
|
|
},
|
|
{
|
|
"epoch": 1.9267548321464902,
|
|
"grad_norm": 0.7600045800209045,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.831433892250061,
|
|
"num_tokens": 603470478.0,
|
|
"step": 1894
|
|
},
|
|
{
|
|
"epoch": 1.9277721261444558,
|
|
"grad_norm": 0.8485811948776245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.83091139793396,
|
|
"num_tokens": 603791455.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 1.9287894201424212,
|
|
"grad_norm": 0.789547324180603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5737,
|
|
"mean_token_accuracy": 0.8254702091217041,
|
|
"num_tokens": 604109105.0,
|
|
"step": 1896
|
|
},
|
|
{
|
|
"epoch": 1.9298067141403865,
|
|
"grad_norm": 0.7782526612281799,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5795,
|
|
"mean_token_accuracy": 0.8261871337890625,
|
|
"num_tokens": 604425281.0,
|
|
"step": 1897
|
|
},
|
|
{
|
|
"epoch": 1.930824008138352,
|
|
"grad_norm": 0.7582260370254517,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5955,
|
|
"mean_token_accuracy": 0.821397066116333,
|
|
"num_tokens": 604760916.0,
|
|
"step": 1898
|
|
},
|
|
{
|
|
"epoch": 1.9318413021363174,
|
|
"grad_norm": 0.8307642936706543,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.8259291052818298,
|
|
"num_tokens": 605069512.0,
|
|
"step": 1899
|
|
},
|
|
{
|
|
"epoch": 1.9328585961342828,
|
|
"grad_norm": 0.7825501561164856,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5742,
|
|
"mean_token_accuracy": 0.8273696899414062,
|
|
"num_tokens": 605388317.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 1.9338758901322484,
|
|
"grad_norm": 0.7845165133476257,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8265910744667053,
|
|
"num_tokens": 605720362.0,
|
|
"step": 1901
|
|
},
|
|
{
|
|
"epoch": 1.9348931841302135,
|
|
"grad_norm": 0.7711207866668701,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5867,
|
|
"mean_token_accuracy": 0.823683500289917,
|
|
"num_tokens": 606043288.0,
|
|
"step": 1902
|
|
},
|
|
{
|
|
"epoch": 1.935910478128179,
|
|
"grad_norm": 0.7821605205535889,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5765,
|
|
"mean_token_accuracy": 0.8253616094589233,
|
|
"num_tokens": 606348632.0,
|
|
"step": 1903
|
|
},
|
|
{
|
|
"epoch": 1.9369277721261444,
|
|
"grad_norm": 0.7327739596366882,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.8361741304397583,
|
|
"num_tokens": 606676157.0,
|
|
"step": 1904
|
|
},
|
|
{
|
|
"epoch": 1.9379450661241098,
|
|
"grad_norm": 0.787828803062439,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8274844884872437,
|
|
"num_tokens": 607011481.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 1.9389623601220753,
|
|
"grad_norm": 0.7983585596084595,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8234202861785889,
|
|
"num_tokens": 607334595.0,
|
|
"step": 1906
|
|
},
|
|
{
|
|
"epoch": 1.9399796541200407,
|
|
"grad_norm": 0.7567639946937561,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8337772488594055,
|
|
"num_tokens": 607647463.0,
|
|
"step": 1907
|
|
},
|
|
{
|
|
"epoch": 1.940996948118006,
|
|
"grad_norm": 0.7706727385520935,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.8285652995109558,
|
|
"num_tokens": 607969176.0,
|
|
"step": 1908
|
|
},
|
|
{
|
|
"epoch": 1.9420142421159716,
|
|
"grad_norm": 0.85313880443573,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5861,
|
|
"mean_token_accuracy": 0.8226011991500854,
|
|
"num_tokens": 608294081.0,
|
|
"step": 1909
|
|
},
|
|
{
|
|
"epoch": 1.943031536113937,
|
|
"grad_norm": 0.8062782883644104,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5814,
|
|
"mean_token_accuracy": 0.823600172996521,
|
|
"num_tokens": 608626842.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 1.9440488301119023,
|
|
"grad_norm": 0.7171719074249268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5604,
|
|
"mean_token_accuracy": 0.8306635618209839,
|
|
"num_tokens": 608969123.0,
|
|
"step": 1911
|
|
},
|
|
{
|
|
"epoch": 1.945066124109868,
|
|
"grad_norm": 0.8143373727798462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8298995494842529,
|
|
"num_tokens": 609267308.0,
|
|
"step": 1912
|
|
},
|
|
{
|
|
"epoch": 1.946083418107833,
|
|
"grad_norm": 1.0744192600250244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5756,
|
|
"mean_token_accuracy": 0.8253097534179688,
|
|
"num_tokens": 609605979.0,
|
|
"step": 1913
|
|
},
|
|
{
|
|
"epoch": 1.9471007121057986,
|
|
"grad_norm": 0.7887747287750244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.565,
|
|
"mean_token_accuracy": 0.8280919790267944,
|
|
"num_tokens": 609905194.0,
|
|
"step": 1914
|
|
},
|
|
{
|
|
"epoch": 1.948118006103764,
|
|
"grad_norm": 0.7372229695320129,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8253764510154724,
|
|
"num_tokens": 610232914.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 1.9491353001017293,
|
|
"grad_norm": 0.8306304812431335,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.8301711678504944,
|
|
"num_tokens": 610524356.0,
|
|
"step": 1916
|
|
},
|
|
{
|
|
"epoch": 1.9501525940996949,
|
|
"grad_norm": 0.7157770395278931,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8307013511657715,
|
|
"num_tokens": 610848911.0,
|
|
"step": 1917
|
|
},
|
|
{
|
|
"epoch": 1.9511698880976602,
|
|
"grad_norm": 0.7406263947486877,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.832368791103363,
|
|
"num_tokens": 611151896.0,
|
|
"step": 1918
|
|
},
|
|
{
|
|
"epoch": 1.9521871820956256,
|
|
"grad_norm": 0.7178313732147217,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8341268301010132,
|
|
"num_tokens": 611477707.0,
|
|
"step": 1919
|
|
},
|
|
{
|
|
"epoch": 1.9532044760935912,
|
|
"grad_norm": 0.8264352679252625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.8236597776412964,
|
|
"num_tokens": 611786310.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 1.9542217700915565,
|
|
"grad_norm": 0.790361225605011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8291520476341248,
|
|
"num_tokens": 612106242.0,
|
|
"step": 1921
|
|
},
|
|
{
|
|
"epoch": 1.9552390640895219,
|
|
"grad_norm": 0.7451375722885132,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5506,
|
|
"mean_token_accuracy": 0.8325362205505371,
|
|
"num_tokens": 612408694.0,
|
|
"step": 1922
|
|
},
|
|
{
|
|
"epoch": 1.9562563580874874,
|
|
"grad_norm": 0.7814817428588867,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5616,
|
|
"mean_token_accuracy": 0.8296904563903809,
|
|
"num_tokens": 612723003.0,
|
|
"step": 1923
|
|
},
|
|
{
|
|
"epoch": 1.9572736520854526,
|
|
"grad_norm": 0.7812781929969788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5636,
|
|
"mean_token_accuracy": 0.8298949003219604,
|
|
"num_tokens": 613044117.0,
|
|
"step": 1924
|
|
},
|
|
{
|
|
"epoch": 1.9582909460834181,
|
|
"grad_norm": 0.8395456671714783,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5421,
|
|
"mean_token_accuracy": 0.8346281051635742,
|
|
"num_tokens": 613322625.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 1.9593082400813835,
|
|
"grad_norm": 0.7608759999275208,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5626,
|
|
"mean_token_accuracy": 0.828799307346344,
|
|
"num_tokens": 613652844.0,
|
|
"step": 1926
|
|
},
|
|
{
|
|
"epoch": 1.9603255340793488,
|
|
"grad_norm": 0.7218765616416931,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8309551477432251,
|
|
"num_tokens": 613969421.0,
|
|
"step": 1927
|
|
},
|
|
{
|
|
"epoch": 1.9613428280773144,
|
|
"grad_norm": 0.7457222938537598,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5452,
|
|
"mean_token_accuracy": 0.8331649899482727,
|
|
"num_tokens": 614295637.0,
|
|
"step": 1928
|
|
},
|
|
{
|
|
"epoch": 1.9623601220752798,
|
|
"grad_norm": 0.7531415224075317,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.828516960144043,
|
|
"num_tokens": 614612191.0,
|
|
"step": 1929
|
|
},
|
|
{
|
|
"epoch": 1.9633774160732451,
|
|
"grad_norm": 0.8161703944206238,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5715,
|
|
"mean_token_accuracy": 0.8255881667137146,
|
|
"num_tokens": 614941005.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 1.9643947100712107,
|
|
"grad_norm": 0.7939525842666626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8334426879882812,
|
|
"num_tokens": 615260901.0,
|
|
"step": 1931
|
|
},
|
|
{
|
|
"epoch": 1.965412004069176,
|
|
"grad_norm": 0.8395957946777344,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.8290640711784363,
|
|
"num_tokens": 615576002.0,
|
|
"step": 1932
|
|
},
|
|
{
|
|
"epoch": 1.9664292980671414,
|
|
"grad_norm": 0.7337270975112915,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.557,
|
|
"mean_token_accuracy": 0.8317199349403381,
|
|
"num_tokens": 615906641.0,
|
|
"step": 1933
|
|
},
|
|
{
|
|
"epoch": 1.967446592065107,
|
|
"grad_norm": 0.7723780870437622,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6038,
|
|
"mean_token_accuracy": 0.8182454705238342,
|
|
"num_tokens": 616248968.0,
|
|
"step": 1934
|
|
},
|
|
{
|
|
"epoch": 1.968463886063072,
|
|
"grad_norm": 0.7680447697639465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8295483589172363,
|
|
"num_tokens": 616578475.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 1.9694811800610377,
|
|
"grad_norm": 0.7666056156158447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.8256507515907288,
|
|
"num_tokens": 616899976.0,
|
|
"step": 1936
|
|
},
|
|
{
|
|
"epoch": 1.970498474059003,
|
|
"grad_norm": 0.7603473663330078,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8319006562232971,
|
|
"num_tokens": 617214839.0,
|
|
"step": 1937
|
|
},
|
|
{
|
|
"epoch": 1.9715157680569684,
|
|
"grad_norm": 0.7682324051856995,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5856,
|
|
"mean_token_accuracy": 0.823277473449707,
|
|
"num_tokens": 617512120.0,
|
|
"step": 1938
|
|
},
|
|
{
|
|
"epoch": 1.972533062054934,
|
|
"grad_norm": 0.7841208577156067,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.829049825668335,
|
|
"num_tokens": 617823554.0,
|
|
"step": 1939
|
|
},
|
|
{
|
|
"epoch": 1.9735503560528993,
|
|
"grad_norm": 0.7929815053939819,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5629,
|
|
"mean_token_accuracy": 0.8287888765335083,
|
|
"num_tokens": 618137243.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 1.9745676500508647,
|
|
"grad_norm": 0.8292960524559021,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5805,
|
|
"mean_token_accuracy": 0.825197160243988,
|
|
"num_tokens": 618473609.0,
|
|
"step": 1941
|
|
},
|
|
{
|
|
"epoch": 1.9755849440488302,
|
|
"grad_norm": 0.7060987949371338,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8338747024536133,
|
|
"num_tokens": 618796427.0,
|
|
"step": 1942
|
|
},
|
|
{
|
|
"epoch": 1.9766022380467956,
|
|
"grad_norm": 0.7885839343070984,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.56,
|
|
"mean_token_accuracy": 0.8296661972999573,
|
|
"num_tokens": 619108709.0,
|
|
"step": 1943
|
|
},
|
|
{
|
|
"epoch": 1.977619532044761,
|
|
"grad_norm": 0.7611109614372253,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5684,
|
|
"mean_token_accuracy": 0.8274158239364624,
|
|
"num_tokens": 619435220.0,
|
|
"step": 1944
|
|
},
|
|
{
|
|
"epoch": 1.9786368260427265,
|
|
"grad_norm": 0.7665042877197266,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6039,
|
|
"mean_token_accuracy": 0.8177695274353027,
|
|
"num_tokens": 619759819.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 1.9796541200406916,
|
|
"grad_norm": 0.781718373298645,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8298677802085876,
|
|
"num_tokens": 620070498.0,
|
|
"step": 1946
|
|
},
|
|
{
|
|
"epoch": 1.9806714140386572,
|
|
"grad_norm": 0.7552787661552429,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8303831815719604,
|
|
"num_tokens": 620393836.0,
|
|
"step": 1947
|
|
},
|
|
{
|
|
"epoch": 1.9816887080366226,
|
|
"grad_norm": 0.7667186260223389,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5595,
|
|
"mean_token_accuracy": 0.8307204842567444,
|
|
"num_tokens": 620719684.0,
|
|
"step": 1948
|
|
},
|
|
{
|
|
"epoch": 1.982706002034588,
|
|
"grad_norm": 0.7332197427749634,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5749,
|
|
"mean_token_accuracy": 0.825315535068512,
|
|
"num_tokens": 621050520.0,
|
|
"step": 1949
|
|
},
|
|
{
|
|
"epoch": 1.9837232960325535,
|
|
"grad_norm": 0.8660725355148315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.563,
|
|
"mean_token_accuracy": 0.8284949064254761,
|
|
"num_tokens": 621363543.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 1.9847405900305188,
|
|
"grad_norm": 0.7703171968460083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8312957286834717,
|
|
"num_tokens": 621674098.0,
|
|
"step": 1951
|
|
},
|
|
{
|
|
"epoch": 1.9857578840284842,
|
|
"grad_norm": 0.7844569683074951,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.830365777015686,
|
|
"num_tokens": 621989155.0,
|
|
"step": 1952
|
|
},
|
|
{
|
|
"epoch": 1.9867751780264498,
|
|
"grad_norm": 0.7418847680091858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.8281689882278442,
|
|
"num_tokens": 622317261.0,
|
|
"step": 1953
|
|
},
|
|
{
|
|
"epoch": 1.987792472024415,
|
|
"grad_norm": 1.7465903759002686,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5766,
|
|
"mean_token_accuracy": 0.825128972530365,
|
|
"num_tokens": 622647732.0,
|
|
"step": 1954
|
|
},
|
|
{
|
|
"epoch": 1.9888097660223805,
|
|
"grad_norm": 0.7512000203132629,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8317239284515381,
|
|
"num_tokens": 622984287.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 1.989827060020346,
|
|
"grad_norm": 0.7754637598991394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.8367646932601929,
|
|
"num_tokens": 623279377.0,
|
|
"step": 1956
|
|
},
|
|
{
|
|
"epoch": 1.9908443540183112,
|
|
"grad_norm": 0.7074607014656067,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5645,
|
|
"mean_token_accuracy": 0.8279786109924316,
|
|
"num_tokens": 623617093.0,
|
|
"step": 1957
|
|
},
|
|
{
|
|
"epoch": 1.9918616480162767,
|
|
"grad_norm": 0.7336138486862183,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5825,
|
|
"mean_token_accuracy": 0.8236699104309082,
|
|
"num_tokens": 623935367.0,
|
|
"step": 1958
|
|
},
|
|
{
|
|
"epoch": 1.992878942014242,
|
|
"grad_norm": 0.7565160989761353,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8255666494369507,
|
|
"num_tokens": 624268903.0,
|
|
"step": 1959
|
|
},
|
|
{
|
|
"epoch": 1.9938962360122074,
|
|
"grad_norm": 0.7770515084266663,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8275524377822876,
|
|
"num_tokens": 624585621.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 1.994913530010173,
|
|
"grad_norm": 0.7109097242355347,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5888,
|
|
"mean_token_accuracy": 0.822210431098938,
|
|
"num_tokens": 624919983.0,
|
|
"step": 1961
|
|
},
|
|
{
|
|
"epoch": 1.9959308240081384,
|
|
"grad_norm": 0.8382838368415833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6082,
|
|
"mean_token_accuracy": 0.8173540830612183,
|
|
"num_tokens": 625244861.0,
|
|
"step": 1962
|
|
},
|
|
{
|
|
"epoch": 1.9969481180061037,
|
|
"grad_norm": 0.7442500591278076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5502,
|
|
"mean_token_accuracy": 0.8312058448791504,
|
|
"num_tokens": 625553633.0,
|
|
"step": 1963
|
|
},
|
|
{
|
|
"epoch": 1.9979654120040693,
|
|
"grad_norm": 0.7596532106399536,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.8254216909408569,
|
|
"num_tokens": 625879310.0,
|
|
"step": 1964
|
|
},
|
|
{
|
|
"epoch": 1.9989827060020344,
|
|
"grad_norm": 0.7299008369445801,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5523,
|
|
"mean_token_accuracy": 0.8325172662734985,
|
|
"num_tokens": 626194056.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.769568681716919,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.8282358050346375,
|
|
"num_tokens": 626514393.0,
|
|
"step": 1966
|
|
},
|
|
{
|
|
"epoch": 2.0010172939979656,
|
|
"grad_norm": 0.793281078338623,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8329326510429382,
|
|
"num_tokens": 626843728.0,
|
|
"step": 1967
|
|
},
|
|
{
|
|
"epoch": 2.0020345879959307,
|
|
"grad_norm": 0.7408942580223083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5692,
|
|
"mean_token_accuracy": 0.8274793028831482,
|
|
"num_tokens": 627179506.0,
|
|
"step": 1968
|
|
},
|
|
{
|
|
"epoch": 2.0030518819938963,
|
|
"grad_norm": 0.7407317757606506,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8320581316947937,
|
|
"num_tokens": 627483549.0,
|
|
"step": 1969
|
|
},
|
|
{
|
|
"epoch": 2.004069175991862,
|
|
"grad_norm": 0.8348525762557983,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5616,
|
|
"mean_token_accuracy": 0.828995406627655,
|
|
"num_tokens": 627813505.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 2.005086469989827,
|
|
"grad_norm": 0.7179054021835327,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8263566493988037,
|
|
"num_tokens": 628136119.0,
|
|
"step": 1971
|
|
},
|
|
{
|
|
"epoch": 2.0061037639877926,
|
|
"grad_norm": 0.8164916038513184,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8339582085609436,
|
|
"num_tokens": 628450141.0,
|
|
"step": 1972
|
|
},
|
|
{
|
|
"epoch": 2.0071210579857577,
|
|
"grad_norm": 0.7368289828300476,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8332412242889404,
|
|
"num_tokens": 628770893.0,
|
|
"step": 1973
|
|
},
|
|
{
|
|
"epoch": 2.0081383519837233,
|
|
"grad_norm": 0.7819622159004211,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8278636932373047,
|
|
"num_tokens": 629091061.0,
|
|
"step": 1974
|
|
},
|
|
{
|
|
"epoch": 2.009155645981689,
|
|
"grad_norm": 0.7813390493392944,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5337,
|
|
"mean_token_accuracy": 0.8363057971000671,
|
|
"num_tokens": 629412815.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 2.010172939979654,
|
|
"grad_norm": 0.8105148077011108,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8274290561676025,
|
|
"num_tokens": 629722640.0,
|
|
"step": 1976
|
|
},
|
|
{
|
|
"epoch": 2.0111902339776195,
|
|
"grad_norm": 0.7819290161132812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5698,
|
|
"mean_token_accuracy": 0.8274918794631958,
|
|
"num_tokens": 630044682.0,
|
|
"step": 1977
|
|
},
|
|
{
|
|
"epoch": 2.012207527975585,
|
|
"grad_norm": 0.8088401556015015,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8194631338119507,
|
|
"num_tokens": 630333856.0,
|
|
"step": 1978
|
|
},
|
|
{
|
|
"epoch": 2.0132248219735502,
|
|
"grad_norm": 0.7477482557296753,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5852,
|
|
"mean_token_accuracy": 0.8224201202392578,
|
|
"num_tokens": 630642252.0,
|
|
"step": 1979
|
|
},
|
|
{
|
|
"epoch": 2.014242115971516,
|
|
"grad_norm": 0.7271042466163635,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8269437551498413,
|
|
"num_tokens": 630967182.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.0152594099694814,
|
|
"grad_norm": 0.7491155862808228,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5189,
|
|
"mean_token_accuracy": 0.841214656829834,
|
|
"num_tokens": 631277861.0,
|
|
"step": 1981
|
|
},
|
|
{
|
|
"epoch": 2.0162767039674465,
|
|
"grad_norm": 0.7819457054138184,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5229,
|
|
"mean_token_accuracy": 0.8392908573150635,
|
|
"num_tokens": 631581074.0,
|
|
"step": 1982
|
|
},
|
|
{
|
|
"epoch": 2.017293997965412,
|
|
"grad_norm": 0.7867006659507751,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8317956328392029,
|
|
"num_tokens": 631908085.0,
|
|
"step": 1983
|
|
},
|
|
{
|
|
"epoch": 2.018311291963377,
|
|
"grad_norm": 0.7748855948448181,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8271198272705078,
|
|
"num_tokens": 632228951.0,
|
|
"step": 1984
|
|
},
|
|
{
|
|
"epoch": 2.019328585961343,
|
|
"grad_norm": 0.7708137631416321,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.8322231769561768,
|
|
"num_tokens": 632557630.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 2.0203458799593084,
|
|
"grad_norm": 0.734464168548584,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8276263475418091,
|
|
"num_tokens": 632888435.0,
|
|
"step": 1986
|
|
},
|
|
{
|
|
"epoch": 2.0213631739572735,
|
|
"grad_norm": 0.7887408137321472,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5376,
|
|
"mean_token_accuracy": 0.8354251384735107,
|
|
"num_tokens": 633213083.0,
|
|
"step": 1987
|
|
},
|
|
{
|
|
"epoch": 2.022380467955239,
|
|
"grad_norm": 0.7359147071838379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8312580585479736,
|
|
"num_tokens": 633521507.0,
|
|
"step": 1988
|
|
},
|
|
{
|
|
"epoch": 2.0233977619532046,
|
|
"grad_norm": 0.7471558451652527,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5619,
|
|
"mean_token_accuracy": 0.8286077976226807,
|
|
"num_tokens": 633851293.0,
|
|
"step": 1989
|
|
},
|
|
{
|
|
"epoch": 2.0244150559511698,
|
|
"grad_norm": 0.7837832570075989,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5756,
|
|
"mean_token_accuracy": 0.8252357840538025,
|
|
"num_tokens": 634182314.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 2.0254323499491353,
|
|
"grad_norm": 0.7467847466468811,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8307093977928162,
|
|
"num_tokens": 634503716.0,
|
|
"step": 1991
|
|
},
|
|
{
|
|
"epoch": 2.026449643947101,
|
|
"grad_norm": 0.7618154883384705,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8307061195373535,
|
|
"num_tokens": 634812750.0,
|
|
"step": 1992
|
|
},
|
|
{
|
|
"epoch": 2.027466937945066,
|
|
"grad_norm": 0.7568649053573608,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.563,
|
|
"mean_token_accuracy": 0.8288283348083496,
|
|
"num_tokens": 635124979.0,
|
|
"step": 1993
|
|
},
|
|
{
|
|
"epoch": 2.0284842319430316,
|
|
"grad_norm": 0.8403826951980591,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.8298501372337341,
|
|
"num_tokens": 635455215.0,
|
|
"step": 1994
|
|
},
|
|
{
|
|
"epoch": 2.0295015259409968,
|
|
"grad_norm": 0.7417284250259399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5618,
|
|
"mean_token_accuracy": 0.8286497592926025,
|
|
"num_tokens": 635778771.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 2.0305188199389623,
|
|
"grad_norm": 0.7632890939712524,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8259048461914062,
|
|
"num_tokens": 636091977.0,
|
|
"step": 1996
|
|
},
|
|
{
|
|
"epoch": 2.031536113936928,
|
|
"grad_norm": 0.7756537199020386,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8347346782684326,
|
|
"num_tokens": 636426068.0,
|
|
"step": 1997
|
|
},
|
|
{
|
|
"epoch": 2.032553407934893,
|
|
"grad_norm": 0.7656052708625793,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.8321259021759033,
|
|
"num_tokens": 636728987.0,
|
|
"step": 1998
|
|
},
|
|
{
|
|
"epoch": 2.0335707019328586,
|
|
"grad_norm": 0.7594374418258667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8242126703262329,
|
|
"num_tokens": 637048170.0,
|
|
"step": 1999
|
|
},
|
|
{
|
|
"epoch": 2.034587995930824,
|
|
"grad_norm": 0.7326385378837585,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5658,
|
|
"mean_token_accuracy": 0.8288227319717407,
|
|
"num_tokens": 637376842.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.0356052899287893,
|
|
"grad_norm": 0.7957135438919067,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5831,
|
|
"mean_token_accuracy": 0.8231464624404907,
|
|
"num_tokens": 637708776.0,
|
|
"step": 2001
|
|
},
|
|
{
|
|
"epoch": 2.036622583926755,
|
|
"grad_norm": 0.7293063402175903,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5671,
|
|
"mean_token_accuracy": 0.8277890086174011,
|
|
"num_tokens": 638026761.0,
|
|
"step": 2002
|
|
},
|
|
{
|
|
"epoch": 2.0376398779247205,
|
|
"grad_norm": 0.7880427241325378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.828234851360321,
|
|
"num_tokens": 638337687.0,
|
|
"step": 2003
|
|
},
|
|
{
|
|
"epoch": 2.0386571719226856,
|
|
"grad_norm": 0.7594811320304871,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.531,
|
|
"mean_token_accuracy": 0.8365904688835144,
|
|
"num_tokens": 638663203.0,
|
|
"step": 2004
|
|
},
|
|
{
|
|
"epoch": 2.039674465920651,
|
|
"grad_norm": 0.7555544376373291,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8272154331207275,
|
|
"num_tokens": 638985653.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 2.0406917599186163,
|
|
"grad_norm": 0.857343852519989,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5771,
|
|
"mean_token_accuracy": 0.8249437212944031,
|
|
"num_tokens": 639296652.0,
|
|
"step": 2006
|
|
},
|
|
{
|
|
"epoch": 2.041709053916582,
|
|
"grad_norm": 0.7711302042007446,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5737,
|
|
"mean_token_accuracy": 0.825864315032959,
|
|
"num_tokens": 639629812.0,
|
|
"step": 2007
|
|
},
|
|
{
|
|
"epoch": 2.0427263479145474,
|
|
"grad_norm": 1.0842350721359253,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5384,
|
|
"mean_token_accuracy": 0.8358830809593201,
|
|
"num_tokens": 639953979.0,
|
|
"step": 2008
|
|
},
|
|
{
|
|
"epoch": 2.0437436419125126,
|
|
"grad_norm": 0.7909249067306519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5484,
|
|
"mean_token_accuracy": 0.8325679898262024,
|
|
"num_tokens": 640271057.0,
|
|
"step": 2009
|
|
},
|
|
{
|
|
"epoch": 2.044760935910478,
|
|
"grad_norm": 0.730385422706604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5391,
|
|
"mean_token_accuracy": 0.8343998789787292,
|
|
"num_tokens": 640574916.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 2.0457782299084437,
|
|
"grad_norm": 0.7266718745231628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5466,
|
|
"mean_token_accuracy": 0.8327420949935913,
|
|
"num_tokens": 640899252.0,
|
|
"step": 2011
|
|
},
|
|
{
|
|
"epoch": 2.046795523906409,
|
|
"grad_norm": 0.7553056478500366,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8326089978218079,
|
|
"num_tokens": 641225765.0,
|
|
"step": 2012
|
|
},
|
|
{
|
|
"epoch": 2.0478128179043744,
|
|
"grad_norm": 0.7447772026062012,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8331438899040222,
|
|
"num_tokens": 641553952.0,
|
|
"step": 2013
|
|
},
|
|
{
|
|
"epoch": 2.04883011190234,
|
|
"grad_norm": 0.7557468414306641,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.825654149055481,
|
|
"num_tokens": 641871688.0,
|
|
"step": 2014
|
|
},
|
|
{
|
|
"epoch": 2.049847405900305,
|
|
"grad_norm": 0.7843445539474487,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5864,
|
|
"mean_token_accuracy": 0.8237768411636353,
|
|
"num_tokens": 642182839.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 2.0508646998982707,
|
|
"grad_norm": 0.8298075199127197,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8268923759460449,
|
|
"num_tokens": 642490309.0,
|
|
"step": 2016
|
|
},
|
|
{
|
|
"epoch": 2.051881993896236,
|
|
"grad_norm": 0.7977005243301392,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8326629996299744,
|
|
"num_tokens": 642789777.0,
|
|
"step": 2017
|
|
},
|
|
{
|
|
"epoch": 2.0528992878942014,
|
|
"grad_norm": 0.7488961815834045,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5653,
|
|
"mean_token_accuracy": 0.8282648324966431,
|
|
"num_tokens": 643121797.0,
|
|
"step": 2018
|
|
},
|
|
{
|
|
"epoch": 2.053916581892167,
|
|
"grad_norm": 0.7535355687141418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5735,
|
|
"mean_token_accuracy": 0.8259459137916565,
|
|
"num_tokens": 643442233.0,
|
|
"step": 2019
|
|
},
|
|
{
|
|
"epoch": 2.054933875890132,
|
|
"grad_norm": 0.752249538898468,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5527,
|
|
"mean_token_accuracy": 0.8324294686317444,
|
|
"num_tokens": 643770297.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.0559511698880977,
|
|
"grad_norm": 0.7770043015480042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.828926146030426,
|
|
"num_tokens": 644077898.0,
|
|
"step": 2021
|
|
},
|
|
{
|
|
"epoch": 2.0569684638860632,
|
|
"grad_norm": 0.7959228754043579,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5657,
|
|
"mean_token_accuracy": 0.8270010948181152,
|
|
"num_tokens": 644396823.0,
|
|
"step": 2022
|
|
},
|
|
{
|
|
"epoch": 2.0579857578840284,
|
|
"grad_norm": 0.7729719281196594,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5597,
|
|
"mean_token_accuracy": 0.8291889429092407,
|
|
"num_tokens": 644715765.0,
|
|
"step": 2023
|
|
},
|
|
{
|
|
"epoch": 2.059003051881994,
|
|
"grad_norm": 0.7595959305763245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8271936178207397,
|
|
"num_tokens": 645039148.0,
|
|
"step": 2024
|
|
},
|
|
{
|
|
"epoch": 2.0600203458799595,
|
|
"grad_norm": 0.8012211322784424,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5639,
|
|
"mean_token_accuracy": 0.828609049320221,
|
|
"num_tokens": 645359940.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 2.0610376398779247,
|
|
"grad_norm": 0.7749910354614258,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8325744867324829,
|
|
"num_tokens": 645695713.0,
|
|
"step": 2026
|
|
},
|
|
{
|
|
"epoch": 2.0620549338758902,
|
|
"grad_norm": 0.8015928864479065,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8329358100891113,
|
|
"num_tokens": 646007060.0,
|
|
"step": 2027
|
|
},
|
|
{
|
|
"epoch": 2.0630722278738554,
|
|
"grad_norm": 0.7851582169532776,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8285136222839355,
|
|
"num_tokens": 646310789.0,
|
|
"step": 2028
|
|
},
|
|
{
|
|
"epoch": 2.064089521871821,
|
|
"grad_norm": 0.7077644467353821,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5313,
|
|
"mean_token_accuracy": 0.836718738079071,
|
|
"num_tokens": 646651450.0,
|
|
"step": 2029
|
|
},
|
|
{
|
|
"epoch": 2.0651068158697865,
|
|
"grad_norm": 0.8080658316612244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.827492356300354,
|
|
"num_tokens": 646974846.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 2.0661241098677516,
|
|
"grad_norm": 0.8135608434677124,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8345105648040771,
|
|
"num_tokens": 647294337.0,
|
|
"step": 2031
|
|
},
|
|
{
|
|
"epoch": 2.067141403865717,
|
|
"grad_norm": 0.7688538432121277,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8303366899490356,
|
|
"num_tokens": 647614724.0,
|
|
"step": 2032
|
|
},
|
|
{
|
|
"epoch": 2.068158697863683,
|
|
"grad_norm": 0.7426178455352783,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.833173930644989,
|
|
"num_tokens": 647933285.0,
|
|
"step": 2033
|
|
},
|
|
{
|
|
"epoch": 2.069175991861648,
|
|
"grad_norm": 0.7545600533485413,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8328017592430115,
|
|
"num_tokens": 648251173.0,
|
|
"step": 2034
|
|
},
|
|
{
|
|
"epoch": 2.0701932858596135,
|
|
"grad_norm": 0.8664435148239136,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8208303451538086,
|
|
"num_tokens": 648551725.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 2.0712105798575786,
|
|
"grad_norm": 0.7247047424316406,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5523,
|
|
"mean_token_accuracy": 0.8322950601577759,
|
|
"num_tokens": 648863508.0,
|
|
"step": 2036
|
|
},
|
|
{
|
|
"epoch": 2.072227873855544,
|
|
"grad_norm": 0.7342517971992493,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5376,
|
|
"mean_token_accuracy": 0.8353379964828491,
|
|
"num_tokens": 649176493.0,
|
|
"step": 2037
|
|
},
|
|
{
|
|
"epoch": 2.0732451678535098,
|
|
"grad_norm": 0.7122109532356262,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5281,
|
|
"mean_token_accuracy": 0.8390113711357117,
|
|
"num_tokens": 649497977.0,
|
|
"step": 2038
|
|
},
|
|
{
|
|
"epoch": 2.074262461851475,
|
|
"grad_norm": 0.7776530981063843,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5653,
|
|
"mean_token_accuracy": 0.8287674784660339,
|
|
"num_tokens": 649826804.0,
|
|
"step": 2039
|
|
},
|
|
{
|
|
"epoch": 2.0752797558494405,
|
|
"grad_norm": 0.9624866843223572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5421,
|
|
"mean_token_accuracy": 0.8347163200378418,
|
|
"num_tokens": 650125023.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.076297049847406,
|
|
"grad_norm": 1.2730292081832886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8335597515106201,
|
|
"num_tokens": 650457502.0,
|
|
"step": 2041
|
|
},
|
|
{
|
|
"epoch": 2.077314343845371,
|
|
"grad_norm": 0.7527278065681458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5576,
|
|
"mean_token_accuracy": 0.8305301666259766,
|
|
"num_tokens": 650774229.0,
|
|
"step": 2042
|
|
},
|
|
{
|
|
"epoch": 2.0783316378433367,
|
|
"grad_norm": 0.7566222548484802,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.8274224996566772,
|
|
"num_tokens": 651094235.0,
|
|
"step": 2043
|
|
},
|
|
{
|
|
"epoch": 2.0793489318413023,
|
|
"grad_norm": 0.8623789548873901,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.572,
|
|
"mean_token_accuracy": 0.8269400596618652,
|
|
"num_tokens": 651403530.0,
|
|
"step": 2044
|
|
},
|
|
{
|
|
"epoch": 2.0803662258392674,
|
|
"grad_norm": 0.8023196458816528,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8259271383285522,
|
|
"num_tokens": 651737149.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 2.081383519837233,
|
|
"grad_norm": 0.7700502276420593,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5688,
|
|
"mean_token_accuracy": 0.8281545639038086,
|
|
"num_tokens": 652057532.0,
|
|
"step": 2046
|
|
},
|
|
{
|
|
"epoch": 2.082400813835198,
|
|
"grad_norm": 0.7362005114555359,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5619,
|
|
"mean_token_accuracy": 0.8289458751678467,
|
|
"num_tokens": 652364933.0,
|
|
"step": 2047
|
|
},
|
|
{
|
|
"epoch": 2.0834181078331637,
|
|
"grad_norm": 0.7432454228401184,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5616,
|
|
"mean_token_accuracy": 0.8287770748138428,
|
|
"num_tokens": 652693646.0,
|
|
"step": 2048
|
|
},
|
|
{
|
|
"epoch": 2.0844354018311293,
|
|
"grad_norm": 0.7227328419685364,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8361355662345886,
|
|
"num_tokens": 653007977.0,
|
|
"step": 2049
|
|
},
|
|
{
|
|
"epoch": 2.0854526958290944,
|
|
"grad_norm": 0.7496834993362427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8348923921585083,
|
|
"num_tokens": 653315197.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 2.08646998982706,
|
|
"grad_norm": 0.7300513386726379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8325375914573669,
|
|
"num_tokens": 653634130.0,
|
|
"step": 2051
|
|
},
|
|
{
|
|
"epoch": 2.0874872838250256,
|
|
"grad_norm": 0.7461215853691101,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8253625631332397,
|
|
"num_tokens": 653956123.0,
|
|
"step": 2052
|
|
},
|
|
{
|
|
"epoch": 2.0885045778229907,
|
|
"grad_norm": 0.7695837020874023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5776,
|
|
"mean_token_accuracy": 0.8242179155349731,
|
|
"num_tokens": 654284883.0,
|
|
"step": 2053
|
|
},
|
|
{
|
|
"epoch": 2.0895218718209563,
|
|
"grad_norm": 0.7454880475997925,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5942,
|
|
"mean_token_accuracy": 0.8213703632354736,
|
|
"num_tokens": 654620877.0,
|
|
"step": 2054
|
|
},
|
|
{
|
|
"epoch": 2.090539165818922,
|
|
"grad_norm": 0.7532869577407837,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5838,
|
|
"mean_token_accuracy": 0.8228363394737244,
|
|
"num_tokens": 654955083.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 2.091556459816887,
|
|
"grad_norm": 0.7616458535194397,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5358,
|
|
"mean_token_accuracy": 0.8365463614463806,
|
|
"num_tokens": 655262947.0,
|
|
"step": 2056
|
|
},
|
|
{
|
|
"epoch": 2.0925737538148526,
|
|
"grad_norm": 0.7663776874542236,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5438,
|
|
"mean_token_accuracy": 0.8342633843421936,
|
|
"num_tokens": 655582982.0,
|
|
"step": 2057
|
|
},
|
|
{
|
|
"epoch": 2.0935910478128177,
|
|
"grad_norm": 0.748302161693573,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5581,
|
|
"mean_token_accuracy": 0.8303278684616089,
|
|
"num_tokens": 655892296.0,
|
|
"step": 2058
|
|
},
|
|
{
|
|
"epoch": 2.0946083418107833,
|
|
"grad_norm": 0.7514637112617493,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8340238332748413,
|
|
"num_tokens": 656199760.0,
|
|
"step": 2059
|
|
},
|
|
{
|
|
"epoch": 2.095625635808749,
|
|
"grad_norm": 0.7614337205886841,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5746,
|
|
"mean_token_accuracy": 0.8253083825111389,
|
|
"num_tokens": 656514419.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.096642929806714,
|
|
"grad_norm": 0.7688518762588501,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5146,
|
|
"mean_token_accuracy": 0.8416045308113098,
|
|
"num_tokens": 656815650.0,
|
|
"step": 2061
|
|
},
|
|
{
|
|
"epoch": 2.0976602238046795,
|
|
"grad_norm": 0.742057204246521,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5512,
|
|
"mean_token_accuracy": 0.8324074745178223,
|
|
"num_tokens": 657128763.0,
|
|
"step": 2062
|
|
},
|
|
{
|
|
"epoch": 2.098677517802645,
|
|
"grad_norm": 0.7600764036178589,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5699,
|
|
"mean_token_accuracy": 0.8270496129989624,
|
|
"num_tokens": 657442965.0,
|
|
"step": 2063
|
|
},
|
|
{
|
|
"epoch": 2.0996948118006102,
|
|
"grad_norm": 0.7068182229995728,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5496,
|
|
"mean_token_accuracy": 0.8338534235954285,
|
|
"num_tokens": 657787301.0,
|
|
"step": 2064
|
|
},
|
|
{
|
|
"epoch": 2.100712105798576,
|
|
"grad_norm": 0.7744857668876648,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.831404447555542,
|
|
"num_tokens": 658086444.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 2.1017293997965414,
|
|
"grad_norm": 0.7384071350097656,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8269665837287903,
|
|
"num_tokens": 658417852.0,
|
|
"step": 2066
|
|
},
|
|
{
|
|
"epoch": 2.1027466937945065,
|
|
"grad_norm": 0.7803059816360474,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.829984188079834,
|
|
"num_tokens": 658727698.0,
|
|
"step": 2067
|
|
},
|
|
{
|
|
"epoch": 2.103763987792472,
|
|
"grad_norm": 0.7474600672721863,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5506,
|
|
"mean_token_accuracy": 0.8319146037101746,
|
|
"num_tokens": 659036589.0,
|
|
"step": 2068
|
|
},
|
|
{
|
|
"epoch": 2.104781281790437,
|
|
"grad_norm": 0.765299916267395,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8267961740493774,
|
|
"num_tokens": 659356387.0,
|
|
"step": 2069
|
|
},
|
|
{
|
|
"epoch": 2.105798575788403,
|
|
"grad_norm": 0.7543993592262268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5595,
|
|
"mean_token_accuracy": 0.8287253379821777,
|
|
"num_tokens": 659665573.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 2.1068158697863684,
|
|
"grad_norm": 0.7225178480148315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5595,
|
|
"mean_token_accuracy": 0.8303307294845581,
|
|
"num_tokens": 659986298.0,
|
|
"step": 2071
|
|
},
|
|
{
|
|
"epoch": 2.1078331637843335,
|
|
"grad_norm": 0.7693639993667603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5399,
|
|
"mean_token_accuracy": 0.8354504108428955,
|
|
"num_tokens": 660295004.0,
|
|
"step": 2072
|
|
},
|
|
{
|
|
"epoch": 2.108850457782299,
|
|
"grad_norm": 0.7996508479118347,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5418,
|
|
"mean_token_accuracy": 0.8350417613983154,
|
|
"num_tokens": 660607411.0,
|
|
"step": 2073
|
|
},
|
|
{
|
|
"epoch": 2.1098677517802646,
|
|
"grad_norm": 0.7588698267936707,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5427,
|
|
"mean_token_accuracy": 0.8349910378456116,
|
|
"num_tokens": 660921469.0,
|
|
"step": 2074
|
|
},
|
|
{
|
|
"epoch": 2.1108850457782298,
|
|
"grad_norm": 0.7550652623176575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5959,
|
|
"mean_token_accuracy": 0.8192981481552124,
|
|
"num_tokens": 661237468.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 2.1119023397761953,
|
|
"grad_norm": 0.7587368488311768,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.8338797688484192,
|
|
"num_tokens": 661553561.0,
|
|
"step": 2076
|
|
},
|
|
{
|
|
"epoch": 2.112919633774161,
|
|
"grad_norm": 0.7872341275215149,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8278385996818542,
|
|
"num_tokens": 661865294.0,
|
|
"step": 2077
|
|
},
|
|
{
|
|
"epoch": 2.113936927772126,
|
|
"grad_norm": 0.7376534342765808,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8293165564537048,
|
|
"num_tokens": 662192533.0,
|
|
"step": 2078
|
|
},
|
|
{
|
|
"epoch": 2.1149542217700916,
|
|
"grad_norm": 0.7513229846954346,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8273903131484985,
|
|
"num_tokens": 662502572.0,
|
|
"step": 2079
|
|
},
|
|
{
|
|
"epoch": 2.1159715157680568,
|
|
"grad_norm": 0.7460830211639404,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5772,
|
|
"mean_token_accuracy": 0.8257139325141907,
|
|
"num_tokens": 662821554.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.1169888097660223,
|
|
"grad_norm": 0.766185998916626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.828972578048706,
|
|
"num_tokens": 663123259.0,
|
|
"step": 2081
|
|
},
|
|
{
|
|
"epoch": 2.118006103763988,
|
|
"grad_norm": 0.7785876989364624,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5392,
|
|
"mean_token_accuracy": 0.8344416618347168,
|
|
"num_tokens": 663445600.0,
|
|
"step": 2082
|
|
},
|
|
{
|
|
"epoch": 2.119023397761953,
|
|
"grad_norm": 0.7833909392356873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5665,
|
|
"mean_token_accuracy": 0.8279260396957397,
|
|
"num_tokens": 663747652.0,
|
|
"step": 2083
|
|
},
|
|
{
|
|
"epoch": 2.1200406917599186,
|
|
"grad_norm": 0.7774428725242615,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8186205625534058,
|
|
"num_tokens": 664069904.0,
|
|
"step": 2084
|
|
},
|
|
{
|
|
"epoch": 2.121057985757884,
|
|
"grad_norm": 0.7461346387863159,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5647,
|
|
"mean_token_accuracy": 0.8285263776779175,
|
|
"num_tokens": 664386511.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 2.1220752797558493,
|
|
"grad_norm": 0.748580276966095,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5536,
|
|
"mean_token_accuracy": 0.8327457904815674,
|
|
"num_tokens": 664712795.0,
|
|
"step": 2086
|
|
},
|
|
{
|
|
"epoch": 2.123092573753815,
|
|
"grad_norm": 0.8052957057952881,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.8318384885787964,
|
|
"num_tokens": 665036489.0,
|
|
"step": 2087
|
|
},
|
|
{
|
|
"epoch": 2.1241098677517805,
|
|
"grad_norm": 0.7805037498474121,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.835271954536438,
|
|
"num_tokens": 665345764.0,
|
|
"step": 2088
|
|
},
|
|
{
|
|
"epoch": 2.1251271617497456,
|
|
"grad_norm": 0.7457467913627625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.8313297033309937,
|
|
"num_tokens": 665650495.0,
|
|
"step": 2089
|
|
},
|
|
{
|
|
"epoch": 2.126144455747711,
|
|
"grad_norm": 0.7864099740982056,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5698,
|
|
"mean_token_accuracy": 0.8272011280059814,
|
|
"num_tokens": 665976759.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 2.1271617497456763,
|
|
"grad_norm": 0.7976198196411133,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5643,
|
|
"mean_token_accuracy": 0.8287394046783447,
|
|
"num_tokens": 666274229.0,
|
|
"step": 2091
|
|
},
|
|
{
|
|
"epoch": 2.128179043743642,
|
|
"grad_norm": 0.718885064125061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5729,
|
|
"mean_token_accuracy": 0.8265447616577148,
|
|
"num_tokens": 666617768.0,
|
|
"step": 2092
|
|
},
|
|
{
|
|
"epoch": 2.1291963377416074,
|
|
"grad_norm": 0.758816123008728,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8275684118270874,
|
|
"num_tokens": 666935081.0,
|
|
"step": 2093
|
|
},
|
|
{
|
|
"epoch": 2.1302136317395726,
|
|
"grad_norm": 0.7571752667427063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5585,
|
|
"mean_token_accuracy": 0.831500232219696,
|
|
"num_tokens": 667248835.0,
|
|
"step": 2094
|
|
},
|
|
{
|
|
"epoch": 2.131230925737538,
|
|
"grad_norm": 0.7851269245147705,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5735,
|
|
"mean_token_accuracy": 0.8253339529037476,
|
|
"num_tokens": 667556405.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 2.1322482197355037,
|
|
"grad_norm": 0.7476552724838257,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8259379267692566,
|
|
"num_tokens": 667894932.0,
|
|
"step": 2096
|
|
},
|
|
{
|
|
"epoch": 2.133265513733469,
|
|
"grad_norm": 0.7466548085212708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5502,
|
|
"mean_token_accuracy": 0.8320664167404175,
|
|
"num_tokens": 668202295.0,
|
|
"step": 2097
|
|
},
|
|
{
|
|
"epoch": 2.1342828077314344,
|
|
"grad_norm": 0.7157275676727295,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5669,
|
|
"mean_token_accuracy": 0.8280497789382935,
|
|
"num_tokens": 668537919.0,
|
|
"step": 2098
|
|
},
|
|
{
|
|
"epoch": 2.1353001017294,
|
|
"grad_norm": 0.7510685920715332,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5524,
|
|
"mean_token_accuracy": 0.8300042748451233,
|
|
"num_tokens": 668854088.0,
|
|
"step": 2099
|
|
},
|
|
{
|
|
"epoch": 2.136317395727365,
|
|
"grad_norm": 0.7428464889526367,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5401,
|
|
"mean_token_accuracy": 0.8350558280944824,
|
|
"num_tokens": 669192028.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.1373346897253307,
|
|
"grad_norm": 0.794187605381012,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.833488941192627,
|
|
"num_tokens": 669516310.0,
|
|
"step": 2101
|
|
},
|
|
{
|
|
"epoch": 2.138351983723296,
|
|
"grad_norm": 0.6934726238250732,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8282812833786011,
|
|
"num_tokens": 669843789.0,
|
|
"step": 2102
|
|
},
|
|
{
|
|
"epoch": 2.1393692777212614,
|
|
"grad_norm": 0.7874788045883179,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5568,
|
|
"mean_token_accuracy": 0.8306584358215332,
|
|
"num_tokens": 670161044.0,
|
|
"step": 2103
|
|
},
|
|
{
|
|
"epoch": 2.140386571719227,
|
|
"grad_norm": 0.7843090295791626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.8288133144378662,
|
|
"num_tokens": 670470095.0,
|
|
"step": 2104
|
|
},
|
|
{
|
|
"epoch": 2.141403865717192,
|
|
"grad_norm": 0.7777359485626221,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8294340968132019,
|
|
"num_tokens": 670784785.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 2.1424211597151577,
|
|
"grad_norm": 0.7022597193717957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.833615779876709,
|
|
"num_tokens": 671102909.0,
|
|
"step": 2106
|
|
},
|
|
{
|
|
"epoch": 2.1434384537131232,
|
|
"grad_norm": 0.7523261904716492,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5577,
|
|
"mean_token_accuracy": 0.8297773599624634,
|
|
"num_tokens": 671421218.0,
|
|
"step": 2107
|
|
},
|
|
{
|
|
"epoch": 2.1444557477110884,
|
|
"grad_norm": 0.7527234554290771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5618,
|
|
"mean_token_accuracy": 0.8291486501693726,
|
|
"num_tokens": 671755061.0,
|
|
"step": 2108
|
|
},
|
|
{
|
|
"epoch": 2.145473041709054,
|
|
"grad_norm": 0.7956981658935547,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5286,
|
|
"mean_token_accuracy": 0.8387909531593323,
|
|
"num_tokens": 672085682.0,
|
|
"step": 2109
|
|
},
|
|
{
|
|
"epoch": 2.1464903357070195,
|
|
"grad_norm": 0.7895192503929138,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5866,
|
|
"mean_token_accuracy": 0.8217060565948486,
|
|
"num_tokens": 672406808.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 2.1475076297049847,
|
|
"grad_norm": 0.7586142420768738,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.586,
|
|
"mean_token_accuracy": 0.8228213787078857,
|
|
"num_tokens": 672742756.0,
|
|
"step": 2111
|
|
},
|
|
{
|
|
"epoch": 2.1485249237029502,
|
|
"grad_norm": 0.6966933608055115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5344,
|
|
"mean_token_accuracy": 0.8360923528671265,
|
|
"num_tokens": 673076346.0,
|
|
"step": 2112
|
|
},
|
|
{
|
|
"epoch": 2.1495422177009154,
|
|
"grad_norm": 0.7788679003715515,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8248082399368286,
|
|
"num_tokens": 673390482.0,
|
|
"step": 2113
|
|
},
|
|
{
|
|
"epoch": 2.150559511698881,
|
|
"grad_norm": 0.7420569658279419,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5677,
|
|
"mean_token_accuracy": 0.8269627094268799,
|
|
"num_tokens": 673724830.0,
|
|
"step": 2114
|
|
},
|
|
{
|
|
"epoch": 2.1515768056968465,
|
|
"grad_norm": 0.9678299427032471,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5727,
|
|
"mean_token_accuracy": 0.8264451026916504,
|
|
"num_tokens": 674041915.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 2.1525940996948116,
|
|
"grad_norm": 0.7400864958763123,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5764,
|
|
"mean_token_accuracy": 0.8259016275405884,
|
|
"num_tokens": 674376067.0,
|
|
"step": 2116
|
|
},
|
|
{
|
|
"epoch": 2.153611393692777,
|
|
"grad_norm": 0.7815296649932861,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5552,
|
|
"mean_token_accuracy": 0.8299500346183777,
|
|
"num_tokens": 674683835.0,
|
|
"step": 2117
|
|
},
|
|
{
|
|
"epoch": 2.154628687690743,
|
|
"grad_norm": 0.7235947251319885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8269928693771362,
|
|
"num_tokens": 675021716.0,
|
|
"step": 2118
|
|
},
|
|
{
|
|
"epoch": 2.155645981688708,
|
|
"grad_norm": 0.7602812647819519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8294414281845093,
|
|
"num_tokens": 675339134.0,
|
|
"step": 2119
|
|
},
|
|
{
|
|
"epoch": 2.1566632756866735,
|
|
"grad_norm": 0.7821603417396545,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8252530097961426,
|
|
"num_tokens": 675650088.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.157680569684639,
|
|
"grad_norm": 0.7559882402420044,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.8339157700538635,
|
|
"num_tokens": 675976634.0,
|
|
"step": 2121
|
|
},
|
|
{
|
|
"epoch": 2.158697863682604,
|
|
"grad_norm": 0.7382848858833313,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.823092520236969,
|
|
"num_tokens": 676301382.0,
|
|
"step": 2122
|
|
},
|
|
{
|
|
"epoch": 2.1597151576805698,
|
|
"grad_norm": 0.7315267324447632,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.8290728330612183,
|
|
"num_tokens": 676630331.0,
|
|
"step": 2123
|
|
},
|
|
{
|
|
"epoch": 2.160732451678535,
|
|
"grad_norm": 0.7366230487823486,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5811,
|
|
"mean_token_accuracy": 0.8245151042938232,
|
|
"num_tokens": 676971441.0,
|
|
"step": 2124
|
|
},
|
|
{
|
|
"epoch": 2.1617497456765005,
|
|
"grad_norm": 0.7743238806724548,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8322397470474243,
|
|
"num_tokens": 677292539.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 2.162767039674466,
|
|
"grad_norm": 0.8578828573226929,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8391723036766052,
|
|
"num_tokens": 677609509.0,
|
|
"step": 2126
|
|
},
|
|
{
|
|
"epoch": 2.163784333672431,
|
|
"grad_norm": 0.7956092953681946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8298279643058777,
|
|
"num_tokens": 677937681.0,
|
|
"step": 2127
|
|
},
|
|
{
|
|
"epoch": 2.1648016276703967,
|
|
"grad_norm": 0.7627188563346863,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5287,
|
|
"mean_token_accuracy": 0.8378783464431763,
|
|
"num_tokens": 678236943.0,
|
|
"step": 2128
|
|
},
|
|
{
|
|
"epoch": 2.1658189216683623,
|
|
"grad_norm": 0.7750257253646851,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5808,
|
|
"mean_token_accuracy": 0.8241315484046936,
|
|
"num_tokens": 678533114.0,
|
|
"step": 2129
|
|
},
|
|
{
|
|
"epoch": 2.1668362156663274,
|
|
"grad_norm": 0.8628514409065247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5381,
|
|
"mean_token_accuracy": 0.8356536626815796,
|
|
"num_tokens": 678854315.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 2.167853509664293,
|
|
"grad_norm": 0.7474110126495361,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8338069915771484,
|
|
"num_tokens": 679170763.0,
|
|
"step": 2131
|
|
},
|
|
{
|
|
"epoch": 2.1688708036622586,
|
|
"grad_norm": 0.7617558836936951,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5691,
|
|
"mean_token_accuracy": 0.8265351057052612,
|
|
"num_tokens": 679480751.0,
|
|
"step": 2132
|
|
},
|
|
{
|
|
"epoch": 2.1698880976602237,
|
|
"grad_norm": 0.7319729924201965,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8302922248840332,
|
|
"num_tokens": 679799542.0,
|
|
"step": 2133
|
|
},
|
|
{
|
|
"epoch": 2.1709053916581893,
|
|
"grad_norm": 0.7312279939651489,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5447,
|
|
"mean_token_accuracy": 0.8329917788505554,
|
|
"num_tokens": 680126987.0,
|
|
"step": 2134
|
|
},
|
|
{
|
|
"epoch": 2.1719226856561544,
|
|
"grad_norm": 0.7618582844734192,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.8301103711128235,
|
|
"num_tokens": 680445654.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 2.17293997965412,
|
|
"grad_norm": 0.7430460453033447,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8269782662391663,
|
|
"num_tokens": 680766149.0,
|
|
"step": 2136
|
|
},
|
|
{
|
|
"epoch": 2.1739572736520856,
|
|
"grad_norm": 0.7346223592758179,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5406,
|
|
"mean_token_accuracy": 0.8346844911575317,
|
|
"num_tokens": 681081193.0,
|
|
"step": 2137
|
|
},
|
|
{
|
|
"epoch": 2.1749745676500507,
|
|
"grad_norm": 0.987307071685791,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8271561861038208,
|
|
"num_tokens": 681389768.0,
|
|
"step": 2138
|
|
},
|
|
{
|
|
"epoch": 2.1759918616480163,
|
|
"grad_norm": 0.7216676473617554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8329777717590332,
|
|
"num_tokens": 681716150.0,
|
|
"step": 2139
|
|
},
|
|
{
|
|
"epoch": 2.177009155645982,
|
|
"grad_norm": 0.7814753651618958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5815,
|
|
"mean_token_accuracy": 0.8237301111221313,
|
|
"num_tokens": 682034982.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.178026449643947,
|
|
"grad_norm": 0.7712879776954651,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.55,
|
|
"mean_token_accuracy": 0.8320897817611694,
|
|
"num_tokens": 682360761.0,
|
|
"step": 2141
|
|
},
|
|
{
|
|
"epoch": 2.1790437436419126,
|
|
"grad_norm": 0.7365331053733826,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8374788761138916,
|
|
"num_tokens": 682683740.0,
|
|
"step": 2142
|
|
},
|
|
{
|
|
"epoch": 2.180061037639878,
|
|
"grad_norm": 0.7548322677612305,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5458,
|
|
"mean_token_accuracy": 0.8324960470199585,
|
|
"num_tokens": 682998758.0,
|
|
"step": 2143
|
|
},
|
|
{
|
|
"epoch": 2.1810783316378433,
|
|
"grad_norm": 0.7492295503616333,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5524,
|
|
"mean_token_accuracy": 0.8332866430282593,
|
|
"num_tokens": 683315540.0,
|
|
"step": 2144
|
|
},
|
|
{
|
|
"epoch": 2.182095625635809,
|
|
"grad_norm": 0.7494568824768066,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5597,
|
|
"mean_token_accuracy": 0.8304259777069092,
|
|
"num_tokens": 683639710.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 2.183112919633774,
|
|
"grad_norm": 0.7964203953742981,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.8240746259689331,
|
|
"num_tokens": 683962253.0,
|
|
"step": 2146
|
|
},
|
|
{
|
|
"epoch": 2.1841302136317395,
|
|
"grad_norm": 0.7324085831642151,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5454,
|
|
"mean_token_accuracy": 0.832410991191864,
|
|
"num_tokens": 684276805.0,
|
|
"step": 2147
|
|
},
|
|
{
|
|
"epoch": 2.185147507629705,
|
|
"grad_norm": 0.7728395462036133,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.536,
|
|
"mean_token_accuracy": 0.8357415795326233,
|
|
"num_tokens": 684600612.0,
|
|
"step": 2148
|
|
},
|
|
{
|
|
"epoch": 2.1861648016276702,
|
|
"grad_norm": 0.8084637522697449,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5621,
|
|
"mean_token_accuracy": 0.8287630081176758,
|
|
"num_tokens": 684915465.0,
|
|
"step": 2149
|
|
},
|
|
{
|
|
"epoch": 2.187182095625636,
|
|
"grad_norm": 0.7585083246231079,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5704,
|
|
"mean_token_accuracy": 0.8277579545974731,
|
|
"num_tokens": 685243003.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 2.1881993896236014,
|
|
"grad_norm": 0.7352972030639648,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.830999493598938,
|
|
"num_tokens": 685586437.0,
|
|
"step": 2151
|
|
},
|
|
{
|
|
"epoch": 2.1892166836215665,
|
|
"grad_norm": 0.7726067900657654,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5449,
|
|
"mean_token_accuracy": 0.8336653709411621,
|
|
"num_tokens": 685883071.0,
|
|
"step": 2152
|
|
},
|
|
{
|
|
"epoch": 2.190233977619532,
|
|
"grad_norm": 0.7544667720794678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.833609402179718,
|
|
"num_tokens": 686203095.0,
|
|
"step": 2153
|
|
},
|
|
{
|
|
"epoch": 2.1912512716174977,
|
|
"grad_norm": 0.7761970162391663,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5417,
|
|
"mean_token_accuracy": 0.8342102766036987,
|
|
"num_tokens": 686515166.0,
|
|
"step": 2154
|
|
},
|
|
{
|
|
"epoch": 2.192268565615463,
|
|
"grad_norm": 0.7551308870315552,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.568,
|
|
"mean_token_accuracy": 0.8281980752944946,
|
|
"num_tokens": 686847911.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 2.1932858596134284,
|
|
"grad_norm": 0.7399230003356934,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8297072649002075,
|
|
"num_tokens": 687161028.0,
|
|
"step": 2156
|
|
},
|
|
{
|
|
"epoch": 2.1943031536113935,
|
|
"grad_norm": 0.7779721617698669,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8314580917358398,
|
|
"num_tokens": 687462776.0,
|
|
"step": 2157
|
|
},
|
|
{
|
|
"epoch": 2.195320447609359,
|
|
"grad_norm": 0.7697399854660034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8339000344276428,
|
|
"num_tokens": 687785512.0,
|
|
"step": 2158
|
|
},
|
|
{
|
|
"epoch": 2.1963377416073246,
|
|
"grad_norm": 0.7364392876625061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5449,
|
|
"mean_token_accuracy": 0.8337985873222351,
|
|
"num_tokens": 688112149.0,
|
|
"step": 2159
|
|
},
|
|
{
|
|
"epoch": 2.1973550356052898,
|
|
"grad_norm": 0.8275453448295593,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5319,
|
|
"mean_token_accuracy": 0.8366191387176514,
|
|
"num_tokens": 688432958.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.1983723296032553,
|
|
"grad_norm": 0.8337928652763367,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8279739618301392,
|
|
"num_tokens": 688745272.0,
|
|
"step": 2161
|
|
},
|
|
{
|
|
"epoch": 2.199389623601221,
|
|
"grad_norm": 0.7610588073730469,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5637,
|
|
"mean_token_accuracy": 0.8283995985984802,
|
|
"num_tokens": 689058958.0,
|
|
"step": 2162
|
|
},
|
|
{
|
|
"epoch": 2.200406917599186,
|
|
"grad_norm": 0.7536100149154663,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8297035694122314,
|
|
"num_tokens": 689387117.0,
|
|
"step": 2163
|
|
},
|
|
{
|
|
"epoch": 2.2014242115971516,
|
|
"grad_norm": 0.819729745388031,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8299574255943298,
|
|
"num_tokens": 689703523.0,
|
|
"step": 2164
|
|
},
|
|
{
|
|
"epoch": 2.202441505595117,
|
|
"grad_norm": 0.8181864619255066,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5764,
|
|
"mean_token_accuracy": 0.8252584338188171,
|
|
"num_tokens": 690011320.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 2.2034587995930823,
|
|
"grad_norm": 0.7794902920722961,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.8352136611938477,
|
|
"num_tokens": 690312474.0,
|
|
"step": 2166
|
|
},
|
|
{
|
|
"epoch": 2.204476093591048,
|
|
"grad_norm": 0.7757052183151245,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5479,
|
|
"mean_token_accuracy": 0.8337260484695435,
|
|
"num_tokens": 690629913.0,
|
|
"step": 2167
|
|
},
|
|
{
|
|
"epoch": 2.205493387589013,
|
|
"grad_norm": 0.7456837892532349,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5291,
|
|
"mean_token_accuracy": 0.8383486270904541,
|
|
"num_tokens": 690946910.0,
|
|
"step": 2168
|
|
},
|
|
{
|
|
"epoch": 2.2065106815869786,
|
|
"grad_norm": 0.765623152256012,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5747,
|
|
"mean_token_accuracy": 0.8256553411483765,
|
|
"num_tokens": 691287105.0,
|
|
"step": 2169
|
|
},
|
|
{
|
|
"epoch": 2.207527975584944,
|
|
"grad_norm": 0.8190492987632751,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.830524742603302,
|
|
"num_tokens": 691590813.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 2.2085452695829093,
|
|
"grad_norm": 0.7249578833580017,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5574,
|
|
"mean_token_accuracy": 0.8298805952072144,
|
|
"num_tokens": 691912782.0,
|
|
"step": 2171
|
|
},
|
|
{
|
|
"epoch": 2.209562563580875,
|
|
"grad_norm": 0.7654576301574707,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.8291027545928955,
|
|
"num_tokens": 692242294.0,
|
|
"step": 2172
|
|
},
|
|
{
|
|
"epoch": 2.2105798575788405,
|
|
"grad_norm": 0.7766432762145996,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.8241997957229614,
|
|
"num_tokens": 692562541.0,
|
|
"step": 2173
|
|
},
|
|
{
|
|
"epoch": 2.2115971515768056,
|
|
"grad_norm": 0.7897323369979858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5469,
|
|
"mean_token_accuracy": 0.8327312469482422,
|
|
"num_tokens": 692863692.0,
|
|
"step": 2174
|
|
},
|
|
{
|
|
"epoch": 2.212614445574771,
|
|
"grad_norm": 0.776065468788147,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5496,
|
|
"mean_token_accuracy": 0.8322169780731201,
|
|
"num_tokens": 693172309.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 2.2136317395727367,
|
|
"grad_norm": 0.8281997442245483,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.8276952505111694,
|
|
"num_tokens": 693494304.0,
|
|
"step": 2176
|
|
},
|
|
{
|
|
"epoch": 2.214649033570702,
|
|
"grad_norm": 0.7335885763168335,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5502,
|
|
"mean_token_accuracy": 0.8332297205924988,
|
|
"num_tokens": 693827204.0,
|
|
"step": 2177
|
|
},
|
|
{
|
|
"epoch": 2.2156663275686674,
|
|
"grad_norm": 0.7174627780914307,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5768,
|
|
"mean_token_accuracy": 0.826240599155426,
|
|
"num_tokens": 694172486.0,
|
|
"step": 2178
|
|
},
|
|
{
|
|
"epoch": 2.2166836215666326,
|
|
"grad_norm": 0.7426119446754456,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5932,
|
|
"mean_token_accuracy": 0.8206398487091064,
|
|
"num_tokens": 694504560.0,
|
|
"step": 2179
|
|
},
|
|
{
|
|
"epoch": 2.217700915564598,
|
|
"grad_norm": 0.7555232048034668,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5824,
|
|
"mean_token_accuracy": 0.8234156966209412,
|
|
"num_tokens": 694817430.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.2187182095625637,
|
|
"grad_norm": 0.7455035448074341,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8338122963905334,
|
|
"num_tokens": 695139610.0,
|
|
"step": 2181
|
|
},
|
|
{
|
|
"epoch": 2.219735503560529,
|
|
"grad_norm": 0.7357603311538696,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5474,
|
|
"mean_token_accuracy": 0.8327358961105347,
|
|
"num_tokens": 695455043.0,
|
|
"step": 2182
|
|
},
|
|
{
|
|
"epoch": 2.2207527975584944,
|
|
"grad_norm": 0.7902182936668396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.568,
|
|
"mean_token_accuracy": 0.8277439475059509,
|
|
"num_tokens": 695788464.0,
|
|
"step": 2183
|
|
},
|
|
{
|
|
"epoch": 2.22177009155646,
|
|
"grad_norm": 0.7493472099304199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8297045230865479,
|
|
"num_tokens": 696110054.0,
|
|
"step": 2184
|
|
},
|
|
{
|
|
"epoch": 2.222787385554425,
|
|
"grad_norm": 0.7690534591674805,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.83198481798172,
|
|
"num_tokens": 696419384.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 2.2238046795523907,
|
|
"grad_norm": 0.7142757177352905,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5398,
|
|
"mean_token_accuracy": 0.834639847278595,
|
|
"num_tokens": 696754841.0,
|
|
"step": 2186
|
|
},
|
|
{
|
|
"epoch": 2.2248219735503563,
|
|
"grad_norm": 0.7481052279472351,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.571,
|
|
"mean_token_accuracy": 0.8269023895263672,
|
|
"num_tokens": 697066242.0,
|
|
"step": 2187
|
|
},
|
|
{
|
|
"epoch": 2.2258392675483214,
|
|
"grad_norm": 0.754620373249054,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8295397758483887,
|
|
"num_tokens": 697384006.0,
|
|
"step": 2188
|
|
},
|
|
{
|
|
"epoch": 2.226856561546287,
|
|
"grad_norm": 0.8104036450386047,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.588,
|
|
"mean_token_accuracy": 0.8218930959701538,
|
|
"num_tokens": 697695648.0,
|
|
"step": 2189
|
|
},
|
|
{
|
|
"epoch": 2.227873855544252,
|
|
"grad_norm": 0.7504926919937134,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5531,
|
|
"mean_token_accuracy": 0.8311076164245605,
|
|
"num_tokens": 698019018.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 2.2288911495422177,
|
|
"grad_norm": 0.7394097447395325,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5661,
|
|
"mean_token_accuracy": 0.8285304307937622,
|
|
"num_tokens": 698339787.0,
|
|
"step": 2191
|
|
},
|
|
{
|
|
"epoch": 2.2299084435401832,
|
|
"grad_norm": 0.7734787464141846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8304460048675537,
|
|
"num_tokens": 698644406.0,
|
|
"step": 2192
|
|
},
|
|
{
|
|
"epoch": 2.2309257375381484,
|
|
"grad_norm": 0.7935941815376282,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8333359956741333,
|
|
"num_tokens": 698949448.0,
|
|
"step": 2193
|
|
},
|
|
{
|
|
"epoch": 2.231943031536114,
|
|
"grad_norm": 0.7313924431800842,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5664,
|
|
"mean_token_accuracy": 0.8283383250236511,
|
|
"num_tokens": 699276343.0,
|
|
"step": 2194
|
|
},
|
|
{
|
|
"epoch": 2.2329603255340795,
|
|
"grad_norm": 0.7338786721229553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8266017436981201,
|
|
"num_tokens": 699600680.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 2.2339776195320447,
|
|
"grad_norm": 0.7657689452171326,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5416,
|
|
"mean_token_accuracy": 0.8337849378585815,
|
|
"num_tokens": 699920412.0,
|
|
"step": 2196
|
|
},
|
|
{
|
|
"epoch": 2.2349949135300102,
|
|
"grad_norm": 0.7452083826065063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5584,
|
|
"mean_token_accuracy": 0.8301029205322266,
|
|
"num_tokens": 700246268.0,
|
|
"step": 2197
|
|
},
|
|
{
|
|
"epoch": 2.236012207527976,
|
|
"grad_norm": 0.7452763319015503,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5515,
|
|
"mean_token_accuracy": 0.8316724896430969,
|
|
"num_tokens": 700561439.0,
|
|
"step": 2198
|
|
},
|
|
{
|
|
"epoch": 2.237029501525941,
|
|
"grad_norm": 0.7289541959762573,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8343749046325684,
|
|
"num_tokens": 700904533.0,
|
|
"step": 2199
|
|
},
|
|
{
|
|
"epoch": 2.2380467955239065,
|
|
"grad_norm": 0.7585221529006958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5777,
|
|
"mean_token_accuracy": 0.8261143565177917,
|
|
"num_tokens": 701220420.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.2390640895218716,
|
|
"grad_norm": 0.7407345175743103,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8327363729476929,
|
|
"num_tokens": 701544168.0,
|
|
"step": 2201
|
|
},
|
|
{
|
|
"epoch": 2.240081383519837,
|
|
"grad_norm": 0.8011090159416199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5686,
|
|
"mean_token_accuracy": 0.8249886631965637,
|
|
"num_tokens": 701866354.0,
|
|
"step": 2202
|
|
},
|
|
{
|
|
"epoch": 2.241098677517803,
|
|
"grad_norm": 0.8079937696456909,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5558,
|
|
"mean_token_accuracy": 0.8320120573043823,
|
|
"num_tokens": 702180586.0,
|
|
"step": 2203
|
|
},
|
|
{
|
|
"epoch": 2.242115971515768,
|
|
"grad_norm": 0.7229354977607727,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5588,
|
|
"mean_token_accuracy": 0.8307468295097351,
|
|
"num_tokens": 702507782.0,
|
|
"step": 2204
|
|
},
|
|
{
|
|
"epoch": 2.2431332655137335,
|
|
"grad_norm": 0.6952229142189026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5762,
|
|
"mean_token_accuracy": 0.8240323066711426,
|
|
"num_tokens": 702834861.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 2.244150559511699,
|
|
"grad_norm": 0.7500696182250977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5647,
|
|
"mean_token_accuracy": 0.8281593322753906,
|
|
"num_tokens": 703173572.0,
|
|
"step": 2206
|
|
},
|
|
{
|
|
"epoch": 2.245167853509664,
|
|
"grad_norm": 0.7257187962532043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.55,
|
|
"mean_token_accuracy": 0.8321106433868408,
|
|
"num_tokens": 703498507.0,
|
|
"step": 2207
|
|
},
|
|
{
|
|
"epoch": 2.2461851475076298,
|
|
"grad_norm": 0.7724878191947937,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8294163346290588,
|
|
"num_tokens": 703823540.0,
|
|
"step": 2208
|
|
},
|
|
{
|
|
"epoch": 2.2472024415055953,
|
|
"grad_norm": 0.816226065158844,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5618,
|
|
"mean_token_accuracy": 0.8298070430755615,
|
|
"num_tokens": 704156689.0,
|
|
"step": 2209
|
|
},
|
|
{
|
|
"epoch": 2.2482197355035605,
|
|
"grad_norm": 0.7845411896705627,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8310840725898743,
|
|
"num_tokens": 704467781.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 2.249237029501526,
|
|
"grad_norm": 0.7413820624351501,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8295588493347168,
|
|
"num_tokens": 704805338.0,
|
|
"step": 2211
|
|
},
|
|
{
|
|
"epoch": 2.250254323499491,
|
|
"grad_norm": 0.7710822224617004,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8290252089500427,
|
|
"num_tokens": 705135627.0,
|
|
"step": 2212
|
|
},
|
|
{
|
|
"epoch": 2.2512716174974567,
|
|
"grad_norm": 0.7354344129562378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.8267947435379028,
|
|
"num_tokens": 705443703.0,
|
|
"step": 2213
|
|
},
|
|
{
|
|
"epoch": 2.2522889114954223,
|
|
"grad_norm": 0.7690240740776062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8332376480102539,
|
|
"num_tokens": 705767052.0,
|
|
"step": 2214
|
|
},
|
|
{
|
|
"epoch": 2.2533062054933874,
|
|
"grad_norm": 0.7489812970161438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8299515247344971,
|
|
"num_tokens": 706101095.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 2.254323499491353,
|
|
"grad_norm": 0.745998740196228,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8307999968528748,
|
|
"num_tokens": 706423567.0,
|
|
"step": 2216
|
|
},
|
|
{
|
|
"epoch": 2.2553407934893186,
|
|
"grad_norm": 0.7590211629867554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8313041925430298,
|
|
"num_tokens": 706747002.0,
|
|
"step": 2217
|
|
},
|
|
{
|
|
"epoch": 2.2563580874872837,
|
|
"grad_norm": 0.7307959794998169,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8323990106582642,
|
|
"num_tokens": 707083769.0,
|
|
"step": 2218
|
|
},
|
|
{
|
|
"epoch": 2.2573753814852493,
|
|
"grad_norm": 0.7042139172554016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8334348797798157,
|
|
"num_tokens": 707407475.0,
|
|
"step": 2219
|
|
},
|
|
{
|
|
"epoch": 2.258392675483215,
|
|
"grad_norm": 0.7372254729270935,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8272490501403809,
|
|
"num_tokens": 707731659.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.25940996948118,
|
|
"grad_norm": 0.7876691222190857,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5387,
|
|
"mean_token_accuracy": 0.835241436958313,
|
|
"num_tokens": 708049516.0,
|
|
"step": 2221
|
|
},
|
|
{
|
|
"epoch": 2.2604272634791456,
|
|
"grad_norm": 0.7591989040374756,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5491,
|
|
"mean_token_accuracy": 0.8333436846733093,
|
|
"num_tokens": 708365608.0,
|
|
"step": 2222
|
|
},
|
|
{
|
|
"epoch": 2.2614445574771107,
|
|
"grad_norm": 0.7744635939598083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5323,
|
|
"mean_token_accuracy": 0.8359612822532654,
|
|
"num_tokens": 708668919.0,
|
|
"step": 2223
|
|
},
|
|
{
|
|
"epoch": 2.2624618514750763,
|
|
"grad_norm": 0.788546621799469,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8418200016021729,
|
|
"num_tokens": 708978992.0,
|
|
"step": 2224
|
|
},
|
|
{
|
|
"epoch": 2.263479145473042,
|
|
"grad_norm": 0.7489559650421143,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5389,
|
|
"mean_token_accuracy": 0.835007905960083,
|
|
"num_tokens": 709289990.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 2.264496439471007,
|
|
"grad_norm": 0.7431305050849915,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5828,
|
|
"mean_token_accuracy": 0.8231770992279053,
|
|
"num_tokens": 709620493.0,
|
|
"step": 2226
|
|
},
|
|
{
|
|
"epoch": 2.2655137334689726,
|
|
"grad_norm": 0.9410425424575806,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8315439224243164,
|
|
"num_tokens": 709945354.0,
|
|
"step": 2227
|
|
},
|
|
{
|
|
"epoch": 2.266531027466938,
|
|
"grad_norm": 0.7582634091377258,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8364298343658447,
|
|
"num_tokens": 710261263.0,
|
|
"step": 2228
|
|
},
|
|
{
|
|
"epoch": 2.2675483214649033,
|
|
"grad_norm": 0.7781898975372314,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5248,
|
|
"mean_token_accuracy": 0.8395653367042542,
|
|
"num_tokens": 710579962.0,
|
|
"step": 2229
|
|
},
|
|
{
|
|
"epoch": 2.268565615462869,
|
|
"grad_norm": 0.7267823815345764,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8301853537559509,
|
|
"num_tokens": 710918771.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 2.2695829094608344,
|
|
"grad_norm": 0.7535921931266785,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8331706523895264,
|
|
"num_tokens": 711248181.0,
|
|
"step": 2231
|
|
},
|
|
{
|
|
"epoch": 2.2706002034587995,
|
|
"grad_norm": 0.7851473093032837,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8320990800857544,
|
|
"num_tokens": 711562515.0,
|
|
"step": 2232
|
|
},
|
|
{
|
|
"epoch": 2.271617497456765,
|
|
"grad_norm": 0.7720229625701904,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8314772248268127,
|
|
"num_tokens": 711864605.0,
|
|
"step": 2233
|
|
},
|
|
{
|
|
"epoch": 2.2726347914547302,
|
|
"grad_norm": 0.757318913936615,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5887,
|
|
"mean_token_accuracy": 0.8224295377731323,
|
|
"num_tokens": 712202336.0,
|
|
"step": 2234
|
|
},
|
|
{
|
|
"epoch": 2.273652085452696,
|
|
"grad_norm": 0.8308467864990234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5768,
|
|
"mean_token_accuracy": 0.8255714178085327,
|
|
"num_tokens": 712510845.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 2.2746693794506614,
|
|
"grad_norm": 0.7796366810798645,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5361,
|
|
"mean_token_accuracy": 0.8357149362564087,
|
|
"num_tokens": 712815226.0,
|
|
"step": 2236
|
|
},
|
|
{
|
|
"epoch": 2.2756866734486265,
|
|
"grad_norm": 0.796631395816803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.553,
|
|
"mean_token_accuracy": 0.8310263156890869,
|
|
"num_tokens": 713136477.0,
|
|
"step": 2237
|
|
},
|
|
{
|
|
"epoch": 2.276703967446592,
|
|
"grad_norm": 0.816248893737793,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.8342663645744324,
|
|
"num_tokens": 713456971.0,
|
|
"step": 2238
|
|
},
|
|
{
|
|
"epoch": 2.2777212614445577,
|
|
"grad_norm": 0.7735084891319275,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8262051939964294,
|
|
"num_tokens": 713779256.0,
|
|
"step": 2239
|
|
},
|
|
{
|
|
"epoch": 2.278738555442523,
|
|
"grad_norm": 0.7764680981636047,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5647,
|
|
"mean_token_accuracy": 0.8285849094390869,
|
|
"num_tokens": 714098535.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.2797558494404884,
|
|
"grad_norm": 0.8629921078681946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8270991444587708,
|
|
"num_tokens": 714415903.0,
|
|
"step": 2241
|
|
},
|
|
{
|
|
"epoch": 2.280773143438454,
|
|
"grad_norm": 0.7582558393478394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5654,
|
|
"mean_token_accuracy": 0.8276588916778564,
|
|
"num_tokens": 714721963.0,
|
|
"step": 2242
|
|
},
|
|
{
|
|
"epoch": 2.281790437436419,
|
|
"grad_norm": 0.7453955411911011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5606,
|
|
"mean_token_accuracy": 0.829574704170227,
|
|
"num_tokens": 715036265.0,
|
|
"step": 2243
|
|
},
|
|
{
|
|
"epoch": 2.2828077314343846,
|
|
"grad_norm": 0.7843350768089294,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5634,
|
|
"mean_token_accuracy": 0.8280022144317627,
|
|
"num_tokens": 715351692.0,
|
|
"step": 2244
|
|
},
|
|
{
|
|
"epoch": 2.2838250254323498,
|
|
"grad_norm": 0.8674906492233276,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.8287839889526367,
|
|
"num_tokens": 715667028.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 2.2848423194303153,
|
|
"grad_norm": 0.8587877154350281,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5519,
|
|
"mean_token_accuracy": 0.8311952948570251,
|
|
"num_tokens": 715978519.0,
|
|
"step": 2246
|
|
},
|
|
{
|
|
"epoch": 2.285859613428281,
|
|
"grad_norm": 0.7901305556297302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.575,
|
|
"mean_token_accuracy": 0.8253798484802246,
|
|
"num_tokens": 716294789.0,
|
|
"step": 2247
|
|
},
|
|
{
|
|
"epoch": 2.286876907426246,
|
|
"grad_norm": 0.7929912209510803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5469,
|
|
"mean_token_accuracy": 0.834429144859314,
|
|
"num_tokens": 716601971.0,
|
|
"step": 2248
|
|
},
|
|
{
|
|
"epoch": 2.2878942014242116,
|
|
"grad_norm": 0.7639389038085938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8323620557785034,
|
|
"num_tokens": 716916644.0,
|
|
"step": 2249
|
|
},
|
|
{
|
|
"epoch": 2.288911495422177,
|
|
"grad_norm": 0.8073030114173889,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5402,
|
|
"mean_token_accuracy": 0.8344812989234924,
|
|
"num_tokens": 717218108.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 2.2899287894201423,
|
|
"grad_norm": 0.7753877639770508,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5504,
|
|
"mean_token_accuracy": 0.8317732810974121,
|
|
"num_tokens": 717517090.0,
|
|
"step": 2251
|
|
},
|
|
{
|
|
"epoch": 2.290946083418108,
|
|
"grad_norm": 0.7521124482154846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5636,
|
|
"mean_token_accuracy": 0.8278107643127441,
|
|
"num_tokens": 717846642.0,
|
|
"step": 2252
|
|
},
|
|
{
|
|
"epoch": 2.2919633774160735,
|
|
"grad_norm": 0.8184477090835571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8333908319473267,
|
|
"num_tokens": 718173696.0,
|
|
"step": 2253
|
|
},
|
|
{
|
|
"epoch": 2.2929806714140386,
|
|
"grad_norm": 0.7861326336860657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8234462141990662,
|
|
"num_tokens": 718494164.0,
|
|
"step": 2254
|
|
},
|
|
{
|
|
"epoch": 2.293997965412004,
|
|
"grad_norm": 0.7486854195594788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5568,
|
|
"mean_token_accuracy": 0.8295444846153259,
|
|
"num_tokens": 718815468.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 2.2950152594099693,
|
|
"grad_norm": 0.7647976875305176,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.8253200650215149,
|
|
"num_tokens": 719136494.0,
|
|
"step": 2256
|
|
},
|
|
{
|
|
"epoch": 2.296032553407935,
|
|
"grad_norm": 0.7757217288017273,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8283142447471619,
|
|
"num_tokens": 719474769.0,
|
|
"step": 2257
|
|
},
|
|
{
|
|
"epoch": 2.2970498474059005,
|
|
"grad_norm": 0.7896122932434082,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5735,
|
|
"mean_token_accuracy": 0.8248286843299866,
|
|
"num_tokens": 719775158.0,
|
|
"step": 2258
|
|
},
|
|
{
|
|
"epoch": 2.2980671414038656,
|
|
"grad_norm": 0.7767871022224426,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5604,
|
|
"mean_token_accuracy": 0.8307970762252808,
|
|
"num_tokens": 720075628.0,
|
|
"step": 2259
|
|
},
|
|
{
|
|
"epoch": 2.299084435401831,
|
|
"grad_norm": 0.7573894262313843,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5805,
|
|
"mean_token_accuracy": 0.8246124982833862,
|
|
"num_tokens": 720390860.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.3001017293997967,
|
|
"grad_norm": 0.8404419422149658,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.8311671018600464,
|
|
"num_tokens": 720717089.0,
|
|
"step": 2261
|
|
},
|
|
{
|
|
"epoch": 2.301119023397762,
|
|
"grad_norm": 0.7757552266120911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.833970308303833,
|
|
"num_tokens": 721053638.0,
|
|
"step": 2262
|
|
},
|
|
{
|
|
"epoch": 2.3021363173957274,
|
|
"grad_norm": 0.7333695888519287,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5527,
|
|
"mean_token_accuracy": 0.8311830759048462,
|
|
"num_tokens": 721379440.0,
|
|
"step": 2263
|
|
},
|
|
{
|
|
"epoch": 2.303153611393693,
|
|
"grad_norm": 0.7551787495613098,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5587,
|
|
"mean_token_accuracy": 0.8288209438323975,
|
|
"num_tokens": 721691506.0,
|
|
"step": 2264
|
|
},
|
|
{
|
|
"epoch": 2.304170905391658,
|
|
"grad_norm": 0.797990083694458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5562,
|
|
"mean_token_accuracy": 0.8305119872093201,
|
|
"num_tokens": 721991138.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 2.3051881993896237,
|
|
"grad_norm": 0.7845953702926636,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.8282233476638794,
|
|
"num_tokens": 722325029.0,
|
|
"step": 2266
|
|
},
|
|
{
|
|
"epoch": 2.306205493387589,
|
|
"grad_norm": 0.7707501649856567,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8319345116615295,
|
|
"num_tokens": 722639326.0,
|
|
"step": 2267
|
|
},
|
|
{
|
|
"epoch": 2.3072227873855544,
|
|
"grad_norm": 0.7780898213386536,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8259189128875732,
|
|
"num_tokens": 722945492.0,
|
|
"step": 2268
|
|
},
|
|
{
|
|
"epoch": 2.30824008138352,
|
|
"grad_norm": 0.8189555406570435,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8206167817115784,
|
|
"num_tokens": 723262253.0,
|
|
"step": 2269
|
|
},
|
|
{
|
|
"epoch": 2.309257375381485,
|
|
"grad_norm": 0.7804981470108032,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5813,
|
|
"mean_token_accuracy": 0.8242859244346619,
|
|
"num_tokens": 723575991.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 2.3102746693794507,
|
|
"grad_norm": 0.736781656742096,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5803,
|
|
"mean_token_accuracy": 0.8251981735229492,
|
|
"num_tokens": 723892353.0,
|
|
"step": 2271
|
|
},
|
|
{
|
|
"epoch": 2.311291963377416,
|
|
"grad_norm": 0.7337729334831238,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5471,
|
|
"mean_token_accuracy": 0.8330903053283691,
|
|
"num_tokens": 724220225.0,
|
|
"step": 2272
|
|
},
|
|
{
|
|
"epoch": 2.3123092573753814,
|
|
"grad_norm": 0.7931007742881775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8291524648666382,
|
|
"num_tokens": 724542617.0,
|
|
"step": 2273
|
|
},
|
|
{
|
|
"epoch": 2.313326551373347,
|
|
"grad_norm": 0.9265283346176147,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5478,
|
|
"mean_token_accuracy": 0.8326225876808167,
|
|
"num_tokens": 724863471.0,
|
|
"step": 2274
|
|
},
|
|
{
|
|
"epoch": 2.3143438453713125,
|
|
"grad_norm": 0.7294421195983887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8275586366653442,
|
|
"num_tokens": 725205147.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 2.3153611393692777,
|
|
"grad_norm": 0.7400902509689331,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8316060304641724,
|
|
"num_tokens": 725544049.0,
|
|
"step": 2276
|
|
},
|
|
{
|
|
"epoch": 2.3163784333672433,
|
|
"grad_norm": 0.7395307421684265,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.8303009271621704,
|
|
"num_tokens": 725863765.0,
|
|
"step": 2277
|
|
},
|
|
{
|
|
"epoch": 2.3173957273652084,
|
|
"grad_norm": 0.7711753845214844,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8282003998756409,
|
|
"num_tokens": 726172877.0,
|
|
"step": 2278
|
|
},
|
|
{
|
|
"epoch": 2.318413021363174,
|
|
"grad_norm": 0.7608178853988647,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.830217182636261,
|
|
"num_tokens": 726475413.0,
|
|
"step": 2279
|
|
},
|
|
{
|
|
"epoch": 2.3194303153611395,
|
|
"grad_norm": 0.7279531359672546,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8248142600059509,
|
|
"num_tokens": 726809779.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.3204476093591047,
|
|
"grad_norm": 0.7574766874313354,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8328359127044678,
|
|
"num_tokens": 727125760.0,
|
|
"step": 2281
|
|
},
|
|
{
|
|
"epoch": 2.3214649033570702,
|
|
"grad_norm": 0.7688292860984802,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5599,
|
|
"mean_token_accuracy": 0.829384446144104,
|
|
"num_tokens": 727438075.0,
|
|
"step": 2282
|
|
},
|
|
{
|
|
"epoch": 2.3224821973550354,
|
|
"grad_norm": 0.7651029229164124,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5641,
|
|
"mean_token_accuracy": 0.8305647969245911,
|
|
"num_tokens": 727764115.0,
|
|
"step": 2283
|
|
},
|
|
{
|
|
"epoch": 2.323499491353001,
|
|
"grad_norm": 0.7398386597633362,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.557,
|
|
"mean_token_accuracy": 0.8299177289009094,
|
|
"num_tokens": 728092211.0,
|
|
"step": 2284
|
|
},
|
|
{
|
|
"epoch": 2.3245167853509665,
|
|
"grad_norm": 0.7717942595481873,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.8286029100418091,
|
|
"num_tokens": 728425475.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 2.325534079348932,
|
|
"grad_norm": 0.8052406907081604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5704,
|
|
"mean_token_accuracy": 0.826826274394989,
|
|
"num_tokens": 728738679.0,
|
|
"step": 2286
|
|
},
|
|
{
|
|
"epoch": 2.326551373346897,
|
|
"grad_norm": 0.8091646432876587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5878,
|
|
"mean_token_accuracy": 0.8217065930366516,
|
|
"num_tokens": 729077322.0,
|
|
"step": 2287
|
|
},
|
|
{
|
|
"epoch": 2.327568667344863,
|
|
"grad_norm": 0.7941763401031494,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8312726616859436,
|
|
"num_tokens": 729376355.0,
|
|
"step": 2288
|
|
},
|
|
{
|
|
"epoch": 2.328585961342828,
|
|
"grad_norm": 0.7231628894805908,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5562,
|
|
"mean_token_accuracy": 0.830712080001831,
|
|
"num_tokens": 729697233.0,
|
|
"step": 2289
|
|
},
|
|
{
|
|
"epoch": 2.3296032553407935,
|
|
"grad_norm": 0.7471105456352234,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.8266077041625977,
|
|
"num_tokens": 730009711.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 2.330620549338759,
|
|
"grad_norm": 0.7873372435569763,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8235731720924377,
|
|
"num_tokens": 730315273.0,
|
|
"step": 2291
|
|
},
|
|
{
|
|
"epoch": 2.331637843336724,
|
|
"grad_norm": 0.7682083249092102,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5409,
|
|
"mean_token_accuracy": 0.8349224328994751,
|
|
"num_tokens": 730622310.0,
|
|
"step": 2292
|
|
},
|
|
{
|
|
"epoch": 2.3326551373346898,
|
|
"grad_norm": 0.7594954967498779,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.8270331621170044,
|
|
"num_tokens": 730938482.0,
|
|
"step": 2293
|
|
},
|
|
{
|
|
"epoch": 2.333672431332655,
|
|
"grad_norm": 0.7214847803115845,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.8242443799972534,
|
|
"num_tokens": 731256654.0,
|
|
"step": 2294
|
|
},
|
|
{
|
|
"epoch": 2.3346897253306205,
|
|
"grad_norm": 0.7712514400482178,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8341904282569885,
|
|
"num_tokens": 731560273.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 2.335707019328586,
|
|
"grad_norm": 0.7314186096191406,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5248,
|
|
"mean_token_accuracy": 0.8387254476547241,
|
|
"num_tokens": 731882252.0,
|
|
"step": 2296
|
|
},
|
|
{
|
|
"epoch": 2.3367243133265516,
|
|
"grad_norm": 0.7417098879814148,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5524,
|
|
"mean_token_accuracy": 0.8319731950759888,
|
|
"num_tokens": 732205142.0,
|
|
"step": 2297
|
|
},
|
|
{
|
|
"epoch": 2.3377416073245167,
|
|
"grad_norm": 0.7539238333702087,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5726,
|
|
"mean_token_accuracy": 0.8251879215240479,
|
|
"num_tokens": 732523658.0,
|
|
"step": 2298
|
|
},
|
|
{
|
|
"epoch": 2.3387589013224823,
|
|
"grad_norm": 0.7640252113342285,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.8272615671157837,
|
|
"num_tokens": 732842797.0,
|
|
"step": 2299
|
|
},
|
|
{
|
|
"epoch": 2.3397761953204474,
|
|
"grad_norm": 0.7650970816612244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5835,
|
|
"mean_token_accuracy": 0.8229362964630127,
|
|
"num_tokens": 733168024.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.340793489318413,
|
|
"grad_norm": 0.733159601688385,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5373,
|
|
"mean_token_accuracy": 0.8358616828918457,
|
|
"num_tokens": 733495169.0,
|
|
"step": 2301
|
|
},
|
|
{
|
|
"epoch": 2.3418107833163786,
|
|
"grad_norm": 0.7942311763763428,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8320826888084412,
|
|
"num_tokens": 733783519.0,
|
|
"step": 2302
|
|
},
|
|
{
|
|
"epoch": 2.3428280773143437,
|
|
"grad_norm": 0.771440863609314,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5461,
|
|
"mean_token_accuracy": 0.8328754901885986,
|
|
"num_tokens": 734114541.0,
|
|
"step": 2303
|
|
},
|
|
{
|
|
"epoch": 2.3438453713123093,
|
|
"grad_norm": 0.7965013980865479,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5776,
|
|
"mean_token_accuracy": 0.824408233165741,
|
|
"num_tokens": 734444293.0,
|
|
"step": 2304
|
|
},
|
|
{
|
|
"epoch": 2.3448626653102744,
|
|
"grad_norm": 0.7346433401107788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5424,
|
|
"mean_token_accuracy": 0.8340154886245728,
|
|
"num_tokens": 734747917.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 2.34587995930824,
|
|
"grad_norm": 0.7440131902694702,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5341,
|
|
"mean_token_accuracy": 0.8372299075126648,
|
|
"num_tokens": 735069410.0,
|
|
"step": 2306
|
|
},
|
|
{
|
|
"epoch": 2.3468972533062056,
|
|
"grad_norm": 0.7549577355384827,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.836683452129364,
|
|
"num_tokens": 735380532.0,
|
|
"step": 2307
|
|
},
|
|
{
|
|
"epoch": 2.347914547304171,
|
|
"grad_norm": 0.7508228421211243,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.828955888748169,
|
|
"num_tokens": 735689763.0,
|
|
"step": 2308
|
|
},
|
|
{
|
|
"epoch": 2.3489318413021363,
|
|
"grad_norm": 0.7547523975372314,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5164,
|
|
"mean_token_accuracy": 0.8409914374351501,
|
|
"num_tokens": 736000433.0,
|
|
"step": 2309
|
|
},
|
|
{
|
|
"epoch": 2.349949135300102,
|
|
"grad_norm": 0.7216244339942932,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5446,
|
|
"mean_token_accuracy": 0.8328282833099365,
|
|
"num_tokens": 736329409.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 2.350966429298067,
|
|
"grad_norm": 0.7756535410881042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.8296158313751221,
|
|
"num_tokens": 736642712.0,
|
|
"step": 2311
|
|
},
|
|
{
|
|
"epoch": 2.3519837232960326,
|
|
"grad_norm": 0.7554481625556946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8292926549911499,
|
|
"num_tokens": 736956462.0,
|
|
"step": 2312
|
|
},
|
|
{
|
|
"epoch": 2.353001017293998,
|
|
"grad_norm": 0.6937792897224426,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5458,
|
|
"mean_token_accuracy": 0.8327944874763489,
|
|
"num_tokens": 737279301.0,
|
|
"step": 2313
|
|
},
|
|
{
|
|
"epoch": 2.3540183112919633,
|
|
"grad_norm": 0.7226637005805969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8313299417495728,
|
|
"num_tokens": 737617291.0,
|
|
"step": 2314
|
|
},
|
|
{
|
|
"epoch": 2.355035605289929,
|
|
"grad_norm": 0.7673088312149048,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8323171138763428,
|
|
"num_tokens": 737938881.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 2.356052899287894,
|
|
"grad_norm": 0.7496564984321594,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5543,
|
|
"mean_token_accuracy": 0.8294878005981445,
|
|
"num_tokens": 738254476.0,
|
|
"step": 2316
|
|
},
|
|
{
|
|
"epoch": 2.3570701932858595,
|
|
"grad_norm": 0.7185983657836914,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5477,
|
|
"mean_token_accuracy": 0.83353590965271,
|
|
"num_tokens": 738587235.0,
|
|
"step": 2317
|
|
},
|
|
{
|
|
"epoch": 2.358087487283825,
|
|
"grad_norm": 0.7741244435310364,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8346359729766846,
|
|
"num_tokens": 738903821.0,
|
|
"step": 2318
|
|
},
|
|
{
|
|
"epoch": 2.3591047812817907,
|
|
"grad_norm": 0.7434374094009399,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8271213173866272,
|
|
"num_tokens": 739208932.0,
|
|
"step": 2319
|
|
},
|
|
{
|
|
"epoch": 2.360122075279756,
|
|
"grad_norm": 0.768464207649231,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5539,
|
|
"mean_token_accuracy": 0.8318808078765869,
|
|
"num_tokens": 739529429.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.3611393692777214,
|
|
"grad_norm": 0.8427668809890747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.8341370224952698,
|
|
"num_tokens": 739826514.0,
|
|
"step": 2321
|
|
},
|
|
{
|
|
"epoch": 2.3621566632756865,
|
|
"grad_norm": 0.7625221610069275,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5649,
|
|
"mean_token_accuracy": 0.8287997245788574,
|
|
"num_tokens": 740139289.0,
|
|
"step": 2322
|
|
},
|
|
{
|
|
"epoch": 2.363173957273652,
|
|
"grad_norm": 0.7757455706596375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8334740400314331,
|
|
"num_tokens": 740449970.0,
|
|
"step": 2323
|
|
},
|
|
{
|
|
"epoch": 2.3641912512716177,
|
|
"grad_norm": 0.7737842798233032,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8300380110740662,
|
|
"num_tokens": 740754163.0,
|
|
"step": 2324
|
|
},
|
|
{
|
|
"epoch": 2.365208545269583,
|
|
"grad_norm": 0.7863881587982178,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8350942730903625,
|
|
"num_tokens": 741074703.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 2.3662258392675484,
|
|
"grad_norm": 0.7454284429550171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8342845439910889,
|
|
"num_tokens": 741403525.0,
|
|
"step": 2326
|
|
},
|
|
{
|
|
"epoch": 2.3672431332655135,
|
|
"grad_norm": 0.7511443495750427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5245,
|
|
"mean_token_accuracy": 0.8395842909812927,
|
|
"num_tokens": 741728453.0,
|
|
"step": 2327
|
|
},
|
|
{
|
|
"epoch": 2.368260427263479,
|
|
"grad_norm": 0.7700026035308838,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8323467373847961,
|
|
"num_tokens": 742045389.0,
|
|
"step": 2328
|
|
},
|
|
{
|
|
"epoch": 2.3692777212614446,
|
|
"grad_norm": 0.7246688604354858,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8339344263076782,
|
|
"num_tokens": 742355845.0,
|
|
"step": 2329
|
|
},
|
|
{
|
|
"epoch": 2.3702950152594098,
|
|
"grad_norm": 0.739668071269989,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.8269882202148438,
|
|
"num_tokens": 742671682.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 2.3713123092573754,
|
|
"grad_norm": 0.7679663300514221,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8265938758850098,
|
|
"num_tokens": 742999389.0,
|
|
"step": 2331
|
|
},
|
|
{
|
|
"epoch": 2.372329603255341,
|
|
"grad_norm": 0.7775615453720093,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5657,
|
|
"mean_token_accuracy": 0.8285351395606995,
|
|
"num_tokens": 743328436.0,
|
|
"step": 2332
|
|
},
|
|
{
|
|
"epoch": 2.373346897253306,
|
|
"grad_norm": 0.7559525966644287,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5585,
|
|
"mean_token_accuracy": 0.8302074670791626,
|
|
"num_tokens": 743642776.0,
|
|
"step": 2333
|
|
},
|
|
{
|
|
"epoch": 2.3743641912512716,
|
|
"grad_norm": 0.7330087423324585,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5466,
|
|
"mean_token_accuracy": 0.8330814838409424,
|
|
"num_tokens": 743962306.0,
|
|
"step": 2334
|
|
},
|
|
{
|
|
"epoch": 2.375381485249237,
|
|
"grad_norm": 0.779496431350708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5759,
|
|
"mean_token_accuracy": 0.8268112540245056,
|
|
"num_tokens": 744277189.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 2.3763987792472023,
|
|
"grad_norm": 0.7355311512947083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8319157361984253,
|
|
"num_tokens": 744602450.0,
|
|
"step": 2336
|
|
},
|
|
{
|
|
"epoch": 2.377416073245168,
|
|
"grad_norm": 0.7220591306686401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5505,
|
|
"mean_token_accuracy": 0.83145672082901,
|
|
"num_tokens": 744941184.0,
|
|
"step": 2337
|
|
},
|
|
{
|
|
"epoch": 2.378433367243133,
|
|
"grad_norm": 0.7432832717895508,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8296507596969604,
|
|
"num_tokens": 745273816.0,
|
|
"step": 2338
|
|
},
|
|
{
|
|
"epoch": 2.3794506612410986,
|
|
"grad_norm": 0.8230165243148804,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8256447315216064,
|
|
"num_tokens": 745600889.0,
|
|
"step": 2339
|
|
},
|
|
{
|
|
"epoch": 2.380467955239064,
|
|
"grad_norm": 0.760030210018158,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5286,
|
|
"mean_token_accuracy": 0.8372802734375,
|
|
"num_tokens": 745931100.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.3814852492370293,
|
|
"grad_norm": 0.7500247359275818,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.830858588218689,
|
|
"num_tokens": 746255957.0,
|
|
"step": 2341
|
|
},
|
|
{
|
|
"epoch": 2.382502543234995,
|
|
"grad_norm": 0.7601770758628845,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5581,
|
|
"mean_token_accuracy": 0.8307530879974365,
|
|
"num_tokens": 746589548.0,
|
|
"step": 2342
|
|
},
|
|
{
|
|
"epoch": 2.3835198372329605,
|
|
"grad_norm": 0.7895753979682922,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5461,
|
|
"mean_token_accuracy": 0.8327901363372803,
|
|
"num_tokens": 746894866.0,
|
|
"step": 2343
|
|
},
|
|
{
|
|
"epoch": 2.3845371312309256,
|
|
"grad_norm": 0.763584554195404,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8359324336051941,
|
|
"num_tokens": 747223779.0,
|
|
"step": 2344
|
|
},
|
|
{
|
|
"epoch": 2.385554425228891,
|
|
"grad_norm": 0.7605777978897095,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8366625308990479,
|
|
"num_tokens": 747533998.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 2.3865717192268567,
|
|
"grad_norm": 0.8445054888725281,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5972,
|
|
"mean_token_accuracy": 0.819532573223114,
|
|
"num_tokens": 747850725.0,
|
|
"step": 2346
|
|
},
|
|
{
|
|
"epoch": 2.387589013224822,
|
|
"grad_norm": 0.8237773776054382,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.8330358862876892,
|
|
"num_tokens": 748172245.0,
|
|
"step": 2347
|
|
},
|
|
{
|
|
"epoch": 2.3886063072227874,
|
|
"grad_norm": 0.9143764972686768,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5466,
|
|
"mean_token_accuracy": 0.8333683013916016,
|
|
"num_tokens": 748476090.0,
|
|
"step": 2348
|
|
},
|
|
{
|
|
"epoch": 2.3896236012207526,
|
|
"grad_norm": 0.7322075963020325,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5673,
|
|
"mean_token_accuracy": 0.827389121055603,
|
|
"num_tokens": 748802917.0,
|
|
"step": 2349
|
|
},
|
|
{
|
|
"epoch": 2.390640895218718,
|
|
"grad_norm": 0.7762848138809204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5828,
|
|
"mean_token_accuracy": 0.824276328086853,
|
|
"num_tokens": 749117584.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 2.3916581892166837,
|
|
"grad_norm": 0.7897521257400513,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5688,
|
|
"mean_token_accuracy": 0.8273171186447144,
|
|
"num_tokens": 749440693.0,
|
|
"step": 2351
|
|
},
|
|
{
|
|
"epoch": 2.392675483214649,
|
|
"grad_norm": 0.8838954567909241,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5376,
|
|
"mean_token_accuracy": 0.8362421989440918,
|
|
"num_tokens": 749746118.0,
|
|
"step": 2352
|
|
},
|
|
{
|
|
"epoch": 2.3936927772126144,
|
|
"grad_norm": 0.7455595135688782,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8335433602333069,
|
|
"num_tokens": 750064386.0,
|
|
"step": 2353
|
|
},
|
|
{
|
|
"epoch": 2.39471007121058,
|
|
"grad_norm": 0.7266263365745544,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8309916257858276,
|
|
"num_tokens": 750392408.0,
|
|
"step": 2354
|
|
},
|
|
{
|
|
"epoch": 2.395727365208545,
|
|
"grad_norm": 0.7415468692779541,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5512,
|
|
"mean_token_accuracy": 0.8322256803512573,
|
|
"num_tokens": 750725594.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 2.3967446592065107,
|
|
"grad_norm": 0.7751864194869995,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5499,
|
|
"mean_token_accuracy": 0.8317053914070129,
|
|
"num_tokens": 751019674.0,
|
|
"step": 2356
|
|
},
|
|
{
|
|
"epoch": 2.3977619532044763,
|
|
"grad_norm": 0.7352653741836548,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5382,
|
|
"mean_token_accuracy": 0.8348796963691711,
|
|
"num_tokens": 751341611.0,
|
|
"step": 2357
|
|
},
|
|
{
|
|
"epoch": 2.3987792472024414,
|
|
"grad_norm": 0.7456854581832886,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8336353898048401,
|
|
"num_tokens": 751658357.0,
|
|
"step": 2358
|
|
},
|
|
{
|
|
"epoch": 2.399796541200407,
|
|
"grad_norm": 0.7456473708152771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5523,
|
|
"mean_token_accuracy": 0.8315349817276001,
|
|
"num_tokens": 751985925.0,
|
|
"step": 2359
|
|
},
|
|
{
|
|
"epoch": 2.400813835198372,
|
|
"grad_norm": 0.7698581218719482,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.827774167060852,
|
|
"num_tokens": 752297184.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.4018311291963377,
|
|
"grad_norm": 0.8498561978340149,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8302951455116272,
|
|
"num_tokens": 752623708.0,
|
|
"step": 2361
|
|
},
|
|
{
|
|
"epoch": 2.4028484231943033,
|
|
"grad_norm": 0.9180802702903748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5542,
|
|
"mean_token_accuracy": 0.8314825296401978,
|
|
"num_tokens": 752932908.0,
|
|
"step": 2362
|
|
},
|
|
{
|
|
"epoch": 2.4038657171922684,
|
|
"grad_norm": 0.7461639642715454,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5574,
|
|
"mean_token_accuracy": 0.8299914598464966,
|
|
"num_tokens": 753256285.0,
|
|
"step": 2363
|
|
},
|
|
{
|
|
"epoch": 2.404883011190234,
|
|
"grad_norm": 0.7700833678245544,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8337959051132202,
|
|
"num_tokens": 753554563.0,
|
|
"step": 2364
|
|
},
|
|
{
|
|
"epoch": 2.4059003051881995,
|
|
"grad_norm": 0.7168130874633789,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.545,
|
|
"mean_token_accuracy": 0.833526611328125,
|
|
"num_tokens": 753883153.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 2.4069175991861647,
|
|
"grad_norm": 0.8115290999412537,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5658,
|
|
"mean_token_accuracy": 0.8273876905441284,
|
|
"num_tokens": 754197393.0,
|
|
"step": 2366
|
|
},
|
|
{
|
|
"epoch": 2.4079348931841302,
|
|
"grad_norm": 0.7849256992340088,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8283255696296692,
|
|
"num_tokens": 754512562.0,
|
|
"step": 2367
|
|
},
|
|
{
|
|
"epoch": 2.408952187182096,
|
|
"grad_norm": 0.7660202383995056,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8274675607681274,
|
|
"num_tokens": 754823935.0,
|
|
"step": 2368
|
|
},
|
|
{
|
|
"epoch": 2.409969481180061,
|
|
"grad_norm": 0.7553797960281372,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8311089277267456,
|
|
"num_tokens": 755155364.0,
|
|
"step": 2369
|
|
},
|
|
{
|
|
"epoch": 2.4109867751780265,
|
|
"grad_norm": 0.7647053599357605,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.8304147124290466,
|
|
"num_tokens": 755467594.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 2.4120040691759916,
|
|
"grad_norm": 0.8000221848487854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5325,
|
|
"mean_token_accuracy": 0.837266206741333,
|
|
"num_tokens": 755776980.0,
|
|
"step": 2371
|
|
},
|
|
{
|
|
"epoch": 2.413021363173957,
|
|
"grad_norm": 0.794994056224823,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8333130478858948,
|
|
"num_tokens": 756086651.0,
|
|
"step": 2372
|
|
},
|
|
{
|
|
"epoch": 2.414038657171923,
|
|
"grad_norm": 0.8113973140716553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5658,
|
|
"mean_token_accuracy": 0.8280282020568848,
|
|
"num_tokens": 756399704.0,
|
|
"step": 2373
|
|
},
|
|
{
|
|
"epoch": 2.415055951169888,
|
|
"grad_norm": 0.7954896092414856,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8311757445335388,
|
|
"num_tokens": 756711634.0,
|
|
"step": 2374
|
|
},
|
|
{
|
|
"epoch": 2.4160732451678535,
|
|
"grad_norm": 0.7626403570175171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.8354580998420715,
|
|
"num_tokens": 757032898.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 2.417090539165819,
|
|
"grad_norm": 0.7603203654289246,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5441,
|
|
"mean_token_accuracy": 0.8340161442756653,
|
|
"num_tokens": 757357972.0,
|
|
"step": 2376
|
|
},
|
|
{
|
|
"epoch": 2.418107833163784,
|
|
"grad_norm": 0.7872731685638428,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5467,
|
|
"mean_token_accuracy": 0.831898033618927,
|
|
"num_tokens": 757677296.0,
|
|
"step": 2377
|
|
},
|
|
{
|
|
"epoch": 2.4191251271617498,
|
|
"grad_norm": 0.7660987973213196,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8300293684005737,
|
|
"num_tokens": 757995955.0,
|
|
"step": 2378
|
|
},
|
|
{
|
|
"epoch": 2.4201424211597153,
|
|
"grad_norm": 0.7972212433815002,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.8283858299255371,
|
|
"num_tokens": 758312347.0,
|
|
"step": 2379
|
|
},
|
|
{
|
|
"epoch": 2.4211597151576805,
|
|
"grad_norm": 0.8009522557258606,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.831788957118988,
|
|
"num_tokens": 758639550.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.422177009155646,
|
|
"grad_norm": 0.7857116460800171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8312411308288574,
|
|
"num_tokens": 758958913.0,
|
|
"step": 2381
|
|
},
|
|
{
|
|
"epoch": 2.423194303153611,
|
|
"grad_norm": 0.7693212628364563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5665,
|
|
"mean_token_accuracy": 0.829216718673706,
|
|
"num_tokens": 759270507.0,
|
|
"step": 2382
|
|
},
|
|
{
|
|
"epoch": 2.4242115971515767,
|
|
"grad_norm": 0.7448201775550842,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.8389084339141846,
|
|
"num_tokens": 759579946.0,
|
|
"step": 2383
|
|
},
|
|
{
|
|
"epoch": 2.4252288911495423,
|
|
"grad_norm": 0.7547968029975891,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5495,
|
|
"mean_token_accuracy": 0.8323776125907898,
|
|
"num_tokens": 759919471.0,
|
|
"step": 2384
|
|
},
|
|
{
|
|
"epoch": 2.4262461851475075,
|
|
"grad_norm": 0.7595186233520508,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5233,
|
|
"mean_token_accuracy": 0.8392271995544434,
|
|
"num_tokens": 760224828.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 2.427263479145473,
|
|
"grad_norm": 0.7325853705406189,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5393,
|
|
"mean_token_accuracy": 0.8344419598579407,
|
|
"num_tokens": 760536153.0,
|
|
"step": 2386
|
|
},
|
|
{
|
|
"epoch": 2.4282807731434386,
|
|
"grad_norm": 0.7919133901596069,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.8291533589363098,
|
|
"num_tokens": 760838181.0,
|
|
"step": 2387
|
|
},
|
|
{
|
|
"epoch": 2.4292980671414037,
|
|
"grad_norm": 0.7144816517829895,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.56,
|
|
"mean_token_accuracy": 0.8297811150550842,
|
|
"num_tokens": 761170210.0,
|
|
"step": 2388
|
|
},
|
|
{
|
|
"epoch": 2.4303153611393693,
|
|
"grad_norm": 0.7816941738128662,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5593,
|
|
"mean_token_accuracy": 0.8292558789253235,
|
|
"num_tokens": 761491740.0,
|
|
"step": 2389
|
|
},
|
|
{
|
|
"epoch": 2.431332655137335,
|
|
"grad_norm": 0.78831547498703,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.8312612175941467,
|
|
"num_tokens": 761813341.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 2.4323499491353,
|
|
"grad_norm": 0.7110333442687988,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5415,
|
|
"mean_token_accuracy": 0.8343943357467651,
|
|
"num_tokens": 762139354.0,
|
|
"step": 2391
|
|
},
|
|
{
|
|
"epoch": 2.4333672431332656,
|
|
"grad_norm": 0.7214698195457458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5404,
|
|
"mean_token_accuracy": 0.8357784748077393,
|
|
"num_tokens": 762460461.0,
|
|
"step": 2392
|
|
},
|
|
{
|
|
"epoch": 2.4343845371312307,
|
|
"grad_norm": 0.817523181438446,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8315002918243408,
|
|
"num_tokens": 762803568.0,
|
|
"step": 2393
|
|
},
|
|
{
|
|
"epoch": 2.4354018311291963,
|
|
"grad_norm": 0.8252072334289551,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8329967260360718,
|
|
"num_tokens": 763121607.0,
|
|
"step": 2394
|
|
},
|
|
{
|
|
"epoch": 2.436419125127162,
|
|
"grad_norm": 0.8103281855583191,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8263031244277954,
|
|
"num_tokens": 763440200.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 2.437436419125127,
|
|
"grad_norm": 0.7681299448013306,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8315169215202332,
|
|
"num_tokens": 763765662.0,
|
|
"step": 2396
|
|
},
|
|
{
|
|
"epoch": 2.4384537131230926,
|
|
"grad_norm": 0.8423113226890564,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5395,
|
|
"mean_token_accuracy": 0.835465669631958,
|
|
"num_tokens": 764082350.0,
|
|
"step": 2397
|
|
},
|
|
{
|
|
"epoch": 2.439471007121058,
|
|
"grad_norm": 0.7529908418655396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5587,
|
|
"mean_token_accuracy": 0.8306792974472046,
|
|
"num_tokens": 764395519.0,
|
|
"step": 2398
|
|
},
|
|
{
|
|
"epoch": 2.4404883011190233,
|
|
"grad_norm": 0.7986095547676086,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5635,
|
|
"mean_token_accuracy": 0.8285139799118042,
|
|
"num_tokens": 764720615.0,
|
|
"step": 2399
|
|
},
|
|
{
|
|
"epoch": 2.441505595116989,
|
|
"grad_norm": 0.7806645035743713,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5776,
|
|
"mean_token_accuracy": 0.826043426990509,
|
|
"num_tokens": 765064375.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.4425228891149544,
|
|
"grad_norm": 0.7634555101394653,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5829,
|
|
"mean_token_accuracy": 0.8234164714813232,
|
|
"num_tokens": 765405280.0,
|
|
"step": 2401
|
|
},
|
|
{
|
|
"epoch": 2.4435401831129195,
|
|
"grad_norm": 0.8364507555961609,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5527,
|
|
"mean_token_accuracy": 0.8306921720504761,
|
|
"num_tokens": 765704128.0,
|
|
"step": 2402
|
|
},
|
|
{
|
|
"epoch": 2.444557477110885,
|
|
"grad_norm": 0.803433358669281,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5576,
|
|
"mean_token_accuracy": 0.8307260274887085,
|
|
"num_tokens": 766021963.0,
|
|
"step": 2403
|
|
},
|
|
{
|
|
"epoch": 2.4455747711088502,
|
|
"grad_norm": 0.7807213068008423,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.8288591504096985,
|
|
"num_tokens": 766348235.0,
|
|
"step": 2404
|
|
},
|
|
{
|
|
"epoch": 2.446592065106816,
|
|
"grad_norm": 0.8111218214035034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5334,
|
|
"mean_token_accuracy": 0.8356228470802307,
|
|
"num_tokens": 766664722.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 2.4476093591047814,
|
|
"grad_norm": 0.7768924236297607,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5336,
|
|
"mean_token_accuracy": 0.8374151587486267,
|
|
"num_tokens": 766973521.0,
|
|
"step": 2406
|
|
},
|
|
{
|
|
"epoch": 2.4486266531027465,
|
|
"grad_norm": 0.7840931415557861,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.8322388529777527,
|
|
"num_tokens": 767301821.0,
|
|
"step": 2407
|
|
},
|
|
{
|
|
"epoch": 2.449643947100712,
|
|
"grad_norm": 0.7447266578674316,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.833337664604187,
|
|
"num_tokens": 767630278.0,
|
|
"step": 2408
|
|
},
|
|
{
|
|
"epoch": 2.4506612410986777,
|
|
"grad_norm": 1.020521879196167,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5433,
|
|
"mean_token_accuracy": 0.8331669569015503,
|
|
"num_tokens": 767961963.0,
|
|
"step": 2409
|
|
},
|
|
{
|
|
"epoch": 2.451678535096643,
|
|
"grad_norm": 0.7774084210395813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.52,
|
|
"mean_token_accuracy": 0.840404748916626,
|
|
"num_tokens": 768274598.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 2.4526958290946084,
|
|
"grad_norm": 0.770845353603363,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5768,
|
|
"mean_token_accuracy": 0.8253146409988403,
|
|
"num_tokens": 768596525.0,
|
|
"step": 2411
|
|
},
|
|
{
|
|
"epoch": 2.453713123092574,
|
|
"grad_norm": 0.752336859703064,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5302,
|
|
"mean_token_accuracy": 0.8379479646682739,
|
|
"num_tokens": 768931098.0,
|
|
"step": 2412
|
|
},
|
|
{
|
|
"epoch": 2.454730417090539,
|
|
"grad_norm": 0.7862922549247742,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5608,
|
|
"mean_token_accuracy": 0.8290746212005615,
|
|
"num_tokens": 769236726.0,
|
|
"step": 2413
|
|
},
|
|
{
|
|
"epoch": 2.4557477110885046,
|
|
"grad_norm": 0.7337533235549927,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8312757015228271,
|
|
"num_tokens": 769546860.0,
|
|
"step": 2414
|
|
},
|
|
{
|
|
"epoch": 2.4567650050864698,
|
|
"grad_norm": 0.7539020776748657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5284,
|
|
"mean_token_accuracy": 0.8379347324371338,
|
|
"num_tokens": 769852870.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 2.4577822990844354,
|
|
"grad_norm": 0.7792903184890747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.827686607837677,
|
|
"num_tokens": 770157889.0,
|
|
"step": 2416
|
|
},
|
|
{
|
|
"epoch": 2.458799593082401,
|
|
"grad_norm": 0.7698293924331665,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8331127166748047,
|
|
"num_tokens": 770495718.0,
|
|
"step": 2417
|
|
},
|
|
{
|
|
"epoch": 2.459816887080366,
|
|
"grad_norm": 0.7606080174446106,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8306302428245544,
|
|
"num_tokens": 770807568.0,
|
|
"step": 2418
|
|
},
|
|
{
|
|
"epoch": 2.4608341810783316,
|
|
"grad_norm": 0.771902322769165,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.8367187976837158,
|
|
"num_tokens": 771115947.0,
|
|
"step": 2419
|
|
},
|
|
{
|
|
"epoch": 2.461851475076297,
|
|
"grad_norm": 0.7755984663963318,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8261435031890869,
|
|
"num_tokens": 771443575.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 2.4628687690742623,
|
|
"grad_norm": 0.7729278206825256,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8226972818374634,
|
|
"num_tokens": 771758233.0,
|
|
"step": 2421
|
|
},
|
|
{
|
|
"epoch": 2.463886063072228,
|
|
"grad_norm": 0.7166008353233337,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8314967155456543,
|
|
"num_tokens": 772093055.0,
|
|
"step": 2422
|
|
},
|
|
{
|
|
"epoch": 2.4649033570701935,
|
|
"grad_norm": 0.7218311429023743,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5321,
|
|
"mean_token_accuracy": 0.8374101519584656,
|
|
"num_tokens": 772411962.0,
|
|
"step": 2423
|
|
},
|
|
{
|
|
"epoch": 2.4659206510681586,
|
|
"grad_norm": 0.7513918280601501,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8277909755706787,
|
|
"num_tokens": 772734084.0,
|
|
"step": 2424
|
|
},
|
|
{
|
|
"epoch": 2.466937945066124,
|
|
"grad_norm": 0.7552136778831482,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5301,
|
|
"mean_token_accuracy": 0.8380136489868164,
|
|
"num_tokens": 773069483.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 2.4679552390640893,
|
|
"grad_norm": 0.8175935745239258,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8256757855415344,
|
|
"num_tokens": 773388030.0,
|
|
"step": 2426
|
|
},
|
|
{
|
|
"epoch": 2.468972533062055,
|
|
"grad_norm": 0.8159247636795044,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5539,
|
|
"mean_token_accuracy": 0.8304073214530945,
|
|
"num_tokens": 773708733.0,
|
|
"step": 2427
|
|
},
|
|
{
|
|
"epoch": 2.4699898270600205,
|
|
"grad_norm": 0.7622677087783813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5392,
|
|
"mean_token_accuracy": 0.835042417049408,
|
|
"num_tokens": 774027412.0,
|
|
"step": 2428
|
|
},
|
|
{
|
|
"epoch": 2.4710071210579856,
|
|
"grad_norm": 0.7640722393989563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5416,
|
|
"mean_token_accuracy": 0.8352609872817993,
|
|
"num_tokens": 774354417.0,
|
|
"step": 2429
|
|
},
|
|
{
|
|
"epoch": 2.472024415055951,
|
|
"grad_norm": 0.8168275952339172,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.8326980471611023,
|
|
"num_tokens": 774658382.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 2.4730417090539167,
|
|
"grad_norm": 0.7612686157226562,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.558,
|
|
"mean_token_accuracy": 0.8303864598274231,
|
|
"num_tokens": 774978306.0,
|
|
"step": 2431
|
|
},
|
|
{
|
|
"epoch": 2.474059003051882,
|
|
"grad_norm": 0.781137228012085,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5499,
|
|
"mean_token_accuracy": 0.8313940763473511,
|
|
"num_tokens": 775301131.0,
|
|
"step": 2432
|
|
},
|
|
{
|
|
"epoch": 2.4750762970498474,
|
|
"grad_norm": 0.8054970502853394,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.8306020498275757,
|
|
"num_tokens": 775609674.0,
|
|
"step": 2433
|
|
},
|
|
{
|
|
"epoch": 2.476093591047813,
|
|
"grad_norm": 0.7589148283004761,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8268976211547852,
|
|
"num_tokens": 775937278.0,
|
|
"step": 2434
|
|
},
|
|
{
|
|
"epoch": 2.477110885045778,
|
|
"grad_norm": 0.8487206101417542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5339,
|
|
"mean_token_accuracy": 0.8378028869628906,
|
|
"num_tokens": 776256891.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 2.4781281790437437,
|
|
"grad_norm": 0.7576294541358948,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5388,
|
|
"mean_token_accuracy": 0.8346007466316223,
|
|
"num_tokens": 776580024.0,
|
|
"step": 2436
|
|
},
|
|
{
|
|
"epoch": 2.479145473041709,
|
|
"grad_norm": 0.7526679039001465,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5601,
|
|
"mean_token_accuracy": 0.8290330171585083,
|
|
"num_tokens": 776881190.0,
|
|
"step": 2437
|
|
},
|
|
{
|
|
"epoch": 2.4801627670396744,
|
|
"grad_norm": 0.7862757444381714,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8304464221000671,
|
|
"num_tokens": 777210571.0,
|
|
"step": 2438
|
|
},
|
|
{
|
|
"epoch": 2.48118006103764,
|
|
"grad_norm": 0.790669858455658,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8283755779266357,
|
|
"num_tokens": 777541394.0,
|
|
"step": 2439
|
|
},
|
|
{
|
|
"epoch": 2.482197355035605,
|
|
"grad_norm": 0.7850418090820312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.8332133889198303,
|
|
"num_tokens": 777839792.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 2.4832146490335707,
|
|
"grad_norm": 0.7663286328315735,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8340730667114258,
|
|
"num_tokens": 778138868.0,
|
|
"step": 2441
|
|
},
|
|
{
|
|
"epoch": 2.4842319430315363,
|
|
"grad_norm": 0.7318602800369263,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5373,
|
|
"mean_token_accuracy": 0.8366137146949768,
|
|
"num_tokens": 778464246.0,
|
|
"step": 2442
|
|
},
|
|
{
|
|
"epoch": 2.4852492370295014,
|
|
"grad_norm": 0.8146898150444031,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5956,
|
|
"mean_token_accuracy": 0.8201758861541748,
|
|
"num_tokens": 778782743.0,
|
|
"step": 2443
|
|
},
|
|
{
|
|
"epoch": 2.486266531027467,
|
|
"grad_norm": 0.7857292890548706,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5343,
|
|
"mean_token_accuracy": 0.8359119296073914,
|
|
"num_tokens": 779106415.0,
|
|
"step": 2444
|
|
},
|
|
{
|
|
"epoch": 2.4872838250254325,
|
|
"grad_norm": 0.7584625482559204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8299977779388428,
|
|
"num_tokens": 779428841.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 2.4883011190233977,
|
|
"grad_norm": 0.7550146579742432,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5454,
|
|
"mean_token_accuracy": 0.833655834197998,
|
|
"num_tokens": 779752096.0,
|
|
"step": 2446
|
|
},
|
|
{
|
|
"epoch": 2.4893184130213633,
|
|
"grad_norm": 0.802813708782196,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8306933641433716,
|
|
"num_tokens": 780066182.0,
|
|
"step": 2447
|
|
},
|
|
{
|
|
"epoch": 2.4903357070193284,
|
|
"grad_norm": 0.7713908553123474,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5494,
|
|
"mean_token_accuracy": 0.8302340507507324,
|
|
"num_tokens": 780379379.0,
|
|
"step": 2448
|
|
},
|
|
{
|
|
"epoch": 2.491353001017294,
|
|
"grad_norm": 0.7197617292404175,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5614,
|
|
"mean_token_accuracy": 0.8294973373413086,
|
|
"num_tokens": 780709618.0,
|
|
"step": 2449
|
|
},
|
|
{
|
|
"epoch": 2.4923702950152595,
|
|
"grad_norm": 0.7505249977111816,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5928,
|
|
"mean_token_accuracy": 0.8199161291122437,
|
|
"num_tokens": 781038418.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 2.4933875890132247,
|
|
"grad_norm": 0.8106674551963806,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5531,
|
|
"mean_token_accuracy": 0.830228328704834,
|
|
"num_tokens": 781342015.0,
|
|
"step": 2451
|
|
},
|
|
{
|
|
"epoch": 2.4944048830111902,
|
|
"grad_norm": 0.747441828250885,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8331936597824097,
|
|
"num_tokens": 781654668.0,
|
|
"step": 2452
|
|
},
|
|
{
|
|
"epoch": 2.495422177009156,
|
|
"grad_norm": 0.762697696685791,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8264614939689636,
|
|
"num_tokens": 781972092.0,
|
|
"step": 2453
|
|
},
|
|
{
|
|
"epoch": 2.496439471007121,
|
|
"grad_norm": 0.764396071434021,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.598,
|
|
"mean_token_accuracy": 0.8198715448379517,
|
|
"num_tokens": 782293763.0,
|
|
"step": 2454
|
|
},
|
|
{
|
|
"epoch": 2.4974567650050865,
|
|
"grad_norm": 0.7979555130004883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8260531425476074,
|
|
"num_tokens": 782606788.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 2.498474059003052,
|
|
"grad_norm": 0.7844521999359131,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5479,
|
|
"mean_token_accuracy": 0.8317605257034302,
|
|
"num_tokens": 782909993.0,
|
|
"step": 2456
|
|
},
|
|
{
|
|
"epoch": 2.499491353001017,
|
|
"grad_norm": 0.8027681708335876,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8310152888298035,
|
|
"num_tokens": 783215610.0,
|
|
"step": 2457
|
|
},
|
|
{
|
|
"epoch": 2.500508646998983,
|
|
"grad_norm": 0.7083740234375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8324770927429199,
|
|
"num_tokens": 783555679.0,
|
|
"step": 2458
|
|
},
|
|
{
|
|
"epoch": 2.501525940996948,
|
|
"grad_norm": 0.8292408585548401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5604,
|
|
"mean_token_accuracy": 0.8293014764785767,
|
|
"num_tokens": 783880886.0,
|
|
"step": 2459
|
|
},
|
|
{
|
|
"epoch": 2.5025432349949135,
|
|
"grad_norm": 0.7347398400306702,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.536,
|
|
"mean_token_accuracy": 0.8344863653182983,
|
|
"num_tokens": 784198844.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 2.503560528992879,
|
|
"grad_norm": 1.7198132276535034,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8289639949798584,
|
|
"num_tokens": 784524245.0,
|
|
"step": 2461
|
|
},
|
|
{
|
|
"epoch": 2.504577822990844,
|
|
"grad_norm": 0.7936400771141052,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5547,
|
|
"mean_token_accuracy": 0.8311157822608948,
|
|
"num_tokens": 784850952.0,
|
|
"step": 2462
|
|
},
|
|
{
|
|
"epoch": 2.5055951169888098,
|
|
"grad_norm": 0.7660007476806641,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.8293622732162476,
|
|
"num_tokens": 785151045.0,
|
|
"step": 2463
|
|
},
|
|
{
|
|
"epoch": 2.5066124109867753,
|
|
"grad_norm": 0.7893761396408081,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8309811353683472,
|
|
"num_tokens": 785470010.0,
|
|
"step": 2464
|
|
},
|
|
{
|
|
"epoch": 2.5076297049847405,
|
|
"grad_norm": 0.7528966069221497,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5774,
|
|
"mean_token_accuracy": 0.8246625661849976,
|
|
"num_tokens": 785801195.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 2.508646998982706,
|
|
"grad_norm": 0.7794021368026733,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8252547383308411,
|
|
"num_tokens": 786115203.0,
|
|
"step": 2466
|
|
},
|
|
{
|
|
"epoch": 2.5096642929806716,
|
|
"grad_norm": 0.8330888152122498,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8368071913719177,
|
|
"num_tokens": 786424319.0,
|
|
"step": 2467
|
|
},
|
|
{
|
|
"epoch": 2.5106815869786367,
|
|
"grad_norm": 0.751885175704956,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8278042078018188,
|
|
"num_tokens": 786732176.0,
|
|
"step": 2468
|
|
},
|
|
{
|
|
"epoch": 2.5116988809766023,
|
|
"grad_norm": 0.7629954814910889,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8319458961486816,
|
|
"num_tokens": 787060121.0,
|
|
"step": 2469
|
|
},
|
|
{
|
|
"epoch": 2.5127161749745675,
|
|
"grad_norm": 0.7450445294380188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8261040449142456,
|
|
"num_tokens": 787389440.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 2.513733468972533,
|
|
"grad_norm": 0.7563230395317078,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5624,
|
|
"mean_token_accuracy": 0.8288929462432861,
|
|
"num_tokens": 787711904.0,
|
|
"step": 2471
|
|
},
|
|
{
|
|
"epoch": 2.5147507629704986,
|
|
"grad_norm": 0.7394628524780273,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8336602449417114,
|
|
"num_tokens": 788040834.0,
|
|
"step": 2472
|
|
},
|
|
{
|
|
"epoch": 2.5157680569684637,
|
|
"grad_norm": 0.7385982871055603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8274520635604858,
|
|
"num_tokens": 788365031.0,
|
|
"step": 2473
|
|
},
|
|
{
|
|
"epoch": 2.5167853509664293,
|
|
"grad_norm": 0.7424465417861938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.837134599685669,
|
|
"num_tokens": 788684184.0,
|
|
"step": 2474
|
|
},
|
|
{
|
|
"epoch": 2.517802644964395,
|
|
"grad_norm": 0.7463836669921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8347026705741882,
|
|
"num_tokens": 789007564.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 2.51881993896236,
|
|
"grad_norm": 0.7344636917114258,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8264937996864319,
|
|
"num_tokens": 789338711.0,
|
|
"step": 2476
|
|
},
|
|
{
|
|
"epoch": 2.5198372329603256,
|
|
"grad_norm": 0.7780519723892212,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8339964151382446,
|
|
"num_tokens": 789651554.0,
|
|
"step": 2477
|
|
},
|
|
{
|
|
"epoch": 2.520854526958291,
|
|
"grad_norm": 0.763392448425293,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5694,
|
|
"mean_token_accuracy": 0.8263593912124634,
|
|
"num_tokens": 789980144.0,
|
|
"step": 2478
|
|
},
|
|
{
|
|
"epoch": 2.5218718209562563,
|
|
"grad_norm": 0.7752723693847656,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5359,
|
|
"mean_token_accuracy": 0.8367195725440979,
|
|
"num_tokens": 790289686.0,
|
|
"step": 2479
|
|
},
|
|
{
|
|
"epoch": 2.522889114954222,
|
|
"grad_norm": 0.780409574508667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5785,
|
|
"mean_token_accuracy": 0.8243266344070435,
|
|
"num_tokens": 790617809.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 2.523906408952187,
|
|
"grad_norm": 0.793233335018158,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5965,
|
|
"mean_token_accuracy": 0.8192570209503174,
|
|
"num_tokens": 790931867.0,
|
|
"step": 2481
|
|
},
|
|
{
|
|
"epoch": 2.5249237029501526,
|
|
"grad_norm": 0.7812409996986389,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8291711807250977,
|
|
"num_tokens": 791257431.0,
|
|
"step": 2482
|
|
},
|
|
{
|
|
"epoch": 2.525940996948118,
|
|
"grad_norm": 0.7788034081459045,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5502,
|
|
"mean_token_accuracy": 0.8330932855606079,
|
|
"num_tokens": 791571096.0,
|
|
"step": 2483
|
|
},
|
|
{
|
|
"epoch": 2.5269582909460833,
|
|
"grad_norm": 0.7682954668998718,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5687,
|
|
"mean_token_accuracy": 0.8256453275680542,
|
|
"num_tokens": 791889891.0,
|
|
"step": 2484
|
|
},
|
|
{
|
|
"epoch": 2.527975584944049,
|
|
"grad_norm": 0.733441174030304,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.8320663571357727,
|
|
"num_tokens": 792214100.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 2.528992878942014,
|
|
"grad_norm": 0.7833328247070312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.8362271189689636,
|
|
"num_tokens": 792530725.0,
|
|
"step": 2486
|
|
},
|
|
{
|
|
"epoch": 2.5300101729399795,
|
|
"grad_norm": 0.7879757881164551,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5648,
|
|
"mean_token_accuracy": 0.8272276520729065,
|
|
"num_tokens": 792847903.0,
|
|
"step": 2487
|
|
},
|
|
{
|
|
"epoch": 2.531027466937945,
|
|
"grad_norm": 0.7867984771728516,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5422,
|
|
"mean_token_accuracy": 0.833976149559021,
|
|
"num_tokens": 793151668.0,
|
|
"step": 2488
|
|
},
|
|
{
|
|
"epoch": 2.5320447609359107,
|
|
"grad_norm": 0.783227801322937,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5577,
|
|
"mean_token_accuracy": 0.8294700384140015,
|
|
"num_tokens": 793460024.0,
|
|
"step": 2489
|
|
},
|
|
{
|
|
"epoch": 2.533062054933876,
|
|
"grad_norm": 0.7828521728515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5384,
|
|
"mean_token_accuracy": 0.8348932266235352,
|
|
"num_tokens": 793777015.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 2.5340793489318414,
|
|
"grad_norm": 0.7644302845001221,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.8249564170837402,
|
|
"num_tokens": 794106369.0,
|
|
"step": 2491
|
|
},
|
|
{
|
|
"epoch": 2.5350966429298065,
|
|
"grad_norm": 0.7875006794929504,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.8332185745239258,
|
|
"num_tokens": 794419893.0,
|
|
"step": 2492
|
|
},
|
|
{
|
|
"epoch": 2.536113936927772,
|
|
"grad_norm": 0.8249663710594177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.534,
|
|
"mean_token_accuracy": 0.8363446593284607,
|
|
"num_tokens": 794725898.0,
|
|
"step": 2493
|
|
},
|
|
{
|
|
"epoch": 2.5371312309257377,
|
|
"grad_norm": 0.7706735730171204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5686,
|
|
"mean_token_accuracy": 0.8266034126281738,
|
|
"num_tokens": 795034685.0,
|
|
"step": 2494
|
|
},
|
|
{
|
|
"epoch": 2.538148524923703,
|
|
"grad_norm": 0.7650234699249268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5469,
|
|
"mean_token_accuracy": 0.8322057127952576,
|
|
"num_tokens": 795359866.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 2.5391658189216684,
|
|
"grad_norm": 0.7540287971496582,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5435,
|
|
"mean_token_accuracy": 0.8335233926773071,
|
|
"num_tokens": 795665057.0,
|
|
"step": 2496
|
|
},
|
|
{
|
|
"epoch": 2.5401831129196335,
|
|
"grad_norm": 1.013059377670288,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5506,
|
|
"mean_token_accuracy": 0.8318750858306885,
|
|
"num_tokens": 795976783.0,
|
|
"step": 2497
|
|
},
|
|
{
|
|
"epoch": 2.541200406917599,
|
|
"grad_norm": 0.8115147352218628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8316807746887207,
|
|
"num_tokens": 796287301.0,
|
|
"step": 2498
|
|
},
|
|
{
|
|
"epoch": 2.5422177009155646,
|
|
"grad_norm": 0.763339102268219,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8338295221328735,
|
|
"num_tokens": 796600291.0,
|
|
"step": 2499
|
|
},
|
|
{
|
|
"epoch": 2.5432349949135302,
|
|
"grad_norm": 0.7467638254165649,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5407,
|
|
"mean_token_accuracy": 0.8346984386444092,
|
|
"num_tokens": 796904278.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.5442522889114954,
|
|
"grad_norm": 0.7442284226417542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5691,
|
|
"mean_token_accuracy": 0.8270628452301025,
|
|
"num_tokens": 797229543.0,
|
|
"step": 2501
|
|
},
|
|
{
|
|
"epoch": 2.545269582909461,
|
|
"grad_norm": 0.7795024514198303,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5653,
|
|
"mean_token_accuracy": 0.8265025019645691,
|
|
"num_tokens": 797536387.0,
|
|
"step": 2502
|
|
},
|
|
{
|
|
"epoch": 2.546286876907426,
|
|
"grad_norm": 0.7624030113220215,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.828598141670227,
|
|
"num_tokens": 797843165.0,
|
|
"step": 2503
|
|
},
|
|
{
|
|
"epoch": 2.5473041709053916,
|
|
"grad_norm": 0.7822015881538391,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5513,
|
|
"mean_token_accuracy": 0.8312956094741821,
|
|
"num_tokens": 798149467.0,
|
|
"step": 2504
|
|
},
|
|
{
|
|
"epoch": 2.548321464903357,
|
|
"grad_norm": 0.7261184453964233,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8313003778457642,
|
|
"num_tokens": 798476152.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 2.5493387589013223,
|
|
"grad_norm": 0.7039114236831665,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8312034606933594,
|
|
"num_tokens": 798807326.0,
|
|
"step": 2506
|
|
},
|
|
{
|
|
"epoch": 2.550356052899288,
|
|
"grad_norm": 0.7534456849098206,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5915,
|
|
"mean_token_accuracy": 0.8210437893867493,
|
|
"num_tokens": 799134326.0,
|
|
"step": 2507
|
|
},
|
|
{
|
|
"epoch": 2.551373346897253,
|
|
"grad_norm": 0.7393434047698975,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8362290263175964,
|
|
"num_tokens": 799444712.0,
|
|
"step": 2508
|
|
},
|
|
{
|
|
"epoch": 2.5523906408952186,
|
|
"grad_norm": 0.7687445282936096,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5571,
|
|
"mean_token_accuracy": 0.8302136659622192,
|
|
"num_tokens": 799761918.0,
|
|
"step": 2509
|
|
},
|
|
{
|
|
"epoch": 2.553407934893184,
|
|
"grad_norm": 0.723220705986023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5286,
|
|
"mean_token_accuracy": 0.8383730053901672,
|
|
"num_tokens": 800086313.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 2.5544252288911498,
|
|
"grad_norm": 0.7691549062728882,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.826926589012146,
|
|
"num_tokens": 800392700.0,
|
|
"step": 2511
|
|
},
|
|
{
|
|
"epoch": 2.555442522889115,
|
|
"grad_norm": 0.7230547666549683,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8337690234184265,
|
|
"num_tokens": 800723148.0,
|
|
"step": 2512
|
|
},
|
|
{
|
|
"epoch": 2.5564598168870805,
|
|
"grad_norm": 0.7520925998687744,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5513,
|
|
"mean_token_accuracy": 0.8333572149276733,
|
|
"num_tokens": 801037154.0,
|
|
"step": 2513
|
|
},
|
|
{
|
|
"epoch": 2.5574771108850456,
|
|
"grad_norm": 0.7694193124771118,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5812,
|
|
"mean_token_accuracy": 0.8236919045448303,
|
|
"num_tokens": 801343393.0,
|
|
"step": 2514
|
|
},
|
|
{
|
|
"epoch": 2.558494404883011,
|
|
"grad_norm": 0.7773657441139221,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5613,
|
|
"mean_token_accuracy": 0.8282038569450378,
|
|
"num_tokens": 801652284.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 2.5595116988809767,
|
|
"grad_norm": 0.7484728693962097,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8301058411598206,
|
|
"num_tokens": 801967943.0,
|
|
"step": 2516
|
|
},
|
|
{
|
|
"epoch": 2.560528992878942,
|
|
"grad_norm": 0.731221616268158,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.538,
|
|
"mean_token_accuracy": 0.8355869054794312,
|
|
"num_tokens": 802277825.0,
|
|
"step": 2517
|
|
},
|
|
{
|
|
"epoch": 2.5615462868769074,
|
|
"grad_norm": 0.7734962105751038,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5505,
|
|
"mean_token_accuracy": 0.8329318165779114,
|
|
"num_tokens": 802590185.0,
|
|
"step": 2518
|
|
},
|
|
{
|
|
"epoch": 2.5625635808748726,
|
|
"grad_norm": 0.7789332270622253,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5133,
|
|
"mean_token_accuracy": 0.842313289642334,
|
|
"num_tokens": 802896699.0,
|
|
"step": 2519
|
|
},
|
|
{
|
|
"epoch": 2.563580874872838,
|
|
"grad_norm": 0.7538076639175415,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8314604759216309,
|
|
"num_tokens": 803217224.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 2.5645981688708037,
|
|
"grad_norm": 0.7312325239181519,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8282783627510071,
|
|
"num_tokens": 803552881.0,
|
|
"step": 2521
|
|
},
|
|
{
|
|
"epoch": 2.5656154628687693,
|
|
"grad_norm": 0.7468303442001343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.536,
|
|
"mean_token_accuracy": 0.8361011743545532,
|
|
"num_tokens": 803876358.0,
|
|
"step": 2522
|
|
},
|
|
{
|
|
"epoch": 2.5666327568667344,
|
|
"grad_norm": 0.8178204298019409,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8333477973937988,
|
|
"num_tokens": 804192458.0,
|
|
"step": 2523
|
|
},
|
|
{
|
|
"epoch": 2.5676500508647,
|
|
"grad_norm": 0.7961925864219666,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.8332005739212036,
|
|
"num_tokens": 804511634.0,
|
|
"step": 2524
|
|
},
|
|
{
|
|
"epoch": 2.568667344862665,
|
|
"grad_norm": 0.981375515460968,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5615,
|
|
"mean_token_accuracy": 0.8280132412910461,
|
|
"num_tokens": 804828982.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 2.5696846388606307,
|
|
"grad_norm": 0.7216664552688599,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5728,
|
|
"mean_token_accuracy": 0.8246604800224304,
|
|
"num_tokens": 805159595.0,
|
|
"step": 2526
|
|
},
|
|
{
|
|
"epoch": 2.5707019328585963,
|
|
"grad_norm": 0.7868689894676208,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8302907943725586,
|
|
"num_tokens": 805467391.0,
|
|
"step": 2527
|
|
},
|
|
{
|
|
"epoch": 2.5717192268565614,
|
|
"grad_norm": 0.7492839694023132,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5552,
|
|
"mean_token_accuracy": 0.8306524157524109,
|
|
"num_tokens": 805767777.0,
|
|
"step": 2528
|
|
},
|
|
{
|
|
"epoch": 2.572736520854527,
|
|
"grad_norm": 0.748909056186676,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5477,
|
|
"mean_token_accuracy": 0.8326543569564819,
|
|
"num_tokens": 806083011.0,
|
|
"step": 2529
|
|
},
|
|
{
|
|
"epoch": 2.573753814852492,
|
|
"grad_norm": 0.7458982467651367,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5407,
|
|
"mean_token_accuracy": 0.833580732345581,
|
|
"num_tokens": 806389989.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 2.5747711088504577,
|
|
"grad_norm": 0.7254387736320496,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8330187797546387,
|
|
"num_tokens": 806724467.0,
|
|
"step": 2531
|
|
},
|
|
{
|
|
"epoch": 2.5757884028484233,
|
|
"grad_norm": 0.7179408669471741,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5311,
|
|
"mean_token_accuracy": 0.8376388549804688,
|
|
"num_tokens": 807048509.0,
|
|
"step": 2532
|
|
},
|
|
{
|
|
"epoch": 2.576805696846389,
|
|
"grad_norm": 0.773971438407898,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5416,
|
|
"mean_token_accuracy": 0.8336687088012695,
|
|
"num_tokens": 807355916.0,
|
|
"step": 2533
|
|
},
|
|
{
|
|
"epoch": 2.577822990844354,
|
|
"grad_norm": 0.7327409982681274,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8345814943313599,
|
|
"num_tokens": 807664154.0,
|
|
"step": 2534
|
|
},
|
|
{
|
|
"epoch": 2.5788402848423195,
|
|
"grad_norm": 0.7396280169487,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5201,
|
|
"mean_token_accuracy": 0.8391358852386475,
|
|
"num_tokens": 807982521.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 2.5798575788402847,
|
|
"grad_norm": 0.745638906955719,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.8290165662765503,
|
|
"num_tokens": 808310119.0,
|
|
"step": 2536
|
|
},
|
|
{
|
|
"epoch": 2.5808748728382502,
|
|
"grad_norm": 0.750477135181427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5449,
|
|
"mean_token_accuracy": 0.8326935172080994,
|
|
"num_tokens": 808637880.0,
|
|
"step": 2537
|
|
},
|
|
{
|
|
"epoch": 2.581892166836216,
|
|
"grad_norm": 0.724315345287323,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.83362877368927,
|
|
"num_tokens": 808973018.0,
|
|
"step": 2538
|
|
},
|
|
{
|
|
"epoch": 2.582909460834181,
|
|
"grad_norm": 0.732577919960022,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.8321558833122253,
|
|
"num_tokens": 809288153.0,
|
|
"step": 2539
|
|
},
|
|
{
|
|
"epoch": 2.5839267548321465,
|
|
"grad_norm": 0.7791313529014587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.834289014339447,
|
|
"num_tokens": 809594813.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 2.5849440488301116,
|
|
"grad_norm": 0.7388265132904053,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8334192037582397,
|
|
"num_tokens": 809914564.0,
|
|
"step": 2541
|
|
},
|
|
{
|
|
"epoch": 2.585961342828077,
|
|
"grad_norm": 0.7179327607154846,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8271803855895996,
|
|
"num_tokens": 810235007.0,
|
|
"step": 2542
|
|
},
|
|
{
|
|
"epoch": 2.586978636826043,
|
|
"grad_norm": 0.763087272644043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.8310995101928711,
|
|
"num_tokens": 810562179.0,
|
|
"step": 2543
|
|
},
|
|
{
|
|
"epoch": 2.5879959308240084,
|
|
"grad_norm": 0.7702552080154419,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5447,
|
|
"mean_token_accuracy": 0.8343356847763062,
|
|
"num_tokens": 810890208.0,
|
|
"step": 2544
|
|
},
|
|
{
|
|
"epoch": 2.5890132248219735,
|
|
"grad_norm": 0.7129185199737549,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8318526744842529,
|
|
"num_tokens": 811223836.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 2.590030518819939,
|
|
"grad_norm": 0.735282838344574,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5631,
|
|
"mean_token_accuracy": 0.8289995789527893,
|
|
"num_tokens": 811552000.0,
|
|
"step": 2546
|
|
},
|
|
{
|
|
"epoch": 2.591047812817904,
|
|
"grad_norm": 0.7413491010665894,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8306357860565186,
|
|
"num_tokens": 811879361.0,
|
|
"step": 2547
|
|
},
|
|
{
|
|
"epoch": 2.5920651068158698,
|
|
"grad_norm": 0.8138635754585266,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5382,
|
|
"mean_token_accuracy": 0.8339513540267944,
|
|
"num_tokens": 812201258.0,
|
|
"step": 2548
|
|
},
|
|
{
|
|
"epoch": 2.5930824008138353,
|
|
"grad_norm": 0.7724733352661133,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8256838321685791,
|
|
"num_tokens": 812522643.0,
|
|
"step": 2549
|
|
},
|
|
{
|
|
"epoch": 2.5940996948118005,
|
|
"grad_norm": 0.7342690229415894,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5378,
|
|
"mean_token_accuracy": 0.8354153633117676,
|
|
"num_tokens": 812857034.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 2.595116988809766,
|
|
"grad_norm": 1.2330756187438965,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5622,
|
|
"mean_token_accuracy": 0.8277719020843506,
|
|
"num_tokens": 813171705.0,
|
|
"step": 2551
|
|
},
|
|
{
|
|
"epoch": 2.596134282807731,
|
|
"grad_norm": 0.8125199675559998,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8276410698890686,
|
|
"num_tokens": 813478790.0,
|
|
"step": 2552
|
|
},
|
|
{
|
|
"epoch": 2.5971515768056967,
|
|
"grad_norm": 0.7920307517051697,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5571,
|
|
"mean_token_accuracy": 0.830203115940094,
|
|
"num_tokens": 813795523.0,
|
|
"step": 2553
|
|
},
|
|
{
|
|
"epoch": 2.5981688708036623,
|
|
"grad_norm": 0.7683935165405273,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8320221304893494,
|
|
"num_tokens": 814113911.0,
|
|
"step": 2554
|
|
},
|
|
{
|
|
"epoch": 2.599186164801628,
|
|
"grad_norm": 0.7362149357795715,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8310428857803345,
|
|
"num_tokens": 814435363.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 2.600203458799593,
|
|
"grad_norm": 0.7711970210075378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.8286822438240051,
|
|
"num_tokens": 814760052.0,
|
|
"step": 2556
|
|
},
|
|
{
|
|
"epoch": 2.6012207527975586,
|
|
"grad_norm": 0.7560073137283325,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5448,
|
|
"mean_token_accuracy": 0.8328245878219604,
|
|
"num_tokens": 815081519.0,
|
|
"step": 2557
|
|
},
|
|
{
|
|
"epoch": 2.6022380467955237,
|
|
"grad_norm": 0.7367380857467651,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5649,
|
|
"mean_token_accuracy": 0.8287874460220337,
|
|
"num_tokens": 815404054.0,
|
|
"step": 2558
|
|
},
|
|
{
|
|
"epoch": 2.6032553407934893,
|
|
"grad_norm": 0.7321335077285767,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5502,
|
|
"mean_token_accuracy": 0.8326501846313477,
|
|
"num_tokens": 815726850.0,
|
|
"step": 2559
|
|
},
|
|
{
|
|
"epoch": 2.604272634791455,
|
|
"grad_norm": 0.797602653503418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5809,
|
|
"mean_token_accuracy": 0.8246169090270996,
|
|
"num_tokens": 816039389.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 2.60528992878942,
|
|
"grad_norm": 0.7175667881965637,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.8312521576881409,
|
|
"num_tokens": 816369450.0,
|
|
"step": 2561
|
|
},
|
|
{
|
|
"epoch": 2.6063072227873856,
|
|
"grad_norm": 0.7842046022415161,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5483,
|
|
"mean_token_accuracy": 0.8326440453529358,
|
|
"num_tokens": 816691307.0,
|
|
"step": 2562
|
|
},
|
|
{
|
|
"epoch": 2.6073245167853507,
|
|
"grad_norm": 0.7255687117576599,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8336960673332214,
|
|
"num_tokens": 817014438.0,
|
|
"step": 2563
|
|
},
|
|
{
|
|
"epoch": 2.6083418107833163,
|
|
"grad_norm": 0.7649070620536804,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5485,
|
|
"mean_token_accuracy": 0.8324762582778931,
|
|
"num_tokens": 817342242.0,
|
|
"step": 2564
|
|
},
|
|
{
|
|
"epoch": 2.609359104781282,
|
|
"grad_norm": 0.8680239915847778,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8290839195251465,
|
|
"num_tokens": 817643303.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 2.6103763987792474,
|
|
"grad_norm": 0.74476158618927,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.512,
|
|
"mean_token_accuracy": 0.8423797488212585,
|
|
"num_tokens": 817952612.0,
|
|
"step": 2566
|
|
},
|
|
{
|
|
"epoch": 2.6113936927772126,
|
|
"grad_norm": 0.7176665663719177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8251190781593323,
|
|
"num_tokens": 818287143.0,
|
|
"step": 2567
|
|
},
|
|
{
|
|
"epoch": 2.612410986775178,
|
|
"grad_norm": 0.726041316986084,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5398,
|
|
"mean_token_accuracy": 0.833194375038147,
|
|
"num_tokens": 818608748.0,
|
|
"step": 2568
|
|
},
|
|
{
|
|
"epoch": 2.6134282807731433,
|
|
"grad_norm": 0.7279813885688782,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5412,
|
|
"mean_token_accuracy": 0.8337301015853882,
|
|
"num_tokens": 818924573.0,
|
|
"step": 2569
|
|
},
|
|
{
|
|
"epoch": 2.614445574771109,
|
|
"grad_norm": 0.8289151191711426,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5279,
|
|
"mean_token_accuracy": 0.8384712934494019,
|
|
"num_tokens": 819253050.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 2.6154628687690744,
|
|
"grad_norm": 0.7946772575378418,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5413,
|
|
"mean_token_accuracy": 0.8344959020614624,
|
|
"num_tokens": 819585374.0,
|
|
"step": 2571
|
|
},
|
|
{
|
|
"epoch": 2.6164801627670395,
|
|
"grad_norm": 0.7746971845626831,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8323616981506348,
|
|
"num_tokens": 819902977.0,
|
|
"step": 2572
|
|
},
|
|
{
|
|
"epoch": 2.617497456765005,
|
|
"grad_norm": 0.735686719417572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8307164907455444,
|
|
"num_tokens": 820219660.0,
|
|
"step": 2573
|
|
},
|
|
{
|
|
"epoch": 2.6185147507629702,
|
|
"grad_norm": 0.9371272325515747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8262836933135986,
|
|
"num_tokens": 820542132.0,
|
|
"step": 2574
|
|
},
|
|
{
|
|
"epoch": 2.619532044760936,
|
|
"grad_norm": 0.745701253414154,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5258,
|
|
"mean_token_accuracy": 0.8384919166564941,
|
|
"num_tokens": 820860446.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 2.6205493387589014,
|
|
"grad_norm": 0.734398603439331,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5811,
|
|
"mean_token_accuracy": 0.8236312866210938,
|
|
"num_tokens": 821181422.0,
|
|
"step": 2576
|
|
},
|
|
{
|
|
"epoch": 2.621566632756867,
|
|
"grad_norm": 0.7533614039421082,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5474,
|
|
"mean_token_accuracy": 0.8329817652702332,
|
|
"num_tokens": 821489947.0,
|
|
"step": 2577
|
|
},
|
|
{
|
|
"epoch": 2.622583926754832,
|
|
"grad_norm": 0.7327065467834473,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8312802314758301,
|
|
"num_tokens": 821809310.0,
|
|
"step": 2578
|
|
},
|
|
{
|
|
"epoch": 2.6236012207527977,
|
|
"grad_norm": 0.7578933238983154,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5509,
|
|
"mean_token_accuracy": 0.8326842188835144,
|
|
"num_tokens": 822129562.0,
|
|
"step": 2579
|
|
},
|
|
{
|
|
"epoch": 2.624618514750763,
|
|
"grad_norm": 0.7132900953292847,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.8397087454795837,
|
|
"num_tokens": 822441494.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 2.6256358087487284,
|
|
"grad_norm": 0.783174455165863,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5458,
|
|
"mean_token_accuracy": 0.8319453597068787,
|
|
"num_tokens": 822761536.0,
|
|
"step": 2581
|
|
},
|
|
{
|
|
"epoch": 2.626653102746694,
|
|
"grad_norm": 0.7523170113563538,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5632,
|
|
"mean_token_accuracy": 0.8283185362815857,
|
|
"num_tokens": 823092906.0,
|
|
"step": 2582
|
|
},
|
|
{
|
|
"epoch": 2.627670396744659,
|
|
"grad_norm": 0.796053946018219,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8302963972091675,
|
|
"num_tokens": 823403147.0,
|
|
"step": 2583
|
|
},
|
|
{
|
|
"epoch": 2.6286876907426246,
|
|
"grad_norm": 0.7748081684112549,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8307095766067505,
|
|
"num_tokens": 823694330.0,
|
|
"step": 2584
|
|
},
|
|
{
|
|
"epoch": 2.62970498474059,
|
|
"grad_norm": 0.736704409122467,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8346253037452698,
|
|
"num_tokens": 824009139.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 2.6307222787385554,
|
|
"grad_norm": 0.7823536992073059,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5254,
|
|
"mean_token_accuracy": 0.8384369015693665,
|
|
"num_tokens": 824329297.0,
|
|
"step": 2586
|
|
},
|
|
{
|
|
"epoch": 2.631739572736521,
|
|
"grad_norm": 0.7874622941017151,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5577,
|
|
"mean_token_accuracy": 0.8292387127876282,
|
|
"num_tokens": 824673484.0,
|
|
"step": 2587
|
|
},
|
|
{
|
|
"epoch": 2.6327568667344865,
|
|
"grad_norm": 0.7718305587768555,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5616,
|
|
"mean_token_accuracy": 0.828931450843811,
|
|
"num_tokens": 825002632.0,
|
|
"step": 2588
|
|
},
|
|
{
|
|
"epoch": 2.6337741607324516,
|
|
"grad_norm": 0.7725831270217896,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5515,
|
|
"mean_token_accuracy": 0.8310902118682861,
|
|
"num_tokens": 825320835.0,
|
|
"step": 2589
|
|
},
|
|
{
|
|
"epoch": 2.634791454730417,
|
|
"grad_norm": 0.736696183681488,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5446,
|
|
"mean_token_accuracy": 0.8332586288452148,
|
|
"num_tokens": 825642437.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 2.6358087487283823,
|
|
"grad_norm": 0.7683669328689575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5662,
|
|
"mean_token_accuracy": 0.8285846710205078,
|
|
"num_tokens": 825949889.0,
|
|
"step": 2591
|
|
},
|
|
{
|
|
"epoch": 2.636826042726348,
|
|
"grad_norm": 0.8031424880027771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8351372480392456,
|
|
"num_tokens": 826249106.0,
|
|
"step": 2592
|
|
},
|
|
{
|
|
"epoch": 2.6378433367243135,
|
|
"grad_norm": 0.7533608675003052,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5554,
|
|
"mean_token_accuracy": 0.8309059143066406,
|
|
"num_tokens": 826566435.0,
|
|
"step": 2593
|
|
},
|
|
{
|
|
"epoch": 2.6388606307222786,
|
|
"grad_norm": 0.7591979503631592,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5557,
|
|
"mean_token_accuracy": 0.8305896520614624,
|
|
"num_tokens": 826880946.0,
|
|
"step": 2594
|
|
},
|
|
{
|
|
"epoch": 2.639877924720244,
|
|
"grad_norm": 0.7779766321182251,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8329280614852905,
|
|
"num_tokens": 827204054.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 2.6408952187182093,
|
|
"grad_norm": 0.7244125008583069,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5241,
|
|
"mean_token_accuracy": 0.838779091835022,
|
|
"num_tokens": 827526209.0,
|
|
"step": 2596
|
|
},
|
|
{
|
|
"epoch": 2.641912512716175,
|
|
"grad_norm": 0.8206683993339539,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5296,
|
|
"mean_token_accuracy": 0.8372079133987427,
|
|
"num_tokens": 827841027.0,
|
|
"step": 2597
|
|
},
|
|
{
|
|
"epoch": 2.6429298067141405,
|
|
"grad_norm": 0.7615863680839539,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5733,
|
|
"mean_token_accuracy": 0.8255772590637207,
|
|
"num_tokens": 828167926.0,
|
|
"step": 2598
|
|
},
|
|
{
|
|
"epoch": 2.643947100712106,
|
|
"grad_norm": 0.7539216876029968,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5451,
|
|
"mean_token_accuracy": 0.8324081897735596,
|
|
"num_tokens": 828492310.0,
|
|
"step": 2599
|
|
},
|
|
{
|
|
"epoch": 2.644964394710071,
|
|
"grad_norm": 0.8165754675865173,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5495,
|
|
"mean_token_accuracy": 0.8313982486724854,
|
|
"num_tokens": 828811219.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 2.6459816887080367,
|
|
"grad_norm": 0.7321832180023193,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5183,
|
|
"mean_token_accuracy": 0.8401905298233032,
|
|
"num_tokens": 829121022.0,
|
|
"step": 2601
|
|
},
|
|
{
|
|
"epoch": 2.646998982706002,
|
|
"grad_norm": 0.7699927687644958,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.830227792263031,
|
|
"num_tokens": 829442232.0,
|
|
"step": 2602
|
|
},
|
|
{
|
|
"epoch": 2.6480162767039674,
|
|
"grad_norm": 0.7571011185646057,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8308849334716797,
|
|
"num_tokens": 829766930.0,
|
|
"step": 2603
|
|
},
|
|
{
|
|
"epoch": 2.649033570701933,
|
|
"grad_norm": 0.7879880666732788,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5552,
|
|
"mean_token_accuracy": 0.8314686417579651,
|
|
"num_tokens": 830084634.0,
|
|
"step": 2604
|
|
},
|
|
{
|
|
"epoch": 2.650050864699898,
|
|
"grad_norm": 0.7368956208229065,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5266,
|
|
"mean_token_accuracy": 0.8385523557662964,
|
|
"num_tokens": 830411922.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 2.6510681586978637,
|
|
"grad_norm": 0.7690927982330322,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.8323553204536438,
|
|
"num_tokens": 830725899.0,
|
|
"step": 2606
|
|
},
|
|
{
|
|
"epoch": 2.652085452695829,
|
|
"grad_norm": 0.7460976839065552,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5379,
|
|
"mean_token_accuracy": 0.8354268670082092,
|
|
"num_tokens": 831048409.0,
|
|
"step": 2607
|
|
},
|
|
{
|
|
"epoch": 2.6531027466937944,
|
|
"grad_norm": 0.7625808715820312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.8273561596870422,
|
|
"num_tokens": 831363323.0,
|
|
"step": 2608
|
|
},
|
|
{
|
|
"epoch": 2.65412004069176,
|
|
"grad_norm": 0.7780117392539978,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.82862389087677,
|
|
"num_tokens": 831680359.0,
|
|
"step": 2609
|
|
},
|
|
{
|
|
"epoch": 2.6551373346897256,
|
|
"grad_norm": 0.7825818657875061,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5798,
|
|
"mean_token_accuracy": 0.8251497745513916,
|
|
"num_tokens": 831993691.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 2.6561546286876907,
|
|
"grad_norm": 0.7896843552589417,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5467,
|
|
"mean_token_accuracy": 0.832328200340271,
|
|
"num_tokens": 832317882.0,
|
|
"step": 2611
|
|
},
|
|
{
|
|
"epoch": 2.6571719226856563,
|
|
"grad_norm": 0.7164852023124695,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8292514085769653,
|
|
"num_tokens": 832638879.0,
|
|
"step": 2612
|
|
},
|
|
{
|
|
"epoch": 2.6581892166836214,
|
|
"grad_norm": 0.7482007145881653,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8312879800796509,
|
|
"num_tokens": 832949967.0,
|
|
"step": 2613
|
|
},
|
|
{
|
|
"epoch": 2.659206510681587,
|
|
"grad_norm": 0.7153770923614502,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.829216480255127,
|
|
"num_tokens": 833279512.0,
|
|
"step": 2614
|
|
},
|
|
{
|
|
"epoch": 2.6602238046795526,
|
|
"grad_norm": 0.8966414928436279,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8260223865509033,
|
|
"num_tokens": 833592791.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 2.6612410986775177,
|
|
"grad_norm": 0.7625566720962524,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5128,
|
|
"mean_token_accuracy": 0.841602087020874,
|
|
"num_tokens": 833906060.0,
|
|
"step": 2616
|
|
},
|
|
{
|
|
"epoch": 2.6622583926754833,
|
|
"grad_norm": 0.7611061334609985,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8299546241760254,
|
|
"num_tokens": 834238761.0,
|
|
"step": 2617
|
|
},
|
|
{
|
|
"epoch": 2.6632756866734484,
|
|
"grad_norm": 0.7519465684890747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.831902027130127,
|
|
"num_tokens": 834556371.0,
|
|
"step": 2618
|
|
},
|
|
{
|
|
"epoch": 2.664292980671414,
|
|
"grad_norm": 0.8466227054595947,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8316989541053772,
|
|
"num_tokens": 834886463.0,
|
|
"step": 2619
|
|
},
|
|
{
|
|
"epoch": 2.6653102746693795,
|
|
"grad_norm": 0.7515255212783813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5788,
|
|
"mean_token_accuracy": 0.8234021067619324,
|
|
"num_tokens": 835210506.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 2.666327568667345,
|
|
"grad_norm": 0.7361791133880615,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.8277843594551086,
|
|
"num_tokens": 835541684.0,
|
|
"step": 2621
|
|
},
|
|
{
|
|
"epoch": 2.6673448626653102,
|
|
"grad_norm": 0.7728709578514099,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5231,
|
|
"mean_token_accuracy": 0.8393305540084839,
|
|
"num_tokens": 835845878.0,
|
|
"step": 2622
|
|
},
|
|
{
|
|
"epoch": 2.668362156663276,
|
|
"grad_norm": 0.7608600854873657,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8346510529518127,
|
|
"num_tokens": 836171735.0,
|
|
"step": 2623
|
|
},
|
|
{
|
|
"epoch": 2.669379450661241,
|
|
"grad_norm": 0.7453369498252869,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8335638046264648,
|
|
"num_tokens": 836490407.0,
|
|
"step": 2624
|
|
},
|
|
{
|
|
"epoch": 2.6703967446592065,
|
|
"grad_norm": 0.765600323677063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5778,
|
|
"mean_token_accuracy": 0.8250622749328613,
|
|
"num_tokens": 836806889.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 2.671414038657172,
|
|
"grad_norm": 0.7886757850646973,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5236,
|
|
"mean_token_accuracy": 0.8383769392967224,
|
|
"num_tokens": 837114999.0,
|
|
"step": 2626
|
|
},
|
|
{
|
|
"epoch": 2.672431332655137,
|
|
"grad_norm": 0.7657735347747803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8311899900436401,
|
|
"num_tokens": 837446232.0,
|
|
"step": 2627
|
|
},
|
|
{
|
|
"epoch": 2.673448626653103,
|
|
"grad_norm": 0.7709946632385254,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8304938077926636,
|
|
"num_tokens": 837755582.0,
|
|
"step": 2628
|
|
},
|
|
{
|
|
"epoch": 2.674465920651068,
|
|
"grad_norm": 0.7618497610092163,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8329755663871765,
|
|
"num_tokens": 838095626.0,
|
|
"step": 2629
|
|
},
|
|
{
|
|
"epoch": 2.6754832146490335,
|
|
"grad_norm": 0.7509616613388062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5364,
|
|
"mean_token_accuracy": 0.8354884386062622,
|
|
"num_tokens": 838424701.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 2.676500508646999,
|
|
"grad_norm": 0.7259635329246521,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5088,
|
|
"mean_token_accuracy": 0.8430804014205933,
|
|
"num_tokens": 838749312.0,
|
|
"step": 2631
|
|
},
|
|
{
|
|
"epoch": 2.6775178026449646,
|
|
"grad_norm": 0.7568365931510925,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8327680826187134,
|
|
"num_tokens": 839056583.0,
|
|
"step": 2632
|
|
},
|
|
{
|
|
"epoch": 2.6785350966429298,
|
|
"grad_norm": 0.7711842656135559,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.8370242714881897,
|
|
"num_tokens": 839375666.0,
|
|
"step": 2633
|
|
},
|
|
{
|
|
"epoch": 2.6795523906408953,
|
|
"grad_norm": 0.8110151886940002,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8339352607727051,
|
|
"num_tokens": 839686915.0,
|
|
"step": 2634
|
|
},
|
|
{
|
|
"epoch": 2.6805696846388605,
|
|
"grad_norm": 0.7888451218605042,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5525,
|
|
"mean_token_accuracy": 0.8314832448959351,
|
|
"num_tokens": 840014745.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 2.681586978636826,
|
|
"grad_norm": 0.7647059559822083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.571,
|
|
"mean_token_accuracy": 0.8256795406341553,
|
|
"num_tokens": 840328107.0,
|
|
"step": 2636
|
|
},
|
|
{
|
|
"epoch": 2.6826042726347916,
|
|
"grad_norm": 0.7674223184585571,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8363380432128906,
|
|
"num_tokens": 840645654.0,
|
|
"step": 2637
|
|
},
|
|
{
|
|
"epoch": 2.6836215666327567,
|
|
"grad_norm": 0.8335756659507751,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5635,
|
|
"mean_token_accuracy": 0.8278694152832031,
|
|
"num_tokens": 840940420.0,
|
|
"step": 2638
|
|
},
|
|
{
|
|
"epoch": 2.6846388606307223,
|
|
"grad_norm": 0.7549247145652771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8322106599807739,
|
|
"num_tokens": 841254500.0,
|
|
"step": 2639
|
|
},
|
|
{
|
|
"epoch": 2.6856561546286875,
|
|
"grad_norm": 0.8285913467407227,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5505,
|
|
"mean_token_accuracy": 0.8317642211914062,
|
|
"num_tokens": 841560795.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 2.686673448626653,
|
|
"grad_norm": 0.7781467437744141,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5618,
|
|
"mean_token_accuracy": 0.8295000791549683,
|
|
"num_tokens": 841878106.0,
|
|
"step": 2641
|
|
},
|
|
{
|
|
"epoch": 2.6876907426246186,
|
|
"grad_norm": 0.8468384742736816,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5537,
|
|
"mean_token_accuracy": 0.8307315111160278,
|
|
"num_tokens": 842189540.0,
|
|
"step": 2642
|
|
},
|
|
{
|
|
"epoch": 2.688708036622584,
|
|
"grad_norm": 0.7900856137275696,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5417,
|
|
"mean_token_accuracy": 0.8336501121520996,
|
|
"num_tokens": 842497226.0,
|
|
"step": 2643
|
|
},
|
|
{
|
|
"epoch": 2.6897253306205493,
|
|
"grad_norm": 0.8252914547920227,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5291,
|
|
"mean_token_accuracy": 0.8377597332000732,
|
|
"num_tokens": 842814612.0,
|
|
"step": 2644
|
|
},
|
|
{
|
|
"epoch": 2.690742624618515,
|
|
"grad_norm": 0.7623134255409241,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.827522873878479,
|
|
"num_tokens": 843117888.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 2.69175991861648,
|
|
"grad_norm": 0.7408959865570068,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.538,
|
|
"mean_token_accuracy": 0.8351221084594727,
|
|
"num_tokens": 843448573.0,
|
|
"step": 2646
|
|
},
|
|
{
|
|
"epoch": 2.6927772126144456,
|
|
"grad_norm": 0.8234544992446899,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8329236507415771,
|
|
"num_tokens": 843753683.0,
|
|
"step": 2647
|
|
},
|
|
{
|
|
"epoch": 2.693794506612411,
|
|
"grad_norm": 0.7720510363578796,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5369,
|
|
"mean_token_accuracy": 0.8356927633285522,
|
|
"num_tokens": 844084227.0,
|
|
"step": 2648
|
|
},
|
|
{
|
|
"epoch": 2.6948118006103763,
|
|
"grad_norm": 0.7771438360214233,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8273162841796875,
|
|
"num_tokens": 844401669.0,
|
|
"step": 2649
|
|
},
|
|
{
|
|
"epoch": 2.695829094608342,
|
|
"grad_norm": 0.8102500438690186,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5292,
|
|
"mean_token_accuracy": 0.8374292850494385,
|
|
"num_tokens": 844700667.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 2.696846388606307,
|
|
"grad_norm": 0.7536808252334595,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8293795585632324,
|
|
"num_tokens": 845028014.0,
|
|
"step": 2651
|
|
},
|
|
{
|
|
"epoch": 2.6978636826042726,
|
|
"grad_norm": 0.7645767331123352,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5391,
|
|
"mean_token_accuracy": 0.8358734846115112,
|
|
"num_tokens": 845341860.0,
|
|
"step": 2652
|
|
},
|
|
{
|
|
"epoch": 2.698880976602238,
|
|
"grad_norm": 0.7719890475273132,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8348777890205383,
|
|
"num_tokens": 845655021.0,
|
|
"step": 2653
|
|
},
|
|
{
|
|
"epoch": 2.6998982706002037,
|
|
"grad_norm": 0.7685028910636902,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.533,
|
|
"mean_token_accuracy": 0.8365878462791443,
|
|
"num_tokens": 845981043.0,
|
|
"step": 2654
|
|
},
|
|
{
|
|
"epoch": 2.700915564598169,
|
|
"grad_norm": 0.8190966248512268,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.558,
|
|
"mean_token_accuracy": 0.8298583030700684,
|
|
"num_tokens": 846289014.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 2.7019328585961344,
|
|
"grad_norm": 0.7613459825515747,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8314666152000427,
|
|
"num_tokens": 846620497.0,
|
|
"step": 2656
|
|
},
|
|
{
|
|
"epoch": 2.7029501525940995,
|
|
"grad_norm": 0.7502064108848572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5247,
|
|
"mean_token_accuracy": 0.8381735682487488,
|
|
"num_tokens": 846939843.0,
|
|
"step": 2657
|
|
},
|
|
{
|
|
"epoch": 2.703967446592065,
|
|
"grad_norm": 0.7535653114318848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8325021266937256,
|
|
"num_tokens": 847258615.0,
|
|
"step": 2658
|
|
},
|
|
{
|
|
"epoch": 2.7049847405900307,
|
|
"grad_norm": 0.7349403500556946,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.535,
|
|
"mean_token_accuracy": 0.8369818329811096,
|
|
"num_tokens": 847566136.0,
|
|
"step": 2659
|
|
},
|
|
{
|
|
"epoch": 2.706002034587996,
|
|
"grad_norm": 1.5821315050125122,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8259924650192261,
|
|
"num_tokens": 847880060.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 2.7070193285859614,
|
|
"grad_norm": 0.8164228796958923,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5371,
|
|
"mean_token_accuracy": 0.8358385562896729,
|
|
"num_tokens": 848174723.0,
|
|
"step": 2661
|
|
},
|
|
{
|
|
"epoch": 2.7080366225839265,
|
|
"grad_norm": 0.7600139379501343,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.8354610800743103,
|
|
"num_tokens": 848487841.0,
|
|
"step": 2662
|
|
},
|
|
{
|
|
"epoch": 2.709053916581892,
|
|
"grad_norm": 0.707962155342102,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5401,
|
|
"mean_token_accuracy": 0.8348598480224609,
|
|
"num_tokens": 848802975.0,
|
|
"step": 2663
|
|
},
|
|
{
|
|
"epoch": 2.7100712105798577,
|
|
"grad_norm": 0.7490245699882507,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8320720195770264,
|
|
"num_tokens": 849119947.0,
|
|
"step": 2664
|
|
},
|
|
{
|
|
"epoch": 2.7110885045778232,
|
|
"grad_norm": 0.7445569634437561,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5256,
|
|
"mean_token_accuracy": 0.8383816480636597,
|
|
"num_tokens": 849449730.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 2.7121057985757884,
|
|
"grad_norm": 0.7544460892677307,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.831578254699707,
|
|
"num_tokens": 849776273.0,
|
|
"step": 2666
|
|
},
|
|
{
|
|
"epoch": 2.713123092573754,
|
|
"grad_norm": 0.7565827965736389,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5535,
|
|
"mean_token_accuracy": 0.8308531045913696,
|
|
"num_tokens": 850089497.0,
|
|
"step": 2667
|
|
},
|
|
{
|
|
"epoch": 2.714140386571719,
|
|
"grad_norm": 0.754690408706665,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8350880146026611,
|
|
"num_tokens": 850402164.0,
|
|
"step": 2668
|
|
},
|
|
{
|
|
"epoch": 2.7151576805696847,
|
|
"grad_norm": 0.7614121437072754,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5626,
|
|
"mean_token_accuracy": 0.8297010064125061,
|
|
"num_tokens": 850732466.0,
|
|
"step": 2669
|
|
},
|
|
{
|
|
"epoch": 2.7161749745676502,
|
|
"grad_norm": 0.8393460512161255,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8328503370285034,
|
|
"num_tokens": 851032068.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 2.7171922685656154,
|
|
"grad_norm": 0.7551735043525696,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5409,
|
|
"mean_token_accuracy": 0.8348469734191895,
|
|
"num_tokens": 851344640.0,
|
|
"step": 2671
|
|
},
|
|
{
|
|
"epoch": 2.718209562563581,
|
|
"grad_norm": 0.7437580823898315,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.8372975587844849,
|
|
"num_tokens": 851650774.0,
|
|
"step": 2672
|
|
},
|
|
{
|
|
"epoch": 2.719226856561546,
|
|
"grad_norm": 0.7455363273620605,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5762,
|
|
"mean_token_accuracy": 0.8253104090690613,
|
|
"num_tokens": 851963425.0,
|
|
"step": 2673
|
|
},
|
|
{
|
|
"epoch": 2.7202441505595116,
|
|
"grad_norm": 0.7925944328308105,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8333321809768677,
|
|
"num_tokens": 852286142.0,
|
|
"step": 2674
|
|
},
|
|
{
|
|
"epoch": 2.721261444557477,
|
|
"grad_norm": 0.7680412530899048,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8373532295227051,
|
|
"num_tokens": 852582032.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 2.722278738555443,
|
|
"grad_norm": 0.7905005216598511,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5284,
|
|
"mean_token_accuracy": 0.8377512693405151,
|
|
"num_tokens": 852884998.0,
|
|
"step": 2676
|
|
},
|
|
{
|
|
"epoch": 2.723296032553408,
|
|
"grad_norm": 0.7383782863616943,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8325200080871582,
|
|
"num_tokens": 853215576.0,
|
|
"step": 2677
|
|
},
|
|
{
|
|
"epoch": 2.7243133265513735,
|
|
"grad_norm": 0.7262946367263794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5634,
|
|
"mean_token_accuracy": 0.8283469080924988,
|
|
"num_tokens": 853539145.0,
|
|
"step": 2678
|
|
},
|
|
{
|
|
"epoch": 2.7253306205493386,
|
|
"grad_norm": 0.769798219203949,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8353898525238037,
|
|
"num_tokens": 853850299.0,
|
|
"step": 2679
|
|
},
|
|
{
|
|
"epoch": 2.726347914547304,
|
|
"grad_norm": 0.8584019541740417,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5555,
|
|
"mean_token_accuracy": 0.8308888673782349,
|
|
"num_tokens": 854162459.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 2.7273652085452698,
|
|
"grad_norm": 0.77191162109375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5226,
|
|
"mean_token_accuracy": 0.839631199836731,
|
|
"num_tokens": 854480464.0,
|
|
"step": 2681
|
|
},
|
|
{
|
|
"epoch": 2.728382502543235,
|
|
"grad_norm": 0.7624874711036682,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.8337525725364685,
|
|
"num_tokens": 854791740.0,
|
|
"step": 2682
|
|
},
|
|
{
|
|
"epoch": 2.7293997965412005,
|
|
"grad_norm": 0.7865549921989441,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8374840021133423,
|
|
"num_tokens": 855114343.0,
|
|
"step": 2683
|
|
},
|
|
{
|
|
"epoch": 2.7304170905391656,
|
|
"grad_norm": 0.7589814066886902,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5257,
|
|
"mean_token_accuracy": 0.8377084732055664,
|
|
"num_tokens": 855437674.0,
|
|
"step": 2684
|
|
},
|
|
{
|
|
"epoch": 2.731434384537131,
|
|
"grad_norm": 0.744863748550415,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5424,
|
|
"mean_token_accuracy": 0.8346495628356934,
|
|
"num_tokens": 855766653.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 2.7324516785350967,
|
|
"grad_norm": 0.7601133584976196,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8303238153457642,
|
|
"num_tokens": 856096377.0,
|
|
"step": 2686
|
|
},
|
|
{
|
|
"epoch": 2.7334689725330623,
|
|
"grad_norm": 0.781333327293396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5593,
|
|
"mean_token_accuracy": 0.828827440738678,
|
|
"num_tokens": 856408236.0,
|
|
"step": 2687
|
|
},
|
|
{
|
|
"epoch": 2.7344862665310274,
|
|
"grad_norm": 0.7458904981613159,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8281187415122986,
|
|
"num_tokens": 856726568.0,
|
|
"step": 2688
|
|
},
|
|
{
|
|
"epoch": 2.735503560528993,
|
|
"grad_norm": 0.7970263957977295,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5467,
|
|
"mean_token_accuracy": 0.832145631313324,
|
|
"num_tokens": 857033785.0,
|
|
"step": 2689
|
|
},
|
|
{
|
|
"epoch": 2.736520854526958,
|
|
"grad_norm": 0.7536654472351074,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8339976072311401,
|
|
"num_tokens": 857354595.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 2.7375381485249237,
|
|
"grad_norm": 0.7576792240142822,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5414,
|
|
"mean_token_accuracy": 0.8351134061813354,
|
|
"num_tokens": 857678702.0,
|
|
"step": 2691
|
|
},
|
|
{
|
|
"epoch": 2.7385554425228893,
|
|
"grad_norm": 0.7687288522720337,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8334512710571289,
|
|
"num_tokens": 858003534.0,
|
|
"step": 2692
|
|
},
|
|
{
|
|
"epoch": 2.7395727365208544,
|
|
"grad_norm": 0.7608603239059448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.563,
|
|
"mean_token_accuracy": 0.8300853967666626,
|
|
"num_tokens": 858340027.0,
|
|
"step": 2693
|
|
},
|
|
{
|
|
"epoch": 2.74059003051882,
|
|
"grad_norm": 0.7603384852409363,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5627,
|
|
"mean_token_accuracy": 0.8286716341972351,
|
|
"num_tokens": 858667008.0,
|
|
"step": 2694
|
|
},
|
|
{
|
|
"epoch": 2.741607324516785,
|
|
"grad_norm": 0.7376677989959717,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.8316107988357544,
|
|
"num_tokens": 858997524.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 2.7426246185147507,
|
|
"grad_norm": 0.7685206532478333,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5454,
|
|
"mean_token_accuracy": 0.8322591781616211,
|
|
"num_tokens": 859308138.0,
|
|
"step": 2696
|
|
},
|
|
{
|
|
"epoch": 2.7436419125127163,
|
|
"grad_norm": 0.8232240676879883,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5553,
|
|
"mean_token_accuracy": 0.8303198218345642,
|
|
"num_tokens": 859624523.0,
|
|
"step": 2697
|
|
},
|
|
{
|
|
"epoch": 2.744659206510682,
|
|
"grad_norm": 0.7209020256996155,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.8319036960601807,
|
|
"num_tokens": 859964920.0,
|
|
"step": 2698
|
|
},
|
|
{
|
|
"epoch": 2.745676500508647,
|
|
"grad_norm": 0.8025298118591309,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8306405544281006,
|
|
"num_tokens": 860276802.0,
|
|
"step": 2699
|
|
},
|
|
{
|
|
"epoch": 2.7466937945066126,
|
|
"grad_norm": 0.7556748390197754,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5483,
|
|
"mean_token_accuracy": 0.8321038484573364,
|
|
"num_tokens": 860603738.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 2.7477110885045777,
|
|
"grad_norm": 0.7994128465652466,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5404,
|
|
"mean_token_accuracy": 0.8345211148262024,
|
|
"num_tokens": 860922428.0,
|
|
"step": 2701
|
|
},
|
|
{
|
|
"epoch": 2.7487283825025433,
|
|
"grad_norm": 0.7713677287101746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.513,
|
|
"mean_token_accuracy": 0.8429844379425049,
|
|
"num_tokens": 861269373.0,
|
|
"step": 2702
|
|
},
|
|
{
|
|
"epoch": 2.749745676500509,
|
|
"grad_norm": 0.7557693719863892,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8305192589759827,
|
|
"num_tokens": 861599524.0,
|
|
"step": 2703
|
|
},
|
|
{
|
|
"epoch": 2.750762970498474,
|
|
"grad_norm": 0.7688437700271606,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5587,
|
|
"mean_token_accuracy": 0.829423189163208,
|
|
"num_tokens": 861913031.0,
|
|
"step": 2704
|
|
},
|
|
{
|
|
"epoch": 2.7517802644964395,
|
|
"grad_norm": 0.761762261390686,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5292,
|
|
"mean_token_accuracy": 0.8373246192932129,
|
|
"num_tokens": 862222025.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 2.7527975584944047,
|
|
"grad_norm": 0.7933812141418457,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5423,
|
|
"mean_token_accuracy": 0.8344542384147644,
|
|
"num_tokens": 862540675.0,
|
|
"step": 2706
|
|
},
|
|
{
|
|
"epoch": 2.7538148524923702,
|
|
"grad_norm": 0.7460007667541504,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8310642242431641,
|
|
"num_tokens": 862862721.0,
|
|
"step": 2707
|
|
},
|
|
{
|
|
"epoch": 2.754832146490336,
|
|
"grad_norm": 0.765570878982544,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.8333597779273987,
|
|
"num_tokens": 863189315.0,
|
|
"step": 2708
|
|
},
|
|
{
|
|
"epoch": 2.7558494404883014,
|
|
"grad_norm": 0.7401432394981384,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.8326635360717773,
|
|
"num_tokens": 863516158.0,
|
|
"step": 2709
|
|
},
|
|
{
|
|
"epoch": 2.7568667344862665,
|
|
"grad_norm": 0.7682474851608276,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5789,
|
|
"mean_token_accuracy": 0.8240602016448975,
|
|
"num_tokens": 863840254.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 2.757884028484232,
|
|
"grad_norm": 0.768813967704773,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5459,
|
|
"mean_token_accuracy": 0.8326060771942139,
|
|
"num_tokens": 864145411.0,
|
|
"step": 2711
|
|
},
|
|
{
|
|
"epoch": 2.758901322482197,
|
|
"grad_norm": 0.7643911242485046,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5075,
|
|
"mean_token_accuracy": 0.8430464267730713,
|
|
"num_tokens": 864463131.0,
|
|
"step": 2712
|
|
},
|
|
{
|
|
"epoch": 2.759918616480163,
|
|
"grad_norm": 0.7496541142463684,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5421,
|
|
"mean_token_accuracy": 0.8330836296081543,
|
|
"num_tokens": 864802104.0,
|
|
"step": 2713
|
|
},
|
|
{
|
|
"epoch": 2.7609359104781284,
|
|
"grad_norm": 0.7185865044593811,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8266865611076355,
|
|
"num_tokens": 865146102.0,
|
|
"step": 2714
|
|
},
|
|
{
|
|
"epoch": 2.7619532044760935,
|
|
"grad_norm": 0.8160783052444458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5323,
|
|
"mean_token_accuracy": 0.8358222246170044,
|
|
"num_tokens": 865438180.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 2.762970498474059,
|
|
"grad_norm": 0.7751122117042542,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8310149908065796,
|
|
"num_tokens": 865761672.0,
|
|
"step": 2716
|
|
},
|
|
{
|
|
"epoch": 2.763987792472024,
|
|
"grad_norm": 1.0494272708892822,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5574,
|
|
"mean_token_accuracy": 0.8299132585525513,
|
|
"num_tokens": 866080300.0,
|
|
"step": 2717
|
|
},
|
|
{
|
|
"epoch": 2.7650050864699898,
|
|
"grad_norm": 0.744733452796936,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5452,
|
|
"mean_token_accuracy": 0.8335196375846863,
|
|
"num_tokens": 866401658.0,
|
|
"step": 2718
|
|
},
|
|
{
|
|
"epoch": 2.7660223804679553,
|
|
"grad_norm": 0.7312759160995483,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.538,
|
|
"mean_token_accuracy": 0.8347940444946289,
|
|
"num_tokens": 866735792.0,
|
|
"step": 2719
|
|
},
|
|
{
|
|
"epoch": 2.767039674465921,
|
|
"grad_norm": 0.7587853670120239,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5505,
|
|
"mean_token_accuracy": 0.8313775062561035,
|
|
"num_tokens": 867069334.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 2.768056968463886,
|
|
"grad_norm": 0.7180788516998291,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5543,
|
|
"mean_token_accuracy": 0.8317238092422485,
|
|
"num_tokens": 867389752.0,
|
|
"step": 2721
|
|
},
|
|
{
|
|
"epoch": 2.7690742624618516,
|
|
"grad_norm": 0.7803786396980286,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5692,
|
|
"mean_token_accuracy": 0.8263806104660034,
|
|
"num_tokens": 867692997.0,
|
|
"step": 2722
|
|
},
|
|
{
|
|
"epoch": 2.7700915564598168,
|
|
"grad_norm": 0.7313866019248962,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5427,
|
|
"mean_token_accuracy": 0.8340601921081543,
|
|
"num_tokens": 868020290.0,
|
|
"step": 2723
|
|
},
|
|
{
|
|
"epoch": 2.7711088504577823,
|
|
"grad_norm": 0.7689408659934998,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8297499418258667,
|
|
"num_tokens": 868330610.0,
|
|
"step": 2724
|
|
},
|
|
{
|
|
"epoch": 2.772126144455748,
|
|
"grad_norm": 0.7322613596916199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5546,
|
|
"mean_token_accuracy": 0.8314656019210815,
|
|
"num_tokens": 868649173.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 2.773143438453713,
|
|
"grad_norm": 0.7614291310310364,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5379,
|
|
"mean_token_accuracy": 0.8358194828033447,
|
|
"num_tokens": 868970827.0,
|
|
"step": 2726
|
|
},
|
|
{
|
|
"epoch": 2.7741607324516786,
|
|
"grad_norm": 0.7638256549835205,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.529,
|
|
"mean_token_accuracy": 0.837417721748352,
|
|
"num_tokens": 869274987.0,
|
|
"step": 2727
|
|
},
|
|
{
|
|
"epoch": 2.7751780264496437,
|
|
"grad_norm": 0.7863031029701233,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8326071500778198,
|
|
"num_tokens": 869586556.0,
|
|
"step": 2728
|
|
},
|
|
{
|
|
"epoch": 2.7761953204476093,
|
|
"grad_norm": 0.7495458126068115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8311454653739929,
|
|
"num_tokens": 869923056.0,
|
|
"step": 2729
|
|
},
|
|
{
|
|
"epoch": 2.777212614445575,
|
|
"grad_norm": 0.7531720399856567,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5912,
|
|
"mean_token_accuracy": 0.8206604719161987,
|
|
"num_tokens": 870245934.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 2.7782299084435405,
|
|
"grad_norm": 0.7870125770568848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8362097144126892,
|
|
"num_tokens": 870553892.0,
|
|
"step": 2731
|
|
},
|
|
{
|
|
"epoch": 2.7792472024415056,
|
|
"grad_norm": 0.8140343427658081,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5373,
|
|
"mean_token_accuracy": 0.8341789841651917,
|
|
"num_tokens": 870854822.0,
|
|
"step": 2732
|
|
},
|
|
{
|
|
"epoch": 2.780264496439471,
|
|
"grad_norm": 0.7666418552398682,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8256700038909912,
|
|
"num_tokens": 871168666.0,
|
|
"step": 2733
|
|
},
|
|
{
|
|
"epoch": 2.7812817904374363,
|
|
"grad_norm": 0.7200734615325928,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8355676531791687,
|
|
"num_tokens": 871494942.0,
|
|
"step": 2734
|
|
},
|
|
{
|
|
"epoch": 2.782299084435402,
|
|
"grad_norm": 0.7705289721488953,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8269005417823792,
|
|
"num_tokens": 871816010.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 2.7833163784333674,
|
|
"grad_norm": 0.7768874764442444,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5378,
|
|
"mean_token_accuracy": 0.8357193470001221,
|
|
"num_tokens": 872141817.0,
|
|
"step": 2736
|
|
},
|
|
{
|
|
"epoch": 2.7843336724313326,
|
|
"grad_norm": 0.7831196188926697,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8289015293121338,
|
|
"num_tokens": 872455047.0,
|
|
"step": 2737
|
|
},
|
|
{
|
|
"epoch": 2.785350966429298,
|
|
"grad_norm": 0.7852113246917725,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5644,
|
|
"mean_token_accuracy": 0.8279637694358826,
|
|
"num_tokens": 872792380.0,
|
|
"step": 2738
|
|
},
|
|
{
|
|
"epoch": 2.7863682604272633,
|
|
"grad_norm": 0.7540766596794128,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5606,
|
|
"mean_token_accuracy": 0.8289576172828674,
|
|
"num_tokens": 873101516.0,
|
|
"step": 2739
|
|
},
|
|
{
|
|
"epoch": 2.787385554425229,
|
|
"grad_norm": 0.7995472550392151,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8327668309211731,
|
|
"num_tokens": 873396170.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 2.7884028484231944,
|
|
"grad_norm": 0.7751038074493408,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8297097682952881,
|
|
"num_tokens": 873721507.0,
|
|
"step": 2741
|
|
},
|
|
{
|
|
"epoch": 2.78942014242116,
|
|
"grad_norm": 0.7843905091285706,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5198,
|
|
"mean_token_accuracy": 0.839174211025238,
|
|
"num_tokens": 874047437.0,
|
|
"step": 2742
|
|
},
|
|
{
|
|
"epoch": 2.790437436419125,
|
|
"grad_norm": 0.7461211681365967,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5446,
|
|
"mean_token_accuracy": 0.8334130644798279,
|
|
"num_tokens": 874355294.0,
|
|
"step": 2743
|
|
},
|
|
{
|
|
"epoch": 2.7914547304170907,
|
|
"grad_norm": 0.7744642496109009,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5389,
|
|
"mean_token_accuracy": 0.8340868353843689,
|
|
"num_tokens": 874686080.0,
|
|
"step": 2744
|
|
},
|
|
{
|
|
"epoch": 2.792472024415056,
|
|
"grad_norm": 0.7094562649726868,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5407,
|
|
"mean_token_accuracy": 0.8346304297447205,
|
|
"num_tokens": 875008971.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 2.7934893184130214,
|
|
"grad_norm": 0.7940659523010254,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5336,
|
|
"mean_token_accuracy": 0.8369737267494202,
|
|
"num_tokens": 875321096.0,
|
|
"step": 2746
|
|
},
|
|
{
|
|
"epoch": 2.794506612410987,
|
|
"grad_norm": 0.7668308019638062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5374,
|
|
"mean_token_accuracy": 0.8349287509918213,
|
|
"num_tokens": 875635412.0,
|
|
"step": 2747
|
|
},
|
|
{
|
|
"epoch": 2.795523906408952,
|
|
"grad_norm": 0.8064855933189392,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5433,
|
|
"mean_token_accuracy": 0.8344882130622864,
|
|
"num_tokens": 875976489.0,
|
|
"step": 2748
|
|
},
|
|
{
|
|
"epoch": 2.7965412004069177,
|
|
"grad_norm": 0.7406098246574402,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8245760202407837,
|
|
"num_tokens": 876298846.0,
|
|
"step": 2749
|
|
},
|
|
{
|
|
"epoch": 2.797558494404883,
|
|
"grad_norm": 0.7594728469848633,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8307546377182007,
|
|
"num_tokens": 876621222.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 2.7985757884028484,
|
|
"grad_norm": 0.7740033864974976,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8324582576751709,
|
|
"num_tokens": 876924869.0,
|
|
"step": 2751
|
|
},
|
|
{
|
|
"epoch": 2.799593082400814,
|
|
"grad_norm": 0.7317264080047607,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.8307446241378784,
|
|
"num_tokens": 877247182.0,
|
|
"step": 2752
|
|
},
|
|
{
|
|
"epoch": 2.8006103763987795,
|
|
"grad_norm": 0.7167682647705078,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5289,
|
|
"mean_token_accuracy": 0.8371304273605347,
|
|
"num_tokens": 877562351.0,
|
|
"step": 2753
|
|
},
|
|
{
|
|
"epoch": 2.8016276703967447,
|
|
"grad_norm": 0.7626464366912842,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8335856199264526,
|
|
"num_tokens": 877878025.0,
|
|
"step": 2754
|
|
},
|
|
{
|
|
"epoch": 2.8026449643947102,
|
|
"grad_norm": 0.8109871745109558,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5758,
|
|
"mean_token_accuracy": 0.8261090517044067,
|
|
"num_tokens": 878193570.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 2.8036622583926754,
|
|
"grad_norm": 0.7584338188171387,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5555,
|
|
"mean_token_accuracy": 0.8293702602386475,
|
|
"num_tokens": 878511125.0,
|
|
"step": 2756
|
|
},
|
|
{
|
|
"epoch": 2.804679552390641,
|
|
"grad_norm": 0.8380006551742554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5128,
|
|
"mean_token_accuracy": 0.8421043753623962,
|
|
"num_tokens": 878816670.0,
|
|
"step": 2757
|
|
},
|
|
{
|
|
"epoch": 2.8056968463886065,
|
|
"grad_norm": 0.7756363153457642,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5477,
|
|
"mean_token_accuracy": 0.8319076299667358,
|
|
"num_tokens": 879137968.0,
|
|
"step": 2758
|
|
},
|
|
{
|
|
"epoch": 2.8067141403865716,
|
|
"grad_norm": 0.774976372718811,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8325152397155762,
|
|
"num_tokens": 879459289.0,
|
|
"step": 2759
|
|
},
|
|
{
|
|
"epoch": 2.807731434384537,
|
|
"grad_norm": 0.765339195728302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.8319852352142334,
|
|
"num_tokens": 879793433.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 2.8087487283825023,
|
|
"grad_norm": 0.72953200340271,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.576,
|
|
"mean_token_accuracy": 0.8244074583053589,
|
|
"num_tokens": 880110903.0,
|
|
"step": 2761
|
|
},
|
|
{
|
|
"epoch": 2.809766022380468,
|
|
"grad_norm": 0.7537474632263184,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8377683758735657,
|
|
"num_tokens": 880423872.0,
|
|
"step": 2762
|
|
},
|
|
{
|
|
"epoch": 2.8107833163784335,
|
|
"grad_norm": 0.7519904375076294,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5383,
|
|
"mean_token_accuracy": 0.8344398736953735,
|
|
"num_tokens": 880730161.0,
|
|
"step": 2763
|
|
},
|
|
{
|
|
"epoch": 2.811800610376399,
|
|
"grad_norm": 0.7822726368904114,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.8343329429626465,
|
|
"num_tokens": 881055356.0,
|
|
"step": 2764
|
|
},
|
|
{
|
|
"epoch": 2.812817904374364,
|
|
"grad_norm": 0.7529635429382324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8321344256401062,
|
|
"num_tokens": 881381086.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 2.8138351983723298,
|
|
"grad_norm": 0.7604416012763977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.533,
|
|
"mean_token_accuracy": 0.8359646201133728,
|
|
"num_tokens": 881697429.0,
|
|
"step": 2766
|
|
},
|
|
{
|
|
"epoch": 2.814852492370295,
|
|
"grad_norm": 0.7417066693305969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5508,
|
|
"mean_token_accuracy": 0.832127571105957,
|
|
"num_tokens": 882012327.0,
|
|
"step": 2767
|
|
},
|
|
{
|
|
"epoch": 2.8158697863682605,
|
|
"grad_norm": 0.7463370561599731,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8359636664390564,
|
|
"num_tokens": 882333209.0,
|
|
"step": 2768
|
|
},
|
|
{
|
|
"epoch": 2.816887080366226,
|
|
"grad_norm": 0.768548846244812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8318994045257568,
|
|
"num_tokens": 882643243.0,
|
|
"step": 2769
|
|
},
|
|
{
|
|
"epoch": 2.817904374364191,
|
|
"grad_norm": 0.7283583283424377,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5371,
|
|
"mean_token_accuracy": 0.8356508016586304,
|
|
"num_tokens": 882969232.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 2.8189216683621567,
|
|
"grad_norm": 0.7772321701049805,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.8342421054840088,
|
|
"num_tokens": 883279013.0,
|
|
"step": 2771
|
|
},
|
|
{
|
|
"epoch": 2.819938962360122,
|
|
"grad_norm": 0.7435513138771057,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5582,
|
|
"mean_token_accuracy": 0.829302191734314,
|
|
"num_tokens": 883592025.0,
|
|
"step": 2772
|
|
},
|
|
{
|
|
"epoch": 2.8209562563580874,
|
|
"grad_norm": 0.807037889957428,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5263,
|
|
"mean_token_accuracy": 0.8380916118621826,
|
|
"num_tokens": 883908818.0,
|
|
"step": 2773
|
|
},
|
|
{
|
|
"epoch": 2.821973550356053,
|
|
"grad_norm": 0.7611693739891052,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5762,
|
|
"mean_token_accuracy": 0.8256351947784424,
|
|
"num_tokens": 884225754.0,
|
|
"step": 2774
|
|
},
|
|
{
|
|
"epoch": 2.822990844354018,
|
|
"grad_norm": 0.7505136728286743,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8276066780090332,
|
|
"num_tokens": 884553353.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 2.8240081383519837,
|
|
"grad_norm": 0.7669854760169983,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8279123306274414,
|
|
"num_tokens": 884885565.0,
|
|
"step": 2776
|
|
},
|
|
{
|
|
"epoch": 2.8250254323499493,
|
|
"grad_norm": 0.7808510065078735,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8301238417625427,
|
|
"num_tokens": 885208221.0,
|
|
"step": 2777
|
|
},
|
|
{
|
|
"epoch": 2.8260427263479144,
|
|
"grad_norm": 0.7980822324752808,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5284,
|
|
"mean_token_accuracy": 0.8371199369430542,
|
|
"num_tokens": 885522863.0,
|
|
"step": 2778
|
|
},
|
|
{
|
|
"epoch": 2.82706002034588,
|
|
"grad_norm": 0.7625985741615295,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5466,
|
|
"mean_token_accuracy": 0.8328350782394409,
|
|
"num_tokens": 885850081.0,
|
|
"step": 2779
|
|
},
|
|
{
|
|
"epoch": 2.8280773143438456,
|
|
"grad_norm": 0.7917464375495911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8335134983062744,
|
|
"num_tokens": 886186789.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 2.8290946083418107,
|
|
"grad_norm": 0.7513185143470764,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8289699554443359,
|
|
"num_tokens": 886496034.0,
|
|
"step": 2781
|
|
},
|
|
{
|
|
"epoch": 2.8301119023397763,
|
|
"grad_norm": 0.7158359289169312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5383,
|
|
"mean_token_accuracy": 0.8345068693161011,
|
|
"num_tokens": 886817995.0,
|
|
"step": 2782
|
|
},
|
|
{
|
|
"epoch": 2.8311291963377414,
|
|
"grad_norm": 0.8091121912002563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5396,
|
|
"mean_token_accuracy": 0.8350147008895874,
|
|
"num_tokens": 887120555.0,
|
|
"step": 2783
|
|
},
|
|
{
|
|
"epoch": 2.832146490335707,
|
|
"grad_norm": 0.7498506307601929,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5332,
|
|
"mean_token_accuracy": 0.8364719152450562,
|
|
"num_tokens": 887434554.0,
|
|
"step": 2784
|
|
},
|
|
{
|
|
"epoch": 2.8331637843336726,
|
|
"grad_norm": 0.7743940353393555,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5277,
|
|
"mean_token_accuracy": 0.8380556702613831,
|
|
"num_tokens": 887749092.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 2.8341810783316377,
|
|
"grad_norm": 0.7444809079170227,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5707,
|
|
"mean_token_accuracy": 0.8259097337722778,
|
|
"num_tokens": 888072082.0,
|
|
"step": 2786
|
|
},
|
|
{
|
|
"epoch": 2.8351983723296033,
|
|
"grad_norm": 0.7832455635070801,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8302660584449768,
|
|
"num_tokens": 888384734.0,
|
|
"step": 2787
|
|
},
|
|
{
|
|
"epoch": 2.836215666327569,
|
|
"grad_norm": 0.7678145170211792,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.526,
|
|
"mean_token_accuracy": 0.8378886580467224,
|
|
"num_tokens": 888705436.0,
|
|
"step": 2788
|
|
},
|
|
{
|
|
"epoch": 2.837232960325534,
|
|
"grad_norm": 0.7604836821556091,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.8355188369750977,
|
|
"num_tokens": 889025194.0,
|
|
"step": 2789
|
|
},
|
|
{
|
|
"epoch": 2.8382502543234995,
|
|
"grad_norm": 0.7448104023933411,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5507,
|
|
"mean_token_accuracy": 0.8303406238555908,
|
|
"num_tokens": 889348596.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 2.839267548321465,
|
|
"grad_norm": 0.7357156276702881,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8347613215446472,
|
|
"num_tokens": 889676197.0,
|
|
"step": 2791
|
|
},
|
|
{
|
|
"epoch": 2.8402848423194302,
|
|
"grad_norm": 0.8427879810333252,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8312827348709106,
|
|
"num_tokens": 890005117.0,
|
|
"step": 2792
|
|
},
|
|
{
|
|
"epoch": 2.841302136317396,
|
|
"grad_norm": 0.78268963098526,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8319834470748901,
|
|
"num_tokens": 890316760.0,
|
|
"step": 2793
|
|
},
|
|
{
|
|
"epoch": 2.842319430315361,
|
|
"grad_norm": 0.7581083178520203,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.832615852355957,
|
|
"num_tokens": 890620871.0,
|
|
"step": 2794
|
|
},
|
|
{
|
|
"epoch": 2.8433367243133265,
|
|
"grad_norm": 0.7503229975700378,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5397,
|
|
"mean_token_accuracy": 0.8342493176460266,
|
|
"num_tokens": 890936449.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 2.844354018311292,
|
|
"grad_norm": 0.7486345767974854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5435,
|
|
"mean_token_accuracy": 0.8329163789749146,
|
|
"num_tokens": 891269034.0,
|
|
"step": 2796
|
|
},
|
|
{
|
|
"epoch": 2.845371312309257,
|
|
"grad_norm": 0.7206019163131714,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5395,
|
|
"mean_token_accuracy": 0.834210991859436,
|
|
"num_tokens": 891601141.0,
|
|
"step": 2797
|
|
},
|
|
{
|
|
"epoch": 2.846388606307223,
|
|
"grad_norm": 0.7604000568389893,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8274968862533569,
|
|
"num_tokens": 891911699.0,
|
|
"step": 2798
|
|
},
|
|
{
|
|
"epoch": 2.8474059003051884,
|
|
"grad_norm": 0.743278443813324,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.8318988680839539,
|
|
"num_tokens": 892236979.0,
|
|
"step": 2799
|
|
},
|
|
{
|
|
"epoch": 2.8484231943031535,
|
|
"grad_norm": 0.787842869758606,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.8333825469017029,
|
|
"num_tokens": 892536633.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 2.849440488301119,
|
|
"grad_norm": 0.7452677488327026,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.8356083631515503,
|
|
"num_tokens": 892862170.0,
|
|
"step": 2801
|
|
},
|
|
{
|
|
"epoch": 2.8504577822990846,
|
|
"grad_norm": 0.7698550224304199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8296544551849365,
|
|
"num_tokens": 893170520.0,
|
|
"step": 2802
|
|
},
|
|
{
|
|
"epoch": 2.8514750762970498,
|
|
"grad_norm": 0.7370395064353943,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8322560787200928,
|
|
"num_tokens": 893480701.0,
|
|
"step": 2803
|
|
},
|
|
{
|
|
"epoch": 2.8524923702950153,
|
|
"grad_norm": 0.7674466371536255,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5761,
|
|
"mean_token_accuracy": 0.8261927962303162,
|
|
"num_tokens": 893795202.0,
|
|
"step": 2804
|
|
},
|
|
{
|
|
"epoch": 2.8535096642929805,
|
|
"grad_norm": 0.7611037492752075,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5366,
|
|
"mean_token_accuracy": 0.8353677988052368,
|
|
"num_tokens": 894108299.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 2.854526958290946,
|
|
"grad_norm": 0.7476277351379395,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.82884681224823,
|
|
"num_tokens": 894419933.0,
|
|
"step": 2806
|
|
},
|
|
{
|
|
"epoch": 2.8555442522889116,
|
|
"grad_norm": 0.737981379032135,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8351168632507324,
|
|
"num_tokens": 894741124.0,
|
|
"step": 2807
|
|
},
|
|
{
|
|
"epoch": 2.8565615462868768,
|
|
"grad_norm": 0.7774837017059326,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8259122371673584,
|
|
"num_tokens": 895056647.0,
|
|
"step": 2808
|
|
},
|
|
{
|
|
"epoch": 2.8575788402848423,
|
|
"grad_norm": 0.777219831943512,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8236808776855469,
|
|
"num_tokens": 895380518.0,
|
|
"step": 2809
|
|
},
|
|
{
|
|
"epoch": 2.8585961342828075,
|
|
"grad_norm": 0.7577489018440247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.833845317363739,
|
|
"num_tokens": 895702008.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 2.859613428280773,
|
|
"grad_norm": 0.7717519998550415,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5208,
|
|
"mean_token_accuracy": 0.8394262194633484,
|
|
"num_tokens": 896011546.0,
|
|
"step": 2811
|
|
},
|
|
{
|
|
"epoch": 2.8606307222787386,
|
|
"grad_norm": 0.7280170917510986,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8363389372825623,
|
|
"num_tokens": 896329246.0,
|
|
"step": 2812
|
|
},
|
|
{
|
|
"epoch": 2.861648016276704,
|
|
"grad_norm": 0.741651177406311,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5274,
|
|
"mean_token_accuracy": 0.8382123112678528,
|
|
"num_tokens": 896652517.0,
|
|
"step": 2813
|
|
},
|
|
{
|
|
"epoch": 2.8626653102746693,
|
|
"grad_norm": 0.7922471761703491,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5365,
|
|
"mean_token_accuracy": 0.8348798155784607,
|
|
"num_tokens": 896970551.0,
|
|
"step": 2814
|
|
},
|
|
{
|
|
"epoch": 2.863682604272635,
|
|
"grad_norm": 0.7952819466590881,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5446,
|
|
"mean_token_accuracy": 0.8329468965530396,
|
|
"num_tokens": 897291892.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 2.8646998982706,
|
|
"grad_norm": 0.7516127824783325,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8317283391952515,
|
|
"num_tokens": 897605769.0,
|
|
"step": 2816
|
|
},
|
|
{
|
|
"epoch": 2.8657171922685656,
|
|
"grad_norm": 0.7459313869476318,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5301,
|
|
"mean_token_accuracy": 0.8376507759094238,
|
|
"num_tokens": 897914249.0,
|
|
"step": 2817
|
|
},
|
|
{
|
|
"epoch": 2.866734486266531,
|
|
"grad_norm": 0.7906985878944397,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5407,
|
|
"mean_token_accuracy": 0.8333652019500732,
|
|
"num_tokens": 898228863.0,
|
|
"step": 2818
|
|
},
|
|
{
|
|
"epoch": 2.8677517802644963,
|
|
"grad_norm": 0.7567861676216125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8349796533584595,
|
|
"num_tokens": 898530951.0,
|
|
"step": 2819
|
|
},
|
|
{
|
|
"epoch": 2.868769074262462,
|
|
"grad_norm": 0.9229597449302673,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.8336858749389648,
|
|
"num_tokens": 898851712.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 2.869786368260427,
|
|
"grad_norm": 0.8401355743408203,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5463,
|
|
"mean_token_accuracy": 0.8318575620651245,
|
|
"num_tokens": 899173646.0,
|
|
"step": 2821
|
|
},
|
|
{
|
|
"epoch": 2.8708036622583926,
|
|
"grad_norm": 0.7329902648925781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5427,
|
|
"mean_token_accuracy": 0.834531843662262,
|
|
"num_tokens": 899486743.0,
|
|
"step": 2822
|
|
},
|
|
{
|
|
"epoch": 2.871820956256358,
|
|
"grad_norm": 0.7627372145652771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5443,
|
|
"mean_token_accuracy": 0.8335117101669312,
|
|
"num_tokens": 899809627.0,
|
|
"step": 2823
|
|
},
|
|
{
|
|
"epoch": 2.8728382502543237,
|
|
"grad_norm": 0.7298281192779541,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.8327968120574951,
|
|
"num_tokens": 900137262.0,
|
|
"step": 2824
|
|
},
|
|
{
|
|
"epoch": 2.873855544252289,
|
|
"grad_norm": 0.7618030905723572,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5579,
|
|
"mean_token_accuracy": 0.8297356367111206,
|
|
"num_tokens": 900459708.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 2.8748728382502544,
|
|
"grad_norm": 0.7472760081291199,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8299884796142578,
|
|
"num_tokens": 900777165.0,
|
|
"step": 2826
|
|
},
|
|
{
|
|
"epoch": 2.8758901322482195,
|
|
"grad_norm": 0.7818968892097473,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5309,
|
|
"mean_token_accuracy": 0.8371264934539795,
|
|
"num_tokens": 901100446.0,
|
|
"step": 2827
|
|
},
|
|
{
|
|
"epoch": 2.876907426246185,
|
|
"grad_norm": 0.7526576519012451,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5294,
|
|
"mean_token_accuracy": 0.8375152945518494,
|
|
"num_tokens": 901413081.0,
|
|
"step": 2828
|
|
},
|
|
{
|
|
"epoch": 2.8779247202441507,
|
|
"grad_norm": 0.8172447085380554,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8350641131401062,
|
|
"num_tokens": 901726168.0,
|
|
"step": 2829
|
|
},
|
|
{
|
|
"epoch": 2.878942014242116,
|
|
"grad_norm": 0.7760986089706421,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5509,
|
|
"mean_token_accuracy": 0.8311924934387207,
|
|
"num_tokens": 902035368.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 2.8799593082400814,
|
|
"grad_norm": 0.7464948892593384,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8369188904762268,
|
|
"num_tokens": 902361599.0,
|
|
"step": 2831
|
|
},
|
|
{
|
|
"epoch": 2.8809766022380465,
|
|
"grad_norm": 0.7712615132331848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5443,
|
|
"mean_token_accuracy": 0.833279550075531,
|
|
"num_tokens": 902671212.0,
|
|
"step": 2832
|
|
},
|
|
{
|
|
"epoch": 2.881993896236012,
|
|
"grad_norm": 0.8632667064666748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5691,
|
|
"mean_token_accuracy": 0.8271939158439636,
|
|
"num_tokens": 902987411.0,
|
|
"step": 2833
|
|
},
|
|
{
|
|
"epoch": 2.8830111902339777,
|
|
"grad_norm": 0.7410888075828552,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5557,
|
|
"mean_token_accuracy": 0.8314687013626099,
|
|
"num_tokens": 903308282.0,
|
|
"step": 2834
|
|
},
|
|
{
|
|
"epoch": 2.8840284842319432,
|
|
"grad_norm": 0.7582542896270752,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8322716355323792,
|
|
"num_tokens": 903628314.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 2.8850457782299084,
|
|
"grad_norm": 0.7456071972846985,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5441,
|
|
"mean_token_accuracy": 0.8332905769348145,
|
|
"num_tokens": 903947269.0,
|
|
"step": 2836
|
|
},
|
|
{
|
|
"epoch": 2.886063072227874,
|
|
"grad_norm": 0.7716429829597473,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8303126692771912,
|
|
"num_tokens": 904246591.0,
|
|
"step": 2837
|
|
},
|
|
{
|
|
"epoch": 2.887080366225839,
|
|
"grad_norm": 0.7811114192008972,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5485,
|
|
"mean_token_accuracy": 0.8313155770301819,
|
|
"num_tokens": 904565888.0,
|
|
"step": 2838
|
|
},
|
|
{
|
|
"epoch": 2.8880976602238047,
|
|
"grad_norm": 0.8217208981513977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8256539702415466,
|
|
"num_tokens": 904888197.0,
|
|
"step": 2839
|
|
},
|
|
{
|
|
"epoch": 2.8891149542217702,
|
|
"grad_norm": 0.7821131944656372,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8305214643478394,
|
|
"num_tokens": 905218458.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 2.8901322482197354,
|
|
"grad_norm": 1.0390526056289673,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8258970379829407,
|
|
"num_tokens": 905543712.0,
|
|
"step": 2841
|
|
},
|
|
{
|
|
"epoch": 2.891149542217701,
|
|
"grad_norm": 0.8377124667167664,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5447,
|
|
"mean_token_accuracy": 0.8334788084030151,
|
|
"num_tokens": 905848988.0,
|
|
"step": 2842
|
|
},
|
|
{
|
|
"epoch": 2.892166836215666,
|
|
"grad_norm": 0.7347882390022278,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5507,
|
|
"mean_token_accuracy": 0.8319611549377441,
|
|
"num_tokens": 906155224.0,
|
|
"step": 2843
|
|
},
|
|
{
|
|
"epoch": 2.8931841302136316,
|
|
"grad_norm": 0.7705765962600708,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5615,
|
|
"mean_token_accuracy": 0.8289397954940796,
|
|
"num_tokens": 906474923.0,
|
|
"step": 2844
|
|
},
|
|
{
|
|
"epoch": 2.894201424211597,
|
|
"grad_norm": 0.7141281962394714,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5402,
|
|
"mean_token_accuracy": 0.8342429399490356,
|
|
"num_tokens": 906788218.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 2.895218718209563,
|
|
"grad_norm": 0.7942697405815125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8304331302642822,
|
|
"num_tokens": 907091206.0,
|
|
"step": 2846
|
|
},
|
|
{
|
|
"epoch": 2.896236012207528,
|
|
"grad_norm": 0.825537919998169,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8328003883361816,
|
|
"num_tokens": 907426047.0,
|
|
"step": 2847
|
|
},
|
|
{
|
|
"epoch": 2.8972533062054935,
|
|
"grad_norm": 0.7048806548118591,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8351121544837952,
|
|
"num_tokens": 907753455.0,
|
|
"step": 2848
|
|
},
|
|
{
|
|
"epoch": 2.8982706002034586,
|
|
"grad_norm": 0.7680245637893677,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5238,
|
|
"mean_token_accuracy": 0.8381252288818359,
|
|
"num_tokens": 908071238.0,
|
|
"step": 2849
|
|
},
|
|
{
|
|
"epoch": 2.899287894201424,
|
|
"grad_norm": 0.774487316608429,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5438,
|
|
"mean_token_accuracy": 0.8338207006454468,
|
|
"num_tokens": 908384365.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 2.9003051881993898,
|
|
"grad_norm": 0.8231586217880249,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5391,
|
|
"mean_token_accuracy": 0.8338854908943176,
|
|
"num_tokens": 908692929.0,
|
|
"step": 2851
|
|
},
|
|
{
|
|
"epoch": 2.901322482197355,
|
|
"grad_norm": 0.7833102941513062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5383,
|
|
"mean_token_accuracy": 0.8346694707870483,
|
|
"num_tokens": 909009416.0,
|
|
"step": 2852
|
|
},
|
|
{
|
|
"epoch": 2.9023397761953205,
|
|
"grad_norm": 0.7677398920059204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5186,
|
|
"mean_token_accuracy": 0.8401553630828857,
|
|
"num_tokens": 909326555.0,
|
|
"step": 2853
|
|
},
|
|
{
|
|
"epoch": 2.9033570701932856,
|
|
"grad_norm": 0.7626712918281555,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5244,
|
|
"mean_token_accuracy": 0.8389687538146973,
|
|
"num_tokens": 909639003.0,
|
|
"step": 2854
|
|
},
|
|
{
|
|
"epoch": 2.904374364191251,
|
|
"grad_norm": 0.8620499968528748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8299873471260071,
|
|
"num_tokens": 909943894.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 2.9053916581892167,
|
|
"grad_norm": 0.7735525369644165,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5531,
|
|
"mean_token_accuracy": 0.8311653733253479,
|
|
"num_tokens": 910282610.0,
|
|
"step": 2856
|
|
},
|
|
{
|
|
"epoch": 2.9064089521871823,
|
|
"grad_norm": 0.7400574684143066,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5482,
|
|
"mean_token_accuracy": 0.8316192030906677,
|
|
"num_tokens": 910603096.0,
|
|
"step": 2857
|
|
},
|
|
{
|
|
"epoch": 2.9074262461851474,
|
|
"grad_norm": 0.7685510516166687,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5411,
|
|
"mean_token_accuracy": 0.8340034484863281,
|
|
"num_tokens": 910907889.0,
|
|
"step": 2858
|
|
},
|
|
{
|
|
"epoch": 2.908443540183113,
|
|
"grad_norm": 0.9150114059448242,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5466,
|
|
"mean_token_accuracy": 0.8319932222366333,
|
|
"num_tokens": 911219506.0,
|
|
"step": 2859
|
|
},
|
|
{
|
|
"epoch": 2.909460834181078,
|
|
"grad_norm": 0.7258608937263489,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.8304907083511353,
|
|
"num_tokens": 911543675.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 2.9104781281790437,
|
|
"grad_norm": 0.7376611828804016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.8338150382041931,
|
|
"num_tokens": 911875094.0,
|
|
"step": 2861
|
|
},
|
|
{
|
|
"epoch": 2.9114954221770093,
|
|
"grad_norm": 0.823221743106842,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5582,
|
|
"mean_token_accuracy": 0.8295491933822632,
|
|
"num_tokens": 912185571.0,
|
|
"step": 2862
|
|
},
|
|
{
|
|
"epoch": 2.9125127161749744,
|
|
"grad_norm": 0.7491183280944824,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8328477740287781,
|
|
"num_tokens": 912513109.0,
|
|
"step": 2863
|
|
},
|
|
{
|
|
"epoch": 2.91353001017294,
|
|
"grad_norm": 0.7279940843582153,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.8309471011161804,
|
|
"num_tokens": 912850119.0,
|
|
"step": 2864
|
|
},
|
|
{
|
|
"epoch": 2.914547304170905,
|
|
"grad_norm": 0.7640823721885681,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5385,
|
|
"mean_token_accuracy": 0.8364745378494263,
|
|
"num_tokens": 913178881.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 2.9155645981688707,
|
|
"grad_norm": 0.7615811228752136,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.833329975605011,
|
|
"num_tokens": 913518334.0,
|
|
"step": 2866
|
|
},
|
|
{
|
|
"epoch": 2.9165818921668363,
|
|
"grad_norm": 0.7493634223937988,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5414,
|
|
"mean_token_accuracy": 0.8334664702415466,
|
|
"num_tokens": 913821428.0,
|
|
"step": 2867
|
|
},
|
|
{
|
|
"epoch": 2.917599186164802,
|
|
"grad_norm": 0.7722597122192383,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5263,
|
|
"mean_token_accuracy": 0.8375759720802307,
|
|
"num_tokens": 914126038.0,
|
|
"step": 2868
|
|
},
|
|
{
|
|
"epoch": 2.918616480162767,
|
|
"grad_norm": 0.7251677513122559,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5553,
|
|
"mean_token_accuracy": 0.8307819366455078,
|
|
"num_tokens": 914459291.0,
|
|
"step": 2869
|
|
},
|
|
{
|
|
"epoch": 2.9196337741607326,
|
|
"grad_norm": 0.7932417392730713,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5181,
|
|
"mean_token_accuracy": 0.8403124809265137,
|
|
"num_tokens": 914765805.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 2.9206510681586977,
|
|
"grad_norm": 0.8020852208137512,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5734,
|
|
"mean_token_accuracy": 0.8264729976654053,
|
|
"num_tokens": 915083509.0,
|
|
"step": 2871
|
|
},
|
|
{
|
|
"epoch": 2.9216683621566633,
|
|
"grad_norm": 0.7599645256996155,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5377,
|
|
"mean_token_accuracy": 0.8344203233718872,
|
|
"num_tokens": 915391939.0,
|
|
"step": 2872
|
|
},
|
|
{
|
|
"epoch": 2.922685656154629,
|
|
"grad_norm": 0.760013222694397,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8350082039833069,
|
|
"num_tokens": 915708594.0,
|
|
"step": 2873
|
|
},
|
|
{
|
|
"epoch": 2.923702950152594,
|
|
"grad_norm": 0.7909659743309021,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8332921266555786,
|
|
"num_tokens": 915997116.0,
|
|
"step": 2874
|
|
},
|
|
{
|
|
"epoch": 2.9247202441505595,
|
|
"grad_norm": 0.7611994743347168,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8260282278060913,
|
|
"num_tokens": 916313182.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 2.9257375381485247,
|
|
"grad_norm": 0.8146640062332153,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.8340287208557129,
|
|
"num_tokens": 916617077.0,
|
|
"step": 2876
|
|
},
|
|
{
|
|
"epoch": 2.9267548321464902,
|
|
"grad_norm": 0.7757108211517334,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5398,
|
|
"mean_token_accuracy": 0.8344134092330933,
|
|
"num_tokens": 916935162.0,
|
|
"step": 2877
|
|
},
|
|
{
|
|
"epoch": 2.927772126144456,
|
|
"grad_norm": 0.7191690802574158,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5747,
|
|
"mean_token_accuracy": 0.8260297775268555,
|
|
"num_tokens": 917266918.0,
|
|
"step": 2878
|
|
},
|
|
{
|
|
"epoch": 2.9287894201424214,
|
|
"grad_norm": 0.774037778377533,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5259,
|
|
"mean_token_accuracy": 0.8385676741600037,
|
|
"num_tokens": 917586569.0,
|
|
"step": 2879
|
|
},
|
|
{
|
|
"epoch": 2.9298067141403865,
|
|
"grad_norm": 0.7555479407310486,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5636,
|
|
"mean_token_accuracy": 0.8282998204231262,
|
|
"num_tokens": 917913379.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 2.930824008138352,
|
|
"grad_norm": 0.7959446310997009,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8281709551811218,
|
|
"num_tokens": 918219033.0,
|
|
"step": 2881
|
|
},
|
|
{
|
|
"epoch": 2.931841302136317,
|
|
"grad_norm": 0.7435408234596252,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8339255452156067,
|
|
"num_tokens": 918538998.0,
|
|
"step": 2882
|
|
},
|
|
{
|
|
"epoch": 2.932858596134283,
|
|
"grad_norm": 0.7767910361289978,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5318,
|
|
"mean_token_accuracy": 0.8368157744407654,
|
|
"num_tokens": 918848120.0,
|
|
"step": 2883
|
|
},
|
|
{
|
|
"epoch": 2.9338758901322484,
|
|
"grad_norm": 0.7454608082771301,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5552,
|
|
"mean_token_accuracy": 0.8298519849777222,
|
|
"num_tokens": 919158162.0,
|
|
"step": 2884
|
|
},
|
|
{
|
|
"epoch": 2.9348931841302135,
|
|
"grad_norm": 0.8529176115989685,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.8282391428947449,
|
|
"num_tokens": 919454435.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 2.935910478128179,
|
|
"grad_norm": 0.996655285358429,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5465,
|
|
"mean_token_accuracy": 0.8331795930862427,
|
|
"num_tokens": 919767371.0,
|
|
"step": 2886
|
|
},
|
|
{
|
|
"epoch": 2.936927772126144,
|
|
"grad_norm": 0.8495599627494812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5285,
|
|
"mean_token_accuracy": 0.8371427655220032,
|
|
"num_tokens": 920097231.0,
|
|
"step": 2887
|
|
},
|
|
{
|
|
"epoch": 2.9379450661241098,
|
|
"grad_norm": 0.7650257349014282,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5449,
|
|
"mean_token_accuracy": 0.8332249522209167,
|
|
"num_tokens": 920404124.0,
|
|
"step": 2888
|
|
},
|
|
{
|
|
"epoch": 2.9389623601220753,
|
|
"grad_norm": 0.799810528755188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5786,
|
|
"mean_token_accuracy": 0.8236322402954102,
|
|
"num_tokens": 920723542.0,
|
|
"step": 2889
|
|
},
|
|
{
|
|
"epoch": 2.939979654120041,
|
|
"grad_norm": 0.8185797929763794,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8314552903175354,
|
|
"num_tokens": 921032468.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 2.940996948118006,
|
|
"grad_norm": 0.7548120617866516,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5334,
|
|
"mean_token_accuracy": 0.8355669975280762,
|
|
"num_tokens": 921343839.0,
|
|
"step": 2891
|
|
},
|
|
{
|
|
"epoch": 2.9420142421159716,
|
|
"grad_norm": 0.7259658575057983,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5542,
|
|
"mean_token_accuracy": 0.8298841714859009,
|
|
"num_tokens": 921677191.0,
|
|
"step": 2892
|
|
},
|
|
{
|
|
"epoch": 2.9430315361139368,
|
|
"grad_norm": 0.8542818427085876,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8286068439483643,
|
|
"num_tokens": 921985325.0,
|
|
"step": 2893
|
|
},
|
|
{
|
|
"epoch": 2.9440488301119023,
|
|
"grad_norm": 0.7178732752799988,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5262,
|
|
"mean_token_accuracy": 0.8378465175628662,
|
|
"num_tokens": 922307545.0,
|
|
"step": 2894
|
|
},
|
|
{
|
|
"epoch": 2.945066124109868,
|
|
"grad_norm": 0.7427535653114319,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5595,
|
|
"mean_token_accuracy": 0.8293996453285217,
|
|
"num_tokens": 922627414.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 2.946083418107833,
|
|
"grad_norm": 0.7810771465301514,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5643,
|
|
"mean_token_accuracy": 0.8286993503570557,
|
|
"num_tokens": 922940862.0,
|
|
"step": 2896
|
|
},
|
|
{
|
|
"epoch": 2.9471007121057986,
|
|
"grad_norm": 0.7892922759056091,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5536,
|
|
"mean_token_accuracy": 0.8313384056091309,
|
|
"num_tokens": 923268455.0,
|
|
"step": 2897
|
|
},
|
|
{
|
|
"epoch": 2.9481180061037637,
|
|
"grad_norm": 0.759972095489502,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5409,
|
|
"mean_token_accuracy": 0.8341248035430908,
|
|
"num_tokens": 923581013.0,
|
|
"step": 2898
|
|
},
|
|
{
|
|
"epoch": 2.9491353001017293,
|
|
"grad_norm": 0.7308177351951599,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8310506343841553,
|
|
"num_tokens": 923889387.0,
|
|
"step": 2899
|
|
},
|
|
{
|
|
"epoch": 2.950152594099695,
|
|
"grad_norm": 0.7997915148735046,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8328058123588562,
|
|
"num_tokens": 924193594.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 2.9511698880976605,
|
|
"grad_norm": 0.7386446595191956,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5344,
|
|
"mean_token_accuracy": 0.8359384536743164,
|
|
"num_tokens": 924523122.0,
|
|
"step": 2901
|
|
},
|
|
{
|
|
"epoch": 2.9521871820956256,
|
|
"grad_norm": 0.8074952363967896,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5181,
|
|
"mean_token_accuracy": 0.8404185771942139,
|
|
"num_tokens": 924829073.0,
|
|
"step": 2902
|
|
},
|
|
{
|
|
"epoch": 2.953204476093591,
|
|
"grad_norm": 0.9806423187255859,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5483,
|
|
"mean_token_accuracy": 0.8329747915267944,
|
|
"num_tokens": 925176143.0,
|
|
"step": 2903
|
|
},
|
|
{
|
|
"epoch": 2.9542217700915563,
|
|
"grad_norm": 0.7590146064758301,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.521,
|
|
"mean_token_accuracy": 0.8392025232315063,
|
|
"num_tokens": 925477479.0,
|
|
"step": 2904
|
|
},
|
|
{
|
|
"epoch": 2.955239064089522,
|
|
"grad_norm": 0.8951772451400757,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8377875089645386,
|
|
"num_tokens": 925794642.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 2.9562563580874874,
|
|
"grad_norm": 0.786734938621521,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5397,
|
|
"mean_token_accuracy": 0.8343855738639832,
|
|
"num_tokens": 926085112.0,
|
|
"step": 2906
|
|
},
|
|
{
|
|
"epoch": 2.9572736520854526,
|
|
"grad_norm": 0.8489226698875427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8272536396980286,
|
|
"num_tokens": 926404523.0,
|
|
"step": 2907
|
|
},
|
|
{
|
|
"epoch": 2.958290946083418,
|
|
"grad_norm": 0.852146327495575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5202,
|
|
"mean_token_accuracy": 0.8402287364006042,
|
|
"num_tokens": 926703380.0,
|
|
"step": 2908
|
|
},
|
|
{
|
|
"epoch": 2.9593082400813833,
|
|
"grad_norm": 0.8744028806686401,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8320158123970032,
|
|
"num_tokens": 927009636.0,
|
|
"step": 2909
|
|
},
|
|
{
|
|
"epoch": 2.960325534079349,
|
|
"grad_norm": 0.7487736940383911,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8285006880760193,
|
|
"num_tokens": 927349363.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 2.9613428280773144,
|
|
"grad_norm": 0.8733537793159485,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8354209661483765,
|
|
"num_tokens": 927661962.0,
|
|
"step": 2911
|
|
},
|
|
{
|
|
"epoch": 2.96236012207528,
|
|
"grad_norm": 0.7431246638298035,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8311399221420288,
|
|
"num_tokens": 927984140.0,
|
|
"step": 2912
|
|
},
|
|
{
|
|
"epoch": 2.963377416073245,
|
|
"grad_norm": 0.7541692852973938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.8358690738677979,
|
|
"num_tokens": 928321531.0,
|
|
"step": 2913
|
|
},
|
|
{
|
|
"epoch": 2.9643947100712107,
|
|
"grad_norm": 0.7754326462745667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8390681743621826,
|
|
"num_tokens": 928624216.0,
|
|
"step": 2914
|
|
},
|
|
{
|
|
"epoch": 2.965412004069176,
|
|
"grad_norm": 0.7342763543128967,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8322646617889404,
|
|
"num_tokens": 928950416.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 2.9664292980671414,
|
|
"grad_norm": 0.7890582084655762,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5525,
|
|
"mean_token_accuracy": 0.8303789496421814,
|
|
"num_tokens": 929254662.0,
|
|
"step": 2916
|
|
},
|
|
{
|
|
"epoch": 2.967446592065107,
|
|
"grad_norm": 0.782354474067688,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5259,
|
|
"mean_token_accuracy": 0.8377487659454346,
|
|
"num_tokens": 929559750.0,
|
|
"step": 2917
|
|
},
|
|
{
|
|
"epoch": 2.968463886063072,
|
|
"grad_norm": 0.8146892786026001,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5211,
|
|
"mean_token_accuracy": 0.8391036987304688,
|
|
"num_tokens": 929861530.0,
|
|
"step": 2918
|
|
},
|
|
{
|
|
"epoch": 2.9694811800610377,
|
|
"grad_norm": 0.7631412148475647,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5488,
|
|
"mean_token_accuracy": 0.8333353400230408,
|
|
"num_tokens": 930177144.0,
|
|
"step": 2919
|
|
},
|
|
{
|
|
"epoch": 2.970498474059003,
|
|
"grad_norm": 0.7770146727561951,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5629,
|
|
"mean_token_accuracy": 0.8283876180648804,
|
|
"num_tokens": 930500065.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 2.9715157680569684,
|
|
"grad_norm": 0.9702063202857971,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8294034004211426,
|
|
"num_tokens": 930809980.0,
|
|
"step": 2921
|
|
},
|
|
{
|
|
"epoch": 2.972533062054934,
|
|
"grad_norm": 0.7356535196304321,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5335,
|
|
"mean_token_accuracy": 0.8369752168655396,
|
|
"num_tokens": 931124035.0,
|
|
"step": 2922
|
|
},
|
|
{
|
|
"epoch": 2.9735503560528995,
|
|
"grad_norm": 0.7785635590553284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.837040901184082,
|
|
"num_tokens": 931415682.0,
|
|
"step": 2923
|
|
},
|
|
{
|
|
"epoch": 2.9745676500508647,
|
|
"grad_norm": 0.7457777857780457,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.565,
|
|
"mean_token_accuracy": 0.8285586833953857,
|
|
"num_tokens": 931735254.0,
|
|
"step": 2924
|
|
},
|
|
{
|
|
"epoch": 2.9755849440488302,
|
|
"grad_norm": 0.7707124948501587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8271319270133972,
|
|
"num_tokens": 932044046.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 2.9766022380467954,
|
|
"grad_norm": 0.7712051272392273,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5339,
|
|
"mean_token_accuracy": 0.8353266716003418,
|
|
"num_tokens": 932350730.0,
|
|
"step": 2926
|
|
},
|
|
{
|
|
"epoch": 2.977619532044761,
|
|
"grad_norm": 0.7692690491676331,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.833324134349823,
|
|
"num_tokens": 932673626.0,
|
|
"step": 2927
|
|
},
|
|
{
|
|
"epoch": 2.9786368260427265,
|
|
"grad_norm": 0.7063543200492859,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5318,
|
|
"mean_token_accuracy": 0.8369261026382446,
|
|
"num_tokens": 933021539.0,
|
|
"step": 2928
|
|
},
|
|
{
|
|
"epoch": 2.9796541200406916,
|
|
"grad_norm": 0.740020215511322,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5829,
|
|
"mean_token_accuracy": 0.8245356678962708,
|
|
"num_tokens": 933342649.0,
|
|
"step": 2929
|
|
},
|
|
{
|
|
"epoch": 2.980671414038657,
|
|
"grad_norm": 0.7315794825553894,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.8372519016265869,
|
|
"num_tokens": 933683564.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 2.9816887080366223,
|
|
"grad_norm": 0.7954710125923157,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5584,
|
|
"mean_token_accuracy": 0.8298385143280029,
|
|
"num_tokens": 933996328.0,
|
|
"step": 2931
|
|
},
|
|
{
|
|
"epoch": 2.982706002034588,
|
|
"grad_norm": 0.7554163932800293,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5305,
|
|
"mean_token_accuracy": 0.8375445604324341,
|
|
"num_tokens": 934323985.0,
|
|
"step": 2932
|
|
},
|
|
{
|
|
"epoch": 2.9837232960325535,
|
|
"grad_norm": 0.7268316745758057,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5388,
|
|
"mean_token_accuracy": 0.8338518142700195,
|
|
"num_tokens": 934651946.0,
|
|
"step": 2933
|
|
},
|
|
{
|
|
"epoch": 2.984740590030519,
|
|
"grad_norm": 0.7595980763435364,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5335,
|
|
"mean_token_accuracy": 0.836643397808075,
|
|
"num_tokens": 934983408.0,
|
|
"step": 2934
|
|
},
|
|
{
|
|
"epoch": 2.985757884028484,
|
|
"grad_norm": 0.7359001636505127,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8356486558914185,
|
|
"num_tokens": 935295080.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 2.9867751780264498,
|
|
"grad_norm": 0.7544114589691162,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8339033126831055,
|
|
"num_tokens": 935616845.0,
|
|
"step": 2936
|
|
},
|
|
{
|
|
"epoch": 2.987792472024415,
|
|
"grad_norm": 0.9193035364151001,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5393,
|
|
"mean_token_accuracy": 0.8357737064361572,
|
|
"num_tokens": 935925552.0,
|
|
"step": 2937
|
|
},
|
|
{
|
|
"epoch": 2.9888097660223805,
|
|
"grad_norm": 0.7781708836555481,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5378,
|
|
"mean_token_accuracy": 0.8341381549835205,
|
|
"num_tokens": 936253904.0,
|
|
"step": 2938
|
|
},
|
|
{
|
|
"epoch": 2.989827060020346,
|
|
"grad_norm": 0.7724592685699463,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5254,
|
|
"mean_token_accuracy": 0.8389288187026978,
|
|
"num_tokens": 936577899.0,
|
|
"step": 2939
|
|
},
|
|
{
|
|
"epoch": 2.990844354018311,
|
|
"grad_norm": 0.8233632445335388,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5471,
|
|
"mean_token_accuracy": 0.832964301109314,
|
|
"num_tokens": 936910362.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 2.9918616480162767,
|
|
"grad_norm": 0.7473217248916626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5412,
|
|
"mean_token_accuracy": 0.8334630727767944,
|
|
"num_tokens": 937231947.0,
|
|
"step": 2941
|
|
},
|
|
{
|
|
"epoch": 2.992878942014242,
|
|
"grad_norm": 0.8038507699966431,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5692,
|
|
"mean_token_accuracy": 0.8273025751113892,
|
|
"num_tokens": 937547528.0,
|
|
"step": 2942
|
|
},
|
|
{
|
|
"epoch": 2.9938962360122074,
|
|
"grad_norm": 0.7563943862915039,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5299,
|
|
"mean_token_accuracy": 0.8375269174575806,
|
|
"num_tokens": 937866031.0,
|
|
"step": 2943
|
|
},
|
|
{
|
|
"epoch": 2.994913530010173,
|
|
"grad_norm": 1.0789949893951416,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5323,
|
|
"mean_token_accuracy": 0.8369662165641785,
|
|
"num_tokens": 938184792.0,
|
|
"step": 2944
|
|
},
|
|
{
|
|
"epoch": 2.9959308240081386,
|
|
"grad_norm": 0.7308768630027771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5461,
|
|
"mean_token_accuracy": 0.8324853181838989,
|
|
"num_tokens": 938504623.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 2.9969481180061037,
|
|
"grad_norm": 0.8132859468460083,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8332552313804626,
|
|
"num_tokens": 938827631.0,
|
|
"step": 2946
|
|
},
|
|
{
|
|
"epoch": 2.9979654120040693,
|
|
"grad_norm": 0.7725070118904114,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8354282379150391,
|
|
"num_tokens": 939140656.0,
|
|
"step": 2947
|
|
},
|
|
{
|
|
"epoch": 2.9989827060020344,
|
|
"grad_norm": 0.7818151712417603,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5342,
|
|
"mean_token_accuracy": 0.8355613350868225,
|
|
"num_tokens": 939437597.0,
|
|
"step": 2948
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.7417168617248535,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.5317,
|
|
"mean_token_accuracy": 0.8380080461502075,
|
|
"num_tokens": 939774271.0,
|
|
"step": 2949
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 2949,
|
|
"total_flos": 6.428081412041081e+18,
|
|
"train_loss": 0.6103494446655175,
|
|
"train_runtime": 7328.7514,
|
|
"train_samples_per_second": 51.484,
|
|
"train_steps_per_second": 0.402
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 2949,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 1475,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.428081412041081e+18,
|
|
"train_batch_size": 32,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|