Model: xd2010/OLMoE-1B-7B-0125-sft-math7k-2epochs-frozen-router Source: Original Platform
1340 lines
36 KiB
JSON
1340 lines
36 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 162,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.018518518518518517,
|
|
"grad_norm": 30.383102972799886,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.4845,
|
|
"mean_token_accuracy": 0.7240093946456909,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.037037037037037035,
|
|
"grad_norm": 32.11227476783311,
|
|
"learning_rate": 5.882352941176471e-07,
|
|
"loss": 1.5437,
|
|
"mean_token_accuracy": 0.7141970992088318,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.05555555555555555,
|
|
"grad_norm": 29.906397140941003,
|
|
"learning_rate": 1.1764705882352942e-06,
|
|
"loss": 1.4825,
|
|
"mean_token_accuracy": 0.7237485647201538,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.07407407407407407,
|
|
"grad_norm": 29.985942701879527,
|
|
"learning_rate": 1.7647058823529414e-06,
|
|
"loss": 1.4989,
|
|
"mean_token_accuracy": 0.7203832268714905,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.09259259259259259,
|
|
"grad_norm": 29.73166397266726,
|
|
"learning_rate": 2.3529411764705885e-06,
|
|
"loss": 1.4574,
|
|
"mean_token_accuracy": 0.7253755927085876,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.1111111111111111,
|
|
"grad_norm": 25.92213291325712,
|
|
"learning_rate": 2.9411764705882355e-06,
|
|
"loss": 1.3813,
|
|
"mean_token_accuracy": 0.7307026386260986,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.12962962962962962,
|
|
"grad_norm": 26.069462391715213,
|
|
"learning_rate": 3.529411764705883e-06,
|
|
"loss": 1.2098,
|
|
"mean_token_accuracy": 0.7584702968597412,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.14814814814814814,
|
|
"grad_norm": 25.19883956752606,
|
|
"learning_rate": 4.11764705882353e-06,
|
|
"loss": 1.1305,
|
|
"mean_token_accuracy": 0.7625263929367065,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 20.418608439087986,
|
|
"learning_rate": 4.705882352941177e-06,
|
|
"loss": 0.7787,
|
|
"mean_token_accuracy": 0.8090993762016296,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.18518518518518517,
|
|
"grad_norm": 18.730795450552804,
|
|
"learning_rate": 5.294117647058824e-06,
|
|
"loss": 0.7188,
|
|
"mean_token_accuracy": 0.8143796324729919,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.2037037037037037,
|
|
"grad_norm": 14.138272617769461,
|
|
"learning_rate": 5.882352941176471e-06,
|
|
"loss": 0.6116,
|
|
"mean_token_accuracy": 0.8397451639175415,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 7.596618029685936,
|
|
"learning_rate": 6.470588235294119e-06,
|
|
"loss": 0.4657,
|
|
"mean_token_accuracy": 0.8724555969238281,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.24074074074074073,
|
|
"grad_norm": 4.119101781430916,
|
|
"learning_rate": 7.058823529411766e-06,
|
|
"loss": 0.4309,
|
|
"mean_token_accuracy": 0.8790558576583862,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.25925925925925924,
|
|
"grad_norm": 2.4176925019707562,
|
|
"learning_rate": 7.647058823529411e-06,
|
|
"loss": 0.4135,
|
|
"mean_token_accuracy": 0.8794527053833008,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.2777777777777778,
|
|
"grad_norm": 2.2647199411793886,
|
|
"learning_rate": 8.23529411764706e-06,
|
|
"loss": 0.3725,
|
|
"mean_token_accuracy": 0.891783595085144,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.2962962962962963,
|
|
"grad_norm": 1.8524926429893849,
|
|
"learning_rate": 8.823529411764707e-06,
|
|
"loss": 0.3794,
|
|
"mean_token_accuracy": 0.8888377547264099,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.3148148148148148,
|
|
"grad_norm": 1.640430683307726,
|
|
"learning_rate": 9.411764705882354e-06,
|
|
"loss": 0.3639,
|
|
"mean_token_accuracy": 0.8921489715576172,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 1.570505827505393,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.3442,
|
|
"mean_token_accuracy": 0.8932390213012695,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.35185185185185186,
|
|
"grad_norm": 1.4135328136828533,
|
|
"learning_rate": 9.998943841083179e-06,
|
|
"loss": 0.3512,
|
|
"mean_token_accuracy": 0.8942129015922546,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 1.5324258913209416,
|
|
"learning_rate": 9.995775860097897e-06,
|
|
"loss": 0.3526,
|
|
"mean_token_accuracy": 0.8924254179000854,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.3888888888888889,
|
|
"grad_norm": 1.7243977398106463,
|
|
"learning_rate": 9.990497544106981e-06,
|
|
"loss": 0.3547,
|
|
"mean_token_accuracy": 0.8945664763450623,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.4074074074074074,
|
|
"grad_norm": 1.386050241196015,
|
|
"learning_rate": 9.983111370772877e-06,
|
|
"loss": 0.3623,
|
|
"mean_token_accuracy": 0.8888705372810364,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.42592592592592593,
|
|
"grad_norm": 1.467216758089058,
|
|
"learning_rate": 9.97362080719462e-06,
|
|
"loss": 0.3372,
|
|
"mean_token_accuracy": 0.8956804871559143,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.3785716474601148,
|
|
"learning_rate": 9.962030308280363e-06,
|
|
"loss": 0.3382,
|
|
"mean_token_accuracy": 0.8987942934036255,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.46296296296296297,
|
|
"grad_norm": 1.3748945772831314,
|
|
"learning_rate": 9.948345314656234e-06,
|
|
"loss": 0.3382,
|
|
"mean_token_accuracy": 0.8963785767555237,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.48148148148148145,
|
|
"grad_norm": 1.4196685316745923,
|
|
"learning_rate": 9.932572250112469e-06,
|
|
"loss": 0.3812,
|
|
"mean_token_accuracy": 0.8882339596748352,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.4483169776281999,
|
|
"learning_rate": 9.914718518588076e-06,
|
|
"loss": 0.3512,
|
|
"mean_token_accuracy": 0.8926590085029602,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.5185185185185185,
|
|
"grad_norm": 1.5402138969150352,
|
|
"learning_rate": 9.89479250069539e-06,
|
|
"loss": 0.3235,
|
|
"mean_token_accuracy": 0.9005230069160461,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.5370370370370371,
|
|
"grad_norm": 1.4292067204728813,
|
|
"learning_rate": 9.872803549786177e-06,
|
|
"loss": 0.3527,
|
|
"mean_token_accuracy": 0.8928272128105164,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.5555555555555556,
|
|
"grad_norm": 1.4483671102685722,
|
|
"learning_rate": 9.848761987561132e-06,
|
|
"loss": 0.3124,
|
|
"mean_token_accuracy": 0.9046220779418945,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.5740740740740741,
|
|
"grad_norm": 1.4763218698788387,
|
|
"learning_rate": 9.822679099224844e-06,
|
|
"loss": 0.3328,
|
|
"mean_token_accuracy": 0.8980678915977478,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.5925925925925926,
|
|
"grad_norm": 1.5799068974635961,
|
|
"learning_rate": 9.794567128188466e-06,
|
|
"loss": 0.3375,
|
|
"mean_token_accuracy": 0.8986757397651672,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.6111111111111112,
|
|
"grad_norm": 1.577093664869925,
|
|
"learning_rate": 9.764439270322612e-06,
|
|
"loss": 0.3744,
|
|
"mean_token_accuracy": 0.8908596634864807,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.6296296296296297,
|
|
"grad_norm": 1.5779044571124354,
|
|
"learning_rate": 9.732309667763158e-06,
|
|
"loss": 0.3804,
|
|
"mean_token_accuracy": 0.8886460065841675,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.6481481481481481,
|
|
"grad_norm": 1.4791494159202596,
|
|
"learning_rate": 9.69819340227288e-06,
|
|
"loss": 0.3432,
|
|
"mean_token_accuracy": 0.8950363397598267,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.3517478774476666,
|
|
"learning_rate": 9.662106488162001e-06,
|
|
"loss": 0.352,
|
|
"mean_token_accuracy": 0.8938645124435425,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.6851851851851852,
|
|
"grad_norm": 1.4032216792961782,
|
|
"learning_rate": 9.624065864771017e-06,
|
|
"loss": 0.3632,
|
|
"mean_token_accuracy": 0.8883957266807556,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.7037037037037037,
|
|
"grad_norm": 1.499738193983017,
|
|
"learning_rate": 9.584089388519307e-06,
|
|
"loss": 0.3665,
|
|
"mean_token_accuracy": 0.88990318775177,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.7222222222222222,
|
|
"grad_norm": 1.5239037095270276,
|
|
"learning_rate": 9.542195824523251e-06,
|
|
"loss": 0.3284,
|
|
"mean_token_accuracy": 0.8995599150657654,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 1.5481560151543097,
|
|
"learning_rate": 9.498404837787811e-06,
|
|
"loss": 0.3434,
|
|
"mean_token_accuracy": 0.8942570686340332,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.7592592592592593,
|
|
"grad_norm": 1.559221040243686,
|
|
"learning_rate": 9.452736983975708e-06,
|
|
"loss": 0.3428,
|
|
"mean_token_accuracy": 0.8983538746833801,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.7777777777777778,
|
|
"grad_norm": 1.4373798428955245,
|
|
"learning_rate": 9.405213699758507e-06,
|
|
"loss": 0.3615,
|
|
"mean_token_accuracy": 0.8924130797386169,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.7962962962962963,
|
|
"grad_norm": 1.4535700047967577,
|
|
"learning_rate": 9.355857292754152e-06,
|
|
"loss": 0.3339,
|
|
"mean_token_accuracy": 0.8993980884552002,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.8148148148148148,
|
|
"grad_norm": 1.327423910804063,
|
|
"learning_rate": 9.304690931055694e-06,
|
|
"loss": 0.3564,
|
|
"mean_token_accuracy": 0.8938746452331543,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 1.3951481767469502,
|
|
"learning_rate": 9.251738632356086e-06,
|
|
"loss": 0.3578,
|
|
"mean_token_accuracy": 0.8920174241065979,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.8518518518518519,
|
|
"grad_norm": 1.315246360794402,
|
|
"learning_rate": 9.197025252674192e-06,
|
|
"loss": 0.3655,
|
|
"mean_token_accuracy": 0.8932892680168152,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.8703703703703703,
|
|
"grad_norm": 1.4834801736119503,
|
|
"learning_rate": 9.140576474687263e-06,
|
|
"loss": 0.343,
|
|
"mean_token_accuracy": 0.8958399891853333,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 1.3053313853126278,
|
|
"learning_rate": 9.082418795675397e-06,
|
|
"loss": 0.3382,
|
|
"mean_token_accuracy": 0.896998941898346,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.9074074074074074,
|
|
"grad_norm": 1.5372874711637974,
|
|
"learning_rate": 9.022579515083601e-06,
|
|
"loss": 0.3519,
|
|
"mean_token_accuracy": 0.8948127627372742,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.9259259259259259,
|
|
"grad_norm": 1.491827161316295,
|
|
"learning_rate": 8.961086721707331e-06,
|
|
"loss": 0.3207,
|
|
"mean_token_accuracy": 0.903107762336731,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.9444444444444444,
|
|
"grad_norm": 1.5717125107419234,
|
|
"learning_rate": 8.897969280507494e-06,
|
|
"loss": 0.3464,
|
|
"mean_token_accuracy": 0.8984756469726562,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.9629629629629629,
|
|
"grad_norm": 1.4048945964359827,
|
|
"learning_rate": 8.833256819061126e-06,
|
|
"loss": 0.3496,
|
|
"mean_token_accuracy": 0.8940179347991943,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.9814814814814815,
|
|
"grad_norm": 1.4541816999218877,
|
|
"learning_rate": 8.76697971365409e-06,
|
|
"loss": 0.3157,
|
|
"mean_token_accuracy": 0.9050168991088867,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.359087932133861,
|
|
"learning_rate": 8.69916907502232e-06,
|
|
"loss": 0.3193,
|
|
"mean_token_accuracy": 0.903141975402832,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 1.0185185185185186,
|
|
"grad_norm": 1.139831522295069,
|
|
"learning_rate": 8.629856733748325e-06,
|
|
"loss": 0.2614,
|
|
"mean_token_accuracy": 0.9204674959182739,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 1.037037037037037,
|
|
"grad_norm": 1.2109733029051535,
|
|
"learning_rate": 8.559075225319786e-06,
|
|
"loss": 0.2431,
|
|
"mean_token_accuracy": 0.9270948767662048,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 1.0555555555555556,
|
|
"grad_norm": 1.1953987880672259,
|
|
"learning_rate": 8.48685777485727e-06,
|
|
"loss": 0.2605,
|
|
"mean_token_accuracy": 0.9191746115684509,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 1.074074074074074,
|
|
"grad_norm": 1.31823870121472,
|
|
"learning_rate": 8.413238281518225e-06,
|
|
"loss": 0.2569,
|
|
"mean_token_accuracy": 0.9213942289352417,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 1.0925925925925926,
|
|
"grad_norm": 1.3434083702460653,
|
|
"learning_rate": 8.33825130258458e-06,
|
|
"loss": 0.255,
|
|
"mean_token_accuracy": 0.9220726490020752,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 1.3311689689140414,
|
|
"learning_rate": 8.261932037241418e-06,
|
|
"loss": 0.2398,
|
|
"mean_token_accuracy": 0.9263064861297607,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 1.1296296296296295,
|
|
"grad_norm": 1.371914132190983,
|
|
"learning_rate": 8.184316310054355e-06,
|
|
"loss": 0.2421,
|
|
"mean_token_accuracy": 0.925538182258606,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 1.1481481481481481,
|
|
"grad_norm": 1.2357585408478602,
|
|
"learning_rate": 8.10544055415332e-06,
|
|
"loss": 0.2689,
|
|
"mean_token_accuracy": 0.9196819067001343,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 1.1634817498053858,
|
|
"learning_rate": 8.025341794130722e-06,
|
|
"loss": 0.2579,
|
|
"mean_token_accuracy": 0.9201351404190063,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 1.1851851851851851,
|
|
"grad_norm": 1.2614222517839995,
|
|
"learning_rate": 7.944057628661948e-06,
|
|
"loss": 0.2516,
|
|
"mean_token_accuracy": 0.923024594783783,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 1.2037037037037037,
|
|
"grad_norm": 1.243413098569248,
|
|
"learning_rate": 7.861626212856404e-06,
|
|
"loss": 0.2558,
|
|
"mean_token_accuracy": 0.9209133982658386,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 1.2222222222222223,
|
|
"grad_norm": 1.213962019110298,
|
|
"learning_rate": 7.778086240347343e-06,
|
|
"loss": 0.2488,
|
|
"mean_token_accuracy": 0.9236682057380676,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 1.2407407407407407,
|
|
"grad_norm": 1.1301355101794357,
|
|
"learning_rate": 7.693476925128937e-06,
|
|
"loss": 0.2676,
|
|
"mean_token_accuracy": 0.9169973731040955,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 1.2592592592592593,
|
|
"grad_norm": 1.037571809068588,
|
|
"learning_rate": 7.607837983149057e-06,
|
|
"loss": 0.2399,
|
|
"mean_token_accuracy": 0.9276074767112732,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 1.2777777777777777,
|
|
"grad_norm": 1.228055556807229,
|
|
"learning_rate": 7.521209613666457e-06,
|
|
"loss": 0.2253,
|
|
"mean_token_accuracy": 0.9291183352470398,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 1.2962962962962963,
|
|
"grad_norm": 1.1604461514204596,
|
|
"learning_rate": 7.433632480381083e-06,
|
|
"loss": 0.2302,
|
|
"mean_token_accuracy": 0.9275596141815186,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 1.3148148148148149,
|
|
"grad_norm": 1.1429904081958335,
|
|
"learning_rate": 7.345147692346373e-06,
|
|
"loss": 0.2468,
|
|
"mean_token_accuracy": 0.9256559014320374,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 1.2670257480788256,
|
|
"learning_rate": 7.255796784672496e-06,
|
|
"loss": 0.2756,
|
|
"mean_token_accuracy": 0.9166375994682312,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 1.3518518518518519,
|
|
"grad_norm": 1.2006495346193158,
|
|
"learning_rate": 7.165621699029615e-06,
|
|
"loss": 0.2675,
|
|
"mean_token_accuracy": 0.9179262518882751,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 1.3703703703703702,
|
|
"grad_norm": 1.2199671885347414,
|
|
"learning_rate": 7.0746647639602994e-06,
|
|
"loss": 0.246,
|
|
"mean_token_accuracy": 0.9246431589126587,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 1.3888888888888888,
|
|
"grad_norm": 1.2325244346693196,
|
|
"learning_rate": 6.982968675010332e-06,
|
|
"loss": 0.2604,
|
|
"mean_token_accuracy": 0.9215620756149292,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 1.4074074074074074,
|
|
"grad_norm": 1.222471090199412,
|
|
"learning_rate": 6.890576474687264e-06,
|
|
"loss": 0.2555,
|
|
"mean_token_accuracy": 0.9202633500099182,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 1.425925925925926,
|
|
"grad_norm": 1.1650774082286408,
|
|
"learning_rate": 6.797531532256079e-06,
|
|
"loss": 0.2535,
|
|
"mean_token_accuracy": 0.9203940629959106,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 1.155157813327957,
|
|
"learning_rate": 6.703877523381495e-06,
|
|
"loss": 0.2514,
|
|
"mean_token_accuracy": 0.9239696860313416,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 1.462962962962963,
|
|
"grad_norm": 1.2322059130014582,
|
|
"learning_rate": 6.609658409626431e-06,
|
|
"loss": 0.2522,
|
|
"mean_token_accuracy": 0.9223854541778564,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 1.1431661429529452,
|
|
"learning_rate": 6.514918417816275e-06,
|
|
"loss": 0.2645,
|
|
"mean_token_accuracy": 0.9201213121414185,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 1.2154621857829624,
|
|
"learning_rate": 6.419702019278643e-06,
|
|
"loss": 0.2351,
|
|
"mean_token_accuracy": 0.9279325008392334,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 1.5185185185185186,
|
|
"grad_norm": 1.2629658642141612,
|
|
"learning_rate": 6.324053908968353e-06,
|
|
"loss": 0.2499,
|
|
"mean_token_accuracy": 0.9237509369850159,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 1.5370370370370372,
|
|
"grad_norm": 1.1434498930723798,
|
|
"learning_rate": 6.228018984487443e-06,
|
|
"loss": 0.2424,
|
|
"mean_token_accuracy": 0.9255508780479431,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 1.5555555555555556,
|
|
"grad_norm": 1.183948134135304,
|
|
"learning_rate": 6.13164232501005e-06,
|
|
"loss": 0.2662,
|
|
"mean_token_accuracy": 0.9195141196250916,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 1.574074074074074,
|
|
"grad_norm": 1.0816053650678727,
|
|
"learning_rate": 6.034969170122079e-06,
|
|
"loss": 0.2251,
|
|
"mean_token_accuracy": 0.9291943311691284,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.5925925925925926,
|
|
"grad_norm": 1.3090327941892361,
|
|
"learning_rate": 5.938044898585555e-06,
|
|
"loss": 0.2845,
|
|
"mean_token_accuracy": 0.9130949378013611,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 1.6111111111111112,
|
|
"grad_norm": 1.090271437723717,
|
|
"learning_rate": 5.840915007037648e-06,
|
|
"loss": 0.2471,
|
|
"mean_token_accuracy": 0.9219435453414917,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 1.6296296296296298,
|
|
"grad_norm": 1.3405339469885855,
|
|
"learning_rate": 5.74362508863438e-06,
|
|
"loss": 0.2726,
|
|
"mean_token_accuracy": 0.9205261468887329,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 1.6481481481481481,
|
|
"grad_norm": 1.1088012675507433,
|
|
"learning_rate": 5.646220811649013e-06,
|
|
"loss": 0.2599,
|
|
"mean_token_accuracy": 0.9209275245666504,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 1.1041623807960375,
|
|
"learning_rate": 5.5487478980351805e-06,
|
|
"loss": 0.2766,
|
|
"mean_token_accuracy": 0.9163511395454407,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.6851851851851851,
|
|
"grad_norm": 1.0975600683056852,
|
|
"learning_rate": 5.451252101964821e-06,
|
|
"loss": 0.2619,
|
|
"mean_token_accuracy": 0.9195134043693542,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 1.7037037037037037,
|
|
"grad_norm": 1.0559577091421926,
|
|
"learning_rate": 5.353779188350989e-06,
|
|
"loss": 0.2542,
|
|
"mean_token_accuracy": 0.9217535853385925,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 1.7222222222222223,
|
|
"grad_norm": 1.0902279304137317,
|
|
"learning_rate": 5.256374911365621e-06,
|
|
"loss": 0.2442,
|
|
"mean_token_accuracy": 0.9247879385948181,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 1.7407407407407407,
|
|
"grad_norm": 1.1900024135485159,
|
|
"learning_rate": 5.159084992962354e-06,
|
|
"loss": 0.2413,
|
|
"mean_token_accuracy": 0.9264701008796692,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 1.7592592592592593,
|
|
"grad_norm": 1.114375986129405,
|
|
"learning_rate": 5.061955101414448e-06,
|
|
"loss": 0.2603,
|
|
"mean_token_accuracy": 0.9205346703529358,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.7777777777777777,
|
|
"grad_norm": 1.1499780047037227,
|
|
"learning_rate": 4.9650308298779215e-06,
|
|
"loss": 0.2477,
|
|
"mean_token_accuracy": 0.9239223599433899,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.7962962962962963,
|
|
"grad_norm": 1.1492427257637925,
|
|
"learning_rate": 4.8683576749899505e-06,
|
|
"loss": 0.2783,
|
|
"mean_token_accuracy": 0.9156987071037292,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 1.8148148148148149,
|
|
"grad_norm": 1.2037535748153476,
|
|
"learning_rate": 4.771981015512559e-06,
|
|
"loss": 0.2419,
|
|
"mean_token_accuracy": 0.9248070120811462,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 1.1672243929637462,
|
|
"learning_rate": 4.675946091031648e-06,
|
|
"loss": 0.2634,
|
|
"mean_token_accuracy": 0.9204791188240051,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 1.2030086649458018,
|
|
"learning_rate": 4.5802979807213585e-06,
|
|
"loss": 0.2691,
|
|
"mean_token_accuracy": 0.9181145429611206,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.8703703703703702,
|
|
"grad_norm": 1.2410994948220553,
|
|
"learning_rate": 4.4850815821837265e-06,
|
|
"loss": 0.2637,
|
|
"mean_token_accuracy": 0.9215072393417358,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 1.1285340553042031,
|
|
"learning_rate": 4.3903415903735725e-06,
|
|
"loss": 0.265,
|
|
"mean_token_accuracy": 0.9205712080001831,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 1.9074074074074074,
|
|
"grad_norm": 1.15167294242117,
|
|
"learning_rate": 4.296122476618507e-06,
|
|
"loss": 0.2491,
|
|
"mean_token_accuracy": 0.9238101840019226,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 1.925925925925926,
|
|
"grad_norm": 1.128579734185751,
|
|
"learning_rate": 4.202468467743922e-06,
|
|
"loss": 0.2613,
|
|
"mean_token_accuracy": 0.9208459854125977,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 1.9444444444444444,
|
|
"grad_norm": 1.094268425976433,
|
|
"learning_rate": 4.109423525312738e-06,
|
|
"loss": 0.2479,
|
|
"mean_token_accuracy": 0.9242894649505615,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.9629629629629628,
|
|
"grad_norm": 1.1260095311018505,
|
|
"learning_rate": 4.017031324989669e-06,
|
|
"loss": 0.245,
|
|
"mean_token_accuracy": 0.923931360244751,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 1.9814814814814814,
|
|
"grad_norm": 1.0715893334011972,
|
|
"learning_rate": 3.925335236039702e-06,
|
|
"loss": 0.2628,
|
|
"mean_token_accuracy": 0.920842707157135,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.9541245934737955,
|
|
"learning_rate": 3.834378300970385e-06,
|
|
"loss": 0.2317,
|
|
"mean_token_accuracy": 0.9292216300964355,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 2.0185185185185186,
|
|
"grad_norm": 0.9950325332822214,
|
|
"learning_rate": 3.7442032153275053e-06,
|
|
"loss": 0.1862,
|
|
"mean_token_accuracy": 0.944040834903717,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 2.037037037037037,
|
|
"grad_norm": 0.9031680263761843,
|
|
"learning_rate": 3.654852307653628e-06,
|
|
"loss": 0.1729,
|
|
"mean_token_accuracy": 0.9489833116531372,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 2.0555555555555554,
|
|
"grad_norm": 0.8700426144680551,
|
|
"learning_rate": 3.5663675196189184e-06,
|
|
"loss": 0.1723,
|
|
"mean_token_accuracy": 0.9485137462615967,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 2.074074074074074,
|
|
"grad_norm": 0.920806616685076,
|
|
"learning_rate": 3.478790386333546e-06,
|
|
"loss": 0.2035,
|
|
"mean_token_accuracy": 0.9403110146522522,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 2.0925925925925926,
|
|
"grad_norm": 0.9156796841824484,
|
|
"learning_rate": 3.392162016850945e-06,
|
|
"loss": 0.1787,
|
|
"mean_token_accuracy": 0.9458126425743103,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 2.111111111111111,
|
|
"grad_norm": 0.8777527626143813,
|
|
"learning_rate": 3.3065230748710646e-06,
|
|
"loss": 0.1764,
|
|
"mean_token_accuracy": 0.9460602402687073,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 2.1296296296296298,
|
|
"grad_norm": 0.8387955266803102,
|
|
"learning_rate": 3.221913759652657e-06,
|
|
"loss": 0.163,
|
|
"mean_token_accuracy": 0.9506521224975586,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 2.148148148148148,
|
|
"grad_norm": 0.9758818845077261,
|
|
"learning_rate": 3.138373787143598e-06,
|
|
"loss": 0.1818,
|
|
"mean_token_accuracy": 0.9452192187309265,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 0.9533178926074783,
|
|
"learning_rate": 3.055942371338052e-06,
|
|
"loss": 0.1696,
|
|
"mean_token_accuracy": 0.9481253623962402,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 2.185185185185185,
|
|
"grad_norm": 1.0699519394655053,
|
|
"learning_rate": 2.9746582058692803e-06,
|
|
"loss": 0.1969,
|
|
"mean_token_accuracy": 0.9410567879676819,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 2.2037037037037037,
|
|
"grad_norm": 0.9486439388249865,
|
|
"learning_rate": 2.894559445846682e-06,
|
|
"loss": 0.1734,
|
|
"mean_token_accuracy": 0.9469440579414368,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 2.2222222222222223,
|
|
"grad_norm": 1.013131329250228,
|
|
"learning_rate": 2.8156836899456475e-06,
|
|
"loss": 0.1756,
|
|
"mean_token_accuracy": 0.9473387598991394,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 2.240740740740741,
|
|
"grad_norm": 0.9433247487590843,
|
|
"learning_rate": 2.7380679627585817e-06,
|
|
"loss": 0.1625,
|
|
"mean_token_accuracy": 0.9486984014511108,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 2.259259259259259,
|
|
"grad_norm": 0.9981757376384809,
|
|
"learning_rate": 2.661748697415423e-06,
|
|
"loss": 0.1752,
|
|
"mean_token_accuracy": 0.9464225769042969,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 2.2777777777777777,
|
|
"grad_norm": 0.9857526409152109,
|
|
"learning_rate": 2.586761718481776e-06,
|
|
"loss": 0.1809,
|
|
"mean_token_accuracy": 0.9457290768623352,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 2.2962962962962963,
|
|
"grad_norm": 0.9462725952295606,
|
|
"learning_rate": 2.5131422251427313e-06,
|
|
"loss": 0.1687,
|
|
"mean_token_accuracy": 0.9483760595321655,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 2.314814814814815,
|
|
"grad_norm": 1.010191097162094,
|
|
"learning_rate": 2.440924774680215e-06,
|
|
"loss": 0.1831,
|
|
"mean_token_accuracy": 0.9440175294876099,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.9374805059731371,
|
|
"learning_rate": 2.3701432662516772e-06,
|
|
"loss": 0.1885,
|
|
"mean_token_accuracy": 0.9436575174331665,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 2.351851851851852,
|
|
"grad_norm": 0.9269388287237318,
|
|
"learning_rate": 2.300830924977683e-06,
|
|
"loss": 0.1768,
|
|
"mean_token_accuracy": 0.946081280708313,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 2.3703703703703702,
|
|
"grad_norm": 0.8355613001726996,
|
|
"learning_rate": 2.2330202863459123e-06,
|
|
"loss": 0.1938,
|
|
"mean_token_accuracy": 0.9410346150398254,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 2.388888888888889,
|
|
"grad_norm": 0.8762995235599421,
|
|
"learning_rate": 2.166743180938875e-06,
|
|
"loss": 0.1839,
|
|
"mean_token_accuracy": 0.9433231949806213,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 2.4074074074074074,
|
|
"grad_norm": 0.8151541961388346,
|
|
"learning_rate": 2.102030719492508e-06,
|
|
"loss": 0.1746,
|
|
"mean_token_accuracy": 0.9466114044189453,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 2.425925925925926,
|
|
"grad_norm": 0.8453160237631886,
|
|
"learning_rate": 2.03891327829267e-06,
|
|
"loss": 0.174,
|
|
"mean_token_accuracy": 0.9469768404960632,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.8395339617580972,
|
|
"learning_rate": 1.9774204849164004e-06,
|
|
"loss": 0.1866,
|
|
"mean_token_accuracy": 0.9433194398880005,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 2.462962962962963,
|
|
"grad_norm": 0.8873824785732524,
|
|
"learning_rate": 1.9175812043246034e-06,
|
|
"loss": 0.1939,
|
|
"mean_token_accuracy": 0.9412445425987244,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 2.4814814814814814,
|
|
"grad_norm": 0.8522508069696332,
|
|
"learning_rate": 1.8594235253127373e-06,
|
|
"loss": 0.183,
|
|
"mean_token_accuracy": 0.9447925686836243,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.9289476468580379,
|
|
"learning_rate": 1.8029747473258092e-06,
|
|
"loss": 0.1769,
|
|
"mean_token_accuracy": 0.9454068541526794,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 2.5185185185185186,
|
|
"grad_norm": 0.9311186985274743,
|
|
"learning_rate": 1.7482613676439153e-06,
|
|
"loss": 0.1809,
|
|
"mean_token_accuracy": 0.945341169834137,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 2.537037037037037,
|
|
"grad_norm": 0.8214964900059085,
|
|
"learning_rate": 1.6953090689443074e-06,
|
|
"loss": 0.1679,
|
|
"mean_token_accuracy": 0.9475165605545044,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 2.5555555555555554,
|
|
"grad_norm": 0.8110726136011421,
|
|
"learning_rate": 1.6441427072458493e-06,
|
|
"loss": 0.1725,
|
|
"mean_token_accuracy": 0.9460154175758362,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 2.574074074074074,
|
|
"grad_norm": 0.8629417011035696,
|
|
"learning_rate": 1.5947863002414938e-06,
|
|
"loss": 0.1773,
|
|
"mean_token_accuracy": 0.9460445046424866,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 2.5925925925925926,
|
|
"grad_norm": 0.9131344482433362,
|
|
"learning_rate": 1.5472630160242921e-06,
|
|
"loss": 0.1888,
|
|
"mean_token_accuracy": 0.9422698020935059,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 2.611111111111111,
|
|
"grad_norm": 0.8995979478525096,
|
|
"learning_rate": 1.5015951622121896e-06,
|
|
"loss": 0.1812,
|
|
"mean_token_accuracy": 0.9445046186447144,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 2.6296296296296298,
|
|
"grad_norm": 0.8973291525652752,
|
|
"learning_rate": 1.457804175476751e-06,
|
|
"loss": 0.1718,
|
|
"mean_token_accuracy": 0.9484192728996277,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 2.648148148148148,
|
|
"grad_norm": 0.8806301946411274,
|
|
"learning_rate": 1.4159106114806943e-06,
|
|
"loss": 0.1763,
|
|
"mean_token_accuracy": 0.9455322623252869,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.9426279555089078,
|
|
"learning_rate": 1.3759341352289832e-06,
|
|
"loss": 0.1819,
|
|
"mean_token_accuracy": 0.943683922290802,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 2.685185185185185,
|
|
"grad_norm": 0.8337535019025356,
|
|
"learning_rate": 1.3378935118380004e-06,
|
|
"loss": 0.1739,
|
|
"mean_token_accuracy": 0.9465071558952332,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 2.7037037037037037,
|
|
"grad_norm": 0.8809269775768832,
|
|
"learning_rate": 1.3018065977271215e-06,
|
|
"loss": 0.1831,
|
|
"mean_token_accuracy": 0.9447413086891174,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 2.7222222222222223,
|
|
"grad_norm": 0.8959429008425933,
|
|
"learning_rate": 1.2676903322368423e-06,
|
|
"loss": 0.1815,
|
|
"mean_token_accuracy": 0.9448564648628235,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 2.7407407407407405,
|
|
"grad_norm": 0.892750069418456,
|
|
"learning_rate": 1.2355607296773896e-06,
|
|
"loss": 0.1798,
|
|
"mean_token_accuracy": 0.9448550939559937,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 2.7592592592592595,
|
|
"grad_norm": 0.9530453934988615,
|
|
"learning_rate": 1.2054328718115336e-06,
|
|
"loss": 0.1893,
|
|
"mean_token_accuracy": 0.9425032138824463,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 0.8446243004896555,
|
|
"learning_rate": 1.1773209007751562e-06,
|
|
"loss": 0.1777,
|
|
"mean_token_accuracy": 0.9456593990325928,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.7962962962962963,
|
|
"grad_norm": 0.8770894361882803,
|
|
"learning_rate": 1.1512380124388695e-06,
|
|
"loss": 0.1739,
|
|
"mean_token_accuracy": 0.9469501376152039,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 2.814814814814815,
|
|
"grad_norm": 0.9163706165945724,
|
|
"learning_rate": 1.127196450213825e-06,
|
|
"loss": 0.1644,
|
|
"mean_token_accuracy": 0.948975682258606,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.8953025604312923,
|
|
"learning_rate": 1.1052074993046102e-06,
|
|
"loss": 0.1845,
|
|
"mean_token_accuracy": 0.9444332718849182,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 2.851851851851852,
|
|
"grad_norm": 0.8938609728248295,
|
|
"learning_rate": 1.0852814814119238e-06,
|
|
"loss": 0.1759,
|
|
"mean_token_accuracy": 0.9459174275398254,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 2.8703703703703702,
|
|
"grad_norm": 1.0176759246319027,
|
|
"learning_rate": 1.0674277498875325e-06,
|
|
"loss": 0.1727,
|
|
"mean_token_accuracy": 0.9466116428375244,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.9239006043222746,
|
|
"learning_rate": 1.0516546853437686e-06,
|
|
"loss": 0.1803,
|
|
"mean_token_accuracy": 0.943221926689148,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 2.9074074074074074,
|
|
"grad_norm": 0.7911892818762664,
|
|
"learning_rate": 1.0379696917196378e-06,
|
|
"loss": 0.1643,
|
|
"mean_token_accuracy": 0.9492570161819458,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 2.925925925925926,
|
|
"grad_norm": 0.9656881157079873,
|
|
"learning_rate": 1.026379192805382e-06,
|
|
"loss": 0.1765,
|
|
"mean_token_accuracy": 0.9460762143135071,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 2.9444444444444446,
|
|
"grad_norm": 1.0195811706557167,
|
|
"learning_rate": 1.0168886292271246e-06,
|
|
"loss": 0.1765,
|
|
"mean_token_accuracy": 0.9457534551620483,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 2.962962962962963,
|
|
"grad_norm": 0.8488680044411275,
|
|
"learning_rate": 1.0095024558930204e-06,
|
|
"loss": 0.1753,
|
|
"mean_token_accuracy": 0.9460701942443848,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 2.9814814814814814,
|
|
"grad_norm": 0.8614929763896857,
|
|
"learning_rate": 1.004224139902105e-06,
|
|
"loss": 0.1809,
|
|
"mean_token_accuracy": 0.9434822201728821,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.7985383850024447,
|
|
"learning_rate": 1.0010561589168217e-06,
|
|
"loss": 0.1582,
|
|
"mean_token_accuracy": 0.9517039656639099,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 162,
|
|
"total_flos": 6812915466240.0,
|
|
"train_loss": 0.3203352055983779,
|
|
"train_runtime": 2102.8589,
|
|
"train_samples_per_second": 9.774,
|
|
"train_steps_per_second": 0.077
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 162,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6812915466240.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|