Files
OLMoE-1B-7B-0125-sft-math7k…/trainer_state.json
ModelHub XC ef597de8e9 初始化项目,由ModelHub XC社区提供模型
Model: xd2010/OLMoE-1B-7B-0125-sft-math7k-2epochs-frozen-router
Source: Original Platform
2026-06-21 01:52:24 +08:00

1340 lines
36 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018518518518518517,
"grad_norm": 30.383102972799886,
"learning_rate": 0.0,
"loss": 1.4845,
"mean_token_accuracy": 0.7240093946456909,
"step": 1
},
{
"epoch": 0.037037037037037035,
"grad_norm": 32.11227476783311,
"learning_rate": 5.882352941176471e-07,
"loss": 1.5437,
"mean_token_accuracy": 0.7141970992088318,
"step": 2
},
{
"epoch": 0.05555555555555555,
"grad_norm": 29.906397140941003,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.4825,
"mean_token_accuracy": 0.7237485647201538,
"step": 3
},
{
"epoch": 0.07407407407407407,
"grad_norm": 29.985942701879527,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.4989,
"mean_token_accuracy": 0.7203832268714905,
"step": 4
},
{
"epoch": 0.09259259259259259,
"grad_norm": 29.73166397266726,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.4574,
"mean_token_accuracy": 0.7253755927085876,
"step": 5
},
{
"epoch": 0.1111111111111111,
"grad_norm": 25.92213291325712,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.3813,
"mean_token_accuracy": 0.7307026386260986,
"step": 6
},
{
"epoch": 0.12962962962962962,
"grad_norm": 26.069462391715213,
"learning_rate": 3.529411764705883e-06,
"loss": 1.2098,
"mean_token_accuracy": 0.7584702968597412,
"step": 7
},
{
"epoch": 0.14814814814814814,
"grad_norm": 25.19883956752606,
"learning_rate": 4.11764705882353e-06,
"loss": 1.1305,
"mean_token_accuracy": 0.7625263929367065,
"step": 8
},
{
"epoch": 0.16666666666666666,
"grad_norm": 20.418608439087986,
"learning_rate": 4.705882352941177e-06,
"loss": 0.7787,
"mean_token_accuracy": 0.8090993762016296,
"step": 9
},
{
"epoch": 0.18518518518518517,
"grad_norm": 18.730795450552804,
"learning_rate": 5.294117647058824e-06,
"loss": 0.7188,
"mean_token_accuracy": 0.8143796324729919,
"step": 10
},
{
"epoch": 0.2037037037037037,
"grad_norm": 14.138272617769461,
"learning_rate": 5.882352941176471e-06,
"loss": 0.6116,
"mean_token_accuracy": 0.8397451639175415,
"step": 11
},
{
"epoch": 0.2222222222222222,
"grad_norm": 7.596618029685936,
"learning_rate": 6.470588235294119e-06,
"loss": 0.4657,
"mean_token_accuracy": 0.8724555969238281,
"step": 12
},
{
"epoch": 0.24074074074074073,
"grad_norm": 4.119101781430916,
"learning_rate": 7.058823529411766e-06,
"loss": 0.4309,
"mean_token_accuracy": 0.8790558576583862,
"step": 13
},
{
"epoch": 0.25925925925925924,
"grad_norm": 2.4176925019707562,
"learning_rate": 7.647058823529411e-06,
"loss": 0.4135,
"mean_token_accuracy": 0.8794527053833008,
"step": 14
},
{
"epoch": 0.2777777777777778,
"grad_norm": 2.2647199411793886,
"learning_rate": 8.23529411764706e-06,
"loss": 0.3725,
"mean_token_accuracy": 0.891783595085144,
"step": 15
},
{
"epoch": 0.2962962962962963,
"grad_norm": 1.8524926429893849,
"learning_rate": 8.823529411764707e-06,
"loss": 0.3794,
"mean_token_accuracy": 0.8888377547264099,
"step": 16
},
{
"epoch": 0.3148148148148148,
"grad_norm": 1.640430683307726,
"learning_rate": 9.411764705882354e-06,
"loss": 0.3639,
"mean_token_accuracy": 0.8921489715576172,
"step": 17
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.570505827505393,
"learning_rate": 1e-05,
"loss": 0.3442,
"mean_token_accuracy": 0.8932390213012695,
"step": 18
},
{
"epoch": 0.35185185185185186,
"grad_norm": 1.4135328136828533,
"learning_rate": 9.998943841083179e-06,
"loss": 0.3512,
"mean_token_accuracy": 0.8942129015922546,
"step": 19
},
{
"epoch": 0.37037037037037035,
"grad_norm": 1.5324258913209416,
"learning_rate": 9.995775860097897e-06,
"loss": 0.3526,
"mean_token_accuracy": 0.8924254179000854,
"step": 20
},
{
"epoch": 0.3888888888888889,
"grad_norm": 1.7243977398106463,
"learning_rate": 9.990497544106981e-06,
"loss": 0.3547,
"mean_token_accuracy": 0.8945664763450623,
"step": 21
},
{
"epoch": 0.4074074074074074,
"grad_norm": 1.386050241196015,
"learning_rate": 9.983111370772877e-06,
"loss": 0.3623,
"mean_token_accuracy": 0.8888705372810364,
"step": 22
},
{
"epoch": 0.42592592592592593,
"grad_norm": 1.467216758089058,
"learning_rate": 9.97362080719462e-06,
"loss": 0.3372,
"mean_token_accuracy": 0.8956804871559143,
"step": 23
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.3785716474601148,
"learning_rate": 9.962030308280363e-06,
"loss": 0.3382,
"mean_token_accuracy": 0.8987942934036255,
"step": 24
},
{
"epoch": 0.46296296296296297,
"grad_norm": 1.3748945772831314,
"learning_rate": 9.948345314656234e-06,
"loss": 0.3382,
"mean_token_accuracy": 0.8963785767555237,
"step": 25
},
{
"epoch": 0.48148148148148145,
"grad_norm": 1.4196685316745923,
"learning_rate": 9.932572250112469e-06,
"loss": 0.3812,
"mean_token_accuracy": 0.8882339596748352,
"step": 26
},
{
"epoch": 0.5,
"grad_norm": 1.4483169776281999,
"learning_rate": 9.914718518588076e-06,
"loss": 0.3512,
"mean_token_accuracy": 0.8926590085029602,
"step": 27
},
{
"epoch": 0.5185185185185185,
"grad_norm": 1.5402138969150352,
"learning_rate": 9.89479250069539e-06,
"loss": 0.3235,
"mean_token_accuracy": 0.9005230069160461,
"step": 28
},
{
"epoch": 0.5370370370370371,
"grad_norm": 1.4292067204728813,
"learning_rate": 9.872803549786177e-06,
"loss": 0.3527,
"mean_token_accuracy": 0.8928272128105164,
"step": 29
},
{
"epoch": 0.5555555555555556,
"grad_norm": 1.4483671102685722,
"learning_rate": 9.848761987561132e-06,
"loss": 0.3124,
"mean_token_accuracy": 0.9046220779418945,
"step": 30
},
{
"epoch": 0.5740740740740741,
"grad_norm": 1.4763218698788387,
"learning_rate": 9.822679099224844e-06,
"loss": 0.3328,
"mean_token_accuracy": 0.8980678915977478,
"step": 31
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.5799068974635961,
"learning_rate": 9.794567128188466e-06,
"loss": 0.3375,
"mean_token_accuracy": 0.8986757397651672,
"step": 32
},
{
"epoch": 0.6111111111111112,
"grad_norm": 1.577093664869925,
"learning_rate": 9.764439270322612e-06,
"loss": 0.3744,
"mean_token_accuracy": 0.8908596634864807,
"step": 33
},
{
"epoch": 0.6296296296296297,
"grad_norm": 1.5779044571124354,
"learning_rate": 9.732309667763158e-06,
"loss": 0.3804,
"mean_token_accuracy": 0.8886460065841675,
"step": 34
},
{
"epoch": 0.6481481481481481,
"grad_norm": 1.4791494159202596,
"learning_rate": 9.69819340227288e-06,
"loss": 0.3432,
"mean_token_accuracy": 0.8950363397598267,
"step": 35
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.3517478774476666,
"learning_rate": 9.662106488162001e-06,
"loss": 0.352,
"mean_token_accuracy": 0.8938645124435425,
"step": 36
},
{
"epoch": 0.6851851851851852,
"grad_norm": 1.4032216792961782,
"learning_rate": 9.624065864771017e-06,
"loss": 0.3632,
"mean_token_accuracy": 0.8883957266807556,
"step": 37
},
{
"epoch": 0.7037037037037037,
"grad_norm": 1.499738193983017,
"learning_rate": 9.584089388519307e-06,
"loss": 0.3665,
"mean_token_accuracy": 0.88990318775177,
"step": 38
},
{
"epoch": 0.7222222222222222,
"grad_norm": 1.5239037095270276,
"learning_rate": 9.542195824523251e-06,
"loss": 0.3284,
"mean_token_accuracy": 0.8995599150657654,
"step": 39
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.5481560151543097,
"learning_rate": 9.498404837787811e-06,
"loss": 0.3434,
"mean_token_accuracy": 0.8942570686340332,
"step": 40
},
{
"epoch": 0.7592592592592593,
"grad_norm": 1.559221040243686,
"learning_rate": 9.452736983975708e-06,
"loss": 0.3428,
"mean_token_accuracy": 0.8983538746833801,
"step": 41
},
{
"epoch": 0.7777777777777778,
"grad_norm": 1.4373798428955245,
"learning_rate": 9.405213699758507e-06,
"loss": 0.3615,
"mean_token_accuracy": 0.8924130797386169,
"step": 42
},
{
"epoch": 0.7962962962962963,
"grad_norm": 1.4535700047967577,
"learning_rate": 9.355857292754152e-06,
"loss": 0.3339,
"mean_token_accuracy": 0.8993980884552002,
"step": 43
},
{
"epoch": 0.8148148148148148,
"grad_norm": 1.327423910804063,
"learning_rate": 9.304690931055694e-06,
"loss": 0.3564,
"mean_token_accuracy": 0.8938746452331543,
"step": 44
},
{
"epoch": 0.8333333333333334,
"grad_norm": 1.3951481767469502,
"learning_rate": 9.251738632356086e-06,
"loss": 0.3578,
"mean_token_accuracy": 0.8920174241065979,
"step": 45
},
{
"epoch": 0.8518518518518519,
"grad_norm": 1.315246360794402,
"learning_rate": 9.197025252674192e-06,
"loss": 0.3655,
"mean_token_accuracy": 0.8932892680168152,
"step": 46
},
{
"epoch": 0.8703703703703703,
"grad_norm": 1.4834801736119503,
"learning_rate": 9.140576474687263e-06,
"loss": 0.343,
"mean_token_accuracy": 0.8958399891853333,
"step": 47
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.3053313853126278,
"learning_rate": 9.082418795675397e-06,
"loss": 0.3382,
"mean_token_accuracy": 0.896998941898346,
"step": 48
},
{
"epoch": 0.9074074074074074,
"grad_norm": 1.5372874711637974,
"learning_rate": 9.022579515083601e-06,
"loss": 0.3519,
"mean_token_accuracy": 0.8948127627372742,
"step": 49
},
{
"epoch": 0.9259259259259259,
"grad_norm": 1.491827161316295,
"learning_rate": 8.961086721707331e-06,
"loss": 0.3207,
"mean_token_accuracy": 0.903107762336731,
"step": 50
},
{
"epoch": 0.9444444444444444,
"grad_norm": 1.5717125107419234,
"learning_rate": 8.897969280507494e-06,
"loss": 0.3464,
"mean_token_accuracy": 0.8984756469726562,
"step": 51
},
{
"epoch": 0.9629629629629629,
"grad_norm": 1.4048945964359827,
"learning_rate": 8.833256819061126e-06,
"loss": 0.3496,
"mean_token_accuracy": 0.8940179347991943,
"step": 52
},
{
"epoch": 0.9814814814814815,
"grad_norm": 1.4541816999218877,
"learning_rate": 8.76697971365409e-06,
"loss": 0.3157,
"mean_token_accuracy": 0.9050168991088867,
"step": 53
},
{
"epoch": 1.0,
"grad_norm": 1.359087932133861,
"learning_rate": 8.69916907502232e-06,
"loss": 0.3193,
"mean_token_accuracy": 0.903141975402832,
"step": 54
},
{
"epoch": 1.0185185185185186,
"grad_norm": 1.139831522295069,
"learning_rate": 8.629856733748325e-06,
"loss": 0.2614,
"mean_token_accuracy": 0.9204674959182739,
"step": 55
},
{
"epoch": 1.037037037037037,
"grad_norm": 1.2109733029051535,
"learning_rate": 8.559075225319786e-06,
"loss": 0.2431,
"mean_token_accuracy": 0.9270948767662048,
"step": 56
},
{
"epoch": 1.0555555555555556,
"grad_norm": 1.1953987880672259,
"learning_rate": 8.48685777485727e-06,
"loss": 0.2605,
"mean_token_accuracy": 0.9191746115684509,
"step": 57
},
{
"epoch": 1.074074074074074,
"grad_norm": 1.31823870121472,
"learning_rate": 8.413238281518225e-06,
"loss": 0.2569,
"mean_token_accuracy": 0.9213942289352417,
"step": 58
},
{
"epoch": 1.0925925925925926,
"grad_norm": 1.3434083702460653,
"learning_rate": 8.33825130258458e-06,
"loss": 0.255,
"mean_token_accuracy": 0.9220726490020752,
"step": 59
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.3311689689140414,
"learning_rate": 8.261932037241418e-06,
"loss": 0.2398,
"mean_token_accuracy": 0.9263064861297607,
"step": 60
},
{
"epoch": 1.1296296296296295,
"grad_norm": 1.371914132190983,
"learning_rate": 8.184316310054355e-06,
"loss": 0.2421,
"mean_token_accuracy": 0.925538182258606,
"step": 61
},
{
"epoch": 1.1481481481481481,
"grad_norm": 1.2357585408478602,
"learning_rate": 8.10544055415332e-06,
"loss": 0.2689,
"mean_token_accuracy": 0.9196819067001343,
"step": 62
},
{
"epoch": 1.1666666666666667,
"grad_norm": 1.1634817498053858,
"learning_rate": 8.025341794130722e-06,
"loss": 0.2579,
"mean_token_accuracy": 0.9201351404190063,
"step": 63
},
{
"epoch": 1.1851851851851851,
"grad_norm": 1.2614222517839995,
"learning_rate": 7.944057628661948e-06,
"loss": 0.2516,
"mean_token_accuracy": 0.923024594783783,
"step": 64
},
{
"epoch": 1.2037037037037037,
"grad_norm": 1.243413098569248,
"learning_rate": 7.861626212856404e-06,
"loss": 0.2558,
"mean_token_accuracy": 0.9209133982658386,
"step": 65
},
{
"epoch": 1.2222222222222223,
"grad_norm": 1.213962019110298,
"learning_rate": 7.778086240347343e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.9236682057380676,
"step": 66
},
{
"epoch": 1.2407407407407407,
"grad_norm": 1.1301355101794357,
"learning_rate": 7.693476925128937e-06,
"loss": 0.2676,
"mean_token_accuracy": 0.9169973731040955,
"step": 67
},
{
"epoch": 1.2592592592592593,
"grad_norm": 1.037571809068588,
"learning_rate": 7.607837983149057e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9276074767112732,
"step": 68
},
{
"epoch": 1.2777777777777777,
"grad_norm": 1.228055556807229,
"learning_rate": 7.521209613666457e-06,
"loss": 0.2253,
"mean_token_accuracy": 0.9291183352470398,
"step": 69
},
{
"epoch": 1.2962962962962963,
"grad_norm": 1.1604461514204596,
"learning_rate": 7.433632480381083e-06,
"loss": 0.2302,
"mean_token_accuracy": 0.9275596141815186,
"step": 70
},
{
"epoch": 1.3148148148148149,
"grad_norm": 1.1429904081958335,
"learning_rate": 7.345147692346373e-06,
"loss": 0.2468,
"mean_token_accuracy": 0.9256559014320374,
"step": 71
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.2670257480788256,
"learning_rate": 7.255796784672496e-06,
"loss": 0.2756,
"mean_token_accuracy": 0.9166375994682312,
"step": 72
},
{
"epoch": 1.3518518518518519,
"grad_norm": 1.2006495346193158,
"learning_rate": 7.165621699029615e-06,
"loss": 0.2675,
"mean_token_accuracy": 0.9179262518882751,
"step": 73
},
{
"epoch": 1.3703703703703702,
"grad_norm": 1.2199671885347414,
"learning_rate": 7.0746647639602994e-06,
"loss": 0.246,
"mean_token_accuracy": 0.9246431589126587,
"step": 74
},
{
"epoch": 1.3888888888888888,
"grad_norm": 1.2325244346693196,
"learning_rate": 6.982968675010332e-06,
"loss": 0.2604,
"mean_token_accuracy": 0.9215620756149292,
"step": 75
},
{
"epoch": 1.4074074074074074,
"grad_norm": 1.222471090199412,
"learning_rate": 6.890576474687264e-06,
"loss": 0.2555,
"mean_token_accuracy": 0.9202633500099182,
"step": 76
},
{
"epoch": 1.425925925925926,
"grad_norm": 1.1650774082286408,
"learning_rate": 6.797531532256079e-06,
"loss": 0.2535,
"mean_token_accuracy": 0.9203940629959106,
"step": 77
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.155157813327957,
"learning_rate": 6.703877523381495e-06,
"loss": 0.2514,
"mean_token_accuracy": 0.9239696860313416,
"step": 78
},
{
"epoch": 1.462962962962963,
"grad_norm": 1.2322059130014582,
"learning_rate": 6.609658409626431e-06,
"loss": 0.2522,
"mean_token_accuracy": 0.9223854541778564,
"step": 79
},
{
"epoch": 1.4814814814814814,
"grad_norm": 1.1431661429529452,
"learning_rate": 6.514918417816275e-06,
"loss": 0.2645,
"mean_token_accuracy": 0.9201213121414185,
"step": 80
},
{
"epoch": 1.5,
"grad_norm": 1.2154621857829624,
"learning_rate": 6.419702019278643e-06,
"loss": 0.2351,
"mean_token_accuracy": 0.9279325008392334,
"step": 81
},
{
"epoch": 1.5185185185185186,
"grad_norm": 1.2629658642141612,
"learning_rate": 6.324053908968353e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.9237509369850159,
"step": 82
},
{
"epoch": 1.5370370370370372,
"grad_norm": 1.1434498930723798,
"learning_rate": 6.228018984487443e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.9255508780479431,
"step": 83
},
{
"epoch": 1.5555555555555556,
"grad_norm": 1.183948134135304,
"learning_rate": 6.13164232501005e-06,
"loss": 0.2662,
"mean_token_accuracy": 0.9195141196250916,
"step": 84
},
{
"epoch": 1.574074074074074,
"grad_norm": 1.0816053650678727,
"learning_rate": 6.034969170122079e-06,
"loss": 0.2251,
"mean_token_accuracy": 0.9291943311691284,
"step": 85
},
{
"epoch": 1.5925925925925926,
"grad_norm": 1.3090327941892361,
"learning_rate": 5.938044898585555e-06,
"loss": 0.2845,
"mean_token_accuracy": 0.9130949378013611,
"step": 86
},
{
"epoch": 1.6111111111111112,
"grad_norm": 1.090271437723717,
"learning_rate": 5.840915007037648e-06,
"loss": 0.2471,
"mean_token_accuracy": 0.9219435453414917,
"step": 87
},
{
"epoch": 1.6296296296296298,
"grad_norm": 1.3405339469885855,
"learning_rate": 5.74362508863438e-06,
"loss": 0.2726,
"mean_token_accuracy": 0.9205261468887329,
"step": 88
},
{
"epoch": 1.6481481481481481,
"grad_norm": 1.1088012675507433,
"learning_rate": 5.646220811649013e-06,
"loss": 0.2599,
"mean_token_accuracy": 0.9209275245666504,
"step": 89
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.1041623807960375,
"learning_rate": 5.5487478980351805e-06,
"loss": 0.2766,
"mean_token_accuracy": 0.9163511395454407,
"step": 90
},
{
"epoch": 1.6851851851851851,
"grad_norm": 1.0975600683056852,
"learning_rate": 5.451252101964821e-06,
"loss": 0.2619,
"mean_token_accuracy": 0.9195134043693542,
"step": 91
},
{
"epoch": 1.7037037037037037,
"grad_norm": 1.0559577091421926,
"learning_rate": 5.353779188350989e-06,
"loss": 0.2542,
"mean_token_accuracy": 0.9217535853385925,
"step": 92
},
{
"epoch": 1.7222222222222223,
"grad_norm": 1.0902279304137317,
"learning_rate": 5.256374911365621e-06,
"loss": 0.2442,
"mean_token_accuracy": 0.9247879385948181,
"step": 93
},
{
"epoch": 1.7407407407407407,
"grad_norm": 1.1900024135485159,
"learning_rate": 5.159084992962354e-06,
"loss": 0.2413,
"mean_token_accuracy": 0.9264701008796692,
"step": 94
},
{
"epoch": 1.7592592592592593,
"grad_norm": 1.114375986129405,
"learning_rate": 5.061955101414448e-06,
"loss": 0.2603,
"mean_token_accuracy": 0.9205346703529358,
"step": 95
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.1499780047037227,
"learning_rate": 4.9650308298779215e-06,
"loss": 0.2477,
"mean_token_accuracy": 0.9239223599433899,
"step": 96
},
{
"epoch": 1.7962962962962963,
"grad_norm": 1.1492427257637925,
"learning_rate": 4.8683576749899505e-06,
"loss": 0.2783,
"mean_token_accuracy": 0.9156987071037292,
"step": 97
},
{
"epoch": 1.8148148148148149,
"grad_norm": 1.2037535748153476,
"learning_rate": 4.771981015512559e-06,
"loss": 0.2419,
"mean_token_accuracy": 0.9248070120811462,
"step": 98
},
{
"epoch": 1.8333333333333335,
"grad_norm": 1.1672243929637462,
"learning_rate": 4.675946091031648e-06,
"loss": 0.2634,
"mean_token_accuracy": 0.9204791188240051,
"step": 99
},
{
"epoch": 1.8518518518518519,
"grad_norm": 1.2030086649458018,
"learning_rate": 4.5802979807213585e-06,
"loss": 0.2691,
"mean_token_accuracy": 0.9181145429611206,
"step": 100
},
{
"epoch": 1.8703703703703702,
"grad_norm": 1.2410994948220553,
"learning_rate": 4.4850815821837265e-06,
"loss": 0.2637,
"mean_token_accuracy": 0.9215072393417358,
"step": 101
},
{
"epoch": 1.8888888888888888,
"grad_norm": 1.1285340553042031,
"learning_rate": 4.3903415903735725e-06,
"loss": 0.265,
"mean_token_accuracy": 0.9205712080001831,
"step": 102
},
{
"epoch": 1.9074074074074074,
"grad_norm": 1.15167294242117,
"learning_rate": 4.296122476618507e-06,
"loss": 0.2491,
"mean_token_accuracy": 0.9238101840019226,
"step": 103
},
{
"epoch": 1.925925925925926,
"grad_norm": 1.128579734185751,
"learning_rate": 4.202468467743922e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.9208459854125977,
"step": 104
},
{
"epoch": 1.9444444444444444,
"grad_norm": 1.094268425976433,
"learning_rate": 4.109423525312738e-06,
"loss": 0.2479,
"mean_token_accuracy": 0.9242894649505615,
"step": 105
},
{
"epoch": 1.9629629629629628,
"grad_norm": 1.1260095311018505,
"learning_rate": 4.017031324989669e-06,
"loss": 0.245,
"mean_token_accuracy": 0.923931360244751,
"step": 106
},
{
"epoch": 1.9814814814814814,
"grad_norm": 1.0715893334011972,
"learning_rate": 3.925335236039702e-06,
"loss": 0.2628,
"mean_token_accuracy": 0.920842707157135,
"step": 107
},
{
"epoch": 2.0,
"grad_norm": 0.9541245934737955,
"learning_rate": 3.834378300970385e-06,
"loss": 0.2317,
"mean_token_accuracy": 0.9292216300964355,
"step": 108
},
{
"epoch": 2.0185185185185186,
"grad_norm": 0.9950325332822214,
"learning_rate": 3.7442032153275053e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.944040834903717,
"step": 109
},
{
"epoch": 2.037037037037037,
"grad_norm": 0.9031680263761843,
"learning_rate": 3.654852307653628e-06,
"loss": 0.1729,
"mean_token_accuracy": 0.9489833116531372,
"step": 110
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.8700426144680551,
"learning_rate": 3.5663675196189184e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9485137462615967,
"step": 111
},
{
"epoch": 2.074074074074074,
"grad_norm": 0.920806616685076,
"learning_rate": 3.478790386333546e-06,
"loss": 0.2035,
"mean_token_accuracy": 0.9403110146522522,
"step": 112
},
{
"epoch": 2.0925925925925926,
"grad_norm": 0.9156796841824484,
"learning_rate": 3.392162016850945e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9458126425743103,
"step": 113
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.8777527626143813,
"learning_rate": 3.3065230748710646e-06,
"loss": 0.1764,
"mean_token_accuracy": 0.9460602402687073,
"step": 114
},
{
"epoch": 2.1296296296296298,
"grad_norm": 0.8387955266803102,
"learning_rate": 3.221913759652657e-06,
"loss": 0.163,
"mean_token_accuracy": 0.9506521224975586,
"step": 115
},
{
"epoch": 2.148148148148148,
"grad_norm": 0.9758818845077261,
"learning_rate": 3.138373787143598e-06,
"loss": 0.1818,
"mean_token_accuracy": 0.9452192187309265,
"step": 116
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.9533178926074783,
"learning_rate": 3.055942371338052e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9481253623962402,
"step": 117
},
{
"epoch": 2.185185185185185,
"grad_norm": 1.0699519394655053,
"learning_rate": 2.9746582058692803e-06,
"loss": 0.1969,
"mean_token_accuracy": 0.9410567879676819,
"step": 118
},
{
"epoch": 2.2037037037037037,
"grad_norm": 0.9486439388249865,
"learning_rate": 2.894559445846682e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9469440579414368,
"step": 119
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.013131329250228,
"learning_rate": 2.8156836899456475e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.9473387598991394,
"step": 120
},
{
"epoch": 2.240740740740741,
"grad_norm": 0.9433247487590843,
"learning_rate": 2.7380679627585817e-06,
"loss": 0.1625,
"mean_token_accuracy": 0.9486984014511108,
"step": 121
},
{
"epoch": 2.259259259259259,
"grad_norm": 0.9981757376384809,
"learning_rate": 2.661748697415423e-06,
"loss": 0.1752,
"mean_token_accuracy": 0.9464225769042969,
"step": 122
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.9857526409152109,
"learning_rate": 2.586761718481776e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9457290768623352,
"step": 123
},
{
"epoch": 2.2962962962962963,
"grad_norm": 0.9462725952295606,
"learning_rate": 2.5131422251427313e-06,
"loss": 0.1687,
"mean_token_accuracy": 0.9483760595321655,
"step": 124
},
{
"epoch": 2.314814814814815,
"grad_norm": 1.010191097162094,
"learning_rate": 2.440924774680215e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9440175294876099,
"step": 125
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.9374805059731371,
"learning_rate": 2.3701432662516772e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9436575174331665,
"step": 126
},
{
"epoch": 2.351851851851852,
"grad_norm": 0.9269388287237318,
"learning_rate": 2.300830924977683e-06,
"loss": 0.1768,
"mean_token_accuracy": 0.946081280708313,
"step": 127
},
{
"epoch": 2.3703703703703702,
"grad_norm": 0.8355613001726996,
"learning_rate": 2.2330202863459123e-06,
"loss": 0.1938,
"mean_token_accuracy": 0.9410346150398254,
"step": 128
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.8762995235599421,
"learning_rate": 2.166743180938875e-06,
"loss": 0.1839,
"mean_token_accuracy": 0.9433231949806213,
"step": 129
},
{
"epoch": 2.4074074074074074,
"grad_norm": 0.8151541961388346,
"learning_rate": 2.102030719492508e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9466114044189453,
"step": 130
},
{
"epoch": 2.425925925925926,
"grad_norm": 0.8453160237631886,
"learning_rate": 2.03891327829267e-06,
"loss": 0.174,
"mean_token_accuracy": 0.9469768404960632,
"step": 131
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.8395339617580972,
"learning_rate": 1.9774204849164004e-06,
"loss": 0.1866,
"mean_token_accuracy": 0.9433194398880005,
"step": 132
},
{
"epoch": 2.462962962962963,
"grad_norm": 0.8873824785732524,
"learning_rate": 1.9175812043246034e-06,
"loss": 0.1939,
"mean_token_accuracy": 0.9412445425987244,
"step": 133
},
{
"epoch": 2.4814814814814814,
"grad_norm": 0.8522508069696332,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9447925686836243,
"step": 134
},
{
"epoch": 2.5,
"grad_norm": 0.9289476468580379,
"learning_rate": 1.8029747473258092e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9454068541526794,
"step": 135
},
{
"epoch": 2.5185185185185186,
"grad_norm": 0.9311186985274743,
"learning_rate": 1.7482613676439153e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.945341169834137,
"step": 136
},
{
"epoch": 2.537037037037037,
"grad_norm": 0.8214964900059085,
"learning_rate": 1.6953090689443074e-06,
"loss": 0.1679,
"mean_token_accuracy": 0.9475165605545044,
"step": 137
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.8110726136011421,
"learning_rate": 1.6441427072458493e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9460154175758362,
"step": 138
},
{
"epoch": 2.574074074074074,
"grad_norm": 0.8629417011035696,
"learning_rate": 1.5947863002414938e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.9460445046424866,
"step": 139
},
{
"epoch": 2.5925925925925926,
"grad_norm": 0.9131344482433362,
"learning_rate": 1.5472630160242921e-06,
"loss": 0.1888,
"mean_token_accuracy": 0.9422698020935059,
"step": 140
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.8995979478525096,
"learning_rate": 1.5015951622121896e-06,
"loss": 0.1812,
"mean_token_accuracy": 0.9445046186447144,
"step": 141
},
{
"epoch": 2.6296296296296298,
"grad_norm": 0.8973291525652752,
"learning_rate": 1.457804175476751e-06,
"loss": 0.1718,
"mean_token_accuracy": 0.9484192728996277,
"step": 142
},
{
"epoch": 2.648148148148148,
"grad_norm": 0.8806301946411274,
"learning_rate": 1.4159106114806943e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.9455322623252869,
"step": 143
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.9426279555089078,
"learning_rate": 1.3759341352289832e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.943683922290802,
"step": 144
},
{
"epoch": 2.685185185185185,
"grad_norm": 0.8337535019025356,
"learning_rate": 1.3378935118380004e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9465071558952332,
"step": 145
},
{
"epoch": 2.7037037037037037,
"grad_norm": 0.8809269775768832,
"learning_rate": 1.3018065977271215e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9447413086891174,
"step": 146
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.8959429008425933,
"learning_rate": 1.2676903322368423e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9448564648628235,
"step": 147
},
{
"epoch": 2.7407407407407405,
"grad_norm": 0.892750069418456,
"learning_rate": 1.2355607296773896e-06,
"loss": 0.1798,
"mean_token_accuracy": 0.9448550939559937,
"step": 148
},
{
"epoch": 2.7592592592592595,
"grad_norm": 0.9530453934988615,
"learning_rate": 1.2054328718115336e-06,
"loss": 0.1893,
"mean_token_accuracy": 0.9425032138824463,
"step": 149
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.8446243004896555,
"learning_rate": 1.1773209007751562e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9456593990325928,
"step": 150
},
{
"epoch": 2.7962962962962963,
"grad_norm": 0.8770894361882803,
"learning_rate": 1.1512380124388695e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9469501376152039,
"step": 151
},
{
"epoch": 2.814814814814815,
"grad_norm": 0.9163706165945724,
"learning_rate": 1.127196450213825e-06,
"loss": 0.1644,
"mean_token_accuracy": 0.948975682258606,
"step": 152
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.8953025604312923,
"learning_rate": 1.1052074993046102e-06,
"loss": 0.1845,
"mean_token_accuracy": 0.9444332718849182,
"step": 153
},
{
"epoch": 2.851851851851852,
"grad_norm": 0.8938609728248295,
"learning_rate": 1.0852814814119238e-06,
"loss": 0.1759,
"mean_token_accuracy": 0.9459174275398254,
"step": 154
},
{
"epoch": 2.8703703703703702,
"grad_norm": 1.0176759246319027,
"learning_rate": 1.0674277498875325e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9466116428375244,
"step": 155
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.9239006043222746,
"learning_rate": 1.0516546853437686e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.943221926689148,
"step": 156
},
{
"epoch": 2.9074074074074074,
"grad_norm": 0.7911892818762664,
"learning_rate": 1.0379696917196378e-06,
"loss": 0.1643,
"mean_token_accuracy": 0.9492570161819458,
"step": 157
},
{
"epoch": 2.925925925925926,
"grad_norm": 0.9656881157079873,
"learning_rate": 1.026379192805382e-06,
"loss": 0.1765,
"mean_token_accuracy": 0.9460762143135071,
"step": 158
},
{
"epoch": 2.9444444444444446,
"grad_norm": 1.0195811706557167,
"learning_rate": 1.0168886292271246e-06,
"loss": 0.1765,
"mean_token_accuracy": 0.9457534551620483,
"step": 159
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.8488680044411275,
"learning_rate": 1.0095024558930204e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.9460701942443848,
"step": 160
},
{
"epoch": 2.9814814814814814,
"grad_norm": 0.8614929763896857,
"learning_rate": 1.004224139902105e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9434822201728821,
"step": 161
},
{
"epoch": 3.0,
"grad_norm": 0.7985383850024447,
"learning_rate": 1.0010561589168217e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.9517039656639099,
"step": 162
},
{
"epoch": 3.0,
"step": 162,
"total_flos": 6812915466240.0,
"train_loss": 0.3203352055983779,
"train_runtime": 2102.8589,
"train_samples_per_second": 9.774,
"train_steps_per_second": 0.077
}
],
"logging_steps": 1,
"max_steps": 162,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6812915466240.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}