{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 162, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018518518518518517, "grad_norm": 30.383102972799886, "learning_rate": 0.0, "loss": 1.4845, "mean_token_accuracy": 0.7240093946456909, "step": 1 }, { "epoch": 0.037037037037037035, "grad_norm": 32.11227476783311, "learning_rate": 5.882352941176471e-07, "loss": 1.5437, "mean_token_accuracy": 0.7141970992088318, "step": 2 }, { "epoch": 0.05555555555555555, "grad_norm": 29.906397140941003, "learning_rate": 1.1764705882352942e-06, "loss": 1.4825, "mean_token_accuracy": 0.7237485647201538, "step": 3 }, { "epoch": 0.07407407407407407, "grad_norm": 29.985942701879527, "learning_rate": 1.7647058823529414e-06, "loss": 1.4989, "mean_token_accuracy": 0.7203832268714905, "step": 4 }, { "epoch": 0.09259259259259259, "grad_norm": 29.73166397266726, "learning_rate": 2.3529411764705885e-06, "loss": 1.4574, "mean_token_accuracy": 0.7253755927085876, "step": 5 }, { "epoch": 0.1111111111111111, "grad_norm": 25.92213291325712, "learning_rate": 2.9411764705882355e-06, "loss": 1.3813, "mean_token_accuracy": 0.7307026386260986, "step": 6 }, { "epoch": 0.12962962962962962, "grad_norm": 26.069462391715213, "learning_rate": 3.529411764705883e-06, "loss": 1.2098, "mean_token_accuracy": 0.7584702968597412, "step": 7 }, { "epoch": 0.14814814814814814, "grad_norm": 25.19883956752606, "learning_rate": 4.11764705882353e-06, "loss": 1.1305, "mean_token_accuracy": 0.7625263929367065, "step": 8 }, { "epoch": 0.16666666666666666, "grad_norm": 20.418608439087986, "learning_rate": 4.705882352941177e-06, "loss": 0.7787, "mean_token_accuracy": 0.8090993762016296, "step": 9 }, { "epoch": 0.18518518518518517, "grad_norm": 18.730795450552804, "learning_rate": 5.294117647058824e-06, "loss": 0.7188, "mean_token_accuracy": 0.8143796324729919, "step": 10 }, { "epoch": 0.2037037037037037, "grad_norm": 14.138272617769461, "learning_rate": 5.882352941176471e-06, "loss": 0.6116, "mean_token_accuracy": 0.8397451639175415, "step": 11 }, { "epoch": 0.2222222222222222, "grad_norm": 7.596618029685936, "learning_rate": 6.470588235294119e-06, "loss": 0.4657, "mean_token_accuracy": 0.8724555969238281, "step": 12 }, { "epoch": 0.24074074074074073, "grad_norm": 4.119101781430916, "learning_rate": 7.058823529411766e-06, "loss": 0.4309, "mean_token_accuracy": 0.8790558576583862, "step": 13 }, { "epoch": 0.25925925925925924, "grad_norm": 2.4176925019707562, "learning_rate": 7.647058823529411e-06, "loss": 0.4135, "mean_token_accuracy": 0.8794527053833008, "step": 14 }, { "epoch": 0.2777777777777778, "grad_norm": 2.2647199411793886, "learning_rate": 8.23529411764706e-06, "loss": 0.3725, "mean_token_accuracy": 0.891783595085144, "step": 15 }, { "epoch": 0.2962962962962963, "grad_norm": 1.8524926429893849, "learning_rate": 8.823529411764707e-06, "loss": 0.3794, "mean_token_accuracy": 0.8888377547264099, "step": 16 }, { "epoch": 0.3148148148148148, "grad_norm": 1.640430683307726, "learning_rate": 9.411764705882354e-06, "loss": 0.3639, "mean_token_accuracy": 0.8921489715576172, "step": 17 }, { "epoch": 0.3333333333333333, "grad_norm": 1.570505827505393, "learning_rate": 1e-05, "loss": 0.3442, "mean_token_accuracy": 0.8932390213012695, "step": 18 }, { "epoch": 0.35185185185185186, "grad_norm": 1.4135328136828533, "learning_rate": 9.998943841083179e-06, "loss": 0.3512, "mean_token_accuracy": 0.8942129015922546, "step": 19 }, { "epoch": 0.37037037037037035, "grad_norm": 1.5324258913209416, "learning_rate": 9.995775860097897e-06, "loss": 0.3526, "mean_token_accuracy": 0.8924254179000854, "step": 20 }, { "epoch": 0.3888888888888889, "grad_norm": 1.7243977398106463, "learning_rate": 9.990497544106981e-06, "loss": 0.3547, "mean_token_accuracy": 0.8945664763450623, "step": 21 }, { "epoch": 0.4074074074074074, "grad_norm": 1.386050241196015, "learning_rate": 9.983111370772877e-06, "loss": 0.3623, "mean_token_accuracy": 0.8888705372810364, "step": 22 }, { "epoch": 0.42592592592592593, "grad_norm": 1.467216758089058, "learning_rate": 9.97362080719462e-06, "loss": 0.3372, "mean_token_accuracy": 0.8956804871559143, "step": 23 }, { "epoch": 0.4444444444444444, "grad_norm": 1.3785716474601148, "learning_rate": 9.962030308280363e-06, "loss": 0.3382, "mean_token_accuracy": 0.8987942934036255, "step": 24 }, { "epoch": 0.46296296296296297, "grad_norm": 1.3748945772831314, "learning_rate": 9.948345314656234e-06, "loss": 0.3382, "mean_token_accuracy": 0.8963785767555237, "step": 25 }, { "epoch": 0.48148148148148145, "grad_norm": 1.4196685316745923, "learning_rate": 9.932572250112469e-06, "loss": 0.3812, "mean_token_accuracy": 0.8882339596748352, "step": 26 }, { "epoch": 0.5, "grad_norm": 1.4483169776281999, "learning_rate": 9.914718518588076e-06, "loss": 0.3512, "mean_token_accuracy": 0.8926590085029602, "step": 27 }, { "epoch": 0.5185185185185185, "grad_norm": 1.5402138969150352, "learning_rate": 9.89479250069539e-06, "loss": 0.3235, "mean_token_accuracy": 0.9005230069160461, "step": 28 }, { "epoch": 0.5370370370370371, "grad_norm": 1.4292067204728813, "learning_rate": 9.872803549786177e-06, "loss": 0.3527, "mean_token_accuracy": 0.8928272128105164, "step": 29 }, { "epoch": 0.5555555555555556, "grad_norm": 1.4483671102685722, "learning_rate": 9.848761987561132e-06, "loss": 0.3124, "mean_token_accuracy": 0.9046220779418945, "step": 30 }, { "epoch": 0.5740740740740741, "grad_norm": 1.4763218698788387, "learning_rate": 9.822679099224844e-06, "loss": 0.3328, "mean_token_accuracy": 0.8980678915977478, "step": 31 }, { "epoch": 0.5925925925925926, "grad_norm": 1.5799068974635961, "learning_rate": 9.794567128188466e-06, "loss": 0.3375, "mean_token_accuracy": 0.8986757397651672, "step": 32 }, { "epoch": 0.6111111111111112, "grad_norm": 1.577093664869925, "learning_rate": 9.764439270322612e-06, "loss": 0.3744, "mean_token_accuracy": 0.8908596634864807, "step": 33 }, { "epoch": 0.6296296296296297, "grad_norm": 1.5779044571124354, "learning_rate": 9.732309667763158e-06, "loss": 0.3804, "mean_token_accuracy": 0.8886460065841675, "step": 34 }, { "epoch": 0.6481481481481481, "grad_norm": 1.4791494159202596, "learning_rate": 9.69819340227288e-06, "loss": 0.3432, "mean_token_accuracy": 0.8950363397598267, "step": 35 }, { "epoch": 0.6666666666666666, "grad_norm": 1.3517478774476666, "learning_rate": 9.662106488162001e-06, "loss": 0.352, "mean_token_accuracy": 0.8938645124435425, "step": 36 }, { "epoch": 0.6851851851851852, "grad_norm": 1.4032216792961782, "learning_rate": 9.624065864771017e-06, "loss": 0.3632, "mean_token_accuracy": 0.8883957266807556, "step": 37 }, { "epoch": 0.7037037037037037, "grad_norm": 1.499738193983017, "learning_rate": 9.584089388519307e-06, "loss": 0.3665, "mean_token_accuracy": 0.88990318775177, "step": 38 }, { "epoch": 0.7222222222222222, "grad_norm": 1.5239037095270276, "learning_rate": 9.542195824523251e-06, "loss": 0.3284, "mean_token_accuracy": 0.8995599150657654, "step": 39 }, { "epoch": 0.7407407407407407, "grad_norm": 1.5481560151543097, "learning_rate": 9.498404837787811e-06, "loss": 0.3434, "mean_token_accuracy": 0.8942570686340332, "step": 40 }, { "epoch": 0.7592592592592593, "grad_norm": 1.559221040243686, "learning_rate": 9.452736983975708e-06, "loss": 0.3428, "mean_token_accuracy": 0.8983538746833801, "step": 41 }, { "epoch": 0.7777777777777778, "grad_norm": 1.4373798428955245, "learning_rate": 9.405213699758507e-06, "loss": 0.3615, "mean_token_accuracy": 0.8924130797386169, "step": 42 }, { "epoch": 0.7962962962962963, "grad_norm": 1.4535700047967577, "learning_rate": 9.355857292754152e-06, "loss": 0.3339, "mean_token_accuracy": 0.8993980884552002, "step": 43 }, { "epoch": 0.8148148148148148, "grad_norm": 1.327423910804063, "learning_rate": 9.304690931055694e-06, "loss": 0.3564, "mean_token_accuracy": 0.8938746452331543, "step": 44 }, { "epoch": 0.8333333333333334, "grad_norm": 1.3951481767469502, "learning_rate": 9.251738632356086e-06, "loss": 0.3578, "mean_token_accuracy": 0.8920174241065979, "step": 45 }, { "epoch": 0.8518518518518519, "grad_norm": 1.315246360794402, "learning_rate": 9.197025252674192e-06, "loss": 0.3655, "mean_token_accuracy": 0.8932892680168152, "step": 46 }, { "epoch": 0.8703703703703703, "grad_norm": 1.4834801736119503, "learning_rate": 9.140576474687263e-06, "loss": 0.343, "mean_token_accuracy": 0.8958399891853333, "step": 47 }, { "epoch": 0.8888888888888888, "grad_norm": 1.3053313853126278, "learning_rate": 9.082418795675397e-06, "loss": 0.3382, "mean_token_accuracy": 0.896998941898346, "step": 48 }, { "epoch": 0.9074074074074074, "grad_norm": 1.5372874711637974, "learning_rate": 9.022579515083601e-06, "loss": 0.3519, "mean_token_accuracy": 0.8948127627372742, "step": 49 }, { "epoch": 0.9259259259259259, "grad_norm": 1.491827161316295, "learning_rate": 8.961086721707331e-06, "loss": 0.3207, "mean_token_accuracy": 0.903107762336731, "step": 50 }, { "epoch": 0.9444444444444444, "grad_norm": 1.5717125107419234, "learning_rate": 8.897969280507494e-06, "loss": 0.3464, "mean_token_accuracy": 0.8984756469726562, "step": 51 }, { "epoch": 0.9629629629629629, "grad_norm": 1.4048945964359827, "learning_rate": 8.833256819061126e-06, "loss": 0.3496, "mean_token_accuracy": 0.8940179347991943, "step": 52 }, { "epoch": 0.9814814814814815, "grad_norm": 1.4541816999218877, "learning_rate": 8.76697971365409e-06, "loss": 0.3157, "mean_token_accuracy": 0.9050168991088867, "step": 53 }, { "epoch": 1.0, "grad_norm": 1.359087932133861, "learning_rate": 8.69916907502232e-06, "loss": 0.3193, "mean_token_accuracy": 0.903141975402832, "step": 54 }, { "epoch": 1.0185185185185186, "grad_norm": 1.139831522295069, "learning_rate": 8.629856733748325e-06, "loss": 0.2614, "mean_token_accuracy": 0.9204674959182739, "step": 55 }, { "epoch": 1.037037037037037, "grad_norm": 1.2109733029051535, "learning_rate": 8.559075225319786e-06, "loss": 0.2431, "mean_token_accuracy": 0.9270948767662048, "step": 56 }, { "epoch": 1.0555555555555556, "grad_norm": 1.1953987880672259, "learning_rate": 8.48685777485727e-06, "loss": 0.2605, "mean_token_accuracy": 0.9191746115684509, "step": 57 }, { "epoch": 1.074074074074074, "grad_norm": 1.31823870121472, "learning_rate": 8.413238281518225e-06, "loss": 0.2569, "mean_token_accuracy": 0.9213942289352417, "step": 58 }, { "epoch": 1.0925925925925926, "grad_norm": 1.3434083702460653, "learning_rate": 8.33825130258458e-06, "loss": 0.255, "mean_token_accuracy": 0.9220726490020752, "step": 59 }, { "epoch": 1.1111111111111112, "grad_norm": 1.3311689689140414, "learning_rate": 8.261932037241418e-06, "loss": 0.2398, "mean_token_accuracy": 0.9263064861297607, "step": 60 }, { "epoch": 1.1296296296296295, "grad_norm": 1.371914132190983, "learning_rate": 8.184316310054355e-06, "loss": 0.2421, "mean_token_accuracy": 0.925538182258606, "step": 61 }, { "epoch": 1.1481481481481481, "grad_norm": 1.2357585408478602, "learning_rate": 8.10544055415332e-06, "loss": 0.2689, "mean_token_accuracy": 0.9196819067001343, "step": 62 }, { "epoch": 1.1666666666666667, "grad_norm": 1.1634817498053858, "learning_rate": 8.025341794130722e-06, "loss": 0.2579, "mean_token_accuracy": 0.9201351404190063, "step": 63 }, { "epoch": 1.1851851851851851, "grad_norm": 1.2614222517839995, "learning_rate": 7.944057628661948e-06, "loss": 0.2516, "mean_token_accuracy": 0.923024594783783, "step": 64 }, { "epoch": 1.2037037037037037, "grad_norm": 1.243413098569248, "learning_rate": 7.861626212856404e-06, "loss": 0.2558, "mean_token_accuracy": 0.9209133982658386, "step": 65 }, { "epoch": 1.2222222222222223, "grad_norm": 1.213962019110298, "learning_rate": 7.778086240347343e-06, "loss": 0.2488, "mean_token_accuracy": 0.9236682057380676, "step": 66 }, { "epoch": 1.2407407407407407, "grad_norm": 1.1301355101794357, "learning_rate": 7.693476925128937e-06, "loss": 0.2676, "mean_token_accuracy": 0.9169973731040955, "step": 67 }, { "epoch": 1.2592592592592593, "grad_norm": 1.037571809068588, "learning_rate": 7.607837983149057e-06, "loss": 0.2399, "mean_token_accuracy": 0.9276074767112732, "step": 68 }, { "epoch": 1.2777777777777777, "grad_norm": 1.228055556807229, "learning_rate": 7.521209613666457e-06, "loss": 0.2253, "mean_token_accuracy": 0.9291183352470398, "step": 69 }, { "epoch": 1.2962962962962963, "grad_norm": 1.1604461514204596, "learning_rate": 7.433632480381083e-06, "loss": 0.2302, "mean_token_accuracy": 0.9275596141815186, "step": 70 }, { "epoch": 1.3148148148148149, "grad_norm": 1.1429904081958335, "learning_rate": 7.345147692346373e-06, "loss": 0.2468, "mean_token_accuracy": 0.9256559014320374, "step": 71 }, { "epoch": 1.3333333333333333, "grad_norm": 1.2670257480788256, "learning_rate": 7.255796784672496e-06, "loss": 0.2756, "mean_token_accuracy": 0.9166375994682312, "step": 72 }, { "epoch": 1.3518518518518519, "grad_norm": 1.2006495346193158, "learning_rate": 7.165621699029615e-06, "loss": 0.2675, "mean_token_accuracy": 0.9179262518882751, "step": 73 }, { "epoch": 1.3703703703703702, "grad_norm": 1.2199671885347414, "learning_rate": 7.0746647639602994e-06, "loss": 0.246, "mean_token_accuracy": 0.9246431589126587, "step": 74 }, { "epoch": 1.3888888888888888, "grad_norm": 1.2325244346693196, "learning_rate": 6.982968675010332e-06, "loss": 0.2604, "mean_token_accuracy": 0.9215620756149292, "step": 75 }, { "epoch": 1.4074074074074074, "grad_norm": 1.222471090199412, "learning_rate": 6.890576474687264e-06, "loss": 0.2555, "mean_token_accuracy": 0.9202633500099182, "step": 76 }, { "epoch": 1.425925925925926, "grad_norm": 1.1650774082286408, "learning_rate": 6.797531532256079e-06, "loss": 0.2535, "mean_token_accuracy": 0.9203940629959106, "step": 77 }, { "epoch": 1.4444444444444444, "grad_norm": 1.155157813327957, "learning_rate": 6.703877523381495e-06, "loss": 0.2514, "mean_token_accuracy": 0.9239696860313416, "step": 78 }, { "epoch": 1.462962962962963, "grad_norm": 1.2322059130014582, "learning_rate": 6.609658409626431e-06, "loss": 0.2522, "mean_token_accuracy": 0.9223854541778564, "step": 79 }, { "epoch": 1.4814814814814814, "grad_norm": 1.1431661429529452, "learning_rate": 6.514918417816275e-06, "loss": 0.2645, "mean_token_accuracy": 0.9201213121414185, "step": 80 }, { "epoch": 1.5, "grad_norm": 1.2154621857829624, "learning_rate": 6.419702019278643e-06, "loss": 0.2351, "mean_token_accuracy": 0.9279325008392334, "step": 81 }, { "epoch": 1.5185185185185186, "grad_norm": 1.2629658642141612, "learning_rate": 6.324053908968353e-06, "loss": 0.2499, "mean_token_accuracy": 0.9237509369850159, "step": 82 }, { "epoch": 1.5370370370370372, "grad_norm": 1.1434498930723798, "learning_rate": 6.228018984487443e-06, "loss": 0.2424, "mean_token_accuracy": 0.9255508780479431, "step": 83 }, { "epoch": 1.5555555555555556, "grad_norm": 1.183948134135304, "learning_rate": 6.13164232501005e-06, "loss": 0.2662, "mean_token_accuracy": 0.9195141196250916, "step": 84 }, { "epoch": 1.574074074074074, "grad_norm": 1.0816053650678727, "learning_rate": 6.034969170122079e-06, "loss": 0.2251, "mean_token_accuracy": 0.9291943311691284, "step": 85 }, { "epoch": 1.5925925925925926, "grad_norm": 1.3090327941892361, "learning_rate": 5.938044898585555e-06, "loss": 0.2845, "mean_token_accuracy": 0.9130949378013611, "step": 86 }, { "epoch": 1.6111111111111112, "grad_norm": 1.090271437723717, "learning_rate": 5.840915007037648e-06, "loss": 0.2471, "mean_token_accuracy": 0.9219435453414917, "step": 87 }, { "epoch": 1.6296296296296298, "grad_norm": 1.3405339469885855, "learning_rate": 5.74362508863438e-06, "loss": 0.2726, "mean_token_accuracy": 0.9205261468887329, "step": 88 }, { "epoch": 1.6481481481481481, "grad_norm": 1.1088012675507433, "learning_rate": 5.646220811649013e-06, "loss": 0.2599, "mean_token_accuracy": 0.9209275245666504, "step": 89 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1041623807960375, "learning_rate": 5.5487478980351805e-06, "loss": 0.2766, "mean_token_accuracy": 0.9163511395454407, "step": 90 }, { "epoch": 1.6851851851851851, "grad_norm": 1.0975600683056852, "learning_rate": 5.451252101964821e-06, "loss": 0.2619, "mean_token_accuracy": 0.9195134043693542, "step": 91 }, { "epoch": 1.7037037037037037, "grad_norm": 1.0559577091421926, "learning_rate": 5.353779188350989e-06, "loss": 0.2542, "mean_token_accuracy": 0.9217535853385925, "step": 92 }, { "epoch": 1.7222222222222223, "grad_norm": 1.0902279304137317, "learning_rate": 5.256374911365621e-06, "loss": 0.2442, "mean_token_accuracy": 0.9247879385948181, "step": 93 }, { "epoch": 1.7407407407407407, "grad_norm": 1.1900024135485159, "learning_rate": 5.159084992962354e-06, "loss": 0.2413, "mean_token_accuracy": 0.9264701008796692, "step": 94 }, { "epoch": 1.7592592592592593, "grad_norm": 1.114375986129405, "learning_rate": 5.061955101414448e-06, "loss": 0.2603, "mean_token_accuracy": 0.9205346703529358, "step": 95 }, { "epoch": 1.7777777777777777, "grad_norm": 1.1499780047037227, "learning_rate": 4.9650308298779215e-06, "loss": 0.2477, "mean_token_accuracy": 0.9239223599433899, "step": 96 }, { "epoch": 1.7962962962962963, "grad_norm": 1.1492427257637925, "learning_rate": 4.8683576749899505e-06, "loss": 0.2783, "mean_token_accuracy": 0.9156987071037292, "step": 97 }, { "epoch": 1.8148148148148149, "grad_norm": 1.2037535748153476, "learning_rate": 4.771981015512559e-06, "loss": 0.2419, "mean_token_accuracy": 0.9248070120811462, "step": 98 }, { "epoch": 1.8333333333333335, "grad_norm": 1.1672243929637462, "learning_rate": 4.675946091031648e-06, "loss": 0.2634, "mean_token_accuracy": 0.9204791188240051, "step": 99 }, { "epoch": 1.8518518518518519, "grad_norm": 1.2030086649458018, "learning_rate": 4.5802979807213585e-06, "loss": 0.2691, "mean_token_accuracy": 0.9181145429611206, "step": 100 }, { "epoch": 1.8703703703703702, "grad_norm": 1.2410994948220553, "learning_rate": 4.4850815821837265e-06, "loss": 0.2637, "mean_token_accuracy": 0.9215072393417358, "step": 101 }, { "epoch": 1.8888888888888888, "grad_norm": 1.1285340553042031, "learning_rate": 4.3903415903735725e-06, "loss": 0.265, "mean_token_accuracy": 0.9205712080001831, "step": 102 }, { "epoch": 1.9074074074074074, "grad_norm": 1.15167294242117, "learning_rate": 4.296122476618507e-06, "loss": 0.2491, "mean_token_accuracy": 0.9238101840019226, "step": 103 }, { "epoch": 1.925925925925926, "grad_norm": 1.128579734185751, "learning_rate": 4.202468467743922e-06, "loss": 0.2613, "mean_token_accuracy": 0.9208459854125977, "step": 104 }, { "epoch": 1.9444444444444444, "grad_norm": 1.094268425976433, "learning_rate": 4.109423525312738e-06, "loss": 0.2479, "mean_token_accuracy": 0.9242894649505615, "step": 105 }, { "epoch": 1.9629629629629628, "grad_norm": 1.1260095311018505, "learning_rate": 4.017031324989669e-06, "loss": 0.245, "mean_token_accuracy": 0.923931360244751, "step": 106 }, { "epoch": 1.9814814814814814, "grad_norm": 1.0715893334011972, "learning_rate": 3.925335236039702e-06, "loss": 0.2628, "mean_token_accuracy": 0.920842707157135, "step": 107 }, { "epoch": 2.0, "grad_norm": 0.9541245934737955, "learning_rate": 3.834378300970385e-06, "loss": 0.2317, "mean_token_accuracy": 0.9292216300964355, "step": 108 }, { "epoch": 2.0185185185185186, "grad_norm": 0.9950325332822214, "learning_rate": 3.7442032153275053e-06, "loss": 0.1862, "mean_token_accuracy": 0.944040834903717, "step": 109 }, { "epoch": 2.037037037037037, "grad_norm": 0.9031680263761843, "learning_rate": 3.654852307653628e-06, "loss": 0.1729, "mean_token_accuracy": 0.9489833116531372, "step": 110 }, { "epoch": 2.0555555555555554, "grad_norm": 0.8700426144680551, "learning_rate": 3.5663675196189184e-06, "loss": 0.1723, "mean_token_accuracy": 0.9485137462615967, "step": 111 }, { "epoch": 2.074074074074074, "grad_norm": 0.920806616685076, "learning_rate": 3.478790386333546e-06, "loss": 0.2035, "mean_token_accuracy": 0.9403110146522522, "step": 112 }, { "epoch": 2.0925925925925926, "grad_norm": 0.9156796841824484, "learning_rate": 3.392162016850945e-06, "loss": 0.1787, "mean_token_accuracy": 0.9458126425743103, "step": 113 }, { "epoch": 2.111111111111111, "grad_norm": 0.8777527626143813, "learning_rate": 3.3065230748710646e-06, "loss": 0.1764, "mean_token_accuracy": 0.9460602402687073, "step": 114 }, { "epoch": 2.1296296296296298, "grad_norm": 0.8387955266803102, "learning_rate": 3.221913759652657e-06, "loss": 0.163, "mean_token_accuracy": 0.9506521224975586, "step": 115 }, { "epoch": 2.148148148148148, "grad_norm": 0.9758818845077261, "learning_rate": 3.138373787143598e-06, "loss": 0.1818, "mean_token_accuracy": 0.9452192187309265, "step": 116 }, { "epoch": 2.1666666666666665, "grad_norm": 0.9533178926074783, "learning_rate": 3.055942371338052e-06, "loss": 0.1696, "mean_token_accuracy": 0.9481253623962402, "step": 117 }, { "epoch": 2.185185185185185, "grad_norm": 1.0699519394655053, "learning_rate": 2.9746582058692803e-06, "loss": 0.1969, "mean_token_accuracy": 0.9410567879676819, "step": 118 }, { "epoch": 2.2037037037037037, "grad_norm": 0.9486439388249865, "learning_rate": 2.894559445846682e-06, "loss": 0.1734, "mean_token_accuracy": 0.9469440579414368, "step": 119 }, { "epoch": 2.2222222222222223, "grad_norm": 1.013131329250228, "learning_rate": 2.8156836899456475e-06, "loss": 0.1756, "mean_token_accuracy": 0.9473387598991394, "step": 120 }, { "epoch": 2.240740740740741, "grad_norm": 0.9433247487590843, "learning_rate": 2.7380679627585817e-06, "loss": 0.1625, "mean_token_accuracy": 0.9486984014511108, "step": 121 }, { "epoch": 2.259259259259259, "grad_norm": 0.9981757376384809, "learning_rate": 2.661748697415423e-06, "loss": 0.1752, "mean_token_accuracy": 0.9464225769042969, "step": 122 }, { "epoch": 2.2777777777777777, "grad_norm": 0.9857526409152109, "learning_rate": 2.586761718481776e-06, "loss": 0.1809, "mean_token_accuracy": 0.9457290768623352, "step": 123 }, { "epoch": 2.2962962962962963, "grad_norm": 0.9462725952295606, "learning_rate": 2.5131422251427313e-06, "loss": 0.1687, "mean_token_accuracy": 0.9483760595321655, "step": 124 }, { "epoch": 2.314814814814815, "grad_norm": 1.010191097162094, "learning_rate": 2.440924774680215e-06, "loss": 0.1831, "mean_token_accuracy": 0.9440175294876099, "step": 125 }, { "epoch": 2.3333333333333335, "grad_norm": 0.9374805059731371, "learning_rate": 2.3701432662516772e-06, "loss": 0.1885, "mean_token_accuracy": 0.9436575174331665, "step": 126 }, { "epoch": 2.351851851851852, "grad_norm": 0.9269388287237318, "learning_rate": 2.300830924977683e-06, "loss": 0.1768, "mean_token_accuracy": 0.946081280708313, "step": 127 }, { "epoch": 2.3703703703703702, "grad_norm": 0.8355613001726996, "learning_rate": 2.2330202863459123e-06, "loss": 0.1938, "mean_token_accuracy": 0.9410346150398254, "step": 128 }, { "epoch": 2.388888888888889, "grad_norm": 0.8762995235599421, "learning_rate": 2.166743180938875e-06, "loss": 0.1839, "mean_token_accuracy": 0.9433231949806213, "step": 129 }, { "epoch": 2.4074074074074074, "grad_norm": 0.8151541961388346, "learning_rate": 2.102030719492508e-06, "loss": 0.1746, "mean_token_accuracy": 0.9466114044189453, "step": 130 }, { "epoch": 2.425925925925926, "grad_norm": 0.8453160237631886, "learning_rate": 2.03891327829267e-06, "loss": 0.174, "mean_token_accuracy": 0.9469768404960632, "step": 131 }, { "epoch": 2.4444444444444446, "grad_norm": 0.8395339617580972, "learning_rate": 1.9774204849164004e-06, "loss": 0.1866, "mean_token_accuracy": 0.9433194398880005, "step": 132 }, { "epoch": 2.462962962962963, "grad_norm": 0.8873824785732524, "learning_rate": 1.9175812043246034e-06, "loss": 0.1939, "mean_token_accuracy": 0.9412445425987244, "step": 133 }, { "epoch": 2.4814814814814814, "grad_norm": 0.8522508069696332, "learning_rate": 1.8594235253127373e-06, "loss": 0.183, "mean_token_accuracy": 0.9447925686836243, "step": 134 }, { "epoch": 2.5, "grad_norm": 0.9289476468580379, "learning_rate": 1.8029747473258092e-06, "loss": 0.1769, "mean_token_accuracy": 0.9454068541526794, "step": 135 }, { "epoch": 2.5185185185185186, "grad_norm": 0.9311186985274743, "learning_rate": 1.7482613676439153e-06, "loss": 0.1809, "mean_token_accuracy": 0.945341169834137, "step": 136 }, { "epoch": 2.537037037037037, "grad_norm": 0.8214964900059085, "learning_rate": 1.6953090689443074e-06, "loss": 0.1679, "mean_token_accuracy": 0.9475165605545044, "step": 137 }, { "epoch": 2.5555555555555554, "grad_norm": 0.8110726136011421, "learning_rate": 1.6441427072458493e-06, "loss": 0.1725, "mean_token_accuracy": 0.9460154175758362, "step": 138 }, { "epoch": 2.574074074074074, "grad_norm": 0.8629417011035696, "learning_rate": 1.5947863002414938e-06, "loss": 0.1773, "mean_token_accuracy": 0.9460445046424866, "step": 139 }, { "epoch": 2.5925925925925926, "grad_norm": 0.9131344482433362, "learning_rate": 1.5472630160242921e-06, "loss": 0.1888, "mean_token_accuracy": 0.9422698020935059, "step": 140 }, { "epoch": 2.611111111111111, "grad_norm": 0.8995979478525096, "learning_rate": 1.5015951622121896e-06, "loss": 0.1812, "mean_token_accuracy": 0.9445046186447144, "step": 141 }, { "epoch": 2.6296296296296298, "grad_norm": 0.8973291525652752, "learning_rate": 1.457804175476751e-06, "loss": 0.1718, "mean_token_accuracy": 0.9484192728996277, "step": 142 }, { "epoch": 2.648148148148148, "grad_norm": 0.8806301946411274, "learning_rate": 1.4159106114806943e-06, "loss": 0.1763, "mean_token_accuracy": 0.9455322623252869, "step": 143 }, { "epoch": 2.6666666666666665, "grad_norm": 0.9426279555089078, "learning_rate": 1.3759341352289832e-06, "loss": 0.1819, "mean_token_accuracy": 0.943683922290802, "step": 144 }, { "epoch": 2.685185185185185, "grad_norm": 0.8337535019025356, "learning_rate": 1.3378935118380004e-06, "loss": 0.1739, "mean_token_accuracy": 0.9465071558952332, "step": 145 }, { "epoch": 2.7037037037037037, "grad_norm": 0.8809269775768832, "learning_rate": 1.3018065977271215e-06, "loss": 0.1831, "mean_token_accuracy": 0.9447413086891174, "step": 146 }, { "epoch": 2.7222222222222223, "grad_norm": 0.8959429008425933, "learning_rate": 1.2676903322368423e-06, "loss": 0.1815, "mean_token_accuracy": 0.9448564648628235, "step": 147 }, { "epoch": 2.7407407407407405, "grad_norm": 0.892750069418456, "learning_rate": 1.2355607296773896e-06, "loss": 0.1798, "mean_token_accuracy": 0.9448550939559937, "step": 148 }, { "epoch": 2.7592592592592595, "grad_norm": 0.9530453934988615, "learning_rate": 1.2054328718115336e-06, "loss": 0.1893, "mean_token_accuracy": 0.9425032138824463, "step": 149 }, { "epoch": 2.7777777777777777, "grad_norm": 0.8446243004896555, "learning_rate": 1.1773209007751562e-06, "loss": 0.1777, "mean_token_accuracy": 0.9456593990325928, "step": 150 }, { "epoch": 2.7962962962962963, "grad_norm": 0.8770894361882803, "learning_rate": 1.1512380124388695e-06, "loss": 0.1739, "mean_token_accuracy": 0.9469501376152039, "step": 151 }, { "epoch": 2.814814814814815, "grad_norm": 0.9163706165945724, "learning_rate": 1.127196450213825e-06, "loss": 0.1644, "mean_token_accuracy": 0.948975682258606, "step": 152 }, { "epoch": 2.8333333333333335, "grad_norm": 0.8953025604312923, "learning_rate": 1.1052074993046102e-06, "loss": 0.1845, "mean_token_accuracy": 0.9444332718849182, "step": 153 }, { "epoch": 2.851851851851852, "grad_norm": 0.8938609728248295, "learning_rate": 1.0852814814119238e-06, "loss": 0.1759, "mean_token_accuracy": 0.9459174275398254, "step": 154 }, { "epoch": 2.8703703703703702, "grad_norm": 1.0176759246319027, "learning_rate": 1.0674277498875325e-06, "loss": 0.1727, "mean_token_accuracy": 0.9466116428375244, "step": 155 }, { "epoch": 2.888888888888889, "grad_norm": 0.9239006043222746, "learning_rate": 1.0516546853437686e-06, "loss": 0.1803, "mean_token_accuracy": 0.943221926689148, "step": 156 }, { "epoch": 2.9074074074074074, "grad_norm": 0.7911892818762664, "learning_rate": 1.0379696917196378e-06, "loss": 0.1643, "mean_token_accuracy": 0.9492570161819458, "step": 157 }, { "epoch": 2.925925925925926, "grad_norm": 0.9656881157079873, "learning_rate": 1.026379192805382e-06, "loss": 0.1765, "mean_token_accuracy": 0.9460762143135071, "step": 158 }, { "epoch": 2.9444444444444446, "grad_norm": 1.0195811706557167, "learning_rate": 1.0168886292271246e-06, "loss": 0.1765, "mean_token_accuracy": 0.9457534551620483, "step": 159 }, { "epoch": 2.962962962962963, "grad_norm": 0.8488680044411275, "learning_rate": 1.0095024558930204e-06, "loss": 0.1753, "mean_token_accuracy": 0.9460701942443848, "step": 160 }, { "epoch": 2.9814814814814814, "grad_norm": 0.8614929763896857, "learning_rate": 1.004224139902105e-06, "loss": 0.1809, "mean_token_accuracy": 0.9434822201728821, "step": 161 }, { "epoch": 3.0, "grad_norm": 0.7985383850024447, "learning_rate": 1.0010561589168217e-06, "loss": 0.1582, "mean_token_accuracy": 0.9517039656639099, "step": 162 }, { "epoch": 3.0, "step": 162, "total_flos": 6812915466240.0, "train_loss": 0.3203352055983779, "train_runtime": 2102.8589, "train_samples_per_second": 9.774, "train_steps_per_second": 0.077 } ], "logging_steps": 1, "max_steps": 162, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6812915466240.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }