{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2949, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001017293997965412, "grad_norm": 31.607948303222656, "learning_rate": 0.0, "loss": 1.2814, "mean_token_accuracy": 0.7476553320884705, "num_tokens": 314183.0, "step": 1 }, { "epoch": 0.002034587995930824, "grad_norm": 31.262537002563477, "learning_rate": 3.389830508474576e-09, "loss": 1.3075, "mean_token_accuracy": 0.740410327911377, "num_tokens": 627227.0, "step": 2 }, { "epoch": 0.003051881993896236, "grad_norm": 31.910720825195312, "learning_rate": 6.779661016949152e-09, "loss": 1.3239, "mean_token_accuracy": 0.7391442656517029, "num_tokens": 926658.0, "step": 3 }, { "epoch": 0.004069175991861648, "grad_norm": 31.123292922973633, "learning_rate": 1.0169491525423728e-08, "loss": 1.3076, "mean_token_accuracy": 0.7411203384399414, "num_tokens": 1253116.0, "step": 4 }, { "epoch": 0.00508646998982706, "grad_norm": 30.289949417114258, "learning_rate": 1.3559322033898304e-08, "loss": 1.2796, "mean_token_accuracy": 0.7462595105171204, "num_tokens": 1582136.0, "step": 5 }, { "epoch": 0.006103763987792472, "grad_norm": 31.040441513061523, "learning_rate": 1.6949152542372882e-08, "loss": 1.295, "mean_token_accuracy": 0.7442319989204407, "num_tokens": 1899325.0, "step": 6 }, { "epoch": 0.007121057985757884, "grad_norm": 30.435606002807617, "learning_rate": 2.0338983050847456e-08, "loss": 1.2849, "mean_token_accuracy": 0.7433764338493347, "num_tokens": 2225992.0, "step": 7 }, { "epoch": 0.008138351983723296, "grad_norm": 30.160449981689453, "learning_rate": 2.3728813559322034e-08, "loss": 1.281, "mean_token_accuracy": 0.7444077730178833, "num_tokens": 2564876.0, "step": 8 }, { "epoch": 0.009155645981688708, "grad_norm": 31.000755310058594, "learning_rate": 2.7118644067796608e-08, "loss": 1.2994, "mean_token_accuracy": 0.7422164678573608, "num_tokens": 2885094.0, "step": 9 }, { "epoch": 0.01017293997965412, "grad_norm": 32.12958908081055, "learning_rate": 3.0508474576271186e-08, "loss": 1.2746, "mean_token_accuracy": 0.7504814863204956, "num_tokens": 3200547.0, "step": 10 }, { "epoch": 0.011190233977619531, "grad_norm": 31.04578399658203, "learning_rate": 3.3898305084745764e-08, "loss": 1.3196, "mean_token_accuracy": 0.7372593879699707, "num_tokens": 3522299.0, "step": 11 }, { "epoch": 0.012207527975584944, "grad_norm": 30.585004806518555, "learning_rate": 3.728813559322034e-08, "loss": 1.3012, "mean_token_accuracy": 0.7397406101226807, "num_tokens": 3843052.0, "step": 12 }, { "epoch": 0.013224821973550356, "grad_norm": 30.747228622436523, "learning_rate": 4.067796610169491e-08, "loss": 1.2812, "mean_token_accuracy": 0.745479941368103, "num_tokens": 4163923.0, "step": 13 }, { "epoch": 0.014242115971515769, "grad_norm": 29.38701629638672, "learning_rate": 4.406779661016949e-08, "loss": 1.2879, "mean_token_accuracy": 0.7403217554092407, "num_tokens": 4501785.0, "step": 14 }, { "epoch": 0.015259409969481181, "grad_norm": 30.99683380126953, "learning_rate": 4.745762711864407e-08, "loss": 1.278, "mean_token_accuracy": 0.7468803524971008, "num_tokens": 4826248.0, "step": 15 }, { "epoch": 0.01627670396744659, "grad_norm": 30.700899124145508, "learning_rate": 5.0847457627118645e-08, "loss": 1.2839, "mean_token_accuracy": 0.7428768873214722, "num_tokens": 5159429.0, "step": 16 }, { "epoch": 0.017293997965412006, "grad_norm": 31.73674201965332, "learning_rate": 5.4237288135593217e-08, "loss": 1.3102, "mean_token_accuracy": 0.7424096465110779, "num_tokens": 5470920.0, "step": 17 }, { "epoch": 0.018311291963377416, "grad_norm": 31.500507354736328, "learning_rate": 5.7627118644067794e-08, "loss": 1.3163, "mean_token_accuracy": 0.7393807172775269, "num_tokens": 5783722.0, "step": 18 }, { "epoch": 0.019328585961342827, "grad_norm": 30.2214412689209, "learning_rate": 6.101694915254237e-08, "loss": 1.2991, "mean_token_accuracy": 0.7394208312034607, "num_tokens": 6105483.0, "step": 19 }, { "epoch": 0.02034587995930824, "grad_norm": 30.756328582763672, "learning_rate": 6.440677966101695e-08, "loss": 1.3097, "mean_token_accuracy": 0.7406641840934753, "num_tokens": 6423575.0, "step": 20 }, { "epoch": 0.021363173957273652, "grad_norm": 31.446762084960938, "learning_rate": 6.779661016949153e-08, "loss": 1.3054, "mean_token_accuracy": 0.7429530024528503, "num_tokens": 6740933.0, "step": 21 }, { "epoch": 0.022380467955239063, "grad_norm": 31.185230255126953, "learning_rate": 7.11864406779661e-08, "loss": 1.3002, "mean_token_accuracy": 0.7414672374725342, "num_tokens": 7053795.0, "step": 22 }, { "epoch": 0.023397761953204477, "grad_norm": 30.35567855834961, "learning_rate": 7.457627118644068e-08, "loss": 1.3191, "mean_token_accuracy": 0.7355214357376099, "num_tokens": 7375236.0, "step": 23 }, { "epoch": 0.024415055951169887, "grad_norm": 30.93867301940918, "learning_rate": 7.796610169491526e-08, "loss": 1.3039, "mean_token_accuracy": 0.739065945148468, "num_tokens": 7693804.0, "step": 24 }, { "epoch": 0.0254323499491353, "grad_norm": 29.841861724853516, "learning_rate": 8.135593220338982e-08, "loss": 1.2686, "mean_token_accuracy": 0.7462300062179565, "num_tokens": 8024175.0, "step": 25 }, { "epoch": 0.026449643947100712, "grad_norm": 30.483552932739258, "learning_rate": 8.47457627118644e-08, "loss": 1.2886, "mean_token_accuracy": 0.7437601089477539, "num_tokens": 8351946.0, "step": 26 }, { "epoch": 0.027466937945066123, "grad_norm": 30.007417678833008, "learning_rate": 8.813559322033898e-08, "loss": 1.2807, "mean_token_accuracy": 0.7446364164352417, "num_tokens": 8661017.0, "step": 27 }, { "epoch": 0.028484231943031537, "grad_norm": 30.700592041015625, "learning_rate": 9.152542372881356e-08, "loss": 1.2904, "mean_token_accuracy": 0.7436801195144653, "num_tokens": 8975390.0, "step": 28 }, { "epoch": 0.029501525940996948, "grad_norm": 31.300748825073242, "learning_rate": 9.491525423728814e-08, "loss": 1.3052, "mean_token_accuracy": 0.7431154847145081, "num_tokens": 9280736.0, "step": 29 }, { "epoch": 0.030518819938962362, "grad_norm": 30.187368392944336, "learning_rate": 9.830508474576271e-08, "loss": 1.2955, "mean_token_accuracy": 0.7419048547744751, "num_tokens": 9587259.0, "step": 30 }, { "epoch": 0.03153611393692777, "grad_norm": 30.064258575439453, "learning_rate": 1.0169491525423729e-07, "loss": 1.2773, "mean_token_accuracy": 0.7429222464561462, "num_tokens": 9903156.0, "step": 31 }, { "epoch": 0.03255340793489318, "grad_norm": 30.224626541137695, "learning_rate": 1.0508474576271186e-07, "loss": 1.2707, "mean_token_accuracy": 0.7471416592597961, "num_tokens": 10214147.0, "step": 32 }, { "epoch": 0.0335707019328586, "grad_norm": 30.359447479248047, "learning_rate": 1.0847457627118643e-07, "loss": 1.2772, "mean_token_accuracy": 0.7445785403251648, "num_tokens": 10528001.0, "step": 33 }, { "epoch": 0.03458799593082401, "grad_norm": 28.968889236450195, "learning_rate": 1.1186440677966101e-07, "loss": 1.3036, "mean_token_accuracy": 0.7359491586685181, "num_tokens": 10852213.0, "step": 34 }, { "epoch": 0.03560528992878942, "grad_norm": 28.165321350097656, "learning_rate": 1.1525423728813559e-07, "loss": 1.2836, "mean_token_accuracy": 0.7375534176826477, "num_tokens": 11178241.0, "step": 35 }, { "epoch": 0.03662258392675483, "grad_norm": 28.93723487854004, "learning_rate": 1.1864406779661017e-07, "loss": 1.262, "mean_token_accuracy": 0.745404839515686, "num_tokens": 11489548.0, "step": 36 }, { "epoch": 0.03763987792472025, "grad_norm": 27.576478958129883, "learning_rate": 1.2203389830508474e-07, "loss": 1.2396, "mean_token_accuracy": 0.745377242565155, "num_tokens": 11818390.0, "step": 37 }, { "epoch": 0.038657171922685654, "grad_norm": 27.26495933532715, "learning_rate": 1.254237288135593e-07, "loss": 1.276, "mean_token_accuracy": 0.7369311451911926, "num_tokens": 12131732.0, "step": 38 }, { "epoch": 0.03967446592065107, "grad_norm": 28.351390838623047, "learning_rate": 1.288135593220339e-07, "loss": 1.2852, "mean_token_accuracy": 0.7390662431716919, "num_tokens": 12436906.0, "step": 39 }, { "epoch": 0.04069175991861648, "grad_norm": 27.946863174438477, "learning_rate": 1.3220338983050846e-07, "loss": 1.2786, "mean_token_accuracy": 0.7396658658981323, "num_tokens": 12747689.0, "step": 40 }, { "epoch": 0.04170905391658189, "grad_norm": 27.03894805908203, "learning_rate": 1.3559322033898305e-07, "loss": 1.2435, "mean_token_accuracy": 0.7437921762466431, "num_tokens": 13081387.0, "step": 41 }, { "epoch": 0.042726347914547304, "grad_norm": 26.022869110107422, "learning_rate": 1.3898305084745762e-07, "loss": 1.2492, "mean_token_accuracy": 0.7406527996063232, "num_tokens": 13408622.0, "step": 42 }, { "epoch": 0.04374364191251272, "grad_norm": 28.38178253173828, "learning_rate": 1.423728813559322e-07, "loss": 1.2827, "mean_token_accuracy": 0.7385092973709106, "num_tokens": 13706016.0, "step": 43 }, { "epoch": 0.044760935910478125, "grad_norm": 27.15532112121582, "learning_rate": 1.4576271186440677e-07, "loss": 1.2311, "mean_token_accuracy": 0.7478699684143066, "num_tokens": 14029427.0, "step": 44 }, { "epoch": 0.04577822990844354, "grad_norm": 26.92449378967285, "learning_rate": 1.4915254237288137e-07, "loss": 1.2324, "mean_token_accuracy": 0.7458136677742004, "num_tokens": 14356828.0, "step": 45 }, { "epoch": 0.04679552390640895, "grad_norm": 27.326919555664062, "learning_rate": 1.5254237288135593e-07, "loss": 1.234, "mean_token_accuracy": 0.7476122975349426, "num_tokens": 14675485.0, "step": 46 }, { "epoch": 0.04781281790437437, "grad_norm": 28.272430419921875, "learning_rate": 1.5593220338983052e-07, "loss": 1.269, "mean_token_accuracy": 0.741723358631134, "num_tokens": 14983974.0, "step": 47 }, { "epoch": 0.048830111902339775, "grad_norm": 27.293691635131836, "learning_rate": 1.5932203389830506e-07, "loss": 1.2908, "mean_token_accuracy": 0.7345436811447144, "num_tokens": 15289033.0, "step": 48 }, { "epoch": 0.04984740590030519, "grad_norm": 25.178010940551758, "learning_rate": 1.6271186440677965e-07, "loss": 1.2383, "mean_token_accuracy": 0.7389808893203735, "num_tokens": 15594811.0, "step": 49 }, { "epoch": 0.0508646998982706, "grad_norm": 20.5047664642334, "learning_rate": 1.6610169491525421e-07, "loss": 1.1516, "mean_token_accuracy": 0.7475082874298096, "num_tokens": 15922340.0, "step": 50 }, { "epoch": 0.05188199389623601, "grad_norm": 19.705347061157227, "learning_rate": 1.694915254237288e-07, "loss": 1.1562, "mean_token_accuracy": 0.7475390434265137, "num_tokens": 16238658.0, "step": 51 }, { "epoch": 0.052899287894201424, "grad_norm": 20.425750732421875, "learning_rate": 1.7288135593220337e-07, "loss": 1.1638, "mean_token_accuracy": 0.7460206747055054, "num_tokens": 16546360.0, "step": 52 }, { "epoch": 0.05391658189216684, "grad_norm": 20.014739990234375, "learning_rate": 1.7627118644067796e-07, "loss": 1.1551, "mean_token_accuracy": 0.7466144561767578, "num_tokens": 16860358.0, "step": 53 }, { "epoch": 0.054933875890132246, "grad_norm": 19.32347869873047, "learning_rate": 1.7966101694915252e-07, "loss": 1.1763, "mean_token_accuracy": 0.7402143478393555, "num_tokens": 17181452.0, "step": 54 }, { "epoch": 0.05595116988809766, "grad_norm": 19.63179588317871, "learning_rate": 1.8305084745762712e-07, "loss": 1.1501, "mean_token_accuracy": 0.7471902370452881, "num_tokens": 17493379.0, "step": 55 }, { "epoch": 0.056968463886063074, "grad_norm": 19.4603328704834, "learning_rate": 1.8644067796610168e-07, "loss": 1.1404, "mean_token_accuracy": 0.75003582239151, "num_tokens": 17810924.0, "step": 56 }, { "epoch": 0.05798575788402848, "grad_norm": 18.85099983215332, "learning_rate": 1.8983050847457627e-07, "loss": 1.166, "mean_token_accuracy": 0.7426574230194092, "num_tokens": 18134320.0, "step": 57 }, { "epoch": 0.059003051881993895, "grad_norm": 20.299636840820312, "learning_rate": 1.9322033898305084e-07, "loss": 1.1747, "mean_token_accuracy": 0.7439650297164917, "num_tokens": 18426725.0, "step": 58 }, { "epoch": 0.06002034587995931, "grad_norm": 18.17243003845215, "learning_rate": 1.9661016949152543e-07, "loss": 1.1545, "mean_token_accuracy": 0.7428985834121704, "num_tokens": 18749574.0, "step": 59 }, { "epoch": 0.061037639877924724, "grad_norm": 17.998579025268555, "learning_rate": 2e-07, "loss": 1.1666, "mean_token_accuracy": 0.7411248683929443, "num_tokens": 19068637.0, "step": 60 }, { "epoch": 0.06205493387589013, "grad_norm": 18.67633056640625, "learning_rate": 2.0338983050847458e-07, "loss": 1.1499, "mean_token_accuracy": 0.7433934211730957, "num_tokens": 19367232.0, "step": 61 }, { "epoch": 0.06307222787385554, "grad_norm": 16.794296264648438, "learning_rate": 2.0677966101694912e-07, "loss": 1.1185, "mean_token_accuracy": 0.7493900060653687, "num_tokens": 19693698.0, "step": 62 }, { "epoch": 0.06408952187182096, "grad_norm": 17.70916748046875, "learning_rate": 2.101694915254237e-07, "loss": 1.1458, "mean_token_accuracy": 0.7437374591827393, "num_tokens": 20001680.0, "step": 63 }, { "epoch": 0.06510681586978637, "grad_norm": 16.543203353881836, "learning_rate": 2.1355932203389828e-07, "loss": 1.1166, "mean_token_accuracy": 0.7477895617485046, "num_tokens": 20326096.0, "step": 64 }, { "epoch": 0.06612410986775177, "grad_norm": 15.88288688659668, "learning_rate": 2.1694915254237287e-07, "loss": 1.1553, "mean_token_accuracy": 0.7364981770515442, "num_tokens": 20657596.0, "step": 65 }, { "epoch": 0.0671414038657172, "grad_norm": 15.658295631408691, "learning_rate": 2.2033898305084743e-07, "loss": 1.1007, "mean_token_accuracy": 0.7487651705741882, "num_tokens": 20978319.0, "step": 66 }, { "epoch": 0.0681586978636826, "grad_norm": 15.981629371643066, "learning_rate": 2.2372881355932202e-07, "loss": 1.0944, "mean_token_accuracy": 0.7503189444541931, "num_tokens": 21283777.0, "step": 67 }, { "epoch": 0.06917599186164802, "grad_norm": 12.884788513183594, "learning_rate": 2.271186440677966e-07, "loss": 1.0762, "mean_token_accuracy": 0.7451905012130737, "num_tokens": 21590885.0, "step": 68 }, { "epoch": 0.07019328585961343, "grad_norm": 9.129504203796387, "learning_rate": 2.3050847457627118e-07, "loss": 1.0612, "mean_token_accuracy": 0.7446426153182983, "num_tokens": 21898426.0, "step": 69 }, { "epoch": 0.07121057985757884, "grad_norm": 7.249643802642822, "learning_rate": 2.3389830508474577e-07, "loss": 1.0684, "mean_token_accuracy": 0.7394939661026001, "num_tokens": 22215818.0, "step": 70 }, { "epoch": 0.07222787385554426, "grad_norm": 6.1339192390441895, "learning_rate": 2.3728813559322033e-07, "loss": 1.0216, "mean_token_accuracy": 0.7490743398666382, "num_tokens": 22537203.0, "step": 71 }, { "epoch": 0.07324516785350967, "grad_norm": 6.061101913452148, "learning_rate": 2.406779661016949e-07, "loss": 1.0146, "mean_token_accuracy": 0.7498236298561096, "num_tokens": 22850630.0, "step": 72 }, { "epoch": 0.07426246185147507, "grad_norm": 5.744910717010498, "learning_rate": 2.440677966101695e-07, "loss": 1.0161, "mean_token_accuracy": 0.7497204542160034, "num_tokens": 23159234.0, "step": 73 }, { "epoch": 0.0752797558494405, "grad_norm": 5.69437313079834, "learning_rate": 2.4745762711864405e-07, "loss": 1.0004, "mean_token_accuracy": 0.7524319887161255, "num_tokens": 23473092.0, "step": 74 }, { "epoch": 0.0762970498474059, "grad_norm": 5.5687785148620605, "learning_rate": 2.508474576271186e-07, "loss": 0.9938, "mean_token_accuracy": 0.7536816596984863, "num_tokens": 23777307.0, "step": 75 }, { "epoch": 0.07731434384537131, "grad_norm": 5.6238203048706055, "learning_rate": 2.542372881355932e-07, "loss": 1.0231, "mean_token_accuracy": 0.7456361055374146, "num_tokens": 24087522.0, "step": 76 }, { "epoch": 0.07833163784333673, "grad_norm": 5.4180378913879395, "learning_rate": 2.576271186440678e-07, "loss": 1.0101, "mean_token_accuracy": 0.7502592206001282, "num_tokens": 24394212.0, "step": 77 }, { "epoch": 0.07934893184130214, "grad_norm": 5.385940074920654, "learning_rate": 2.6101694915254236e-07, "loss": 1.0241, "mean_token_accuracy": 0.7468531131744385, "num_tokens": 24703099.0, "step": 78 }, { "epoch": 0.08036622583926754, "grad_norm": 5.0685882568359375, "learning_rate": 2.6440677966101693e-07, "loss": 0.9893, "mean_token_accuracy": 0.7535327672958374, "num_tokens": 25018302.0, "step": 79 }, { "epoch": 0.08138351983723296, "grad_norm": 4.923753261566162, "learning_rate": 2.677966101694915e-07, "loss": 0.9959, "mean_token_accuracy": 0.7521530389785767, "num_tokens": 25345695.0, "step": 80 }, { "epoch": 0.08240081383519837, "grad_norm": 4.761390686035156, "learning_rate": 2.711864406779661e-07, "loss": 0.972, "mean_token_accuracy": 0.7573574185371399, "num_tokens": 25674463.0, "step": 81 }, { "epoch": 0.08341810783316378, "grad_norm": 4.758273601531982, "learning_rate": 2.745762711864407e-07, "loss": 0.9888, "mean_token_accuracy": 0.7523093223571777, "num_tokens": 25991547.0, "step": 82 }, { "epoch": 0.0844354018311292, "grad_norm": 4.4657158851623535, "learning_rate": 2.7796610169491524e-07, "loss": 0.977, "mean_token_accuracy": 0.7544059157371521, "num_tokens": 26310637.0, "step": 83 }, { "epoch": 0.08545269582909461, "grad_norm": 4.273565292358398, "learning_rate": 2.813559322033898e-07, "loss": 0.9622, "mean_token_accuracy": 0.7580838799476624, "num_tokens": 26632464.0, "step": 84 }, { "epoch": 0.08646998982706001, "grad_norm": 4.093416690826416, "learning_rate": 2.847457627118644e-07, "loss": 0.9687, "mean_token_accuracy": 0.7556474208831787, "num_tokens": 26951965.0, "step": 85 }, { "epoch": 0.08748728382502544, "grad_norm": 3.6930601596832275, "learning_rate": 2.88135593220339e-07, "loss": 0.9338, "mean_token_accuracy": 0.7628868818283081, "num_tokens": 27279172.0, "step": 86 }, { "epoch": 0.08850457782299084, "grad_norm": 3.5280673503875732, "learning_rate": 2.9152542372881355e-07, "loss": 0.9686, "mean_token_accuracy": 0.754558801651001, "num_tokens": 27595872.0, "step": 87 }, { "epoch": 0.08952187182095625, "grad_norm": 3.2873504161834717, "learning_rate": 2.949152542372881e-07, "loss": 0.9341, "mean_token_accuracy": 0.7624785304069519, "num_tokens": 27923943.0, "step": 88 }, { "epoch": 0.09053916581892167, "grad_norm": 3.2077414989471436, "learning_rate": 2.9830508474576273e-07, "loss": 0.9196, "mean_token_accuracy": 0.7646963000297546, "num_tokens": 28243518.0, "step": 89 }, { "epoch": 0.09155645981688708, "grad_norm": 3.1801390647888184, "learning_rate": 3.016949152542373e-07, "loss": 0.9401, "mean_token_accuracy": 0.758694589138031, "num_tokens": 28558441.0, "step": 90 }, { "epoch": 0.09257375381485249, "grad_norm": 3.2310826778411865, "learning_rate": 3.0508474576271186e-07, "loss": 0.9157, "mean_token_accuracy": 0.7642312049865723, "num_tokens": 28877502.0, "step": 91 }, { "epoch": 0.0935910478128179, "grad_norm": 3.286752462387085, "learning_rate": 3.084745762711864e-07, "loss": 0.9419, "mean_token_accuracy": 0.7571986317634583, "num_tokens": 29189342.0, "step": 92 }, { "epoch": 0.09460834181078331, "grad_norm": 3.144559383392334, "learning_rate": 3.1186440677966104e-07, "loss": 0.8815, "mean_token_accuracy": 0.7729621529579163, "num_tokens": 29523929.0, "step": 93 }, { "epoch": 0.09562563580874874, "grad_norm": 3.3081698417663574, "learning_rate": 3.152542372881356e-07, "loss": 0.9071, "mean_token_accuracy": 0.7662538290023804, "num_tokens": 29839653.0, "step": 94 }, { "epoch": 0.09664292980671414, "grad_norm": 3.4476613998413086, "learning_rate": 3.186440677966101e-07, "loss": 0.9231, "mean_token_accuracy": 0.7608891129493713, "num_tokens": 30163501.0, "step": 95 }, { "epoch": 0.09766022380467955, "grad_norm": 3.5375242233276367, "learning_rate": 3.220338983050847e-07, "loss": 0.9056, "mean_token_accuracy": 0.764274001121521, "num_tokens": 30492907.0, "step": 96 }, { "epoch": 0.09867751780264497, "grad_norm": 3.2265610694885254, "learning_rate": 3.254237288135593e-07, "loss": 0.8757, "mean_token_accuracy": 0.7704571485519409, "num_tokens": 30830951.0, "step": 97 }, { "epoch": 0.09969481180061038, "grad_norm": 3.309645175933838, "learning_rate": 3.2881355932203386e-07, "loss": 0.8855, "mean_token_accuracy": 0.7690130472183228, "num_tokens": 31153089.0, "step": 98 }, { "epoch": 0.10071210579857579, "grad_norm": 3.1601974964141846, "learning_rate": 3.3220338983050843e-07, "loss": 0.9156, "mean_token_accuracy": 0.7623469829559326, "num_tokens": 31462790.0, "step": 99 }, { "epoch": 0.1017293997965412, "grad_norm": 3.098494052886963, "learning_rate": 3.35593220338983e-07, "loss": 0.9097, "mean_token_accuracy": 0.7615648508071899, "num_tokens": 31757983.0, "step": 100 }, { "epoch": 0.10274669379450661, "grad_norm": 2.887308359146118, "learning_rate": 3.389830508474576e-07, "loss": 0.8915, "mean_token_accuracy": 0.7654359340667725, "num_tokens": 32072015.0, "step": 101 }, { "epoch": 0.10376398779247202, "grad_norm": 2.7770333290100098, "learning_rate": 3.423728813559322e-07, "loss": 0.8763, "mean_token_accuracy": 0.7692670822143555, "num_tokens": 32401421.0, "step": 102 }, { "epoch": 0.10478128179043744, "grad_norm": 2.542947769165039, "learning_rate": 3.4576271186440674e-07, "loss": 0.8768, "mean_token_accuracy": 0.7691189050674438, "num_tokens": 32710194.0, "step": 103 }, { "epoch": 0.10579857578840285, "grad_norm": 2.471694231033325, "learning_rate": 3.4915254237288136e-07, "loss": 0.896, "mean_token_accuracy": 0.7648525238037109, "num_tokens": 33048273.0, "step": 104 }, { "epoch": 0.10681586978636826, "grad_norm": 2.376176357269287, "learning_rate": 3.525423728813559e-07, "loss": 0.8623, "mean_token_accuracy": 0.7729743123054504, "num_tokens": 33375803.0, "step": 105 }, { "epoch": 0.10783316378433368, "grad_norm": 2.270570755004883, "learning_rate": 3.559322033898305e-07, "loss": 0.903, "mean_token_accuracy": 0.7616955637931824, "num_tokens": 33675958.0, "step": 106 }, { "epoch": 0.10885045778229908, "grad_norm": 2.171126127243042, "learning_rate": 3.5932203389830505e-07, "loss": 0.8738, "mean_token_accuracy": 0.7685263156890869, "num_tokens": 34013911.0, "step": 107 }, { "epoch": 0.10986775178026449, "grad_norm": 2.0027692317962646, "learning_rate": 3.6271186440677967e-07, "loss": 0.8549, "mean_token_accuracy": 0.7736266851425171, "num_tokens": 34337717.0, "step": 108 }, { "epoch": 0.11088504577822991, "grad_norm": 1.915738582611084, "learning_rate": 3.6610169491525423e-07, "loss": 0.8559, "mean_token_accuracy": 0.7725293040275574, "num_tokens": 34668424.0, "step": 109 }, { "epoch": 0.11190233977619532, "grad_norm": 1.8464709520339966, "learning_rate": 3.694915254237288e-07, "loss": 0.864, "mean_token_accuracy": 0.7720854878425598, "num_tokens": 34986133.0, "step": 110 }, { "epoch": 0.11291963377416073, "grad_norm": 1.863933801651001, "learning_rate": 3.7288135593220336e-07, "loss": 0.8559, "mean_token_accuracy": 0.7732508778572083, "num_tokens": 35301888.0, "step": 111 }, { "epoch": 0.11393692777212615, "grad_norm": 1.771524429321289, "learning_rate": 3.76271186440678e-07, "loss": 0.8498, "mean_token_accuracy": 0.7744065523147583, "num_tokens": 35621625.0, "step": 112 }, { "epoch": 0.11495422177009156, "grad_norm": 1.7587084770202637, "learning_rate": 3.7966101694915254e-07, "loss": 0.8296, "mean_token_accuracy": 0.7790131568908691, "num_tokens": 35936577.0, "step": 113 }, { "epoch": 0.11597151576805696, "grad_norm": 1.6823285818099976, "learning_rate": 3.830508474576271e-07, "loss": 0.8507, "mean_token_accuracy": 0.7727391123771667, "num_tokens": 36258492.0, "step": 114 }, { "epoch": 0.11698880976602238, "grad_norm": 1.6039552688598633, "learning_rate": 3.8644067796610167e-07, "loss": 0.8337, "mean_token_accuracy": 0.7773861289024353, "num_tokens": 36569958.0, "step": 115 }, { "epoch": 0.11800610376398779, "grad_norm": 1.829788327217102, "learning_rate": 3.898305084745763e-07, "loss": 0.8552, "mean_token_accuracy": 0.7726601362228394, "num_tokens": 36886881.0, "step": 116 }, { "epoch": 0.1190233977619532, "grad_norm": 1.4620957374572754, "learning_rate": 3.9322033898305085e-07, "loss": 0.8252, "mean_token_accuracy": 0.7793046236038208, "num_tokens": 37215889.0, "step": 117 }, { "epoch": 0.12004069175991862, "grad_norm": 1.456753134727478, "learning_rate": 3.966101694915254e-07, "loss": 0.8254, "mean_token_accuracy": 0.7786370515823364, "num_tokens": 37528382.0, "step": 118 }, { "epoch": 0.12105798575788403, "grad_norm": 1.4053311347961426, "learning_rate": 4e-07, "loss": 0.8491, "mean_token_accuracy": 0.7720214128494263, "num_tokens": 37848901.0, "step": 119 }, { "epoch": 0.12207527975584945, "grad_norm": 1.395704984664917, "learning_rate": 4.033898305084746e-07, "loss": 0.8484, "mean_token_accuracy": 0.7713293433189392, "num_tokens": 38166048.0, "step": 120 }, { "epoch": 0.12309257375381485, "grad_norm": 1.3442366123199463, "learning_rate": 4.0677966101694916e-07, "loss": 0.7989, "mean_token_accuracy": 0.7831647992134094, "num_tokens": 38487350.0, "step": 121 }, { "epoch": 0.12410986775178026, "grad_norm": 1.3379216194152832, "learning_rate": 4.101694915254237e-07, "loss": 0.8444, "mean_token_accuracy": 0.7723356485366821, "num_tokens": 38803903.0, "step": 122 }, { "epoch": 0.12512716174974567, "grad_norm": 1.4637871980667114, "learning_rate": 4.1355932203389824e-07, "loss": 0.806, "mean_token_accuracy": 0.7808359861373901, "num_tokens": 39109262.0, "step": 123 }, { "epoch": 0.12614445574771108, "grad_norm": 1.468603491783142, "learning_rate": 4.1694915254237286e-07, "loss": 0.8097, "mean_token_accuracy": 0.77936190366745, "num_tokens": 39438988.0, "step": 124 }, { "epoch": 0.1271617497456765, "grad_norm": 1.3074346780776978, "learning_rate": 4.203389830508474e-07, "loss": 0.8555, "mean_token_accuracy": 0.7697174549102783, "num_tokens": 39780179.0, "step": 125 }, { "epoch": 0.12817904374364192, "grad_norm": 1.2865965366363525, "learning_rate": 4.23728813559322e-07, "loss": 0.832, "mean_token_accuracy": 0.7754062414169312, "num_tokens": 40103824.0, "step": 126 }, { "epoch": 0.12919633774160733, "grad_norm": 1.2583401203155518, "learning_rate": 4.2711864406779655e-07, "loss": 0.8114, "mean_token_accuracy": 0.7794609069824219, "num_tokens": 40433373.0, "step": 127 }, { "epoch": 0.13021363173957273, "grad_norm": 1.1636322736740112, "learning_rate": 4.3050847457627117e-07, "loss": 0.8085, "mean_token_accuracy": 0.7795127630233765, "num_tokens": 40743349.0, "step": 128 }, { "epoch": 0.13123092573753814, "grad_norm": 1.1396219730377197, "learning_rate": 4.3389830508474573e-07, "loss": 0.7826, "mean_token_accuracy": 0.7847133278846741, "num_tokens": 41064135.0, "step": 129 }, { "epoch": 0.13224821973550355, "grad_norm": 1.1963047981262207, "learning_rate": 4.372881355932203e-07, "loss": 0.7915, "mean_token_accuracy": 0.7845783233642578, "num_tokens": 41373300.0, "step": 130 }, { "epoch": 0.13326551373346898, "grad_norm": 1.128556489944458, "learning_rate": 4.4067796610169486e-07, "loss": 0.824, "mean_token_accuracy": 0.7758683562278748, "num_tokens": 41699217.0, "step": 131 }, { "epoch": 0.1342828077314344, "grad_norm": 1.1753361225128174, "learning_rate": 4.440677966101695e-07, "loss": 0.8166, "mean_token_accuracy": 0.7765942811965942, "num_tokens": 42029303.0, "step": 132 }, { "epoch": 0.1353001017293998, "grad_norm": 1.1195635795593262, "learning_rate": 4.4745762711864404e-07, "loss": 0.8082, "mean_token_accuracy": 0.7790402173995972, "num_tokens": 42349174.0, "step": 133 }, { "epoch": 0.1363173957273652, "grad_norm": 1.065693974494934, "learning_rate": 4.508474576271186e-07, "loss": 0.7991, "mean_token_accuracy": 0.7818088531494141, "num_tokens": 42676569.0, "step": 134 }, { "epoch": 0.1373346897253306, "grad_norm": 1.0510700941085815, "learning_rate": 4.542372881355932e-07, "loss": 0.8062, "mean_token_accuracy": 0.7790409922599792, "num_tokens": 42992073.0, "step": 135 }, { "epoch": 0.13835198372329605, "grad_norm": 1.014585018157959, "learning_rate": 4.576271186440678e-07, "loss": 0.7901, "mean_token_accuracy": 0.7838433980941772, "num_tokens": 43319905.0, "step": 136 }, { "epoch": 0.13936927772126145, "grad_norm": 1.0587494373321533, "learning_rate": 4.6101694915254235e-07, "loss": 0.782, "mean_token_accuracy": 0.7848911285400391, "num_tokens": 43642021.0, "step": 137 }, { "epoch": 0.14038657171922686, "grad_norm": 1.0070114135742188, "learning_rate": 4.644067796610169e-07, "loss": 0.7956, "mean_token_accuracy": 0.7806844711303711, "num_tokens": 43948725.0, "step": 138 }, { "epoch": 0.14140386571719227, "grad_norm": 1.0419402122497559, "learning_rate": 4.6779661016949154e-07, "loss": 0.7772, "mean_token_accuracy": 0.7851964831352234, "num_tokens": 44265793.0, "step": 139 }, { "epoch": 0.14242115971515767, "grad_norm": 1.03512442111969, "learning_rate": 4.711864406779661e-07, "loss": 0.7913, "mean_token_accuracy": 0.7820444107055664, "num_tokens": 44585677.0, "step": 140 }, { "epoch": 0.14343845371312308, "grad_norm": 0.9669404029846191, "learning_rate": 4.7457627118644066e-07, "loss": 0.7981, "mean_token_accuracy": 0.7805585265159607, "num_tokens": 44896422.0, "step": 141 }, { "epoch": 0.14445574771108852, "grad_norm": 0.9535713195800781, "learning_rate": 4.779661016949152e-07, "loss": 0.7657, "mean_token_accuracy": 0.7876882553100586, "num_tokens": 45222963.0, "step": 142 }, { "epoch": 0.14547304170905392, "grad_norm": 0.9769468903541565, "learning_rate": 4.813559322033898e-07, "loss": 0.7582, "mean_token_accuracy": 0.7912623882293701, "num_tokens": 45525565.0, "step": 143 }, { "epoch": 0.14649033570701933, "grad_norm": 0.9299623370170593, "learning_rate": 4.847457627118644e-07, "loss": 0.7813, "mean_token_accuracy": 0.7845970392227173, "num_tokens": 45852191.0, "step": 144 }, { "epoch": 0.14750762970498474, "grad_norm": 1.0642598867416382, "learning_rate": 4.88135593220339e-07, "loss": 0.8012, "mean_token_accuracy": 0.781639575958252, "num_tokens": 46163464.0, "step": 145 }, { "epoch": 0.14852492370295015, "grad_norm": 0.9342326521873474, "learning_rate": 4.915254237288136e-07, "loss": 0.7732, "mean_token_accuracy": 0.7873122096061707, "num_tokens": 46475098.0, "step": 146 }, { "epoch": 0.14954221770091555, "grad_norm": 0.9148728251457214, "learning_rate": 4.949152542372881e-07, "loss": 0.7663, "mean_token_accuracy": 0.7879598140716553, "num_tokens": 46799760.0, "step": 147 }, { "epoch": 0.150559511698881, "grad_norm": 0.8839486837387085, "learning_rate": 4.983050847457627e-07, "loss": 0.7478, "mean_token_accuracy": 0.7923979759216309, "num_tokens": 47128771.0, "step": 148 }, { "epoch": 0.1515768056968464, "grad_norm": 0.9131452441215515, "learning_rate": 5.016949152542372e-07, "loss": 0.7751, "mean_token_accuracy": 0.7854825258255005, "num_tokens": 47455167.0, "step": 149 }, { "epoch": 0.1525940996948118, "grad_norm": 0.8893353343009949, "learning_rate": 5.050847457627119e-07, "loss": 0.7649, "mean_token_accuracy": 0.7875303030014038, "num_tokens": 47768491.0, "step": 150 }, { "epoch": 0.1536113936927772, "grad_norm": 0.9076464772224426, "learning_rate": 5.084745762711864e-07, "loss": 0.7347, "mean_token_accuracy": 0.7955377101898193, "num_tokens": 48076479.0, "step": 151 }, { "epoch": 0.15462868769074262, "grad_norm": 0.9250063896179199, "learning_rate": 5.11864406779661e-07, "loss": 0.7281, "mean_token_accuracy": 0.7981938123703003, "num_tokens": 48390048.0, "step": 152 }, { "epoch": 0.15564598168870802, "grad_norm": 0.86025470495224, "learning_rate": 5.152542372881356e-07, "loss": 0.7421, "mean_token_accuracy": 0.793510913848877, "num_tokens": 48712282.0, "step": 153 }, { "epoch": 0.15666327568667346, "grad_norm": 0.8844801783561707, "learning_rate": 5.186440677966102e-07, "loss": 0.7886, "mean_token_accuracy": 0.7832847237586975, "num_tokens": 49032570.0, "step": 154 }, { "epoch": 0.15768056968463887, "grad_norm": 0.8823682069778442, "learning_rate": 5.220338983050847e-07, "loss": 0.7683, "mean_token_accuracy": 0.7857640981674194, "num_tokens": 49357658.0, "step": 155 }, { "epoch": 0.15869786368260427, "grad_norm": 0.8387380838394165, "learning_rate": 5.254237288135593e-07, "loss": 0.7383, "mean_token_accuracy": 0.7938828468322754, "num_tokens": 49678022.0, "step": 156 }, { "epoch": 0.15971515768056968, "grad_norm": 0.8787022829055786, "learning_rate": 5.288135593220339e-07, "loss": 0.7387, "mean_token_accuracy": 0.7946102023124695, "num_tokens": 50003658.0, "step": 157 }, { "epoch": 0.1607324516785351, "grad_norm": 0.8462948203086853, "learning_rate": 5.322033898305085e-07, "loss": 0.7505, "mean_token_accuracy": 0.7912555932998657, "num_tokens": 50337188.0, "step": 158 }, { "epoch": 0.1617497456765005, "grad_norm": 1.0470606088638306, "learning_rate": 5.35593220338983e-07, "loss": 0.7759, "mean_token_accuracy": 0.7850117087364197, "num_tokens": 50649690.0, "step": 159 }, { "epoch": 0.16276703967446593, "grad_norm": 0.8577026128768921, "learning_rate": 5.389830508474577e-07, "loss": 0.7579, "mean_token_accuracy": 0.7884396910667419, "num_tokens": 50973651.0, "step": 160 }, { "epoch": 0.16378433367243134, "grad_norm": 0.8702759742736816, "learning_rate": 5.423728813559322e-07, "loss": 0.7647, "mean_token_accuracy": 0.7875997424125671, "num_tokens": 51302446.0, "step": 161 }, { "epoch": 0.16480162767039674, "grad_norm": 0.8638347387313843, "learning_rate": 5.457627118644067e-07, "loss": 0.7555, "mean_token_accuracy": 0.7890993356704712, "num_tokens": 51613127.0, "step": 162 }, { "epoch": 0.16581892166836215, "grad_norm": 0.8361454606056213, "learning_rate": 5.491525423728813e-07, "loss": 0.7335, "mean_token_accuracy": 0.7952470779418945, "num_tokens": 51935798.0, "step": 163 }, { "epoch": 0.16683621566632756, "grad_norm": 0.8453940749168396, "learning_rate": 5.525423728813559e-07, "loss": 0.7525, "mean_token_accuracy": 0.7894124984741211, "num_tokens": 52241386.0, "step": 164 }, { "epoch": 0.167853509664293, "grad_norm": 0.8433927297592163, "learning_rate": 5.559322033898305e-07, "loss": 0.7566, "mean_token_accuracy": 0.7888467907905579, "num_tokens": 52550468.0, "step": 165 }, { "epoch": 0.1688708036622584, "grad_norm": 0.890924870967865, "learning_rate": 5.59322033898305e-07, "loss": 0.7577, "mean_token_accuracy": 0.7890304327011108, "num_tokens": 52870603.0, "step": 166 }, { "epoch": 0.1698880976602238, "grad_norm": 0.8495107293128967, "learning_rate": 5.627118644067796e-07, "loss": 0.7612, "mean_token_accuracy": 0.7883809804916382, "num_tokens": 53186596.0, "step": 167 }, { "epoch": 0.17090539165818922, "grad_norm": 0.8580964207649231, "learning_rate": 5.661016949152541e-07, "loss": 0.7522, "mean_token_accuracy": 0.7893248796463013, "num_tokens": 53511107.0, "step": 168 }, { "epoch": 0.17192268565615462, "grad_norm": 0.8602516055107117, "learning_rate": 5.694915254237288e-07, "loss": 0.7306, "mean_token_accuracy": 0.7956889867782593, "num_tokens": 53829950.0, "step": 169 }, { "epoch": 0.17293997965412003, "grad_norm": 0.8338865637779236, "learning_rate": 5.728813559322034e-07, "loss": 0.7611, "mean_token_accuracy": 0.7888447046279907, "num_tokens": 54156414.0, "step": 170 }, { "epoch": 0.17395727365208546, "grad_norm": 0.8236330151557922, "learning_rate": 5.76271186440678e-07, "loss": 0.7451, "mean_token_accuracy": 0.7912829518318176, "num_tokens": 54487854.0, "step": 171 }, { "epoch": 0.17497456765005087, "grad_norm": 0.825598418712616, "learning_rate": 5.796610169491525e-07, "loss": 0.7408, "mean_token_accuracy": 0.7934503555297852, "num_tokens": 54799217.0, "step": 172 }, { "epoch": 0.17599186164801628, "grad_norm": 0.8386024832725525, "learning_rate": 5.830508474576271e-07, "loss": 0.7371, "mean_token_accuracy": 0.7941863536834717, "num_tokens": 55126417.0, "step": 173 }, { "epoch": 0.1770091556459817, "grad_norm": 0.8538760542869568, "learning_rate": 5.864406779661016e-07, "loss": 0.752, "mean_token_accuracy": 0.7901371717453003, "num_tokens": 55457557.0, "step": 174 }, { "epoch": 0.1780264496439471, "grad_norm": 0.8268012404441833, "learning_rate": 5.898305084745762e-07, "loss": 0.7468, "mean_token_accuracy": 0.7909038066864014, "num_tokens": 55780975.0, "step": 175 }, { "epoch": 0.1790437436419125, "grad_norm": 0.8469352126121521, "learning_rate": 5.932203389830508e-07, "loss": 0.7643, "mean_token_accuracy": 0.7857322096824646, "num_tokens": 56097949.0, "step": 176 }, { "epoch": 0.18006103763987794, "grad_norm": 0.8765305280685425, "learning_rate": 5.966101694915255e-07, "loss": 0.7732, "mean_token_accuracy": 0.7842049598693848, "num_tokens": 56427795.0, "step": 177 }, { "epoch": 0.18107833163784334, "grad_norm": 0.9026130437850952, "learning_rate": 6e-07, "loss": 0.7455, "mean_token_accuracy": 0.790698766708374, "num_tokens": 56725887.0, "step": 178 }, { "epoch": 0.18209562563580875, "grad_norm": 0.8640514016151428, "learning_rate": 6.033898305084746e-07, "loss": 0.7259, "mean_token_accuracy": 0.795634388923645, "num_tokens": 57066741.0, "step": 179 }, { "epoch": 0.18311291963377416, "grad_norm": 0.8257157802581787, "learning_rate": 6.067796610169491e-07, "loss": 0.7334, "mean_token_accuracy": 0.7942250967025757, "num_tokens": 57382943.0, "step": 180 }, { "epoch": 0.18413021363173956, "grad_norm": 0.8310932517051697, "learning_rate": 6.101694915254237e-07, "loss": 0.7276, "mean_token_accuracy": 0.7951348423957825, "num_tokens": 57697387.0, "step": 181 }, { "epoch": 0.18514750762970497, "grad_norm": 0.7885932326316833, "learning_rate": 6.135593220338982e-07, "loss": 0.7314, "mean_token_accuracy": 0.7951779961585999, "num_tokens": 58024333.0, "step": 182 }, { "epoch": 0.1861648016276704, "grad_norm": 0.860317051410675, "learning_rate": 6.169491525423728e-07, "loss": 0.7316, "mean_token_accuracy": 0.7940535545349121, "num_tokens": 58338761.0, "step": 183 }, { "epoch": 0.1871820956256358, "grad_norm": 0.8580062985420227, "learning_rate": 6.203389830508475e-07, "loss": 0.7538, "mean_token_accuracy": 0.7890710234642029, "num_tokens": 58643138.0, "step": 184 }, { "epoch": 0.18819938962360122, "grad_norm": 0.8123632073402405, "learning_rate": 6.237288135593221e-07, "loss": 0.7133, "mean_token_accuracy": 0.7988241314888, "num_tokens": 58978624.0, "step": 185 }, { "epoch": 0.18921668362156663, "grad_norm": 0.8669635653495789, "learning_rate": 6.271186440677966e-07, "loss": 0.6936, "mean_token_accuracy": 0.8056967854499817, "num_tokens": 59307831.0, "step": 186 }, { "epoch": 0.19023397761953204, "grad_norm": 0.8834171295166016, "learning_rate": 6.305084745762712e-07, "loss": 0.7359, "mean_token_accuracy": 0.7936495542526245, "num_tokens": 59612951.0, "step": 187 }, { "epoch": 0.19125127161749747, "grad_norm": 0.80832839012146, "learning_rate": 6.338983050847457e-07, "loss": 0.6905, "mean_token_accuracy": 0.8033033609390259, "num_tokens": 59940444.0, "step": 188 }, { "epoch": 0.19226856561546288, "grad_norm": 0.8026413321495056, "learning_rate": 6.372881355932202e-07, "loss": 0.726, "mean_token_accuracy": 0.7942773103713989, "num_tokens": 60270518.0, "step": 189 }, { "epoch": 0.19328585961342828, "grad_norm": 0.9134705662727356, "learning_rate": 6.406779661016949e-07, "loss": 0.7269, "mean_token_accuracy": 0.7943723201751709, "num_tokens": 60587162.0, "step": 190 }, { "epoch": 0.1943031536113937, "grad_norm": 0.8130223751068115, "learning_rate": 6.440677966101694e-07, "loss": 0.7114, "mean_token_accuracy": 0.7991458177566528, "num_tokens": 60917294.0, "step": 191 }, { "epoch": 0.1953204476093591, "grad_norm": 0.8199300169944763, "learning_rate": 6.474576271186441e-07, "loss": 0.7219, "mean_token_accuracy": 0.7966375350952148, "num_tokens": 61248579.0, "step": 192 }, { "epoch": 0.1963377416073245, "grad_norm": 0.8786404132843018, "learning_rate": 6.508474576271186e-07, "loss": 0.7247, "mean_token_accuracy": 0.7953713536262512, "num_tokens": 61568050.0, "step": 193 }, { "epoch": 0.19735503560528994, "grad_norm": 0.7589143514633179, "learning_rate": 6.542372881355932e-07, "loss": 0.7081, "mean_token_accuracy": 0.8009048104286194, "num_tokens": 61895500.0, "step": 194 }, { "epoch": 0.19837232960325535, "grad_norm": 0.8833111524581909, "learning_rate": 6.576271186440677e-07, "loss": 0.7257, "mean_token_accuracy": 0.7971071004867554, "num_tokens": 62247115.0, "step": 195 }, { "epoch": 0.19938962360122076, "grad_norm": 0.902251660823822, "learning_rate": 6.610169491525423e-07, "loss": 0.7258, "mean_token_accuracy": 0.7947437763214111, "num_tokens": 62559441.0, "step": 196 }, { "epoch": 0.20040691759918616, "grad_norm": 0.8570722341537476, "learning_rate": 6.644067796610169e-07, "loss": 0.7453, "mean_token_accuracy": 0.7911810278892517, "num_tokens": 62863338.0, "step": 197 }, { "epoch": 0.20142421159715157, "grad_norm": 0.7847694158554077, "learning_rate": 6.677966101694915e-07, "loss": 0.6925, "mean_token_accuracy": 0.802798330783844, "num_tokens": 63189778.0, "step": 198 }, { "epoch": 0.20244150559511698, "grad_norm": 0.8860125541687012, "learning_rate": 6.71186440677966e-07, "loss": 0.7432, "mean_token_accuracy": 0.7898187637329102, "num_tokens": 63513614.0, "step": 199 }, { "epoch": 0.2034587995930824, "grad_norm": 0.8800017833709717, "learning_rate": 6.745762711864407e-07, "loss": 0.7245, "mean_token_accuracy": 0.7962162494659424, "num_tokens": 63810862.0, "step": 200 }, { "epoch": 0.20447609359104782, "grad_norm": 0.7929122447967529, "learning_rate": 6.779661016949152e-07, "loss": 0.7337, "mean_token_accuracy": 0.7929904460906982, "num_tokens": 64131809.0, "step": 201 }, { "epoch": 0.20549338758901323, "grad_norm": 0.8461807370185852, "learning_rate": 6.813559322033898e-07, "loss": 0.726, "mean_token_accuracy": 0.7958388328552246, "num_tokens": 64453734.0, "step": 202 }, { "epoch": 0.20651068158697863, "grad_norm": 0.7745478749275208, "learning_rate": 6.847457627118643e-07, "loss": 0.7154, "mean_token_accuracy": 0.7977970838546753, "num_tokens": 64783877.0, "step": 203 }, { "epoch": 0.20752797558494404, "grad_norm": 0.8068340420722961, "learning_rate": 6.88135593220339e-07, "loss": 0.7194, "mean_token_accuracy": 0.7968555688858032, "num_tokens": 65105547.0, "step": 204 }, { "epoch": 0.20854526958290945, "grad_norm": 0.8017715811729431, "learning_rate": 6.915254237288135e-07, "loss": 0.7199, "mean_token_accuracy": 0.7974048256874084, "num_tokens": 65425596.0, "step": 205 }, { "epoch": 0.20956256358087488, "grad_norm": 0.8070000410079956, "learning_rate": 6.949152542372881e-07, "loss": 0.7379, "mean_token_accuracy": 0.7917187213897705, "num_tokens": 65753543.0, "step": 206 }, { "epoch": 0.2105798575788403, "grad_norm": 0.782502293586731, "learning_rate": 6.983050847457627e-07, "loss": 0.6966, "mean_token_accuracy": 0.8027874231338501, "num_tokens": 66079261.0, "step": 207 }, { "epoch": 0.2115971515768057, "grad_norm": 0.8917709589004517, "learning_rate": 7.016949152542373e-07, "loss": 0.7335, "mean_token_accuracy": 0.7919062376022339, "num_tokens": 66410765.0, "step": 208 }, { "epoch": 0.2126144455747711, "grad_norm": 0.7903974056243896, "learning_rate": 7.050847457627118e-07, "loss": 0.7222, "mean_token_accuracy": 0.7964398860931396, "num_tokens": 66737804.0, "step": 209 }, { "epoch": 0.2136317395727365, "grad_norm": 0.7872498631477356, "learning_rate": 7.084745762711865e-07, "loss": 0.7237, "mean_token_accuracy": 0.7953373193740845, "num_tokens": 67056288.0, "step": 210 }, { "epoch": 0.21464903357070192, "grad_norm": 0.8102449774742126, "learning_rate": 7.11864406779661e-07, "loss": 0.7061, "mean_token_accuracy": 0.7994955778121948, "num_tokens": 67369756.0, "step": 211 }, { "epoch": 0.21566632756866735, "grad_norm": 0.8323161005973816, "learning_rate": 7.152542372881356e-07, "loss": 0.7026, "mean_token_accuracy": 0.7993268966674805, "num_tokens": 67688042.0, "step": 212 }, { "epoch": 0.21668362156663276, "grad_norm": 0.7897840738296509, "learning_rate": 7.186440677966101e-07, "loss": 0.7025, "mean_token_accuracy": 0.8000015020370483, "num_tokens": 68008533.0, "step": 213 }, { "epoch": 0.21770091556459817, "grad_norm": 0.8085548877716064, "learning_rate": 7.220338983050847e-07, "loss": 0.6911, "mean_token_accuracy": 0.8024193644523621, "num_tokens": 68332401.0, "step": 214 }, { "epoch": 0.21871820956256358, "grad_norm": 0.8432585000991821, "learning_rate": 7.254237288135593e-07, "loss": 0.7061, "mean_token_accuracy": 0.7990224361419678, "num_tokens": 68643444.0, "step": 215 }, { "epoch": 0.21973550356052898, "grad_norm": 0.8513753414154053, "learning_rate": 7.288135593220338e-07, "loss": 0.7054, "mean_token_accuracy": 0.7992247343063354, "num_tokens": 68976455.0, "step": 216 }, { "epoch": 0.22075279755849442, "grad_norm": 0.7816675305366516, "learning_rate": 7.322033898305085e-07, "loss": 0.7403, "mean_token_accuracy": 0.790706992149353, "num_tokens": 69307832.0, "step": 217 }, { "epoch": 0.22177009155645983, "grad_norm": 0.786855161190033, "learning_rate": 7.35593220338983e-07, "loss": 0.6763, "mean_token_accuracy": 0.8071946501731873, "num_tokens": 69619859.0, "step": 218 }, { "epoch": 0.22278738555442523, "grad_norm": 0.8236937522888184, "learning_rate": 7.389830508474576e-07, "loss": 0.6995, "mean_token_accuracy": 0.8011282682418823, "num_tokens": 69924843.0, "step": 219 }, { "epoch": 0.22380467955239064, "grad_norm": 0.8247690200805664, "learning_rate": 7.423728813559321e-07, "loss": 0.7059, "mean_token_accuracy": 0.7996514439582825, "num_tokens": 70244433.0, "step": 220 }, { "epoch": 0.22482197355035605, "grad_norm": 0.7740869522094727, "learning_rate": 7.457627118644067e-07, "loss": 0.7155, "mean_token_accuracy": 0.7963728904724121, "num_tokens": 70568120.0, "step": 221 }, { "epoch": 0.22583926754832145, "grad_norm": 0.8260399699211121, "learning_rate": 7.491525423728812e-07, "loss": 0.7275, "mean_token_accuracy": 0.7930858135223389, "num_tokens": 70881374.0, "step": 222 }, { "epoch": 0.2268565615462869, "grad_norm": 0.7941381335258484, "learning_rate": 7.52542372881356e-07, "loss": 0.7088, "mean_token_accuracy": 0.7990828156471252, "num_tokens": 71192310.0, "step": 223 }, { "epoch": 0.2278738555442523, "grad_norm": 0.8065604567527771, "learning_rate": 7.559322033898305e-07, "loss": 0.7243, "mean_token_accuracy": 0.7951697111129761, "num_tokens": 71509580.0, "step": 224 }, { "epoch": 0.2288911495422177, "grad_norm": 0.808077335357666, "learning_rate": 7.593220338983051e-07, "loss": 0.6918, "mean_token_accuracy": 0.8035826683044434, "num_tokens": 71825201.0, "step": 225 }, { "epoch": 0.2299084435401831, "grad_norm": 0.7790554761886597, "learning_rate": 7.627118644067796e-07, "loss": 0.6938, "mean_token_accuracy": 0.8024296760559082, "num_tokens": 72158603.0, "step": 226 }, { "epoch": 0.23092573753814852, "grad_norm": 0.7746298313140869, "learning_rate": 7.661016949152542e-07, "loss": 0.6972, "mean_token_accuracy": 0.8012113571166992, "num_tokens": 72476307.0, "step": 227 }, { "epoch": 0.23194303153611392, "grad_norm": 0.7807798385620117, "learning_rate": 7.694915254237287e-07, "loss": 0.6955, "mean_token_accuracy": 0.8009060025215149, "num_tokens": 72805014.0, "step": 228 }, { "epoch": 0.23296032553407936, "grad_norm": 0.7983285784721375, "learning_rate": 7.728813559322033e-07, "loss": 0.6659, "mean_token_accuracy": 0.8106253743171692, "num_tokens": 73123811.0, "step": 229 }, { "epoch": 0.23397761953204477, "grad_norm": 0.8004521131515503, "learning_rate": 7.762711864406779e-07, "loss": 0.6967, "mean_token_accuracy": 0.8012536764144897, "num_tokens": 73432474.0, "step": 230 }, { "epoch": 0.23499491353001017, "grad_norm": 0.7807230353355408, "learning_rate": 7.796610169491526e-07, "loss": 0.7013, "mean_token_accuracy": 0.7990843057632446, "num_tokens": 73759617.0, "step": 231 }, { "epoch": 0.23601220752797558, "grad_norm": 0.7825319170951843, "learning_rate": 7.830508474576271e-07, "loss": 0.712, "mean_token_accuracy": 0.7976258993148804, "num_tokens": 74082923.0, "step": 232 }, { "epoch": 0.237029501525941, "grad_norm": 0.7608177661895752, "learning_rate": 7.864406779661017e-07, "loss": 0.6872, "mean_token_accuracy": 0.8039759397506714, "num_tokens": 74410401.0, "step": 233 }, { "epoch": 0.2380467955239064, "grad_norm": 0.7753943204879761, "learning_rate": 7.898305084745762e-07, "loss": 0.6987, "mean_token_accuracy": 0.8019410371780396, "num_tokens": 74727709.0, "step": 234 }, { "epoch": 0.23906408952187183, "grad_norm": 0.7589889764785767, "learning_rate": 7.932203389830508e-07, "loss": 0.6697, "mean_token_accuracy": 0.8082659244537354, "num_tokens": 75056158.0, "step": 235 }, { "epoch": 0.24008138351983724, "grad_norm": 0.7917013168334961, "learning_rate": 7.966101694915253e-07, "loss": 0.6882, "mean_token_accuracy": 0.803979218006134, "num_tokens": 75380129.0, "step": 236 }, { "epoch": 0.24109867751780265, "grad_norm": 0.8265038728713989, "learning_rate": 8e-07, "loss": 0.6652, "mean_token_accuracy": 0.8091133832931519, "num_tokens": 75692158.0, "step": 237 }, { "epoch": 0.24211597151576805, "grad_norm": 0.8178495168685913, "learning_rate": 8.033898305084746e-07, "loss": 0.6695, "mean_token_accuracy": 0.8072071075439453, "num_tokens": 76015367.0, "step": 238 }, { "epoch": 0.24313326551373346, "grad_norm": 0.8022916913032532, "learning_rate": 8.067796610169492e-07, "loss": 0.6941, "mean_token_accuracy": 0.801773190498352, "num_tokens": 76309618.0, "step": 239 }, { "epoch": 0.2441505595116989, "grad_norm": 0.7696356177330017, "learning_rate": 8.101694915254237e-07, "loss": 0.6796, "mean_token_accuracy": 0.8047494292259216, "num_tokens": 76638347.0, "step": 240 }, { "epoch": 0.2451678535096643, "grad_norm": 0.7858201861381531, "learning_rate": 8.135593220338983e-07, "loss": 0.6956, "mean_token_accuracy": 0.8011404275894165, "num_tokens": 76960377.0, "step": 241 }, { "epoch": 0.2461851475076297, "grad_norm": 0.8141928315162659, "learning_rate": 8.169491525423728e-07, "loss": 0.7296, "mean_token_accuracy": 0.792807936668396, "num_tokens": 77284277.0, "step": 242 }, { "epoch": 0.24720244150559512, "grad_norm": 0.7566704750061035, "learning_rate": 8.203389830508474e-07, "loss": 0.6863, "mean_token_accuracy": 0.8035607933998108, "num_tokens": 77624014.0, "step": 243 }, { "epoch": 0.24821973550356052, "grad_norm": 0.8007069230079651, "learning_rate": 8.23728813559322e-07, "loss": 0.7073, "mean_token_accuracy": 0.7980139255523682, "num_tokens": 77955105.0, "step": 244 }, { "epoch": 0.24923702950152593, "grad_norm": 0.7965030670166016, "learning_rate": 8.271186440677965e-07, "loss": 0.6755, "mean_token_accuracy": 0.8065483570098877, "num_tokens": 78259934.0, "step": 245 }, { "epoch": 0.25025432349949134, "grad_norm": 0.7759296894073486, "learning_rate": 8.305084745762712e-07, "loss": 0.6885, "mean_token_accuracy": 0.8040277361869812, "num_tokens": 78575928.0, "step": 246 }, { "epoch": 0.25127161749745675, "grad_norm": 0.8442168235778809, "learning_rate": 8.338983050847457e-07, "loss": 0.6945, "mean_token_accuracy": 0.8002915382385254, "num_tokens": 78897539.0, "step": 247 }, { "epoch": 0.25228891149542215, "grad_norm": 0.7938070297241211, "learning_rate": 8.372881355932203e-07, "loss": 0.6755, "mean_token_accuracy": 0.806767463684082, "num_tokens": 79206412.0, "step": 248 }, { "epoch": 0.2533062054933876, "grad_norm": 0.7777937650680542, "learning_rate": 8.406779661016948e-07, "loss": 0.6851, "mean_token_accuracy": 0.8038220405578613, "num_tokens": 79550225.0, "step": 249 }, { "epoch": 0.254323499491353, "grad_norm": 0.7907181978225708, "learning_rate": 8.440677966101695e-07, "loss": 0.6769, "mean_token_accuracy": 0.8062283992767334, "num_tokens": 79861100.0, "step": 250 }, { "epoch": 0.25534079348931843, "grad_norm": 0.7875558137893677, "learning_rate": 8.47457627118644e-07, "loss": 0.6873, "mean_token_accuracy": 0.8025171756744385, "num_tokens": 80184953.0, "step": 251 }, { "epoch": 0.25635808748728384, "grad_norm": 0.8327239751815796, "learning_rate": 8.508474576271186e-07, "loss": 0.6759, "mean_token_accuracy": 0.8057800531387329, "num_tokens": 80497644.0, "step": 252 }, { "epoch": 0.25737538148524924, "grad_norm": 0.8272035121917725, "learning_rate": 8.542372881355931e-07, "loss": 0.7038, "mean_token_accuracy": 0.7997850775718689, "num_tokens": 80815501.0, "step": 253 }, { "epoch": 0.25839267548321465, "grad_norm": 0.8474062085151672, "learning_rate": 8.576271186440678e-07, "loss": 0.705, "mean_token_accuracy": 0.797932505607605, "num_tokens": 81137163.0, "step": 254 }, { "epoch": 0.25940996948118006, "grad_norm": 0.8098815679550171, "learning_rate": 8.610169491525423e-07, "loss": 0.6947, "mean_token_accuracy": 0.8005977272987366, "num_tokens": 81448192.0, "step": 255 }, { "epoch": 0.26042726347914547, "grad_norm": 0.7739518284797668, "learning_rate": 8.64406779661017e-07, "loss": 0.697, "mean_token_accuracy": 0.7992587685585022, "num_tokens": 81757788.0, "step": 256 }, { "epoch": 0.2614445574771109, "grad_norm": 0.7883069515228271, "learning_rate": 8.677966101694915e-07, "loss": 0.6769, "mean_token_accuracy": 0.8063681125640869, "num_tokens": 82085431.0, "step": 257 }, { "epoch": 0.2624618514750763, "grad_norm": 0.7693223357200623, "learning_rate": 8.711864406779661e-07, "loss": 0.6924, "mean_token_accuracy": 0.8022792935371399, "num_tokens": 82404647.0, "step": 258 }, { "epoch": 0.2634791454730417, "grad_norm": 0.7538211941719055, "learning_rate": 8.745762711864406e-07, "loss": 0.6846, "mean_token_accuracy": 0.8039592504501343, "num_tokens": 82738083.0, "step": 259 }, { "epoch": 0.2644964394710071, "grad_norm": 0.8052369356155396, "learning_rate": 8.779661016949152e-07, "loss": 0.6828, "mean_token_accuracy": 0.8034741878509521, "num_tokens": 83033584.0, "step": 260 }, { "epoch": 0.26551373346897256, "grad_norm": 0.7891734838485718, "learning_rate": 8.813559322033897e-07, "loss": 0.6932, "mean_token_accuracy": 0.8006192445755005, "num_tokens": 83363928.0, "step": 261 }, { "epoch": 0.26653102746693796, "grad_norm": 0.8296700119972229, "learning_rate": 8.847457627118644e-07, "loss": 0.6739, "mean_token_accuracy": 0.8050704598426819, "num_tokens": 83680255.0, "step": 262 }, { "epoch": 0.26754832146490337, "grad_norm": 0.7551729679107666, "learning_rate": 8.88135593220339e-07, "loss": 0.6909, "mean_token_accuracy": 0.8014122843742371, "num_tokens": 84014162.0, "step": 263 }, { "epoch": 0.2685656154628688, "grad_norm": 0.752968966960907, "learning_rate": 8.915254237288136e-07, "loss": 0.6621, "mean_token_accuracy": 0.8096007704734802, "num_tokens": 84329460.0, "step": 264 }, { "epoch": 0.2695829094608342, "grad_norm": 0.7830281257629395, "learning_rate": 8.949152542372881e-07, "loss": 0.6477, "mean_token_accuracy": 0.813726544380188, "num_tokens": 84643661.0, "step": 265 }, { "epoch": 0.2706002034587996, "grad_norm": 0.785149097442627, "learning_rate": 8.983050847457627e-07, "loss": 0.6637, "mean_token_accuracy": 0.8084698915481567, "num_tokens": 84968874.0, "step": 266 }, { "epoch": 0.271617497456765, "grad_norm": 0.7704117298126221, "learning_rate": 9.016949152542372e-07, "loss": 0.6716, "mean_token_accuracy": 0.8069436550140381, "num_tokens": 85292100.0, "step": 267 }, { "epoch": 0.2726347914547304, "grad_norm": 0.7994391918182373, "learning_rate": 9.050847457627118e-07, "loss": 0.667, "mean_token_accuracy": 0.8093474507331848, "num_tokens": 85605945.0, "step": 268 }, { "epoch": 0.2736520854526958, "grad_norm": 0.7655627727508545, "learning_rate": 9.084745762711864e-07, "loss": 0.6726, "mean_token_accuracy": 0.8057523965835571, "num_tokens": 85924331.0, "step": 269 }, { "epoch": 0.2746693794506612, "grad_norm": 0.8249563574790955, "learning_rate": 9.11864406779661e-07, "loss": 0.7019, "mean_token_accuracy": 0.7991850972175598, "num_tokens": 86231923.0, "step": 270 }, { "epoch": 0.27568667344862663, "grad_norm": 0.7793235182762146, "learning_rate": 9.152542372881356e-07, "loss": 0.6568, "mean_token_accuracy": 0.8094903826713562, "num_tokens": 86546912.0, "step": 271 }, { "epoch": 0.2767039674465921, "grad_norm": 0.7628976702690125, "learning_rate": 9.186440677966101e-07, "loss": 0.716, "mean_token_accuracy": 0.7952671051025391, "num_tokens": 86872441.0, "step": 272 }, { "epoch": 0.2777212614445575, "grad_norm": 0.7923837900161743, "learning_rate": 9.220338983050847e-07, "loss": 0.6864, "mean_token_accuracy": 0.8018236756324768, "num_tokens": 87204457.0, "step": 273 }, { "epoch": 0.2787385554425229, "grad_norm": 0.7962261438369751, "learning_rate": 9.254237288135592e-07, "loss": 0.6931, "mean_token_accuracy": 0.8012575507164001, "num_tokens": 87501470.0, "step": 274 }, { "epoch": 0.2797558494404883, "grad_norm": 0.8026267886161804, "learning_rate": 9.288135593220338e-07, "loss": 0.675, "mean_token_accuracy": 0.8057051301002502, "num_tokens": 87816423.0, "step": 275 }, { "epoch": 0.2807731434384537, "grad_norm": 0.8577560186386108, "learning_rate": 9.322033898305083e-07, "loss": 0.6871, "mean_token_accuracy": 0.8022831678390503, "num_tokens": 88105701.0, "step": 276 }, { "epoch": 0.28179043743641913, "grad_norm": 0.8111786842346191, "learning_rate": 9.355932203389831e-07, "loss": 0.681, "mean_token_accuracy": 0.8033446073532104, "num_tokens": 88429993.0, "step": 277 }, { "epoch": 0.28280773143438453, "grad_norm": 0.7656622529029846, "learning_rate": 9.389830508474576e-07, "loss": 0.685, "mean_token_accuracy": 0.8025110960006714, "num_tokens": 88754109.0, "step": 278 }, { "epoch": 0.28382502543234994, "grad_norm": 0.8410552144050598, "learning_rate": 9.423728813559322e-07, "loss": 0.7007, "mean_token_accuracy": 0.7993711829185486, "num_tokens": 89075556.0, "step": 279 }, { "epoch": 0.28484231943031535, "grad_norm": 0.8009824752807617, "learning_rate": 9.457627118644067e-07, "loss": 0.6852, "mean_token_accuracy": 0.8035849332809448, "num_tokens": 89394941.0, "step": 280 }, { "epoch": 0.28585961342828076, "grad_norm": 0.8217434287071228, "learning_rate": 9.491525423728813e-07, "loss": 0.6889, "mean_token_accuracy": 0.80260169506073, "num_tokens": 89726320.0, "step": 281 }, { "epoch": 0.28687690742624616, "grad_norm": 0.7959878444671631, "learning_rate": 9.525423728813558e-07, "loss": 0.6847, "mean_token_accuracy": 0.8041878938674927, "num_tokens": 90046963.0, "step": 282 }, { "epoch": 0.28789420142421157, "grad_norm": 0.8031529188156128, "learning_rate": 9.559322033898305e-07, "loss": 0.6705, "mean_token_accuracy": 0.8066917657852173, "num_tokens": 90360145.0, "step": 283 }, { "epoch": 0.28891149542217703, "grad_norm": 0.8071709275245667, "learning_rate": 9.59322033898305e-07, "loss": 0.6762, "mean_token_accuracy": 0.8048709034919739, "num_tokens": 90666248.0, "step": 284 }, { "epoch": 0.28992878942014244, "grad_norm": 0.7606756091117859, "learning_rate": 9.627118644067797e-07, "loss": 0.6748, "mean_token_accuracy": 0.8058351278305054, "num_tokens": 90990168.0, "step": 285 }, { "epoch": 0.29094608341810785, "grad_norm": 0.8229522705078125, "learning_rate": 9.661016949152542e-07, "loss": 0.668, "mean_token_accuracy": 0.8086898326873779, "num_tokens": 91287120.0, "step": 286 }, { "epoch": 0.29196337741607326, "grad_norm": 0.7963361144065857, "learning_rate": 9.694915254237287e-07, "loss": 0.6602, "mean_token_accuracy": 0.809562623500824, "num_tokens": 91605888.0, "step": 287 }, { "epoch": 0.29298067141403866, "grad_norm": 0.8544802665710449, "learning_rate": 9.728813559322032e-07, "loss": 0.7004, "mean_token_accuracy": 0.79966139793396, "num_tokens": 91923111.0, "step": 288 }, { "epoch": 0.29399796541200407, "grad_norm": 0.7967947721481323, "learning_rate": 9.76271186440678e-07, "loss": 0.6634, "mean_token_accuracy": 0.8083997964859009, "num_tokens": 92233100.0, "step": 289 }, { "epoch": 0.2950152594099695, "grad_norm": 0.8263453245162964, "learning_rate": 9.796610169491525e-07, "loss": 0.6741, "mean_token_accuracy": 0.8063830733299255, "num_tokens": 92538563.0, "step": 290 }, { "epoch": 0.2960325534079349, "grad_norm": 0.7387613654136658, "learning_rate": 9.830508474576272e-07, "loss": 0.6343, "mean_token_accuracy": 0.8165593147277832, "num_tokens": 92861897.0, "step": 291 }, { "epoch": 0.2970498474059003, "grad_norm": 0.8126459717750549, "learning_rate": 9.864406779661017e-07, "loss": 0.6224, "mean_token_accuracy": 0.8183848261833191, "num_tokens": 93169971.0, "step": 292 }, { "epoch": 0.2980671414038657, "grad_norm": 0.820056676864624, "learning_rate": 9.898305084745762e-07, "loss": 0.678, "mean_token_accuracy": 0.8047983050346375, "num_tokens": 93476129.0, "step": 293 }, { "epoch": 0.2990844354018311, "grad_norm": 0.7961844801902771, "learning_rate": 9.932203389830507e-07, "loss": 0.6694, "mean_token_accuracy": 0.8077576756477356, "num_tokens": 93795891.0, "step": 294 }, { "epoch": 0.30010172939979657, "grad_norm": 0.8246234059333801, "learning_rate": 9.966101694915254e-07, "loss": 0.6731, "mean_token_accuracy": 0.8063099384307861, "num_tokens": 94099353.0, "step": 295 }, { "epoch": 0.301119023397762, "grad_norm": 0.8062840104103088, "learning_rate": 1e-06, "loss": 0.7053, "mean_token_accuracy": 0.7989630699157715, "num_tokens": 94420660.0, "step": 296 }, { "epoch": 0.3021363173957274, "grad_norm": 0.7715139389038086, "learning_rate": 1e-06, "loss": 0.6706, "mean_token_accuracy": 0.8074213266372681, "num_tokens": 94745810.0, "step": 297 }, { "epoch": 0.3031536113936928, "grad_norm": 0.7462009191513062, "learning_rate": 1e-06, "loss": 0.6759, "mean_token_accuracy": 0.8049613237380981, "num_tokens": 95053065.0, "step": 298 }, { "epoch": 0.3041709053916582, "grad_norm": 0.8057239055633545, "learning_rate": 1e-06, "loss": 0.671, "mean_token_accuracy": 0.8050029873847961, "num_tokens": 95365564.0, "step": 299 }, { "epoch": 0.3051881993896236, "grad_norm": 0.7807227969169617, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8084917068481445, "num_tokens": 95677617.0, "step": 300 }, { "epoch": 0.306205493387589, "grad_norm": 0.7490959167480469, "learning_rate": 1e-06, "loss": 0.6935, "mean_token_accuracy": 0.799102783203125, "num_tokens": 96017692.0, "step": 301 }, { "epoch": 0.3072227873855544, "grad_norm": 0.7807316184043884, "learning_rate": 1e-06, "loss": 0.6705, "mean_token_accuracy": 0.8059637546539307, "num_tokens": 96337729.0, "step": 302 }, { "epoch": 0.3082400813835198, "grad_norm": 0.8054029941558838, "learning_rate": 1e-06, "loss": 0.6739, "mean_token_accuracy": 0.804855465888977, "num_tokens": 96644965.0, "step": 303 }, { "epoch": 0.30925737538148523, "grad_norm": 0.7975510954856873, "learning_rate": 1e-06, "loss": 0.6517, "mean_token_accuracy": 0.8112246990203857, "num_tokens": 96966380.0, "step": 304 }, { "epoch": 0.31027466937945064, "grad_norm": 0.7594712376594543, "learning_rate": 1e-06, "loss": 0.6646, "mean_token_accuracy": 0.8069475889205933, "num_tokens": 97279451.0, "step": 305 }, { "epoch": 0.31129196337741605, "grad_norm": 0.7637832760810852, "learning_rate": 1e-06, "loss": 0.678, "mean_token_accuracy": 0.8046350479125977, "num_tokens": 97595648.0, "step": 306 }, { "epoch": 0.3123092573753815, "grad_norm": 0.8266690373420715, "learning_rate": 1e-06, "loss": 0.6783, "mean_token_accuracy": 0.8042589426040649, "num_tokens": 97916479.0, "step": 307 }, { "epoch": 0.3133265513733469, "grad_norm": 0.7826452255249023, "learning_rate": 1e-06, "loss": 0.6799, "mean_token_accuracy": 0.8030845522880554, "num_tokens": 98234766.0, "step": 308 }, { "epoch": 0.3143438453713123, "grad_norm": 1.1081233024597168, "learning_rate": 1e-06, "loss": 0.6539, "mean_token_accuracy": 0.8105137348175049, "num_tokens": 98567715.0, "step": 309 }, { "epoch": 0.31536113936927773, "grad_norm": 0.7553532123565674, "learning_rate": 1e-06, "loss": 0.6828, "mean_token_accuracy": 0.8033838272094727, "num_tokens": 98906165.0, "step": 310 }, { "epoch": 0.31637843336724314, "grad_norm": 0.7629131078720093, "learning_rate": 1e-06, "loss": 0.6519, "mean_token_accuracy": 0.8110224008560181, "num_tokens": 99211579.0, "step": 311 }, { "epoch": 0.31739572736520855, "grad_norm": 0.7820941805839539, "learning_rate": 1e-06, "loss": 0.6836, "mean_token_accuracy": 0.8022605180740356, "num_tokens": 99530349.0, "step": 312 }, { "epoch": 0.31841302136317395, "grad_norm": 0.7474110126495361, "learning_rate": 1e-06, "loss": 0.6843, "mean_token_accuracy": 0.8024811744689941, "num_tokens": 99867681.0, "step": 313 }, { "epoch": 0.31943031536113936, "grad_norm": 0.7628995180130005, "learning_rate": 1e-06, "loss": 0.6644, "mean_token_accuracy": 0.8076979517936707, "num_tokens": 100198041.0, "step": 314 }, { "epoch": 0.32044760935910477, "grad_norm": 0.822460949420929, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8149458169937134, "num_tokens": 100509537.0, "step": 315 }, { "epoch": 0.3214649033570702, "grad_norm": 0.7507079243659973, "learning_rate": 1e-06, "loss": 0.666, "mean_token_accuracy": 0.8075241446495056, "num_tokens": 100831452.0, "step": 316 }, { "epoch": 0.3224821973550356, "grad_norm": 0.8105189800262451, "learning_rate": 1e-06, "loss": 0.6727, "mean_token_accuracy": 0.8060649633407593, "num_tokens": 101131963.0, "step": 317 }, { "epoch": 0.323499491353001, "grad_norm": 0.7931933999061584, "learning_rate": 1e-06, "loss": 0.6643, "mean_token_accuracy": 0.8087434768676758, "num_tokens": 101448301.0, "step": 318 }, { "epoch": 0.32451678535096645, "grad_norm": 0.8080085515975952, "learning_rate": 1e-06, "loss": 0.6669, "mean_token_accuracy": 0.8062810301780701, "num_tokens": 101750506.0, "step": 319 }, { "epoch": 0.32553407934893186, "grad_norm": 0.7895338535308838, "learning_rate": 1e-06, "loss": 0.6769, "mean_token_accuracy": 0.8053954243659973, "num_tokens": 102079659.0, "step": 320 }, { "epoch": 0.32655137334689727, "grad_norm": 0.7637503743171692, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8128135800361633, "num_tokens": 102405734.0, "step": 321 }, { "epoch": 0.3275686673448627, "grad_norm": 0.7679542899131775, "learning_rate": 1e-06, "loss": 0.6908, "mean_token_accuracy": 0.8014698624610901, "num_tokens": 102739870.0, "step": 322 }, { "epoch": 0.3285859613428281, "grad_norm": 0.7717718482017517, "learning_rate": 1e-06, "loss": 0.6621, "mean_token_accuracy": 0.8089470267295837, "num_tokens": 103055954.0, "step": 323 }, { "epoch": 0.3296032553407935, "grad_norm": 0.885274350643158, "learning_rate": 1e-06, "loss": 0.667, "mean_token_accuracy": 0.8072282075881958, "num_tokens": 103373341.0, "step": 324 }, { "epoch": 0.3306205493387589, "grad_norm": 0.7615958452224731, "learning_rate": 1e-06, "loss": 0.6488, "mean_token_accuracy": 0.8105665445327759, "num_tokens": 103709672.0, "step": 325 }, { "epoch": 0.3316378433367243, "grad_norm": 0.7842453718185425, "learning_rate": 1e-06, "loss": 0.6477, "mean_token_accuracy": 0.8108562231063843, "num_tokens": 104027661.0, "step": 326 }, { "epoch": 0.3326551373346897, "grad_norm": 0.7681851983070374, "learning_rate": 1e-06, "loss": 0.6755, "mean_token_accuracy": 0.8049620389938354, "num_tokens": 104353014.0, "step": 327 }, { "epoch": 0.3336724313326551, "grad_norm": 0.7865566611289978, "learning_rate": 1e-06, "loss": 0.6815, "mean_token_accuracy": 0.8016282320022583, "num_tokens": 104669203.0, "step": 328 }, { "epoch": 0.3346897253306205, "grad_norm": 0.7729827761650085, "learning_rate": 1e-06, "loss": 0.6893, "mean_token_accuracy": 0.8000586032867432, "num_tokens": 104980367.0, "step": 329 }, { "epoch": 0.335707019328586, "grad_norm": 0.8236528038978577, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8063338994979858, "num_tokens": 105298008.0, "step": 330 }, { "epoch": 0.3367243133265514, "grad_norm": 0.8104047775268555, "learning_rate": 1e-06, "loss": 0.6687, "mean_token_accuracy": 0.8056067228317261, "num_tokens": 105619978.0, "step": 331 }, { "epoch": 0.3377416073245168, "grad_norm": 0.7776243090629578, "learning_rate": 1e-06, "loss": 0.6702, "mean_token_accuracy": 0.8067533373832703, "num_tokens": 105942503.0, "step": 332 }, { "epoch": 0.3387589013224822, "grad_norm": 0.792806088924408, "learning_rate": 1e-06, "loss": 0.6394, "mean_token_accuracy": 0.8132205009460449, "num_tokens": 106246812.0, "step": 333 }, { "epoch": 0.3397761953204476, "grad_norm": 0.9993346333503723, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8066599369049072, "num_tokens": 106556412.0, "step": 334 }, { "epoch": 0.340793489318413, "grad_norm": 0.7874058485031128, "learning_rate": 1e-06, "loss": 0.6694, "mean_token_accuracy": 0.8050298690795898, "num_tokens": 106869710.0, "step": 335 }, { "epoch": 0.34181078331637843, "grad_norm": 0.8449519276618958, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8095645308494568, "num_tokens": 107197492.0, "step": 336 }, { "epoch": 0.34282807731434384, "grad_norm": 0.7852010726928711, "learning_rate": 1e-06, "loss": 0.6529, "mean_token_accuracy": 0.8105023503303528, "num_tokens": 107520176.0, "step": 337 }, { "epoch": 0.34384537131230924, "grad_norm": 0.7831073999404907, "learning_rate": 1e-06, "loss": 0.6523, "mean_token_accuracy": 0.8103317022323608, "num_tokens": 107829848.0, "step": 338 }, { "epoch": 0.34486266531027465, "grad_norm": 0.7952174544334412, "learning_rate": 1e-06, "loss": 0.6847, "mean_token_accuracy": 0.8014416694641113, "num_tokens": 108169162.0, "step": 339 }, { "epoch": 0.34587995930824006, "grad_norm": 0.7554988861083984, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8142600059509277, "num_tokens": 108481647.0, "step": 340 }, { "epoch": 0.34689725330620547, "grad_norm": 0.7773537635803223, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8115161657333374, "num_tokens": 108795594.0, "step": 341 }, { "epoch": 0.34791454730417093, "grad_norm": 0.7583483457565308, "learning_rate": 1e-06, "loss": 0.6605, "mean_token_accuracy": 0.8079638481140137, "num_tokens": 109100848.0, "step": 342 }, { "epoch": 0.34893184130213634, "grad_norm": 0.8855310678482056, "learning_rate": 1e-06, "loss": 0.6703, "mean_token_accuracy": 0.8064523935317993, "num_tokens": 109425497.0, "step": 343 }, { "epoch": 0.34994913530010174, "grad_norm": 0.708702027797699, "learning_rate": 1e-06, "loss": 0.6432, "mean_token_accuracy": 0.8125522136688232, "num_tokens": 109752861.0, "step": 344 }, { "epoch": 0.35096642929806715, "grad_norm": 0.7805309891700745, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8148272037506104, "num_tokens": 110075837.0, "step": 345 }, { "epoch": 0.35198372329603256, "grad_norm": 1.29429030418396, "learning_rate": 1e-06, "loss": 0.6647, "mean_token_accuracy": 0.8079172968864441, "num_tokens": 110388178.0, "step": 346 }, { "epoch": 0.35300101729399797, "grad_norm": 0.7764292359352112, "learning_rate": 1e-06, "loss": 0.6668, "mean_token_accuracy": 0.8062443733215332, "num_tokens": 110712545.0, "step": 347 }, { "epoch": 0.3540183112919634, "grad_norm": 0.7958059906959534, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.8096246719360352, "num_tokens": 111014148.0, "step": 348 }, { "epoch": 0.3550356052899288, "grad_norm": 0.8071582317352295, "learning_rate": 1e-06, "loss": 0.6623, "mean_token_accuracy": 0.8077216148376465, "num_tokens": 111318802.0, "step": 349 }, { "epoch": 0.3560528992878942, "grad_norm": 0.7708571553230286, "learning_rate": 1e-06, "loss": 0.6537, "mean_token_accuracy": 0.809117317199707, "num_tokens": 111631108.0, "step": 350 }, { "epoch": 0.3570701932858596, "grad_norm": 0.8013575077056885, "learning_rate": 1e-06, "loss": 0.6683, "mean_token_accuracy": 0.8072644472122192, "num_tokens": 111956492.0, "step": 351 }, { "epoch": 0.358087487283825, "grad_norm": 0.7933052182197571, "learning_rate": 1e-06, "loss": 0.6564, "mean_token_accuracy": 0.8090948462486267, "num_tokens": 112271716.0, "step": 352 }, { "epoch": 0.35910478128179046, "grad_norm": 0.7996081709861755, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8179689645767212, "num_tokens": 112595775.0, "step": 353 }, { "epoch": 0.36012207527975587, "grad_norm": 0.7942414879798889, "learning_rate": 1e-06, "loss": 0.6468, "mean_token_accuracy": 0.8119409680366516, "num_tokens": 112906425.0, "step": 354 }, { "epoch": 0.3611393692777213, "grad_norm": 0.7331574559211731, "learning_rate": 1e-06, "loss": 0.657, "mean_token_accuracy": 0.8085232377052307, "num_tokens": 113237249.0, "step": 355 }, { "epoch": 0.3621566632756867, "grad_norm": 0.7615078091621399, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8098192811012268, "num_tokens": 113555518.0, "step": 356 }, { "epoch": 0.3631739572736521, "grad_norm": 0.7951853275299072, "learning_rate": 1e-06, "loss": 0.6867, "mean_token_accuracy": 0.8017234206199646, "num_tokens": 113869795.0, "step": 357 }, { "epoch": 0.3641912512716175, "grad_norm": 0.7761409282684326, "learning_rate": 1e-06, "loss": 0.6536, "mean_token_accuracy": 0.809200644493103, "num_tokens": 114180922.0, "step": 358 }, { "epoch": 0.3652085452695829, "grad_norm": 0.7533003091812134, "learning_rate": 1e-06, "loss": 0.6368, "mean_token_accuracy": 0.8134925365447998, "num_tokens": 114503125.0, "step": 359 }, { "epoch": 0.3662258392675483, "grad_norm": 0.8157519102096558, "learning_rate": 1e-06, "loss": 0.679, "mean_token_accuracy": 0.8032369613647461, "num_tokens": 114829230.0, "step": 360 }, { "epoch": 0.3672431332655137, "grad_norm": 0.7701749801635742, "learning_rate": 1e-06, "loss": 0.6547, "mean_token_accuracy": 0.8107021450996399, "num_tokens": 115154558.0, "step": 361 }, { "epoch": 0.36826042726347913, "grad_norm": 0.741514265537262, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8118999004364014, "num_tokens": 115471150.0, "step": 362 }, { "epoch": 0.36927772126144454, "grad_norm": 0.7648496627807617, "learning_rate": 1e-06, "loss": 0.6666, "mean_token_accuracy": 0.8067079782485962, "num_tokens": 115802068.0, "step": 363 }, { "epoch": 0.37029501525940994, "grad_norm": 0.7627708911895752, "learning_rate": 1e-06, "loss": 0.6652, "mean_token_accuracy": 0.8071906566619873, "num_tokens": 116121996.0, "step": 364 }, { "epoch": 0.3713123092573754, "grad_norm": 0.7299651503562927, "learning_rate": 1e-06, "loss": 0.6472, "mean_token_accuracy": 0.8119363784790039, "num_tokens": 116456648.0, "step": 365 }, { "epoch": 0.3723296032553408, "grad_norm": 0.7837634086608887, "learning_rate": 1e-06, "loss": 0.6781, "mean_token_accuracy": 0.8045236468315125, "num_tokens": 116787232.0, "step": 366 }, { "epoch": 0.3733468972533062, "grad_norm": 0.865563690662384, "learning_rate": 1e-06, "loss": 0.6947, "mean_token_accuracy": 0.7989071607589722, "num_tokens": 117122455.0, "step": 367 }, { "epoch": 0.3743641912512716, "grad_norm": 0.8054040670394897, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8087784051895142, "num_tokens": 117430802.0, "step": 368 }, { "epoch": 0.37538148524923703, "grad_norm": 0.9044189453125, "learning_rate": 1e-06, "loss": 0.6733, "mean_token_accuracy": 0.8037705421447754, "num_tokens": 117747701.0, "step": 369 }, { "epoch": 0.37639877924720244, "grad_norm": 0.7625953555107117, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.814502477645874, "num_tokens": 118069631.0, "step": 370 }, { "epoch": 0.37741607324516785, "grad_norm": 0.7826732397079468, "learning_rate": 1e-06, "loss": 0.6396, "mean_token_accuracy": 0.8139146566390991, "num_tokens": 118364107.0, "step": 371 }, { "epoch": 0.37843336724313326, "grad_norm": 0.7956497669219971, "learning_rate": 1e-06, "loss": 0.665, "mean_token_accuracy": 0.8056970834732056, "num_tokens": 118689942.0, "step": 372 }, { "epoch": 0.37945066124109866, "grad_norm": 0.8007025122642517, "learning_rate": 1e-06, "loss": 0.6529, "mean_token_accuracy": 0.809262752532959, "num_tokens": 119027905.0, "step": 373 }, { "epoch": 0.38046795523906407, "grad_norm": 0.8179851174354553, "learning_rate": 1e-06, "loss": 0.6508, "mean_token_accuracy": 0.8107311129570007, "num_tokens": 119333549.0, "step": 374 }, { "epoch": 0.3814852492370295, "grad_norm": 0.775359570980072, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8123606443405151, "num_tokens": 119635566.0, "step": 375 }, { "epoch": 0.38250254323499494, "grad_norm": 0.7540283799171448, "learning_rate": 1e-06, "loss": 0.6688, "mean_token_accuracy": 0.8049556612968445, "num_tokens": 119965071.0, "step": 376 }, { "epoch": 0.38351983723296035, "grad_norm": 0.8038058280944824, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8087278604507446, "num_tokens": 120275408.0, "step": 377 }, { "epoch": 0.38453713123092575, "grad_norm": 0.794776439666748, "learning_rate": 1e-06, "loss": 0.6659, "mean_token_accuracy": 0.8071854114532471, "num_tokens": 120600556.0, "step": 378 }, { "epoch": 0.38555442522889116, "grad_norm": 0.7935164570808411, "learning_rate": 1e-06, "loss": 0.6441, "mean_token_accuracy": 0.8129609823226929, "num_tokens": 120922100.0, "step": 379 }, { "epoch": 0.38657171922685657, "grad_norm": 0.8230006694793701, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.8110771179199219, "num_tokens": 121235661.0, "step": 380 }, { "epoch": 0.387589013224822, "grad_norm": 0.8138030767440796, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8064550161361694, "num_tokens": 121539889.0, "step": 381 }, { "epoch": 0.3886063072227874, "grad_norm": 0.8121137022972107, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8163203001022339, "num_tokens": 121844594.0, "step": 382 }, { "epoch": 0.3896236012207528, "grad_norm": 0.8339466452598572, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.80696702003479, "num_tokens": 122181626.0, "step": 383 }, { "epoch": 0.3906408952187182, "grad_norm": 0.8089752197265625, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8159544467926025, "num_tokens": 122486556.0, "step": 384 }, { "epoch": 0.3916581892166836, "grad_norm": 0.7690979242324829, "learning_rate": 1e-06, "loss": 0.6565, "mean_token_accuracy": 0.8081955909729004, "num_tokens": 122806517.0, "step": 385 }, { "epoch": 0.392675483214649, "grad_norm": 0.757345974445343, "learning_rate": 1e-06, "loss": 0.6541, "mean_token_accuracy": 0.8088266849517822, "num_tokens": 123130884.0, "step": 386 }, { "epoch": 0.3936927772126144, "grad_norm": 0.8054885864257812, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8190693855285645, "num_tokens": 123442859.0, "step": 387 }, { "epoch": 0.3947100712105799, "grad_norm": 0.8839951157569885, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8120584487915039, "num_tokens": 123764752.0, "step": 388 }, { "epoch": 0.3957273652085453, "grad_norm": 0.7563456296920776, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8099383115768433, "num_tokens": 124083040.0, "step": 389 }, { "epoch": 0.3967446592065107, "grad_norm": 0.7801865339279175, "learning_rate": 1e-06, "loss": 0.6461, "mean_token_accuracy": 0.8110958337783813, "num_tokens": 124398859.0, "step": 390 }, { "epoch": 0.3977619532044761, "grad_norm": 0.8431596159934998, "learning_rate": 1e-06, "loss": 0.6735, "mean_token_accuracy": 0.8044856786727905, "num_tokens": 124705300.0, "step": 391 }, { "epoch": 0.3987792472024415, "grad_norm": 0.7838254570960999, "learning_rate": 1e-06, "loss": 0.6659, "mean_token_accuracy": 0.8059735298156738, "num_tokens": 125017794.0, "step": 392 }, { "epoch": 0.3997965412004069, "grad_norm": 0.8154076933860779, "learning_rate": 1e-06, "loss": 0.6659, "mean_token_accuracy": 0.8061184883117676, "num_tokens": 125331610.0, "step": 393 }, { "epoch": 0.4008138351983723, "grad_norm": 0.850584864616394, "learning_rate": 1e-06, "loss": 0.6451, "mean_token_accuracy": 0.8114930987358093, "num_tokens": 125650464.0, "step": 394 }, { "epoch": 0.40183112919633773, "grad_norm": 0.7639232873916626, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.813078761100769, "num_tokens": 125964354.0, "step": 395 }, { "epoch": 0.40284842319430314, "grad_norm": 0.8114677667617798, "learning_rate": 1e-06, "loss": 0.6402, "mean_token_accuracy": 0.8129217624664307, "num_tokens": 126266332.0, "step": 396 }, { "epoch": 0.40386571719226855, "grad_norm": 0.7987565398216248, "learning_rate": 1e-06, "loss": 0.6422, "mean_token_accuracy": 0.8125091195106506, "num_tokens": 126582617.0, "step": 397 }, { "epoch": 0.40488301119023395, "grad_norm": 0.8058961033821106, "learning_rate": 1e-06, "loss": 0.646, "mean_token_accuracy": 0.8124598264694214, "num_tokens": 126894951.0, "step": 398 }, { "epoch": 0.4059003051881994, "grad_norm": 0.8132612109184265, "learning_rate": 1e-06, "loss": 0.6779, "mean_token_accuracy": 0.803368866443634, "num_tokens": 127193010.0, "step": 399 }, { "epoch": 0.4069175991861648, "grad_norm": 0.7651572227478027, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8124961256980896, "num_tokens": 127530354.0, "step": 400 }, { "epoch": 0.40793489318413023, "grad_norm": 0.7767626047134399, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8131939768791199, "num_tokens": 127833308.0, "step": 401 }, { "epoch": 0.40895218718209564, "grad_norm": 0.7715514898300171, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8088132739067078, "num_tokens": 128161302.0, "step": 402 }, { "epoch": 0.40996948118006105, "grad_norm": 0.8009824156761169, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8080823421478271, "num_tokens": 128485010.0, "step": 403 }, { "epoch": 0.41098677517802645, "grad_norm": 0.8095982074737549, "learning_rate": 1e-06, "loss": 0.6564, "mean_token_accuracy": 0.8079812526702881, "num_tokens": 128794807.0, "step": 404 }, { "epoch": 0.41200406917599186, "grad_norm": 0.7956030964851379, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8127642869949341, "num_tokens": 129117445.0, "step": 405 }, { "epoch": 0.41302136317395727, "grad_norm": 0.7819294333457947, "learning_rate": 1e-06, "loss": 0.6583, "mean_token_accuracy": 0.8086738586425781, "num_tokens": 129465561.0, "step": 406 }, { "epoch": 0.4140386571719227, "grad_norm": 0.727975070476532, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8174310326576233, "num_tokens": 129789431.0, "step": 407 }, { "epoch": 0.4150559511698881, "grad_norm": 0.8050909042358398, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8126672506332397, "num_tokens": 130099262.0, "step": 408 }, { "epoch": 0.4160732451678535, "grad_norm": 0.7858853936195374, "learning_rate": 1e-06, "loss": 0.6572, "mean_token_accuracy": 0.8087785840034485, "num_tokens": 130412634.0, "step": 409 }, { "epoch": 0.4170905391658189, "grad_norm": 0.7969059348106384, "learning_rate": 1e-06, "loss": 0.6543, "mean_token_accuracy": 0.8089858889579773, "num_tokens": 130738545.0, "step": 410 }, { "epoch": 0.41810783316378436, "grad_norm": 0.7841689586639404, "learning_rate": 1e-06, "loss": 0.6537, "mean_token_accuracy": 0.8089855909347534, "num_tokens": 131069889.0, "step": 411 }, { "epoch": 0.41912512716174977, "grad_norm": 0.8327849507331848, "learning_rate": 1e-06, "loss": 0.6557, "mean_token_accuracy": 0.8091450929641724, "num_tokens": 131374265.0, "step": 412 }, { "epoch": 0.4201424211597152, "grad_norm": 0.77179354429245, "learning_rate": 1e-06, "loss": 0.6566, "mean_token_accuracy": 0.8080447316169739, "num_tokens": 131702768.0, "step": 413 }, { "epoch": 0.4211597151576806, "grad_norm": 0.8102238774299622, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8112056851387024, "num_tokens": 132005394.0, "step": 414 }, { "epoch": 0.422177009155646, "grad_norm": 0.9549462795257568, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.815801739692688, "num_tokens": 132318929.0, "step": 415 }, { "epoch": 0.4231943031536114, "grad_norm": 0.7793801426887512, "learning_rate": 1e-06, "loss": 0.6206, "mean_token_accuracy": 0.8180446624755859, "num_tokens": 132639135.0, "step": 416 }, { "epoch": 0.4242115971515768, "grad_norm": 0.7840235233306885, "learning_rate": 1e-06, "loss": 0.6648, "mean_token_accuracy": 0.8058639764785767, "num_tokens": 132955884.0, "step": 417 }, { "epoch": 0.4252288911495422, "grad_norm": 0.7213472127914429, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8190960884094238, "num_tokens": 133281242.0, "step": 418 }, { "epoch": 0.4262461851475076, "grad_norm": 0.7838244438171387, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8113455772399902, "num_tokens": 133620766.0, "step": 419 }, { "epoch": 0.427263479145473, "grad_norm": 0.7640904784202576, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8127099275588989, "num_tokens": 133962681.0, "step": 420 }, { "epoch": 0.42828077314343843, "grad_norm": 0.7636725306510925, "learning_rate": 1e-06, "loss": 0.6727, "mean_token_accuracy": 0.8043913841247559, "num_tokens": 134299862.0, "step": 421 }, { "epoch": 0.42929806714140384, "grad_norm": 0.7917993664741516, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8139828443527222, "num_tokens": 134603411.0, "step": 422 }, { "epoch": 0.4303153611393693, "grad_norm": 0.836502194404602, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8130548596382141, "num_tokens": 134910477.0, "step": 423 }, { "epoch": 0.4313326551373347, "grad_norm": 0.8161730766296387, "learning_rate": 1e-06, "loss": 0.6692, "mean_token_accuracy": 0.8051292896270752, "num_tokens": 135215377.0, "step": 424 }, { "epoch": 0.4323499491353001, "grad_norm": 0.7963213324546814, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8107162714004517, "num_tokens": 135530087.0, "step": 425 }, { "epoch": 0.4333672431332655, "grad_norm": 0.7756025195121765, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8174505233764648, "num_tokens": 135852811.0, "step": 426 }, { "epoch": 0.43438453713123093, "grad_norm": 0.7596142888069153, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8148894906044006, "num_tokens": 136177024.0, "step": 427 }, { "epoch": 0.43540183112919634, "grad_norm": 0.8031265139579773, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8117724657058716, "num_tokens": 136482739.0, "step": 428 }, { "epoch": 0.43641912512716174, "grad_norm": 0.7338760495185852, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8177262544631958, "num_tokens": 136822688.0, "step": 429 }, { "epoch": 0.43743641912512715, "grad_norm": 0.7699108719825745, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8188862800598145, "num_tokens": 137137295.0, "step": 430 }, { "epoch": 0.43845371312309256, "grad_norm": 0.8056550025939941, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8134770393371582, "num_tokens": 137449013.0, "step": 431 }, { "epoch": 0.43947100712105797, "grad_norm": 0.7790605425834656, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.8130491971969604, "num_tokens": 137767680.0, "step": 432 }, { "epoch": 0.4404883011190234, "grad_norm": 0.7934316992759705, "learning_rate": 1e-06, "loss": 0.6459, "mean_token_accuracy": 0.8120208978652954, "num_tokens": 138066689.0, "step": 433 }, { "epoch": 0.44150559511698884, "grad_norm": 0.7886009812355042, "learning_rate": 1e-06, "loss": 0.6802, "mean_token_accuracy": 0.8030209541320801, "num_tokens": 138390463.0, "step": 434 }, { "epoch": 0.44252288911495424, "grad_norm": 0.7820769548416138, "learning_rate": 1e-06, "loss": 0.651, "mean_token_accuracy": 0.8096523880958557, "num_tokens": 138712726.0, "step": 435 }, { "epoch": 0.44354018311291965, "grad_norm": 0.7809168100357056, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8129587173461914, "num_tokens": 139056494.0, "step": 436 }, { "epoch": 0.44455747711088506, "grad_norm": 0.7615790367126465, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8090481758117676, "num_tokens": 139362971.0, "step": 437 }, { "epoch": 0.44557477110885046, "grad_norm": 0.8051952719688416, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8134530186653137, "num_tokens": 139661330.0, "step": 438 }, { "epoch": 0.44659206510681587, "grad_norm": 0.7830991744995117, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8091185092926025, "num_tokens": 139996848.0, "step": 439 }, { "epoch": 0.4476093591047813, "grad_norm": 0.7350242733955383, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.8117263317108154, "num_tokens": 140315404.0, "step": 440 }, { "epoch": 0.4486266531027467, "grad_norm": 0.763858437538147, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8085876703262329, "num_tokens": 140632686.0, "step": 441 }, { "epoch": 0.4496439471007121, "grad_norm": 0.722997784614563, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8143622875213623, "num_tokens": 140945277.0, "step": 442 }, { "epoch": 0.4506612410986775, "grad_norm": 0.7560616135597229, "learning_rate": 1e-06, "loss": 0.6451, "mean_token_accuracy": 0.8108248114585876, "num_tokens": 141261262.0, "step": 443 }, { "epoch": 0.4516785350966429, "grad_norm": 0.7908616065979004, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.813894510269165, "num_tokens": 141584264.0, "step": 444 }, { "epoch": 0.4526958290946083, "grad_norm": 0.7412334680557251, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8175861835479736, "num_tokens": 141902690.0, "step": 445 }, { "epoch": 0.4537131230925738, "grad_norm": 0.8058040142059326, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.809880256652832, "num_tokens": 142212848.0, "step": 446 }, { "epoch": 0.4547304170905392, "grad_norm": 0.7759329676628113, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8088507056236267, "num_tokens": 142533419.0, "step": 447 }, { "epoch": 0.4557477110885046, "grad_norm": 0.772881269454956, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.8117966055870056, "num_tokens": 142834121.0, "step": 448 }, { "epoch": 0.45676500508647, "grad_norm": 0.814788281917572, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8127095103263855, "num_tokens": 143165685.0, "step": 449 }, { "epoch": 0.4577822990844354, "grad_norm": 0.8127166628837585, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8171497583389282, "num_tokens": 143485496.0, "step": 450 }, { "epoch": 0.4587995930824008, "grad_norm": 0.7722148299217224, "learning_rate": 1e-06, "loss": 0.6591, "mean_token_accuracy": 0.8063204288482666, "num_tokens": 143805905.0, "step": 451 }, { "epoch": 0.4598168870803662, "grad_norm": 0.8112136721611023, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.813546895980835, "num_tokens": 144107657.0, "step": 452 }, { "epoch": 0.46083418107833163, "grad_norm": 0.795857310295105, "learning_rate": 1e-06, "loss": 0.6535, "mean_token_accuracy": 0.8090566396713257, "num_tokens": 144426788.0, "step": 453 }, { "epoch": 0.46185147507629704, "grad_norm": 0.8356524109840393, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.816721498966217, "num_tokens": 144742302.0, "step": 454 }, { "epoch": 0.46286876907426244, "grad_norm": 0.8701412677764893, "learning_rate": 1e-06, "loss": 0.6432, "mean_token_accuracy": 0.8105349540710449, "num_tokens": 145041480.0, "step": 455 }, { "epoch": 0.46388606307222785, "grad_norm": 0.7511188387870789, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8154951333999634, "num_tokens": 145365776.0, "step": 456 }, { "epoch": 0.4649033570701933, "grad_norm": 0.8432009816169739, "learning_rate": 1e-06, "loss": 0.6296, "mean_token_accuracy": 0.8161913156509399, "num_tokens": 145686829.0, "step": 457 }, { "epoch": 0.4659206510681587, "grad_norm": 0.8876609802246094, "learning_rate": 1e-06, "loss": 0.6484, "mean_token_accuracy": 0.8092880249023438, "num_tokens": 146031134.0, "step": 458 }, { "epoch": 0.4669379450661241, "grad_norm": 0.8064795732498169, "learning_rate": 1e-06, "loss": 0.6724, "mean_token_accuracy": 0.805739164352417, "num_tokens": 146342004.0, "step": 459 }, { "epoch": 0.46795523906408953, "grad_norm": 0.7816082835197449, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8109795451164246, "num_tokens": 146660600.0, "step": 460 }, { "epoch": 0.46897253306205494, "grad_norm": 0.7669892311096191, "learning_rate": 1e-06, "loss": 0.6927, "mean_token_accuracy": 0.8000683784484863, "num_tokens": 146980645.0, "step": 461 }, { "epoch": 0.46998982706002035, "grad_norm": 0.8293817639350891, "learning_rate": 1e-06, "loss": 0.6595, "mean_token_accuracy": 0.8065913915634155, "num_tokens": 147298752.0, "step": 462 }, { "epoch": 0.47100712105798576, "grad_norm": 0.8262353539466858, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.818239688873291, "num_tokens": 147606035.0, "step": 463 }, { "epoch": 0.47202441505595116, "grad_norm": 0.7733154296875, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8200194835662842, "num_tokens": 147900796.0, "step": 464 }, { "epoch": 0.47304170905391657, "grad_norm": 0.812021791934967, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8169869184494019, "num_tokens": 148213700.0, "step": 465 }, { "epoch": 0.474059003051882, "grad_norm": 0.7849969863891602, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8115862607955933, "num_tokens": 148519502.0, "step": 466 }, { "epoch": 0.4750762970498474, "grad_norm": 0.8655486106872559, "learning_rate": 1e-06, "loss": 0.6592, "mean_token_accuracy": 0.8073008060455322, "num_tokens": 148837050.0, "step": 467 }, { "epoch": 0.4760935910478128, "grad_norm": 0.7930212616920471, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8117002248764038, "num_tokens": 149158085.0, "step": 468 }, { "epoch": 0.47711088504577825, "grad_norm": 0.8016993999481201, "learning_rate": 1e-06, "loss": 0.6499, "mean_token_accuracy": 0.8118265271186829, "num_tokens": 149470715.0, "step": 469 }, { "epoch": 0.47812817904374366, "grad_norm": 0.7912672162055969, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8092522025108337, "num_tokens": 149779243.0, "step": 470 }, { "epoch": 0.47914547304170907, "grad_norm": 0.7673670649528503, "learning_rate": 1e-06, "loss": 0.6641, "mean_token_accuracy": 0.8060247898101807, "num_tokens": 150086890.0, "step": 471 }, { "epoch": 0.4801627670396745, "grad_norm": 0.7772818207740784, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8201066851615906, "num_tokens": 150414091.0, "step": 472 }, { "epoch": 0.4811800610376399, "grad_norm": 0.7861338257789612, "learning_rate": 1e-06, "loss": 0.6536, "mean_token_accuracy": 0.8087126016616821, "num_tokens": 150749759.0, "step": 473 }, { "epoch": 0.4821973550356053, "grad_norm": 0.7545295357704163, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.813199520111084, "num_tokens": 151078517.0, "step": 474 }, { "epoch": 0.4832146490335707, "grad_norm": 0.7383038401603699, "learning_rate": 1e-06, "loss": 0.6368, "mean_token_accuracy": 0.8135240077972412, "num_tokens": 151404760.0, "step": 475 }, { "epoch": 0.4842319430315361, "grad_norm": 0.8081879019737244, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8206827044487, "num_tokens": 151714374.0, "step": 476 }, { "epoch": 0.4852492370295015, "grad_norm": 0.7860760688781738, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.817007303237915, "num_tokens": 152034628.0, "step": 477 }, { "epoch": 0.4862665310274669, "grad_norm": 0.8067706227302551, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8134366273880005, "num_tokens": 152336970.0, "step": 478 }, { "epoch": 0.4872838250254323, "grad_norm": 0.7861080765724182, "learning_rate": 1e-06, "loss": 0.6464, "mean_token_accuracy": 0.8105477094650269, "num_tokens": 152684663.0, "step": 479 }, { "epoch": 0.4883011190233978, "grad_norm": 0.7935335636138916, "learning_rate": 1e-06, "loss": 0.6551, "mean_token_accuracy": 0.808892011642456, "num_tokens": 153010247.0, "step": 480 }, { "epoch": 0.4893184130213632, "grad_norm": 0.81971275806427, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8200415372848511, "num_tokens": 153337306.0, "step": 481 }, { "epoch": 0.4903357070193286, "grad_norm": 0.7935061454772949, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8127099871635437, "num_tokens": 153652759.0, "step": 482 }, { "epoch": 0.491353001017294, "grad_norm": 0.8456212878227234, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8179374933242798, "num_tokens": 153982904.0, "step": 483 }, { "epoch": 0.4923702950152594, "grad_norm": 0.7553489804267883, "learning_rate": 1e-06, "loss": 0.658, "mean_token_accuracy": 0.8084520101547241, "num_tokens": 154318454.0, "step": 484 }, { "epoch": 0.4933875890132248, "grad_norm": 0.7678808569908142, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8083710670471191, "num_tokens": 154646655.0, "step": 485 }, { "epoch": 0.49440488301119023, "grad_norm": 0.7790781259536743, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.813983678817749, "num_tokens": 154961491.0, "step": 486 }, { "epoch": 0.49542217700915564, "grad_norm": 0.8041170835494995, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.814818263053894, "num_tokens": 155273562.0, "step": 487 }, { "epoch": 0.49643947100712105, "grad_norm": 0.8097386956214905, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8204355239868164, "num_tokens": 155577059.0, "step": 488 }, { "epoch": 0.49745676500508645, "grad_norm": 0.7783612608909607, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.809700608253479, "num_tokens": 155896969.0, "step": 489 }, { "epoch": 0.49847405900305186, "grad_norm": 0.8291912078857422, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8116985559463501, "num_tokens": 156221661.0, "step": 490 }, { "epoch": 0.49949135300101727, "grad_norm": 0.8157835602760315, "learning_rate": 1e-06, "loss": 0.6503, "mean_token_accuracy": 0.8086612820625305, "num_tokens": 156538559.0, "step": 491 }, { "epoch": 0.5005086469989827, "grad_norm": 0.7264803051948547, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8084179162979126, "num_tokens": 156879118.0, "step": 492 }, { "epoch": 0.5015259409969481, "grad_norm": 0.7339447736740112, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8194094896316528, "num_tokens": 157198178.0, "step": 493 }, { "epoch": 0.5025432349949135, "grad_norm": 0.7389786839485168, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8140756487846375, "num_tokens": 157519093.0, "step": 494 }, { "epoch": 0.503560528992879, "grad_norm": 0.869776725769043, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8145843744277954, "num_tokens": 157826508.0, "step": 495 }, { "epoch": 0.5045778229908443, "grad_norm": 0.7960457801818848, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8094485998153687, "num_tokens": 158153856.0, "step": 496 }, { "epoch": 0.5055951169888098, "grad_norm": 0.7548427581787109, "learning_rate": 1e-06, "loss": 0.6424, "mean_token_accuracy": 0.8124884366989136, "num_tokens": 158468860.0, "step": 497 }, { "epoch": 0.5066124109867752, "grad_norm": 0.7868984341621399, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8119939565658569, "num_tokens": 158789067.0, "step": 498 }, { "epoch": 0.5076297049847406, "grad_norm": 0.7604773640632629, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8112584948539734, "num_tokens": 159115006.0, "step": 499 }, { "epoch": 0.508646998982706, "grad_norm": 0.7894480228424072, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8169581890106201, "num_tokens": 159430845.0, "step": 500 }, { "epoch": 0.5096642929806714, "grad_norm": 0.7751943469047546, "learning_rate": 1e-06, "loss": 0.643, "mean_token_accuracy": 0.8119826316833496, "num_tokens": 159766028.0, "step": 501 }, { "epoch": 0.5106815869786369, "grad_norm": 0.7243176102638245, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.8198132514953613, "num_tokens": 160093084.0, "step": 502 }, { "epoch": 0.5116988809766022, "grad_norm": 0.7752160429954529, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8124977350234985, "num_tokens": 160414551.0, "step": 503 }, { "epoch": 0.5127161749745677, "grad_norm": 0.7632299661636353, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8139313459396362, "num_tokens": 160732492.0, "step": 504 }, { "epoch": 0.513733468972533, "grad_norm": 0.8722953796386719, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8128257989883423, "num_tokens": 161063180.0, "step": 505 }, { "epoch": 0.5147507629704985, "grad_norm": 0.7689654231071472, "learning_rate": 1e-06, "loss": 0.6473, "mean_token_accuracy": 0.8114692568778992, "num_tokens": 161375840.0, "step": 506 }, { "epoch": 0.5157680569684638, "grad_norm": 0.7574338912963867, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.8104807734489441, "num_tokens": 161712451.0, "step": 507 }, { "epoch": 0.5167853509664293, "grad_norm": 0.721907377243042, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8147372007369995, "num_tokens": 162053742.0, "step": 508 }, { "epoch": 0.5178026449643948, "grad_norm": 0.792888879776001, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8106032609939575, "num_tokens": 162383502.0, "step": 509 }, { "epoch": 0.5188199389623601, "grad_norm": 0.7896516919136047, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8150818347930908, "num_tokens": 162721800.0, "step": 510 }, { "epoch": 0.5198372329603256, "grad_norm": 0.814561128616333, "learning_rate": 1e-06, "loss": 0.6347, "mean_token_accuracy": 0.8130834102630615, "num_tokens": 163040860.0, "step": 511 }, { "epoch": 0.5208545269582909, "grad_norm": 0.7659908533096313, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8147145509719849, "num_tokens": 163354092.0, "step": 512 }, { "epoch": 0.5218718209562564, "grad_norm": 0.7354118824005127, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.8131240606307983, "num_tokens": 163677302.0, "step": 513 }, { "epoch": 0.5228891149542217, "grad_norm": 0.7778656482696533, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8165751099586487, "num_tokens": 163993198.0, "step": 514 }, { "epoch": 0.5239064089521872, "grad_norm": 0.773829460144043, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8093839883804321, "num_tokens": 164325603.0, "step": 515 }, { "epoch": 0.5249237029501526, "grad_norm": 0.7734475135803223, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.818503737449646, "num_tokens": 164634955.0, "step": 516 }, { "epoch": 0.525940996948118, "grad_norm": 0.7791919112205505, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8177918195724487, "num_tokens": 164949736.0, "step": 517 }, { "epoch": 0.5269582909460834, "grad_norm": 0.8125184774398804, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8144404888153076, "num_tokens": 165268681.0, "step": 518 }, { "epoch": 0.5279755849440488, "grad_norm": 0.7059145569801331, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8215758800506592, "num_tokens": 165611395.0, "step": 519 }, { "epoch": 0.5289928789420142, "grad_norm": 0.7735430598258972, "learning_rate": 1e-06, "loss": 0.6619, "mean_token_accuracy": 0.8062798380851746, "num_tokens": 165930541.0, "step": 520 }, { "epoch": 0.5300101729399797, "grad_norm": 0.8060302734375, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8155234456062317, "num_tokens": 166253912.0, "step": 521 }, { "epoch": 0.5310274669379451, "grad_norm": 0.7790757417678833, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8167293667793274, "num_tokens": 166570026.0, "step": 522 }, { "epoch": 0.5320447609359105, "grad_norm": 0.7891453504562378, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8134738206863403, "num_tokens": 166891414.0, "step": 523 }, { "epoch": 0.5330620549338759, "grad_norm": 0.7717955708503723, "learning_rate": 1e-06, "loss": 0.6457, "mean_token_accuracy": 0.8105074167251587, "num_tokens": 167199448.0, "step": 524 }, { "epoch": 0.5340793489318413, "grad_norm": 0.7495227456092834, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8140999674797058, "num_tokens": 167508836.0, "step": 525 }, { "epoch": 0.5350966429298067, "grad_norm": 0.7756131887435913, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8117661476135254, "num_tokens": 167821540.0, "step": 526 }, { "epoch": 0.5361139369277721, "grad_norm": 0.7604585886001587, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8179171085357666, "num_tokens": 168163810.0, "step": 527 }, { "epoch": 0.5371312309257376, "grad_norm": 0.7594449520111084, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8211343288421631, "num_tokens": 168465144.0, "step": 528 }, { "epoch": 0.5381485249237029, "grad_norm": 0.7848928570747375, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.812796413898468, "num_tokens": 168772296.0, "step": 529 }, { "epoch": 0.5391658189216684, "grad_norm": 0.8153516054153442, "learning_rate": 1e-06, "loss": 0.6366, "mean_token_accuracy": 0.8122652769088745, "num_tokens": 169084941.0, "step": 530 }, { "epoch": 0.5401831129196337, "grad_norm": 0.8312289714813232, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8194169998168945, "num_tokens": 169398931.0, "step": 531 }, { "epoch": 0.5412004069175992, "grad_norm": 0.8274112939834595, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8165071606636047, "num_tokens": 169713890.0, "step": 532 }, { "epoch": 0.5422177009155646, "grad_norm": 0.7571256756782532, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8139414191246033, "num_tokens": 170042021.0, "step": 533 }, { "epoch": 0.54323499491353, "grad_norm": 0.8405437469482422, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8171428442001343, "num_tokens": 170344681.0, "step": 534 }, { "epoch": 0.5442522889114955, "grad_norm": 0.7748994827270508, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.814400315284729, "num_tokens": 170662002.0, "step": 535 }, { "epoch": 0.5452695829094608, "grad_norm": 0.7834451198577881, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8116738796234131, "num_tokens": 170965381.0, "step": 536 }, { "epoch": 0.5462868769074263, "grad_norm": 0.7792726159095764, "learning_rate": 1e-06, "loss": 0.6654, "mean_token_accuracy": 0.8080847263336182, "num_tokens": 171297534.0, "step": 537 }, { "epoch": 0.5473041709053916, "grad_norm": 0.7707831859588623, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8134000897407532, "num_tokens": 171615049.0, "step": 538 }, { "epoch": 0.5483214649033571, "grad_norm": 0.8062372803688049, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8191686868667603, "num_tokens": 171931276.0, "step": 539 }, { "epoch": 0.5493387589013224, "grad_norm": 0.8075754046440125, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8167059421539307, "num_tokens": 172259334.0, "step": 540 }, { "epoch": 0.5503560528992879, "grad_norm": 0.778333306312561, "learning_rate": 1e-06, "loss": 0.618, "mean_token_accuracy": 0.8180431127548218, "num_tokens": 172580086.0, "step": 541 }, { "epoch": 0.5513733468972533, "grad_norm": 0.7373014688491821, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8125739097595215, "num_tokens": 172895819.0, "step": 542 }, { "epoch": 0.5523906408952187, "grad_norm": 0.7623785734176636, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8183821439743042, "num_tokens": 173207899.0, "step": 543 }, { "epoch": 0.5534079348931842, "grad_norm": 0.7864964604377747, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8134520053863525, "num_tokens": 173547290.0, "step": 544 }, { "epoch": 0.5544252288911495, "grad_norm": 0.8133346438407898, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.811774492263794, "num_tokens": 173872148.0, "step": 545 }, { "epoch": 0.555442522889115, "grad_norm": 0.828346848487854, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8110955953598022, "num_tokens": 174174594.0, "step": 546 }, { "epoch": 0.5564598168870803, "grad_norm": 0.7909507751464844, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8137595057487488, "num_tokens": 174483118.0, "step": 547 }, { "epoch": 0.5574771108850458, "grad_norm": 0.7661901712417603, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8151018619537354, "num_tokens": 174798555.0, "step": 548 }, { "epoch": 0.5584944048830112, "grad_norm": 0.7890106439590454, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8171303868293762, "num_tokens": 175130207.0, "step": 549 }, { "epoch": 0.5595116988809766, "grad_norm": 0.7659645676612854, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8203459978103638, "num_tokens": 175445505.0, "step": 550 }, { "epoch": 0.560528992878942, "grad_norm": 0.7757659554481506, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8122877478599548, "num_tokens": 175767713.0, "step": 551 }, { "epoch": 0.5615462868769074, "grad_norm": 0.7445358037948608, "learning_rate": 1e-06, "loss": 0.646, "mean_token_accuracy": 0.8095182180404663, "num_tokens": 176094548.0, "step": 552 }, { "epoch": 0.5625635808748728, "grad_norm": 0.7649447321891785, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8160684704780579, "num_tokens": 176419770.0, "step": 553 }, { "epoch": 0.5635808748728383, "grad_norm": 0.78580242395401, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.814705491065979, "num_tokens": 176731375.0, "step": 554 }, { "epoch": 0.5645981688708036, "grad_norm": 0.7920994162559509, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8193442821502686, "num_tokens": 177037040.0, "step": 555 }, { "epoch": 0.5656154628687691, "grad_norm": 0.7603932023048401, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8114141225814819, "num_tokens": 177353869.0, "step": 556 }, { "epoch": 0.5666327568667345, "grad_norm": 0.8625222444534302, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8205225467681885, "num_tokens": 177663239.0, "step": 557 }, { "epoch": 0.5676500508646999, "grad_norm": 0.732420027256012, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.8171700239181519, "num_tokens": 177989713.0, "step": 558 }, { "epoch": 0.5686673448626653, "grad_norm": 0.7737362384796143, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8142409324645996, "num_tokens": 178311252.0, "step": 559 }, { "epoch": 0.5696846388606307, "grad_norm": 0.8172044157981873, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.807847261428833, "num_tokens": 178635634.0, "step": 560 }, { "epoch": 0.5707019328585962, "grad_norm": 0.844097375869751, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8111605048179626, "num_tokens": 178948373.0, "step": 561 }, { "epoch": 0.5717192268565615, "grad_norm": 0.7378110885620117, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8191424012184143, "num_tokens": 179265598.0, "step": 562 }, { "epoch": 0.572736520854527, "grad_norm": 0.779130756855011, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8169468641281128, "num_tokens": 179577100.0, "step": 563 }, { "epoch": 0.5737538148524923, "grad_norm": 0.9463231563568115, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8117104768753052, "num_tokens": 179892505.0, "step": 564 }, { "epoch": 0.5747711088504578, "grad_norm": 0.7623103857040405, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8120288252830505, "num_tokens": 180223161.0, "step": 565 }, { "epoch": 0.5757884028484231, "grad_norm": 0.8463129997253418, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8121374249458313, "num_tokens": 180529292.0, "step": 566 }, { "epoch": 0.5768056968463886, "grad_norm": 0.7645483613014221, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8175876140594482, "num_tokens": 180831990.0, "step": 567 }, { "epoch": 0.5778229908443541, "grad_norm": 0.7431308627128601, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8129169940948486, "num_tokens": 181158179.0, "step": 568 }, { "epoch": 0.5788402848423194, "grad_norm": 0.7826768159866333, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8198922276496887, "num_tokens": 181465876.0, "step": 569 }, { "epoch": 0.5798575788402849, "grad_norm": 0.8153584599494934, "learning_rate": 1e-06, "loss": 0.6368, "mean_token_accuracy": 0.811568558216095, "num_tokens": 181772486.0, "step": 570 }, { "epoch": 0.5808748728382502, "grad_norm": 1.32663094997406, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8144712448120117, "num_tokens": 182092553.0, "step": 571 }, { "epoch": 0.5818921668362157, "grad_norm": 0.7690805792808533, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8114436268806458, "num_tokens": 182399236.0, "step": 572 }, { "epoch": 0.582909460834181, "grad_norm": 0.8244120478630066, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.810742974281311, "num_tokens": 182711095.0, "step": 573 }, { "epoch": 0.5839267548321465, "grad_norm": 0.7508371472358704, "learning_rate": 1e-06, "loss": 0.656, "mean_token_accuracy": 0.8080297112464905, "num_tokens": 183045279.0, "step": 574 }, { "epoch": 0.5849440488301119, "grad_norm": 0.7884571552276611, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.817246675491333, "num_tokens": 183377751.0, "step": 575 }, { "epoch": 0.5859613428280773, "grad_norm": 0.828568696975708, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8165144920349121, "num_tokens": 183676743.0, "step": 576 }, { "epoch": 0.5869786368260427, "grad_norm": 0.7600045800209045, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.817339301109314, "num_tokens": 184006491.0, "step": 577 }, { "epoch": 0.5879959308240081, "grad_norm": 0.7350272536277771, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8175899386405945, "num_tokens": 184334536.0, "step": 578 }, { "epoch": 0.5890132248219736, "grad_norm": 0.7769359350204468, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8150594830513, "num_tokens": 184644555.0, "step": 579 }, { "epoch": 0.590030518819939, "grad_norm": 0.791875422000885, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8136811256408691, "num_tokens": 184956764.0, "step": 580 }, { "epoch": 0.5910478128179044, "grad_norm": 0.7764705419540405, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8154654502868652, "num_tokens": 185278874.0, "step": 581 }, { "epoch": 0.5920651068158698, "grad_norm": 0.7930245995521545, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8177323341369629, "num_tokens": 185603559.0, "step": 582 }, { "epoch": 0.5930824008138352, "grad_norm": 0.7742440700531006, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8144494295120239, "num_tokens": 185904158.0, "step": 583 }, { "epoch": 0.5940996948118006, "grad_norm": 0.8462097644805908, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8187432885169983, "num_tokens": 186208990.0, "step": 584 }, { "epoch": 0.595116988809766, "grad_norm": 0.7964716553688049, "learning_rate": 1e-06, "loss": 0.6499, "mean_token_accuracy": 0.8092055320739746, "num_tokens": 186527391.0, "step": 585 }, { "epoch": 0.5961342828077314, "grad_norm": 0.8105505108833313, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8204156756401062, "num_tokens": 186851711.0, "step": 586 }, { "epoch": 0.5971515768056969, "grad_norm": 0.7496419548988342, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8129917979240417, "num_tokens": 187158410.0, "step": 587 }, { "epoch": 0.5981688708036622, "grad_norm": 0.7854833602905273, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8220297694206238, "num_tokens": 187492387.0, "step": 588 }, { "epoch": 0.5991861648016277, "grad_norm": 0.854580819606781, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.8100663423538208, "num_tokens": 187815200.0, "step": 589 }, { "epoch": 0.6002034587995931, "grad_norm": 0.7688719034194946, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8207713961601257, "num_tokens": 188141681.0, "step": 590 }, { "epoch": 0.6012207527975585, "grad_norm": 0.7543858885765076, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8124486207962036, "num_tokens": 188459816.0, "step": 591 }, { "epoch": 0.602238046795524, "grad_norm": 0.7601234912872314, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8103123307228088, "num_tokens": 188793110.0, "step": 592 }, { "epoch": 0.6032553407934893, "grad_norm": 0.7299609780311584, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8188794851303101, "num_tokens": 189123138.0, "step": 593 }, { "epoch": 0.6042726347914548, "grad_norm": 0.7960324883460999, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8179804086685181, "num_tokens": 189448908.0, "step": 594 }, { "epoch": 0.6052899287894201, "grad_norm": 0.799144983291626, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8109234571456909, "num_tokens": 189760563.0, "step": 595 }, { "epoch": 0.6063072227873856, "grad_norm": 0.7431643605232239, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8132368326187134, "num_tokens": 190067307.0, "step": 596 }, { "epoch": 0.6073245167853509, "grad_norm": 0.7618584632873535, "learning_rate": 1e-06, "loss": 0.6206, "mean_token_accuracy": 0.8160717487335205, "num_tokens": 190372698.0, "step": 597 }, { "epoch": 0.6083418107833164, "grad_norm": 0.7846772074699402, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8136916756629944, "num_tokens": 190679521.0, "step": 598 }, { "epoch": 0.6093591047812817, "grad_norm": 0.7604075074195862, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8160173892974854, "num_tokens": 190990350.0, "step": 599 }, { "epoch": 0.6103763987792472, "grad_norm": 0.8070312142372131, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.812640905380249, "num_tokens": 191298356.0, "step": 600 }, { "epoch": 0.6113936927772126, "grad_norm": 0.7849087119102478, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8239523768424988, "num_tokens": 191607061.0, "step": 601 }, { "epoch": 0.612410986775178, "grad_norm": 0.7357282042503357, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8173874616622925, "num_tokens": 191920982.0, "step": 602 }, { "epoch": 0.6134282807731435, "grad_norm": 0.7360347509384155, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.816581130027771, "num_tokens": 192244544.0, "step": 603 }, { "epoch": 0.6144455747711088, "grad_norm": 0.7564297318458557, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8155977129936218, "num_tokens": 192568952.0, "step": 604 }, { "epoch": 0.6154628687690743, "grad_norm": 0.7610177993774414, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.818304717540741, "num_tokens": 192871340.0, "step": 605 }, { "epoch": 0.6164801627670397, "grad_norm": 0.7376631498336792, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8194389343261719, "num_tokens": 193194025.0, "step": 606 }, { "epoch": 0.6174974567650051, "grad_norm": 0.8029021620750427, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8219469785690308, "num_tokens": 193505599.0, "step": 607 }, { "epoch": 0.6185147507629705, "grad_norm": 0.746333122253418, "learning_rate": 1e-06, "loss": 0.6551, "mean_token_accuracy": 0.8087726831436157, "num_tokens": 193846107.0, "step": 608 }, { "epoch": 0.6195320447609359, "grad_norm": 0.7344440221786499, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8207103610038757, "num_tokens": 194162440.0, "step": 609 }, { "epoch": 0.6205493387589013, "grad_norm": 0.7551791071891785, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8163305521011353, "num_tokens": 194471827.0, "step": 610 }, { "epoch": 0.6215666327568667, "grad_norm": 0.7398301959037781, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8153471946716309, "num_tokens": 194798176.0, "step": 611 }, { "epoch": 0.6225839267548321, "grad_norm": 0.794724702835083, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.810512900352478, "num_tokens": 195115010.0, "step": 612 }, { "epoch": 0.6236012207527976, "grad_norm": 0.7791882753372192, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.822370171546936, "num_tokens": 195442166.0, "step": 613 }, { "epoch": 0.624618514750763, "grad_norm": 0.7617864608764648, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8227953314781189, "num_tokens": 195755945.0, "step": 614 }, { "epoch": 0.6256358087487284, "grad_norm": 0.757507860660553, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8223221302032471, "num_tokens": 196076642.0, "step": 615 }, { "epoch": 0.6266531027466938, "grad_norm": 0.7687931656837463, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8135921955108643, "num_tokens": 196384640.0, "step": 616 }, { "epoch": 0.6276703967446592, "grad_norm": 0.7535430192947388, "learning_rate": 1e-06, "loss": 0.6511, "mean_token_accuracy": 0.8093615770339966, "num_tokens": 196709416.0, "step": 617 }, { "epoch": 0.6286876907426246, "grad_norm": 0.7329970598220825, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8180556297302246, "num_tokens": 197038688.0, "step": 618 }, { "epoch": 0.62970498474059, "grad_norm": 0.7846882343292236, "learning_rate": 1e-06, "loss": 0.65, "mean_token_accuracy": 0.8091417551040649, "num_tokens": 197361082.0, "step": 619 }, { "epoch": 0.6307222787385555, "grad_norm": 0.7763864398002625, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8196543455123901, "num_tokens": 197654672.0, "step": 620 }, { "epoch": 0.6317395727365208, "grad_norm": 0.7358604669570923, "learning_rate": 1e-06, "loss": 0.6206, "mean_token_accuracy": 0.8170638084411621, "num_tokens": 197972971.0, "step": 621 }, { "epoch": 0.6327568667344863, "grad_norm": 0.7396625876426697, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8225463628768921, "num_tokens": 198288270.0, "step": 622 }, { "epoch": 0.6337741607324516, "grad_norm": 0.7535018920898438, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.818221390247345, "num_tokens": 198600564.0, "step": 623 }, { "epoch": 0.6347914547304171, "grad_norm": 0.771710991859436, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8219149708747864, "num_tokens": 198919816.0, "step": 624 }, { "epoch": 0.6358087487283826, "grad_norm": 0.7761126160621643, "learning_rate": 1e-06, "loss": 0.6438, "mean_token_accuracy": 0.8097991943359375, "num_tokens": 199234621.0, "step": 625 }, { "epoch": 0.6368260427263479, "grad_norm": 0.7397571206092834, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8178320527076721, "num_tokens": 199581880.0, "step": 626 }, { "epoch": 0.6378433367243134, "grad_norm": 0.8031889200210571, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8121150732040405, "num_tokens": 199887035.0, "step": 627 }, { "epoch": 0.6388606307222787, "grad_norm": 0.743266761302948, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8165163397789001, "num_tokens": 200218153.0, "step": 628 }, { "epoch": 0.6398779247202442, "grad_norm": 0.789598286151886, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.814096212387085, "num_tokens": 200538925.0, "step": 629 }, { "epoch": 0.6408952187182095, "grad_norm": 0.7879233360290527, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8194499015808105, "num_tokens": 200852605.0, "step": 630 }, { "epoch": 0.641912512716175, "grad_norm": 0.7725964784622192, "learning_rate": 1e-06, "loss": 0.6239, "mean_token_accuracy": 0.8154667615890503, "num_tokens": 201171353.0, "step": 631 }, { "epoch": 0.6429298067141404, "grad_norm": 0.7540669441223145, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8165876865386963, "num_tokens": 201496911.0, "step": 632 }, { "epoch": 0.6439471007121058, "grad_norm": 0.7851372957229614, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8111737370491028, "num_tokens": 201822744.0, "step": 633 }, { "epoch": 0.6449643947100712, "grad_norm": 0.7377258539199829, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8218101859092712, "num_tokens": 202163172.0, "step": 634 }, { "epoch": 0.6459816887080366, "grad_norm": 0.8279411792755127, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8161522150039673, "num_tokens": 202477704.0, "step": 635 }, { "epoch": 0.646998982706002, "grad_norm": 0.7455507516860962, "learning_rate": 1e-06, "loss": 0.6541, "mean_token_accuracy": 0.8080745935440063, "num_tokens": 202799169.0, "step": 636 }, { "epoch": 0.6480162767039674, "grad_norm": 0.7769946455955505, "learning_rate": 1e-06, "loss": 0.6183, "mean_token_accuracy": 0.8177330493927002, "num_tokens": 203103019.0, "step": 637 }, { "epoch": 0.6490335707019329, "grad_norm": 0.7767770290374756, "learning_rate": 1e-06, "loss": 0.6348, "mean_token_accuracy": 0.8127952218055725, "num_tokens": 203417686.0, "step": 638 }, { "epoch": 0.6500508646998983, "grad_norm": 0.7546711564064026, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.813634991645813, "num_tokens": 203737547.0, "step": 639 }, { "epoch": 0.6510681586978637, "grad_norm": 0.7748486995697021, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8149056434631348, "num_tokens": 204040525.0, "step": 640 }, { "epoch": 0.6520854526958291, "grad_norm": 0.7774174213409424, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8201233744621277, "num_tokens": 204371623.0, "step": 641 }, { "epoch": 0.6531027466937945, "grad_norm": 0.8860936164855957, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8205939531326294, "num_tokens": 204695860.0, "step": 642 }, { "epoch": 0.6541200406917599, "grad_norm": 0.7313113212585449, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8227372169494629, "num_tokens": 205022471.0, "step": 643 }, { "epoch": 0.6551373346897253, "grad_norm": 0.8307287096977234, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8170334100723267, "num_tokens": 205338248.0, "step": 644 }, { "epoch": 0.6561546286876907, "grad_norm": 0.7945787310600281, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.810707688331604, "num_tokens": 205654689.0, "step": 645 }, { "epoch": 0.6571719226856562, "grad_norm": 0.7601882219314575, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8086690902709961, "num_tokens": 205984399.0, "step": 646 }, { "epoch": 0.6581892166836215, "grad_norm": 0.7996160387992859, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8113598227500916, "num_tokens": 206308245.0, "step": 647 }, { "epoch": 0.659206510681587, "grad_norm": 0.7633128762245178, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8195191621780396, "num_tokens": 206613022.0, "step": 648 }, { "epoch": 0.6602238046795524, "grad_norm": 0.7742624878883362, "learning_rate": 1e-06, "loss": 0.6328, "mean_token_accuracy": 0.8122345209121704, "num_tokens": 206935916.0, "step": 649 }, { "epoch": 0.6612410986775178, "grad_norm": 0.7525912523269653, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.823207437992096, "num_tokens": 207249857.0, "step": 650 }, { "epoch": 0.6622583926754833, "grad_norm": 0.7381570935249329, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8117334842681885, "num_tokens": 207585946.0, "step": 651 }, { "epoch": 0.6632756866734486, "grad_norm": 0.7533280849456787, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8249242901802063, "num_tokens": 207905907.0, "step": 652 }, { "epoch": 0.6642929806714141, "grad_norm": 0.7347438931465149, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8232535123825073, "num_tokens": 208221915.0, "step": 653 }, { "epoch": 0.6653102746693794, "grad_norm": 0.8024598360061646, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8214566707611084, "num_tokens": 208541953.0, "step": 654 }, { "epoch": 0.6663275686673449, "grad_norm": 0.7610267400741577, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.819033145904541, "num_tokens": 208855549.0, "step": 655 }, { "epoch": 0.6673448626653102, "grad_norm": 0.7803341746330261, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8129167556762695, "num_tokens": 209167815.0, "step": 656 }, { "epoch": 0.6683621566632757, "grad_norm": 0.9436068534851074, "learning_rate": 1e-06, "loss": 0.6464, "mean_token_accuracy": 0.8097710013389587, "num_tokens": 209506826.0, "step": 657 }, { "epoch": 0.669379450661241, "grad_norm": 0.7793200612068176, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8215120434761047, "num_tokens": 209810131.0, "step": 658 }, { "epoch": 0.6703967446592065, "grad_norm": 0.7526964545249939, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8179228901863098, "num_tokens": 210123131.0, "step": 659 }, { "epoch": 0.671414038657172, "grad_norm": 0.7783557772636414, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8184784650802612, "num_tokens": 210444300.0, "step": 660 }, { "epoch": 0.6724313326551373, "grad_norm": 0.7479385733604431, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.816991925239563, "num_tokens": 210771386.0, "step": 661 }, { "epoch": 0.6734486266531028, "grad_norm": 0.7698660492897034, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8170925974845886, "num_tokens": 211073799.0, "step": 662 }, { "epoch": 0.6744659206510681, "grad_norm": 0.7108475565910339, "learning_rate": 1e-06, "loss": 0.6239, "mean_token_accuracy": 0.816068172454834, "num_tokens": 211407370.0, "step": 663 }, { "epoch": 0.6754832146490336, "grad_norm": 0.780982255935669, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8190521001815796, "num_tokens": 211739189.0, "step": 664 }, { "epoch": 0.676500508646999, "grad_norm": 0.7685346007347107, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8201148509979248, "num_tokens": 212047699.0, "step": 665 }, { "epoch": 0.6775178026449644, "grad_norm": 0.7723684310913086, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8142590522766113, "num_tokens": 212350768.0, "step": 666 }, { "epoch": 0.6785350966429298, "grad_norm": 0.8147526979446411, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8146811723709106, "num_tokens": 212656414.0, "step": 667 }, { "epoch": 0.6795523906408952, "grad_norm": 0.7926346659660339, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8165136575698853, "num_tokens": 212970630.0, "step": 668 }, { "epoch": 0.6805696846388606, "grad_norm": 0.7599271535873413, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8165913820266724, "num_tokens": 213296625.0, "step": 669 }, { "epoch": 0.681586978636826, "grad_norm": 0.7710536122322083, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8237712383270264, "num_tokens": 213608058.0, "step": 670 }, { "epoch": 0.6826042726347915, "grad_norm": 0.754758894443512, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8195925951004028, "num_tokens": 213919287.0, "step": 671 }, { "epoch": 0.6836215666327569, "grad_norm": 0.7790162563323975, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8195450305938721, "num_tokens": 214248836.0, "step": 672 }, { "epoch": 0.6846388606307223, "grad_norm": 0.754189133644104, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8181887865066528, "num_tokens": 214578559.0, "step": 673 }, { "epoch": 0.6856561546286877, "grad_norm": 0.7615801692008972, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8142949342727661, "num_tokens": 214902909.0, "step": 674 }, { "epoch": 0.6866734486266531, "grad_norm": 0.8176745772361755, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.817600667476654, "num_tokens": 215220787.0, "step": 675 }, { "epoch": 0.6876907426246185, "grad_norm": 0.7809671759605408, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.8143289089202881, "num_tokens": 215532596.0, "step": 676 }, { "epoch": 0.688708036622584, "grad_norm": 0.7892170548439026, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8146532773971558, "num_tokens": 215854411.0, "step": 677 }, { "epoch": 0.6897253306205493, "grad_norm": 0.7844973802566528, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8172857761383057, "num_tokens": 216176025.0, "step": 678 }, { "epoch": 0.6907426246185148, "grad_norm": 0.7520744204521179, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.819256603717804, "num_tokens": 216490560.0, "step": 679 }, { "epoch": 0.6917599186164801, "grad_norm": 0.8263868689537048, "learning_rate": 1e-06, "loss": 0.6214, "mean_token_accuracy": 0.8156100511550903, "num_tokens": 216788892.0, "step": 680 }, { "epoch": 0.6927772126144456, "grad_norm": 0.7588439583778381, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8199942111968994, "num_tokens": 217091789.0, "step": 681 }, { "epoch": 0.6937945066124109, "grad_norm": 0.7323412299156189, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8157312273979187, "num_tokens": 217391649.0, "step": 682 }, { "epoch": 0.6948118006103764, "grad_norm": 0.8044806122779846, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8197875022888184, "num_tokens": 217705676.0, "step": 683 }, { "epoch": 0.6958290946083419, "grad_norm": 0.7340426445007324, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8123747706413269, "num_tokens": 218034192.0, "step": 684 }, { "epoch": 0.6968463886063072, "grad_norm": 0.7277271747589111, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8182982206344604, "num_tokens": 218361880.0, "step": 685 }, { "epoch": 0.6978636826042727, "grad_norm": 0.7453147172927856, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8207399845123291, "num_tokens": 218677170.0, "step": 686 }, { "epoch": 0.698880976602238, "grad_norm": 0.7955684661865234, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8173407316207886, "num_tokens": 218990542.0, "step": 687 }, { "epoch": 0.6998982706002035, "grad_norm": 0.7468408346176147, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8189893960952759, "num_tokens": 219320664.0, "step": 688 }, { "epoch": 0.7009155645981688, "grad_norm": 0.7514234781265259, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8156855702400208, "num_tokens": 219643460.0, "step": 689 }, { "epoch": 0.7019328585961343, "grad_norm": 0.7620576024055481, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8172132968902588, "num_tokens": 219953874.0, "step": 690 }, { "epoch": 0.7029501525940997, "grad_norm": 0.754711389541626, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8232397437095642, "num_tokens": 220272230.0, "step": 691 }, { "epoch": 0.7039674465920651, "grad_norm": 0.8804978728294373, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8175148963928223, "num_tokens": 220588939.0, "step": 692 }, { "epoch": 0.7049847405900305, "grad_norm": 0.776063859462738, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8212346434593201, "num_tokens": 220916843.0, "step": 693 }, { "epoch": 0.7060020345879959, "grad_norm": 0.7388089299201965, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8113117218017578, "num_tokens": 221258649.0, "step": 694 }, { "epoch": 0.7070193285859614, "grad_norm": 0.7917898893356323, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8141858577728271, "num_tokens": 221583215.0, "step": 695 }, { "epoch": 0.7080366225839267, "grad_norm": 0.7290518283843994, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8122216463088989, "num_tokens": 221923372.0, "step": 696 }, { "epoch": 0.7090539165818922, "grad_norm": 0.7753685116767883, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8205819129943848, "num_tokens": 222233287.0, "step": 697 }, { "epoch": 0.7100712105798576, "grad_norm": 0.7622568607330322, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8145500421524048, "num_tokens": 222549595.0, "step": 698 }, { "epoch": 0.711088504577823, "grad_norm": 0.7664331793785095, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8161635398864746, "num_tokens": 222873104.0, "step": 699 }, { "epoch": 0.7121057985757884, "grad_norm": 0.7780277729034424, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8182451725006104, "num_tokens": 223175189.0, "step": 700 }, { "epoch": 0.7131230925737538, "grad_norm": 0.7432042956352234, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8169593811035156, "num_tokens": 223499553.0, "step": 701 }, { "epoch": 0.7141403865717192, "grad_norm": 0.7784262299537659, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8182696104049683, "num_tokens": 223808765.0, "step": 702 }, { "epoch": 0.7151576805696847, "grad_norm": 0.7821851372718811, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8244631290435791, "num_tokens": 224133481.0, "step": 703 }, { "epoch": 0.71617497456765, "grad_norm": 0.7383291125297546, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8167239427566528, "num_tokens": 224447298.0, "step": 704 }, { "epoch": 0.7171922685656155, "grad_norm": 0.7307918071746826, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8202638626098633, "num_tokens": 224757199.0, "step": 705 }, { "epoch": 0.7182095625635809, "grad_norm": 0.779835045337677, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8192306756973267, "num_tokens": 225078638.0, "step": 706 }, { "epoch": 0.7192268565615463, "grad_norm": 0.7415493130683899, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8193663954734802, "num_tokens": 225402143.0, "step": 707 }, { "epoch": 0.7202441505595117, "grad_norm": 0.7614285349845886, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8191728591918945, "num_tokens": 225720513.0, "step": 708 }, { "epoch": 0.7212614445574771, "grad_norm": 0.7944474220275879, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8146188259124756, "num_tokens": 226039357.0, "step": 709 }, { "epoch": 0.7222787385554426, "grad_norm": 0.8491767644882202, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8203014135360718, "num_tokens": 226335920.0, "step": 710 }, { "epoch": 0.7232960325534079, "grad_norm": 0.8166708946228027, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8206759691238403, "num_tokens": 226657995.0, "step": 711 }, { "epoch": 0.7243133265513734, "grad_norm": 0.7617036700248718, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8158437013626099, "num_tokens": 226982248.0, "step": 712 }, { "epoch": 0.7253306205493387, "grad_norm": 0.8162404894828796, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8187158703804016, "num_tokens": 227274613.0, "step": 713 }, { "epoch": 0.7263479145473042, "grad_norm": 0.9296181797981262, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.816686749458313, "num_tokens": 227605134.0, "step": 714 }, { "epoch": 0.7273652085452695, "grad_norm": 0.8144844770431519, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8128677606582642, "num_tokens": 227922627.0, "step": 715 }, { "epoch": 0.728382502543235, "grad_norm": 0.7691407799720764, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8177823424339294, "num_tokens": 228242141.0, "step": 716 }, { "epoch": 0.7293997965412004, "grad_norm": 0.839131236076355, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8203939199447632, "num_tokens": 228545473.0, "step": 717 }, { "epoch": 0.7304170905391658, "grad_norm": 0.7813650965690613, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.813715934753418, "num_tokens": 228884216.0, "step": 718 }, { "epoch": 0.7314343845371313, "grad_norm": 0.7858477234840393, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8113440275192261, "num_tokens": 229181920.0, "step": 719 }, { "epoch": 0.7324516785350966, "grad_norm": 0.7855584025382996, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8171951770782471, "num_tokens": 229505884.0, "step": 720 }, { "epoch": 0.7334689725330621, "grad_norm": 0.7636620998382568, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8192106485366821, "num_tokens": 229844345.0, "step": 721 }, { "epoch": 0.7344862665310274, "grad_norm": 0.7640539407730103, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8148088455200195, "num_tokens": 230160760.0, "step": 722 }, { "epoch": 0.7355035605289929, "grad_norm": 0.7512305378913879, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8187347650527954, "num_tokens": 230480542.0, "step": 723 }, { "epoch": 0.7365208545269583, "grad_norm": 0.769347608089447, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8190985918045044, "num_tokens": 230793950.0, "step": 724 }, { "epoch": 0.7375381485249237, "grad_norm": 0.7874099016189575, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8138229250907898, "num_tokens": 231116995.0, "step": 725 }, { "epoch": 0.7385554425228891, "grad_norm": 0.7738003730773926, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8237497806549072, "num_tokens": 231446504.0, "step": 726 }, { "epoch": 0.7395727365208545, "grad_norm": 0.7526485919952393, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.815697968006134, "num_tokens": 231772874.0, "step": 727 }, { "epoch": 0.7405900305188199, "grad_norm": 0.76657634973526, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8176850080490112, "num_tokens": 232087611.0, "step": 728 }, { "epoch": 0.7416073245167853, "grad_norm": 0.794198215007782, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8136752843856812, "num_tokens": 232405181.0, "step": 729 }, { "epoch": 0.7426246185147508, "grad_norm": 0.7804276347160339, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8142085075378418, "num_tokens": 232730814.0, "step": 730 }, { "epoch": 0.7436419125127162, "grad_norm": 0.7470871210098267, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8203924894332886, "num_tokens": 233062413.0, "step": 731 }, { "epoch": 0.7446592065106816, "grad_norm": 0.7899121642112732, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8206931352615356, "num_tokens": 233382314.0, "step": 732 }, { "epoch": 0.745676500508647, "grad_norm": 0.7328281402587891, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8203123807907104, "num_tokens": 233714244.0, "step": 733 }, { "epoch": 0.7466937945066124, "grad_norm": 0.7478933930397034, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.822023868560791, "num_tokens": 234028924.0, "step": 734 }, { "epoch": 0.7477110885045778, "grad_norm": 0.7476601600646973, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8236653804779053, "num_tokens": 234352052.0, "step": 735 }, { "epoch": 0.7487283825025433, "grad_norm": 0.7471521496772766, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8166206479072571, "num_tokens": 234685164.0, "step": 736 }, { "epoch": 0.7497456765005086, "grad_norm": 0.7760440111160278, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.82477867603302, "num_tokens": 234999779.0, "step": 737 }, { "epoch": 0.7507629704984741, "grad_norm": 0.7630813717842102, "learning_rate": 1e-06, "loss": 0.6705, "mean_token_accuracy": 0.803142786026001, "num_tokens": 235319603.0, "step": 738 }, { "epoch": 0.7517802644964394, "grad_norm": 0.7749915719032288, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8185386657714844, "num_tokens": 235635662.0, "step": 739 }, { "epoch": 0.7527975584944049, "grad_norm": 0.8760622143745422, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8190468549728394, "num_tokens": 235935563.0, "step": 740 }, { "epoch": 0.7538148524923703, "grad_norm": 0.7899062633514404, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8181654810905457, "num_tokens": 236247865.0, "step": 741 }, { "epoch": 0.7548321464903357, "grad_norm": 0.7470571994781494, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.822014570236206, "num_tokens": 236548474.0, "step": 742 }, { "epoch": 0.7558494404883012, "grad_norm": 0.8113415837287903, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8192257881164551, "num_tokens": 236859308.0, "step": 743 }, { "epoch": 0.7568667344862665, "grad_norm": 0.7829393744468689, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8189871907234192, "num_tokens": 237170473.0, "step": 744 }, { "epoch": 0.757884028484232, "grad_norm": 0.8226320147514343, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8234495520591736, "num_tokens": 237483382.0, "step": 745 }, { "epoch": 0.7589013224821973, "grad_norm": 0.7683728933334351, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8195022940635681, "num_tokens": 237809071.0, "step": 746 }, { "epoch": 0.7599186164801628, "grad_norm": 0.7355079054832458, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8160339593887329, "num_tokens": 238146507.0, "step": 747 }, { "epoch": 0.7609359104781281, "grad_norm": 0.7261055111885071, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8235647678375244, "num_tokens": 238483591.0, "step": 748 }, { "epoch": 0.7619532044760936, "grad_norm": 0.8172787427902222, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8236143589019775, "num_tokens": 238793523.0, "step": 749 }, { "epoch": 0.762970498474059, "grad_norm": 0.7747340202331543, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8225404024124146, "num_tokens": 239112968.0, "step": 750 }, { "epoch": 0.7639877924720244, "grad_norm": 0.7910871505737305, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8168953657150269, "num_tokens": 239429305.0, "step": 751 }, { "epoch": 0.7650050864699899, "grad_norm": 0.7941386699676514, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8191172480583191, "num_tokens": 239745054.0, "step": 752 }, { "epoch": 0.7660223804679552, "grad_norm": 0.7600555419921875, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8215411901473999, "num_tokens": 240051112.0, "step": 753 }, { "epoch": 0.7670396744659207, "grad_norm": 0.7864240407943726, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8241608142852783, "num_tokens": 240373170.0, "step": 754 }, { "epoch": 0.768056968463886, "grad_norm": 0.7831670641899109, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8154398798942566, "num_tokens": 240668912.0, "step": 755 }, { "epoch": 0.7690742624618515, "grad_norm": 0.7375035881996155, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.822466254234314, "num_tokens": 240996115.0, "step": 756 }, { "epoch": 0.7700915564598169, "grad_norm": 0.7203482985496521, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8243117332458496, "num_tokens": 241313175.0, "step": 757 }, { "epoch": 0.7711088504577823, "grad_norm": 0.7840688228607178, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8159429430961609, "num_tokens": 241631176.0, "step": 758 }, { "epoch": 0.7721261444557477, "grad_norm": 0.7749730944633484, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8183140158653259, "num_tokens": 241954029.0, "step": 759 }, { "epoch": 0.7731434384537131, "grad_norm": 0.7695093154907227, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8105282783508301, "num_tokens": 242276624.0, "step": 760 }, { "epoch": 0.7741607324516785, "grad_norm": 0.8038228750228882, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8202804327011108, "num_tokens": 242576931.0, "step": 761 }, { "epoch": 0.775178026449644, "grad_norm": 0.7450258135795593, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8187384605407715, "num_tokens": 242882978.0, "step": 762 }, { "epoch": 0.7761953204476093, "grad_norm": 0.7412680387496948, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8196582198143005, "num_tokens": 243206785.0, "step": 763 }, { "epoch": 0.7772126144455748, "grad_norm": 0.7654978036880493, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8214221000671387, "num_tokens": 243512602.0, "step": 764 }, { "epoch": 0.7782299084435402, "grad_norm": 0.7713125348091125, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8209126591682434, "num_tokens": 243845125.0, "step": 765 }, { "epoch": 0.7792472024415056, "grad_norm": 0.7866131663322449, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8228663206100464, "num_tokens": 244139431.0, "step": 766 }, { "epoch": 0.780264496439471, "grad_norm": 0.7617781162261963, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8174664378166199, "num_tokens": 244456341.0, "step": 767 }, { "epoch": 0.7812817904374364, "grad_norm": 0.7536523938179016, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.819066047668457, "num_tokens": 244763392.0, "step": 768 }, { "epoch": 0.7822990844354019, "grad_norm": 0.739930272102356, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8170568943023682, "num_tokens": 245082597.0, "step": 769 }, { "epoch": 0.7833163784333672, "grad_norm": 0.7585535049438477, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.815958559513092, "num_tokens": 245379953.0, "step": 770 }, { "epoch": 0.7843336724313327, "grad_norm": 0.7480993866920471, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8213130235671997, "num_tokens": 245696677.0, "step": 771 }, { "epoch": 0.785350966429298, "grad_norm": 0.7671734690666199, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8170229196548462, "num_tokens": 246030786.0, "step": 772 }, { "epoch": 0.7863682604272635, "grad_norm": 0.7891989350318909, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8221583366394043, "num_tokens": 246332311.0, "step": 773 }, { "epoch": 0.7873855544252288, "grad_norm": 0.7767341136932373, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8198896646499634, "num_tokens": 246646398.0, "step": 774 }, { "epoch": 0.7884028484231943, "grad_norm": 0.7895208597183228, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8229328989982605, "num_tokens": 246957754.0, "step": 775 }, { "epoch": 0.7894201424211598, "grad_norm": 0.7392787337303162, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.831958532333374, "num_tokens": 247284678.0, "step": 776 }, { "epoch": 0.7904374364191251, "grad_norm": 0.748408317565918, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8162950277328491, "num_tokens": 247601438.0, "step": 777 }, { "epoch": 0.7914547304170906, "grad_norm": 0.7726233601570129, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8199659585952759, "num_tokens": 247935788.0, "step": 778 }, { "epoch": 0.7924720244150559, "grad_norm": 0.7829846143722534, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8276870250701904, "num_tokens": 248252748.0, "step": 779 }, { "epoch": 0.7934893184130214, "grad_norm": 0.7898768782615662, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8218032121658325, "num_tokens": 248562629.0, "step": 780 }, { "epoch": 0.7945066124109867, "grad_norm": 0.779306948184967, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8156956434249878, "num_tokens": 248884711.0, "step": 781 }, { "epoch": 0.7955239064089522, "grad_norm": 0.7667821645736694, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8179029226303101, "num_tokens": 249198668.0, "step": 782 }, { "epoch": 0.7965412004069176, "grad_norm": 0.810929000377655, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8167812824249268, "num_tokens": 249506726.0, "step": 783 }, { "epoch": 0.797558494404883, "grad_norm": 0.7606924176216125, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8178728818893433, "num_tokens": 249815595.0, "step": 784 }, { "epoch": 0.7985757884028484, "grad_norm": 0.7567647099494934, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8186475038528442, "num_tokens": 250145992.0, "step": 785 }, { "epoch": 0.7995930824008138, "grad_norm": 0.7598842978477478, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8182504177093506, "num_tokens": 250443792.0, "step": 786 }, { "epoch": 0.8006103763987793, "grad_norm": 0.802496075630188, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8158894777297974, "num_tokens": 250747877.0, "step": 787 }, { "epoch": 0.8016276703967447, "grad_norm": 0.8037290573120117, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8193314075469971, "num_tokens": 251052502.0, "step": 788 }, { "epoch": 0.8026449643947101, "grad_norm": 0.8257606625556946, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8240830302238464, "num_tokens": 251352775.0, "step": 789 }, { "epoch": 0.8036622583926755, "grad_norm": 0.7704517841339111, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8215023279190063, "num_tokens": 251677652.0, "step": 790 }, { "epoch": 0.8046795523906409, "grad_norm": 0.7924766540527344, "learning_rate": 1e-06, "loss": 0.6454, "mean_token_accuracy": 0.8102990388870239, "num_tokens": 251995232.0, "step": 791 }, { "epoch": 0.8056968463886063, "grad_norm": 0.7513923645019531, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8175941705703735, "num_tokens": 252318139.0, "step": 792 }, { "epoch": 0.8067141403865717, "grad_norm": 0.7312906980514526, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8224383592605591, "num_tokens": 252655015.0, "step": 793 }, { "epoch": 0.8077314343845371, "grad_norm": 0.7360526323318481, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8211574554443359, "num_tokens": 252975872.0, "step": 794 }, { "epoch": 0.8087487283825026, "grad_norm": 0.8279215693473816, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8176878690719604, "num_tokens": 253298562.0, "step": 795 }, { "epoch": 0.8097660223804679, "grad_norm": 0.7401262521743774, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8262432813644409, "num_tokens": 253623711.0, "step": 796 }, { "epoch": 0.8107833163784334, "grad_norm": 0.7732365131378174, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8178532123565674, "num_tokens": 253951728.0, "step": 797 }, { "epoch": 0.8118006103763988, "grad_norm": 0.8180770874023438, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.816921591758728, "num_tokens": 254256769.0, "step": 798 }, { "epoch": 0.8128179043743642, "grad_norm": 0.7186532020568848, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8152790069580078, "num_tokens": 254592698.0, "step": 799 }, { "epoch": 0.8138351983723296, "grad_norm": 0.7515096664428711, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8181686997413635, "num_tokens": 254907582.0, "step": 800 }, { "epoch": 0.814852492370295, "grad_norm": 0.7662571668624878, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8246775269508362, "num_tokens": 255220612.0, "step": 801 }, { "epoch": 0.8158697863682605, "grad_norm": 0.7624213099479675, "learning_rate": 1e-06, "loss": 0.6251, "mean_token_accuracy": 0.8160996437072754, "num_tokens": 255530491.0, "step": 802 }, { "epoch": 0.8168870803662258, "grad_norm": 0.723014771938324, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8190559148788452, "num_tokens": 255861693.0, "step": 803 }, { "epoch": 0.8179043743641913, "grad_norm": 0.7166092395782471, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8237287998199463, "num_tokens": 256184434.0, "step": 804 }, { "epoch": 0.8189216683621566, "grad_norm": 0.7516783475875854, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8218938708305359, "num_tokens": 256517073.0, "step": 805 }, { "epoch": 0.8199389623601221, "grad_norm": 0.7856842279434204, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.81956946849823, "num_tokens": 256848111.0, "step": 806 }, { "epoch": 0.8209562563580874, "grad_norm": 0.9904868602752686, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8192522525787354, "num_tokens": 257182035.0, "step": 807 }, { "epoch": 0.8219735503560529, "grad_norm": 0.7695131301879883, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8135746717453003, "num_tokens": 257508833.0, "step": 808 }, { "epoch": 0.8229908443540183, "grad_norm": 0.7403817176818848, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8210437893867493, "num_tokens": 257832236.0, "step": 809 }, { "epoch": 0.8240081383519837, "grad_norm": 0.7729365825653076, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.82197505235672, "num_tokens": 258156268.0, "step": 810 }, { "epoch": 0.8250254323499492, "grad_norm": 0.7479487657546997, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8176651000976562, "num_tokens": 258495850.0, "step": 811 }, { "epoch": 0.8260427263479145, "grad_norm": 0.7545899152755737, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8191924095153809, "num_tokens": 258823978.0, "step": 812 }, { "epoch": 0.82706002034588, "grad_norm": 0.7741188406944275, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8178560733795166, "num_tokens": 259143758.0, "step": 813 }, { "epoch": 0.8280773143438453, "grad_norm": 0.8673993349075317, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8175423741340637, "num_tokens": 259459034.0, "step": 814 }, { "epoch": 0.8290946083418108, "grad_norm": 0.8135408759117126, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8214139938354492, "num_tokens": 259769628.0, "step": 815 }, { "epoch": 0.8301119023397762, "grad_norm": 0.7409703731536865, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.815783679485321, "num_tokens": 260091045.0, "step": 816 }, { "epoch": 0.8311291963377416, "grad_norm": 0.7929558753967285, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8218845129013062, "num_tokens": 260414955.0, "step": 817 }, { "epoch": 0.832146490335707, "grad_norm": 0.7872368693351746, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8281674385070801, "num_tokens": 260738547.0, "step": 818 }, { "epoch": 0.8331637843336724, "grad_norm": 0.802636444568634, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8184858560562134, "num_tokens": 261051308.0, "step": 819 }, { "epoch": 0.8341810783316378, "grad_norm": 0.7677578926086426, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8210058212280273, "num_tokens": 261358402.0, "step": 820 }, { "epoch": 0.8351983723296033, "grad_norm": 0.7571184039115906, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8174717426300049, "num_tokens": 261687233.0, "step": 821 }, { "epoch": 0.8362156663275687, "grad_norm": 0.7534165978431702, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8172184228897095, "num_tokens": 262026963.0, "step": 822 }, { "epoch": 0.8372329603255341, "grad_norm": 1.1013829708099365, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8210376501083374, "num_tokens": 262365347.0, "step": 823 }, { "epoch": 0.8382502543234995, "grad_norm": 0.7831832766532898, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8204481601715088, "num_tokens": 262673650.0, "step": 824 }, { "epoch": 0.8392675483214649, "grad_norm": 0.8020746111869812, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8149141073226929, "num_tokens": 262970304.0, "step": 825 }, { "epoch": 0.8402848423194303, "grad_norm": 0.7812209129333496, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8161625266075134, "num_tokens": 263293537.0, "step": 826 }, { "epoch": 0.8413021363173957, "grad_norm": 0.7618210315704346, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8219529986381531, "num_tokens": 263615516.0, "step": 827 }, { "epoch": 0.8423194303153612, "grad_norm": 0.8371942639350891, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8239431381225586, "num_tokens": 263924035.0, "step": 828 }, { "epoch": 0.8433367243133265, "grad_norm": 0.7650486826896667, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8103851675987244, "num_tokens": 264253124.0, "step": 829 }, { "epoch": 0.844354018311292, "grad_norm": 0.7482921481132507, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8206346035003662, "num_tokens": 264578027.0, "step": 830 }, { "epoch": 0.8453713123092573, "grad_norm": 0.7574668526649475, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8168084621429443, "num_tokens": 264888054.0, "step": 831 }, { "epoch": 0.8463886063072228, "grad_norm": 2.012491226196289, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8190878033638, "num_tokens": 265191225.0, "step": 832 }, { "epoch": 0.8474059003051883, "grad_norm": 0.7282354235649109, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8182060718536377, "num_tokens": 265529823.0, "step": 833 }, { "epoch": 0.8484231943031536, "grad_norm": 0.7751017212867737, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8238675594329834, "num_tokens": 265845965.0, "step": 834 }, { "epoch": 0.8494404883011191, "grad_norm": 0.7922971248626709, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8150073289871216, "num_tokens": 266161615.0, "step": 835 }, { "epoch": 0.8504577822990844, "grad_norm": 0.761900007724762, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8176765441894531, "num_tokens": 266462436.0, "step": 836 }, { "epoch": 0.8514750762970499, "grad_norm": 0.7576068043708801, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8197664618492126, "num_tokens": 266773293.0, "step": 837 }, { "epoch": 0.8524923702950152, "grad_norm": 0.7256014943122864, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8169959783554077, "num_tokens": 267100828.0, "step": 838 }, { "epoch": 0.8535096642929807, "grad_norm": 0.8366914987564087, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.821556568145752, "num_tokens": 267429972.0, "step": 839 }, { "epoch": 0.854526958290946, "grad_norm": 0.7880463600158691, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8247042894363403, "num_tokens": 267748631.0, "step": 840 }, { "epoch": 0.8555442522889115, "grad_norm": 0.7307940721511841, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8206034898757935, "num_tokens": 268058483.0, "step": 841 }, { "epoch": 0.8565615462868769, "grad_norm": 0.764660120010376, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8215179443359375, "num_tokens": 268382188.0, "step": 842 }, { "epoch": 0.8575788402848423, "grad_norm": 0.779708981513977, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8204516172409058, "num_tokens": 268701052.0, "step": 843 }, { "epoch": 0.8585961342828077, "grad_norm": 0.7856629490852356, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8267958164215088, "num_tokens": 268996708.0, "step": 844 }, { "epoch": 0.8596134282807731, "grad_norm": 0.7329586744308472, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.820576548576355, "num_tokens": 269311951.0, "step": 845 }, { "epoch": 0.8606307222787386, "grad_norm": 0.8273540139198303, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8222566246986389, "num_tokens": 269598930.0, "step": 846 }, { "epoch": 0.861648016276704, "grad_norm": 0.7287344932556152, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.814606785774231, "num_tokens": 269921692.0, "step": 847 }, { "epoch": 0.8626653102746694, "grad_norm": 0.755750834941864, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8209663033485413, "num_tokens": 270232620.0, "step": 848 }, { "epoch": 0.8636826042726348, "grad_norm": 0.7512905597686768, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8223528265953064, "num_tokens": 270560619.0, "step": 849 }, { "epoch": 0.8646998982706002, "grad_norm": 0.8476002216339111, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8203320503234863, "num_tokens": 270874728.0, "step": 850 }, { "epoch": 0.8657171922685656, "grad_norm": 0.7668102383613586, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8214831352233887, "num_tokens": 271202782.0, "step": 851 }, { "epoch": 0.866734486266531, "grad_norm": 0.7954599857330322, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.820402979850769, "num_tokens": 271529123.0, "step": 852 }, { "epoch": 0.8677517802644964, "grad_norm": 0.7808477282524109, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8243553638458252, "num_tokens": 271828281.0, "step": 853 }, { "epoch": 0.8687690742624619, "grad_norm": 0.7486982941627502, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.8210355043411255, "num_tokens": 272152291.0, "step": 854 }, { "epoch": 0.8697863682604272, "grad_norm": 0.7671699523925781, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8134573698043823, "num_tokens": 272469049.0, "step": 855 }, { "epoch": 0.8708036622583927, "grad_norm": 0.8027375340461731, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8117994070053101, "num_tokens": 272768408.0, "step": 856 }, { "epoch": 0.8718209562563581, "grad_norm": 0.7631490230560303, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8212313652038574, "num_tokens": 273089732.0, "step": 857 }, { "epoch": 0.8728382502543235, "grad_norm": 0.7767783403396606, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8243077397346497, "num_tokens": 273407433.0, "step": 858 }, { "epoch": 0.873855544252289, "grad_norm": 0.7670124769210815, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.81707364320755, "num_tokens": 273726324.0, "step": 859 }, { "epoch": 0.8748728382502543, "grad_norm": 0.8127790093421936, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8218735456466675, "num_tokens": 274048187.0, "step": 860 }, { "epoch": 0.8758901322482198, "grad_norm": 0.7514941096305847, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8224079012870789, "num_tokens": 274377833.0, "step": 861 }, { "epoch": 0.8769074262461851, "grad_norm": 0.7778471112251282, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8132891654968262, "num_tokens": 274705006.0, "step": 862 }, { "epoch": 0.8779247202441506, "grad_norm": 0.7269313931465149, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8242048025131226, "num_tokens": 275014059.0, "step": 863 }, { "epoch": 0.8789420142421159, "grad_norm": 0.7438430190086365, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8166732788085938, "num_tokens": 275338869.0, "step": 864 }, { "epoch": 0.8799593082400814, "grad_norm": 0.8035140037536621, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8169056177139282, "num_tokens": 275653401.0, "step": 865 }, { "epoch": 0.8809766022380467, "grad_norm": 0.7698527574539185, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8229260444641113, "num_tokens": 275962373.0, "step": 866 }, { "epoch": 0.8819938962360122, "grad_norm": 0.7348355054855347, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8198667764663696, "num_tokens": 276287064.0, "step": 867 }, { "epoch": 0.8830111902339777, "grad_norm": 0.7421631813049316, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8199079036712646, "num_tokens": 276610916.0, "step": 868 }, { "epoch": 0.884028484231943, "grad_norm": 0.7622741460800171, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8169533610343933, "num_tokens": 276928368.0, "step": 869 }, { "epoch": 0.8850457782299085, "grad_norm": 0.7316154837608337, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.824696958065033, "num_tokens": 277246859.0, "step": 870 }, { "epoch": 0.8860630722278738, "grad_norm": 0.7528466582298279, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8203611373901367, "num_tokens": 277573295.0, "step": 871 }, { "epoch": 0.8870803662258393, "grad_norm": 0.7816317677497864, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8214170336723328, "num_tokens": 277877252.0, "step": 872 }, { "epoch": 0.8880976602238047, "grad_norm": 0.7750648856163025, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8203676342964172, "num_tokens": 278194339.0, "step": 873 }, { "epoch": 0.8891149542217701, "grad_norm": 0.7397667765617371, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8168438076972961, "num_tokens": 278516575.0, "step": 874 }, { "epoch": 0.8901322482197355, "grad_norm": 0.7264861464500427, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8153183460235596, "num_tokens": 278844290.0, "step": 875 }, { "epoch": 0.8911495422177009, "grad_norm": 0.7935214042663574, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8169482350349426, "num_tokens": 279161918.0, "step": 876 }, { "epoch": 0.8921668362156663, "grad_norm": 0.7562498450279236, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.815357506275177, "num_tokens": 279481295.0, "step": 877 }, { "epoch": 0.8931841302136317, "grad_norm": 0.7284414768218994, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8232520818710327, "num_tokens": 279800329.0, "step": 878 }, { "epoch": 0.8942014242115972, "grad_norm": 0.7370333671569824, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8256187438964844, "num_tokens": 280111037.0, "step": 879 }, { "epoch": 0.8952187182095626, "grad_norm": 0.7799801826477051, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8217278718948364, "num_tokens": 280405852.0, "step": 880 }, { "epoch": 0.896236012207528, "grad_norm": 0.7867871522903442, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8187962770462036, "num_tokens": 280727104.0, "step": 881 }, { "epoch": 0.8972533062054934, "grad_norm": 0.8229040503501892, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8244612812995911, "num_tokens": 281043942.0, "step": 882 }, { "epoch": 0.8982706002034588, "grad_norm": 0.773310124874115, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8205931186676025, "num_tokens": 281374849.0, "step": 883 }, { "epoch": 0.8992878942014242, "grad_norm": 0.7774438858032227, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8223268985748291, "num_tokens": 281681982.0, "step": 884 }, { "epoch": 0.9003051881993896, "grad_norm": 0.7470009922981262, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.816724956035614, "num_tokens": 282019096.0, "step": 885 }, { "epoch": 0.901322482197355, "grad_norm": 0.7227064371109009, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8233357667922974, "num_tokens": 282360553.0, "step": 886 }, { "epoch": 0.9023397761953205, "grad_norm": 0.7855974435806274, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8194457292556763, "num_tokens": 282674995.0, "step": 887 }, { "epoch": 0.9033570701932858, "grad_norm": 0.7482305765151978, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8232486248016357, "num_tokens": 282997592.0, "step": 888 }, { "epoch": 0.9043743641912513, "grad_norm": 0.7636599540710449, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8257430195808411, "num_tokens": 283307527.0, "step": 889 }, { "epoch": 0.9053916581892166, "grad_norm": 0.7545581459999084, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8150089979171753, "num_tokens": 283620377.0, "step": 890 }, { "epoch": 0.9064089521871821, "grad_norm": 0.8514378070831299, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8237584233283997, "num_tokens": 283931044.0, "step": 891 }, { "epoch": 0.9074262461851476, "grad_norm": 0.7739594578742981, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8195616006851196, "num_tokens": 284248921.0, "step": 892 }, { "epoch": 0.9084435401831129, "grad_norm": 0.71832275390625, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8258185386657715, "num_tokens": 284570062.0, "step": 893 }, { "epoch": 0.9094608341810784, "grad_norm": 0.7497410178184509, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8188490867614746, "num_tokens": 284892788.0, "step": 894 }, { "epoch": 0.9104781281790437, "grad_norm": 0.7797565460205078, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8166163563728333, "num_tokens": 285203700.0, "step": 895 }, { "epoch": 0.9114954221770092, "grad_norm": 0.7578243017196655, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8208448886871338, "num_tokens": 285550546.0, "step": 896 }, { "epoch": 0.9125127161749745, "grad_norm": 0.8120006918907166, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8185294270515442, "num_tokens": 285845604.0, "step": 897 }, { "epoch": 0.91353001017294, "grad_norm": 0.829473614692688, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8186506628990173, "num_tokens": 286144853.0, "step": 898 }, { "epoch": 0.9145473041709054, "grad_norm": 0.7547066807746887, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8188067674636841, "num_tokens": 286458777.0, "step": 899 }, { "epoch": 0.9155645981688708, "grad_norm": 0.8693905472755432, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8198082447052002, "num_tokens": 286774374.0, "step": 900 }, { "epoch": 0.9165818921668362, "grad_norm": 0.774172306060791, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8185197710990906, "num_tokens": 287085779.0, "step": 901 }, { "epoch": 0.9175991861648016, "grad_norm": 0.7755725979804993, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8177812099456787, "num_tokens": 287405700.0, "step": 902 }, { "epoch": 0.9186164801627671, "grad_norm": 0.7723372578620911, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8251550197601318, "num_tokens": 287716716.0, "step": 903 }, { "epoch": 0.9196337741607324, "grad_norm": 0.7204782962799072, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8186980485916138, "num_tokens": 288040043.0, "step": 904 }, { "epoch": 0.9206510681586979, "grad_norm": 0.7748673558235168, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8196672201156616, "num_tokens": 288352383.0, "step": 905 }, { "epoch": 0.9216683621566633, "grad_norm": 0.7603813409805298, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8238770365715027, "num_tokens": 288681122.0, "step": 906 }, { "epoch": 0.9226856561546287, "grad_norm": 0.736601710319519, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.8154827356338501, "num_tokens": 289007588.0, "step": 907 }, { "epoch": 0.9237029501525941, "grad_norm": 0.7555220723152161, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8206131458282471, "num_tokens": 289338174.0, "step": 908 }, { "epoch": 0.9247202441505595, "grad_norm": 0.7680867314338684, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8236669301986694, "num_tokens": 289657315.0, "step": 909 }, { "epoch": 0.9257375381485249, "grad_norm": 0.7650969624519348, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8214796781539917, "num_tokens": 289982984.0, "step": 910 }, { "epoch": 0.9267548321464903, "grad_norm": 0.7762976288795471, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8228493332862854, "num_tokens": 290298649.0, "step": 911 }, { "epoch": 0.9277721261444557, "grad_norm": 0.7978520393371582, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8216226100921631, "num_tokens": 290617255.0, "step": 912 }, { "epoch": 0.9287894201424212, "grad_norm": 0.7277722954750061, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8223746418952942, "num_tokens": 290937706.0, "step": 913 }, { "epoch": 0.9298067141403866, "grad_norm": 0.7542223930358887, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8198230862617493, "num_tokens": 291248241.0, "step": 914 }, { "epoch": 0.930824008138352, "grad_norm": 0.7750515341758728, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8176894783973694, "num_tokens": 291586275.0, "step": 915 }, { "epoch": 0.9318413021363174, "grad_norm": 0.7996515035629272, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8253262042999268, "num_tokens": 291904561.0, "step": 916 }, { "epoch": 0.9328585961342828, "grad_norm": 0.7990328669548035, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8202857971191406, "num_tokens": 292214056.0, "step": 917 }, { "epoch": 0.9338758901322483, "grad_norm": 0.7909140586853027, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8177269101142883, "num_tokens": 292505385.0, "step": 918 }, { "epoch": 0.9348931841302136, "grad_norm": 0.7725432515144348, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8192976713180542, "num_tokens": 292829795.0, "step": 919 }, { "epoch": 0.9359104781281791, "grad_norm": 0.8053933382034302, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8256614804267883, "num_tokens": 293144553.0, "step": 920 }, { "epoch": 0.9369277721261444, "grad_norm": 0.7472183108329773, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8184031248092651, "num_tokens": 293463953.0, "step": 921 }, { "epoch": 0.9379450661241099, "grad_norm": 0.7906295657157898, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8287568688392639, "num_tokens": 293773864.0, "step": 922 }, { "epoch": 0.9389623601220752, "grad_norm": 0.8005516529083252, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8264212012290955, "num_tokens": 294103688.0, "step": 923 }, { "epoch": 0.9399796541200407, "grad_norm": 0.7889928221702576, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8293386101722717, "num_tokens": 294425536.0, "step": 924 }, { "epoch": 0.940996948118006, "grad_norm": 0.7291927933692932, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8204392194747925, "num_tokens": 294743889.0, "step": 925 }, { "epoch": 0.9420142421159715, "grad_norm": 0.7222439646720886, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8163737058639526, "num_tokens": 295078964.0, "step": 926 }, { "epoch": 0.943031536113937, "grad_norm": 0.7308232188224792, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8282566070556641, "num_tokens": 295382594.0, "step": 927 }, { "epoch": 0.9440488301119023, "grad_norm": 0.7753663063049316, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8208799362182617, "num_tokens": 295694723.0, "step": 928 }, { "epoch": 0.9450661241098678, "grad_norm": 0.8650704622268677, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8251305222511292, "num_tokens": 295998034.0, "step": 929 }, { "epoch": 0.9460834181078331, "grad_norm": 0.7788504958152771, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8198367357254028, "num_tokens": 296290081.0, "step": 930 }, { "epoch": 0.9471007121057986, "grad_norm": 0.76593017578125, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8246276378631592, "num_tokens": 296598043.0, "step": 931 }, { "epoch": 0.948118006103764, "grad_norm": 0.7856141328811646, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8231593370437622, "num_tokens": 296907598.0, "step": 932 }, { "epoch": 0.9491353001017294, "grad_norm": 1.8626831769943237, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8226741552352905, "num_tokens": 297229538.0, "step": 933 }, { "epoch": 0.9501525940996948, "grad_norm": 0.753101646900177, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8162423372268677, "num_tokens": 297545849.0, "step": 934 }, { "epoch": 0.9511698880976602, "grad_norm": 0.7566388249397278, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8176044225692749, "num_tokens": 297857385.0, "step": 935 }, { "epoch": 0.9521871820956256, "grad_norm": 0.7380351424217224, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8211214542388916, "num_tokens": 298178437.0, "step": 936 }, { "epoch": 0.953204476093591, "grad_norm": 0.7686142325401306, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8248642086982727, "num_tokens": 298502544.0, "step": 937 }, { "epoch": 0.9542217700915565, "grad_norm": 0.7463445067405701, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8178445100784302, "num_tokens": 298813928.0, "step": 938 }, { "epoch": 0.9552390640895219, "grad_norm": 0.7960903644561768, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8256028294563293, "num_tokens": 299145054.0, "step": 939 }, { "epoch": 0.9562563580874873, "grad_norm": 0.708060622215271, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8219953775405884, "num_tokens": 299484449.0, "step": 940 }, { "epoch": 0.9572736520854527, "grad_norm": 0.7917901277542114, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8174960613250732, "num_tokens": 299821866.0, "step": 941 }, { "epoch": 0.9582909460834181, "grad_norm": 0.7996972799301147, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8205087184906006, "num_tokens": 300122149.0, "step": 942 }, { "epoch": 0.9593082400813835, "grad_norm": 0.7606682777404785, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8185034990310669, "num_tokens": 300434761.0, "step": 943 }, { "epoch": 0.960325534079349, "grad_norm": 0.7440558075904846, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8182252645492554, "num_tokens": 300761476.0, "step": 944 }, { "epoch": 0.9613428280773143, "grad_norm": 0.760749101638794, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8271734714508057, "num_tokens": 301078714.0, "step": 945 }, { "epoch": 0.9623601220752798, "grad_norm": 0.8019530773162842, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8223601579666138, "num_tokens": 301390887.0, "step": 946 }, { "epoch": 0.9633774160732451, "grad_norm": 0.7818415760993958, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8200531005859375, "num_tokens": 301709240.0, "step": 947 }, { "epoch": 0.9643947100712106, "grad_norm": 0.7479201555252075, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8179908990859985, "num_tokens": 302020968.0, "step": 948 }, { "epoch": 0.965412004069176, "grad_norm": 0.7905112504959106, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8104801774024963, "num_tokens": 302349196.0, "step": 949 }, { "epoch": 0.9664292980671414, "grad_norm": 0.743297815322876, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8159749507904053, "num_tokens": 302661848.0, "step": 950 }, { "epoch": 0.9674465920651069, "grad_norm": 0.7662633657455444, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8191525936126709, "num_tokens": 302985335.0, "step": 951 }, { "epoch": 0.9684638860630722, "grad_norm": 0.739526093006134, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8264555931091309, "num_tokens": 303312885.0, "step": 952 }, { "epoch": 0.9694811800610377, "grad_norm": 0.7629678845405579, "learning_rate": 1e-06, "loss": 0.6088, "mean_token_accuracy": 0.8180946111679077, "num_tokens": 303632397.0, "step": 953 }, { "epoch": 0.970498474059003, "grad_norm": 0.7920119166374207, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8164322376251221, "num_tokens": 303954402.0, "step": 954 }, { "epoch": 0.9715157680569685, "grad_norm": 0.7756142616271973, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.819657564163208, "num_tokens": 304285608.0, "step": 955 }, { "epoch": 0.9725330620549338, "grad_norm": 0.7486905455589294, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8285978436470032, "num_tokens": 304604968.0, "step": 956 }, { "epoch": 0.9735503560528993, "grad_norm": 0.7525914907455444, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8244077563285828, "num_tokens": 304919739.0, "step": 957 }, { "epoch": 0.9745676500508647, "grad_norm": 0.7886344194412231, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8192185759544373, "num_tokens": 305245025.0, "step": 958 }, { "epoch": 0.9755849440488301, "grad_norm": 0.7626937627792358, "learning_rate": 1e-06, "loss": 0.5983, "mean_token_accuracy": 0.8203018307685852, "num_tokens": 305554045.0, "step": 959 }, { "epoch": 0.9766022380467956, "grad_norm": 0.7469524145126343, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8262737989425659, "num_tokens": 305880937.0, "step": 960 }, { "epoch": 0.9776195320447609, "grad_norm": 0.7448502779006958, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8239433765411377, "num_tokens": 306205585.0, "step": 961 }, { "epoch": 0.9786368260427264, "grad_norm": 0.8299108147621155, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8226567506790161, "num_tokens": 306521564.0, "step": 962 }, { "epoch": 0.9796541200406917, "grad_norm": 0.7663608193397522, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.81708824634552, "num_tokens": 306832682.0, "step": 963 }, { "epoch": 0.9806714140386572, "grad_norm": 0.7992986440658569, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8270239233970642, "num_tokens": 307155638.0, "step": 964 }, { "epoch": 0.9816887080366226, "grad_norm": 0.7808005213737488, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8254443407058716, "num_tokens": 307461975.0, "step": 965 }, { "epoch": 0.982706002034588, "grad_norm": 0.7484624981880188, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.816098690032959, "num_tokens": 307787106.0, "step": 966 }, { "epoch": 0.9837232960325534, "grad_norm": 0.7650085091590881, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8195856809616089, "num_tokens": 308114670.0, "step": 967 }, { "epoch": 0.9847405900305188, "grad_norm": 0.7795204520225525, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8185417056083679, "num_tokens": 308438717.0, "step": 968 }, { "epoch": 0.9857578840284842, "grad_norm": 0.7380291819572449, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.82597416639328, "num_tokens": 308770389.0, "step": 969 }, { "epoch": 0.9867751780264497, "grad_norm": 0.7749535441398621, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8239677548408508, "num_tokens": 309098467.0, "step": 970 }, { "epoch": 0.987792472024415, "grad_norm": 0.8078527450561523, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8165298700332642, "num_tokens": 309430767.0, "step": 971 }, { "epoch": 0.9888097660223805, "grad_norm": 0.8138400316238403, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8182769417762756, "num_tokens": 309743337.0, "step": 972 }, { "epoch": 0.9898270600203459, "grad_norm": 0.7963126301765442, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.816325306892395, "num_tokens": 310051593.0, "step": 973 }, { "epoch": 0.9908443540183113, "grad_norm": 0.7809932231903076, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8219630718231201, "num_tokens": 310376369.0, "step": 974 }, { "epoch": 0.9918616480162767, "grad_norm": 0.7698544263839722, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8174964189529419, "num_tokens": 310705795.0, "step": 975 }, { "epoch": 0.9928789420142421, "grad_norm": 0.8088345527648926, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8205104470252991, "num_tokens": 311023220.0, "step": 976 }, { "epoch": 0.9938962360122076, "grad_norm": 0.8139130473136902, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8165026903152466, "num_tokens": 311361294.0, "step": 977 }, { "epoch": 0.9949135300101729, "grad_norm": 0.7873548269271851, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8094485402107239, "num_tokens": 311682933.0, "step": 978 }, { "epoch": 0.9959308240081384, "grad_norm": 0.7542492747306824, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.820980429649353, "num_tokens": 311997757.0, "step": 979 }, { "epoch": 0.9969481180061037, "grad_norm": 0.8661382794380188, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8212977051734924, "num_tokens": 312300232.0, "step": 980 }, { "epoch": 0.9979654120040692, "grad_norm": 0.805185079574585, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8209521770477295, "num_tokens": 312617272.0, "step": 981 }, { "epoch": 0.9989827060020345, "grad_norm": 0.7660844922065735, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.821892261505127, "num_tokens": 312940535.0, "step": 982 }, { "epoch": 1.0, "grad_norm": 0.7682638168334961, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8197771906852722, "num_tokens": 313255150.0, "step": 983 }, { "epoch": 1.0010172939979654, "grad_norm": 0.7689452171325684, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8259244561195374, "num_tokens": 313576480.0, "step": 984 }, { "epoch": 1.002034587995931, "grad_norm": 0.820095419883728, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.814440906047821, "num_tokens": 313894362.0, "step": 985 }, { "epoch": 1.0030518819938963, "grad_norm": 0.7715836763381958, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8188121914863586, "num_tokens": 314199033.0, "step": 986 }, { "epoch": 1.0040691759918616, "grad_norm": 0.733830988407135, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8249683976173401, "num_tokens": 314529806.0, "step": 987 }, { "epoch": 1.005086469989827, "grad_norm": 0.8194170594215393, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.822218120098114, "num_tokens": 314836980.0, "step": 988 }, { "epoch": 1.0061037639877926, "grad_norm": 0.7957749366760254, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8144529461860657, "num_tokens": 315155851.0, "step": 989 }, { "epoch": 1.007121057985758, "grad_norm": 0.7519081234931946, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8247135877609253, "num_tokens": 315477553.0, "step": 990 }, { "epoch": 1.0081383519837233, "grad_norm": 0.8048916459083557, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.820995032787323, "num_tokens": 315789205.0, "step": 991 }, { "epoch": 1.0091556459816886, "grad_norm": 0.756081223487854, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.820772111415863, "num_tokens": 316108041.0, "step": 992 }, { "epoch": 1.0101729399796542, "grad_norm": 0.7621251344680786, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8217575550079346, "num_tokens": 316433915.0, "step": 993 }, { "epoch": 1.0111902339776195, "grad_norm": 0.7687362432479858, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8215383291244507, "num_tokens": 316764032.0, "step": 994 }, { "epoch": 1.0122075279755849, "grad_norm": 0.7695761919021606, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8193082213401794, "num_tokens": 317068693.0, "step": 995 }, { "epoch": 1.0132248219735505, "grad_norm": 0.7918261885643005, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8233439922332764, "num_tokens": 317399513.0, "step": 996 }, { "epoch": 1.0142421159715158, "grad_norm": 0.755750834941864, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8246799111366272, "num_tokens": 317722640.0, "step": 997 }, { "epoch": 1.0152594099694812, "grad_norm": 0.7813019752502441, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8261044025421143, "num_tokens": 318033418.0, "step": 998 }, { "epoch": 1.0162767039674465, "grad_norm": 0.7828826308250427, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8192239999771118, "num_tokens": 318339258.0, "step": 999 }, { "epoch": 1.017293997965412, "grad_norm": 0.7472410798072815, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8274403810501099, "num_tokens": 318648663.0, "step": 1000 }, { "epoch": 1.0183112919633774, "grad_norm": 0.795993983745575, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.820968747138977, "num_tokens": 318960576.0, "step": 1001 }, { "epoch": 1.0193285859613428, "grad_norm": 0.7590869665145874, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8260129690170288, "num_tokens": 319276087.0, "step": 1002 }, { "epoch": 1.0203458799593081, "grad_norm": 0.7744798064231873, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8254764676094055, "num_tokens": 319576787.0, "step": 1003 }, { "epoch": 1.0213631739572737, "grad_norm": 0.7490171194076538, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8221466541290283, "num_tokens": 319902904.0, "step": 1004 }, { "epoch": 1.022380467955239, "grad_norm": 0.738842248916626, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8217558264732361, "num_tokens": 320212284.0, "step": 1005 }, { "epoch": 1.0233977619532044, "grad_norm": 0.7941291332244873, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8189691305160522, "num_tokens": 320529125.0, "step": 1006 }, { "epoch": 1.02441505595117, "grad_norm": 0.7658363580703735, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8234286308288574, "num_tokens": 320834351.0, "step": 1007 }, { "epoch": 1.0254323499491353, "grad_norm": 0.7774712443351746, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8215651512145996, "num_tokens": 321155407.0, "step": 1008 }, { "epoch": 1.0264496439471007, "grad_norm": 0.8093451857566833, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.816012978553772, "num_tokens": 321466786.0, "step": 1009 }, { "epoch": 1.027466937945066, "grad_norm": 0.7412152290344238, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8279814124107361, "num_tokens": 321794738.0, "step": 1010 }, { "epoch": 1.0284842319430316, "grad_norm": 0.7972337603569031, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8213940262794495, "num_tokens": 322105270.0, "step": 1011 }, { "epoch": 1.029501525940997, "grad_norm": 0.7478787899017334, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8207067251205444, "num_tokens": 322430631.0, "step": 1012 }, { "epoch": 1.0305188199389623, "grad_norm": 0.7256690859794617, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8281009197235107, "num_tokens": 322762321.0, "step": 1013 }, { "epoch": 1.0315361139369277, "grad_norm": 0.8089573979377747, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8199090957641602, "num_tokens": 323079150.0, "step": 1014 }, { "epoch": 1.0325534079348933, "grad_norm": 0.7496512532234192, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8193944692611694, "num_tokens": 323409267.0, "step": 1015 }, { "epoch": 1.0335707019328586, "grad_norm": 0.7756277322769165, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8161155581474304, "num_tokens": 323717438.0, "step": 1016 }, { "epoch": 1.034587995930824, "grad_norm": 0.7738403081893921, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8196358680725098, "num_tokens": 324033814.0, "step": 1017 }, { "epoch": 1.0356052899287893, "grad_norm": 0.807456910610199, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8201157450675964, "num_tokens": 324339938.0, "step": 1018 }, { "epoch": 1.0366225839267549, "grad_norm": 0.8129794001579285, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8207684755325317, "num_tokens": 324649242.0, "step": 1019 }, { "epoch": 1.0376398779247202, "grad_norm": 0.747581958770752, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8210676908493042, "num_tokens": 324973812.0, "step": 1020 }, { "epoch": 1.0386571719226856, "grad_norm": 0.7553842067718506, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8205328583717346, "num_tokens": 325293984.0, "step": 1021 }, { "epoch": 1.0396744659206512, "grad_norm": 0.7708786725997925, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8168396949768066, "num_tokens": 325602970.0, "step": 1022 }, { "epoch": 1.0406917599186165, "grad_norm": 0.7987100481987, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8218681812286377, "num_tokens": 325898114.0, "step": 1023 }, { "epoch": 1.0417090539165819, "grad_norm": 0.8853242993354797, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8202185034751892, "num_tokens": 326210039.0, "step": 1024 }, { "epoch": 1.0427263479145472, "grad_norm": 0.7377429008483887, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8149514198303223, "num_tokens": 326539486.0, "step": 1025 }, { "epoch": 1.0437436419125128, "grad_norm": 0.7787431478500366, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.82254958152771, "num_tokens": 326855696.0, "step": 1026 }, { "epoch": 1.0447609359104781, "grad_norm": 0.7603839635848999, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8195805549621582, "num_tokens": 327187580.0, "step": 1027 }, { "epoch": 1.0457782299084435, "grad_norm": 0.753252387046814, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8188362121582031, "num_tokens": 327507342.0, "step": 1028 }, { "epoch": 1.0467955239064088, "grad_norm": 0.7527574300765991, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8182722926139832, "num_tokens": 327836633.0, "step": 1029 }, { "epoch": 1.0478128179043744, "grad_norm": 0.784575879573822, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.822380006313324, "num_tokens": 328133022.0, "step": 1030 }, { "epoch": 1.0488301119023398, "grad_norm": 0.8077467083930969, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8195399045944214, "num_tokens": 328442910.0, "step": 1031 }, { "epoch": 1.0498474059003051, "grad_norm": 0.7402403950691223, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8225548267364502, "num_tokens": 328766960.0, "step": 1032 }, { "epoch": 1.0508646998982707, "grad_norm": 0.7321717739105225, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8174277544021606, "num_tokens": 329079757.0, "step": 1033 }, { "epoch": 1.051881993896236, "grad_norm": 0.8269543647766113, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8168300986289978, "num_tokens": 329396255.0, "step": 1034 }, { "epoch": 1.0528992878942014, "grad_norm": 0.8111584186553955, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8299026489257812, "num_tokens": 329691299.0, "step": 1035 }, { "epoch": 1.0539165818921667, "grad_norm": 0.7589960694313049, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8235443830490112, "num_tokens": 330014371.0, "step": 1036 }, { "epoch": 1.0549338758901323, "grad_norm": 0.7154140472412109, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.822191596031189, "num_tokens": 330363604.0, "step": 1037 }, { "epoch": 1.0559511698880977, "grad_norm": 0.7874635457992554, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8294987082481384, "num_tokens": 330698211.0, "step": 1038 }, { "epoch": 1.056968463886063, "grad_norm": 0.741408109664917, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8210251927375793, "num_tokens": 331039546.0, "step": 1039 }, { "epoch": 1.0579857578840284, "grad_norm": 0.7418220043182373, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8246545791625977, "num_tokens": 331349908.0, "step": 1040 }, { "epoch": 1.059003051881994, "grad_norm": 0.7552366852760315, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8202588558197021, "num_tokens": 331685842.0, "step": 1041 }, { "epoch": 1.0600203458799593, "grad_norm": 0.7713178396224976, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.825634777545929, "num_tokens": 332002207.0, "step": 1042 }, { "epoch": 1.0610376398779247, "grad_norm": 0.7259511351585388, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8279554843902588, "num_tokens": 332313613.0, "step": 1043 }, { "epoch": 1.0620549338758902, "grad_norm": 0.7211476564407349, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8233799934387207, "num_tokens": 332638608.0, "step": 1044 }, { "epoch": 1.0630722278738556, "grad_norm": 0.794397234916687, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8254649639129639, "num_tokens": 332951297.0, "step": 1045 }, { "epoch": 1.064089521871821, "grad_norm": 0.7747459411621094, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8188462257385254, "num_tokens": 333249289.0, "step": 1046 }, { "epoch": 1.0651068158697863, "grad_norm": 0.7619302868843079, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8311722278594971, "num_tokens": 333565896.0, "step": 1047 }, { "epoch": 1.0661241098677519, "grad_norm": 0.7596496939659119, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8213114738464355, "num_tokens": 333906023.0, "step": 1048 }, { "epoch": 1.0671414038657172, "grad_norm": 0.7507315278053284, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8168172240257263, "num_tokens": 334238798.0, "step": 1049 }, { "epoch": 1.0681586978636826, "grad_norm": 0.7711279988288879, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8149176836013794, "num_tokens": 334559804.0, "step": 1050 }, { "epoch": 1.069175991861648, "grad_norm": 0.8109130263328552, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8220605254173279, "num_tokens": 334867946.0, "step": 1051 }, { "epoch": 1.0701932858596135, "grad_norm": 0.7828776240348816, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8221853971481323, "num_tokens": 335181587.0, "step": 1052 }, { "epoch": 1.0712105798575788, "grad_norm": 0.7226819396018982, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8222191333770752, "num_tokens": 335509021.0, "step": 1053 }, { "epoch": 1.0722278738555442, "grad_norm": 0.7938230037689209, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.822351336479187, "num_tokens": 335809158.0, "step": 1054 }, { "epoch": 1.0732451678535098, "grad_norm": 0.7728512287139893, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8209540843963623, "num_tokens": 336119431.0, "step": 1055 }, { "epoch": 1.0742624618514751, "grad_norm": 0.7619753479957581, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8202822804450989, "num_tokens": 336442944.0, "step": 1056 }, { "epoch": 1.0752797558494405, "grad_norm": 0.7428168654441833, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8283674120903015, "num_tokens": 336743733.0, "step": 1057 }, { "epoch": 1.0762970498474058, "grad_norm": 0.7476255893707275, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8177371621131897, "num_tokens": 337080544.0, "step": 1058 }, { "epoch": 1.0773143438453714, "grad_norm": 0.7603205442428589, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8195057511329651, "num_tokens": 337392278.0, "step": 1059 }, { "epoch": 1.0783316378433367, "grad_norm": 0.7620111703872681, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8295987844467163, "num_tokens": 337706603.0, "step": 1060 }, { "epoch": 1.079348931841302, "grad_norm": 0.7455660104751587, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8240776062011719, "num_tokens": 338026298.0, "step": 1061 }, { "epoch": 1.0803662258392674, "grad_norm": 0.809394121170044, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8286706209182739, "num_tokens": 338365205.0, "step": 1062 }, { "epoch": 1.081383519837233, "grad_norm": 0.8363968729972839, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8185662627220154, "num_tokens": 338668185.0, "step": 1063 }, { "epoch": 1.0824008138351984, "grad_norm": 0.7580261826515198, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8213092088699341, "num_tokens": 338990507.0, "step": 1064 }, { "epoch": 1.0834181078331637, "grad_norm": 0.7513942122459412, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8225909471511841, "num_tokens": 339326912.0, "step": 1065 }, { "epoch": 1.0844354018311293, "grad_norm": 0.7773120999336243, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8196120262145996, "num_tokens": 339637655.0, "step": 1066 }, { "epoch": 1.0854526958290946, "grad_norm": 0.77564537525177, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8285472989082336, "num_tokens": 339944200.0, "step": 1067 }, { "epoch": 1.08646998982706, "grad_norm": 0.763636589050293, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8201035261154175, "num_tokens": 340252788.0, "step": 1068 }, { "epoch": 1.0874872838250254, "grad_norm": 0.827096700668335, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8229066133499146, "num_tokens": 340571484.0, "step": 1069 }, { "epoch": 1.088504577822991, "grad_norm": 0.7606673240661621, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8228622674942017, "num_tokens": 340886244.0, "step": 1070 }, { "epoch": 1.0895218718209563, "grad_norm": 0.7811551094055176, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8230052590370178, "num_tokens": 341199202.0, "step": 1071 }, { "epoch": 1.0905391658189216, "grad_norm": 0.7782750129699707, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8207575082778931, "num_tokens": 341532738.0, "step": 1072 }, { "epoch": 1.091556459816887, "grad_norm": 0.797938346862793, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8228356838226318, "num_tokens": 341847416.0, "step": 1073 }, { "epoch": 1.0925737538148526, "grad_norm": 0.7769486308097839, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8257655501365662, "num_tokens": 342175374.0, "step": 1074 }, { "epoch": 1.093591047812818, "grad_norm": 0.7653997540473938, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8170241713523865, "num_tokens": 342506392.0, "step": 1075 }, { "epoch": 1.0946083418107833, "grad_norm": 0.8226693868637085, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8193036317825317, "num_tokens": 342830993.0, "step": 1076 }, { "epoch": 1.0956256358087488, "grad_norm": 0.7961437106132507, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8232593536376953, "num_tokens": 343141065.0, "step": 1077 }, { "epoch": 1.0966429298067142, "grad_norm": 0.7740011215209961, "learning_rate": 1e-06, "loss": 0.623, "mean_token_accuracy": 0.8138188719749451, "num_tokens": 343447392.0, "step": 1078 }, { "epoch": 1.0976602238046795, "grad_norm": 0.7739623188972473, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8247867822647095, "num_tokens": 343757619.0, "step": 1079 }, { "epoch": 1.0986775178026449, "grad_norm": 0.7831132411956787, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8220590949058533, "num_tokens": 344073814.0, "step": 1080 }, { "epoch": 1.0996948118006105, "grad_norm": 0.854607880115509, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8229290246963501, "num_tokens": 344391441.0, "step": 1081 }, { "epoch": 1.1007121057985758, "grad_norm": 0.749823808670044, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8226028680801392, "num_tokens": 344712984.0, "step": 1082 }, { "epoch": 1.1017293997965412, "grad_norm": 0.7485523819923401, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8197723627090454, "num_tokens": 345036641.0, "step": 1083 }, { "epoch": 1.1027466937945065, "grad_norm": 0.7741460800170898, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8218555450439453, "num_tokens": 345369577.0, "step": 1084 }, { "epoch": 1.103763987792472, "grad_norm": 0.7494634389877319, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8278588056564331, "num_tokens": 345683367.0, "step": 1085 }, { "epoch": 1.1047812817904374, "grad_norm": 0.7270079851150513, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.816685676574707, "num_tokens": 346003193.0, "step": 1086 }, { "epoch": 1.1057985757884028, "grad_norm": 0.7581070065498352, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8216639757156372, "num_tokens": 346330561.0, "step": 1087 }, { "epoch": 1.1068158697863684, "grad_norm": 0.7528476715087891, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8204837441444397, "num_tokens": 346660957.0, "step": 1088 }, { "epoch": 1.1078331637843337, "grad_norm": 0.759814977645874, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8286617398262024, "num_tokens": 346990623.0, "step": 1089 }, { "epoch": 1.108850457782299, "grad_norm": 0.7554075121879578, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8215787410736084, "num_tokens": 347324012.0, "step": 1090 }, { "epoch": 1.1098677517802644, "grad_norm": 0.7750098705291748, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8182422518730164, "num_tokens": 347632344.0, "step": 1091 }, { "epoch": 1.11088504577823, "grad_norm": 0.8283107280731201, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.816761314868927, "num_tokens": 347925739.0, "step": 1092 }, { "epoch": 1.1119023397761953, "grad_norm": 0.7343229055404663, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8283286094665527, "num_tokens": 348240499.0, "step": 1093 }, { "epoch": 1.1129196337741607, "grad_norm": 0.7427061796188354, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8270567655563354, "num_tokens": 348555987.0, "step": 1094 }, { "epoch": 1.113936927772126, "grad_norm": 0.7688320279121399, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8231028318405151, "num_tokens": 348866347.0, "step": 1095 }, { "epoch": 1.1149542217700916, "grad_norm": 0.7548975944519043, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8277769088745117, "num_tokens": 349194754.0, "step": 1096 }, { "epoch": 1.115971515768057, "grad_norm": 0.7943713665008545, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8240779638290405, "num_tokens": 349501086.0, "step": 1097 }, { "epoch": 1.1169888097660223, "grad_norm": 0.7876549363136292, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.826018214225769, "num_tokens": 349818869.0, "step": 1098 }, { "epoch": 1.118006103763988, "grad_norm": 0.7462044954299927, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8253153562545776, "num_tokens": 350142280.0, "step": 1099 }, { "epoch": 1.1190233977619533, "grad_norm": 0.7265391945838928, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8255075216293335, "num_tokens": 350469513.0, "step": 1100 }, { "epoch": 1.1200406917599186, "grad_norm": 0.7540880441665649, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8258617520332336, "num_tokens": 350784696.0, "step": 1101 }, { "epoch": 1.121057985757884, "grad_norm": 0.722629964351654, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.832967221736908, "num_tokens": 351109045.0, "step": 1102 }, { "epoch": 1.1220752797558495, "grad_norm": 0.7430449724197388, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8284896612167358, "num_tokens": 351433256.0, "step": 1103 }, { "epoch": 1.1230925737538149, "grad_norm": 0.7905130982398987, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8168379664421082, "num_tokens": 351734672.0, "step": 1104 }, { "epoch": 1.1241098677517802, "grad_norm": 0.8053008317947388, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.823253870010376, "num_tokens": 352042249.0, "step": 1105 }, { "epoch": 1.1251271617497456, "grad_norm": 0.7345619797706604, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8202496767044067, "num_tokens": 352358733.0, "step": 1106 }, { "epoch": 1.1261444557477112, "grad_norm": 0.7487010955810547, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8215106725692749, "num_tokens": 352667221.0, "step": 1107 }, { "epoch": 1.1271617497456765, "grad_norm": 0.7505720853805542, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8260778784751892, "num_tokens": 352982322.0, "step": 1108 }, { "epoch": 1.1281790437436419, "grad_norm": 0.7573304176330566, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8235150575637817, "num_tokens": 353294238.0, "step": 1109 }, { "epoch": 1.1291963377416074, "grad_norm": 0.7672238349914551, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.820686936378479, "num_tokens": 353617488.0, "step": 1110 }, { "epoch": 1.1302136317395728, "grad_norm": 0.7361064553260803, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.823577880859375, "num_tokens": 353942973.0, "step": 1111 }, { "epoch": 1.1312309257375381, "grad_norm": 0.757715106010437, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8163123726844788, "num_tokens": 354241839.0, "step": 1112 }, { "epoch": 1.1322482197355035, "grad_norm": 0.7401450872421265, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8200206160545349, "num_tokens": 354575014.0, "step": 1113 }, { "epoch": 1.133265513733469, "grad_norm": 0.8054563999176025, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8230452537536621, "num_tokens": 354915712.0, "step": 1114 }, { "epoch": 1.1342828077314344, "grad_norm": 0.7494530081748962, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8168686032295227, "num_tokens": 355237653.0, "step": 1115 }, { "epoch": 1.1353001017293998, "grad_norm": 0.7508543133735657, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8246638774871826, "num_tokens": 355582832.0, "step": 1116 }, { "epoch": 1.1363173957273651, "grad_norm": 0.788264811038971, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8205045461654663, "num_tokens": 355882190.0, "step": 1117 }, { "epoch": 1.1373346897253307, "grad_norm": 0.7862522006034851, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8265331983566284, "num_tokens": 356192221.0, "step": 1118 }, { "epoch": 1.138351983723296, "grad_norm": 0.7364363074302673, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8141418099403381, "num_tokens": 356515011.0, "step": 1119 }, { "epoch": 1.1393692777212614, "grad_norm": 0.7794057726860046, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8244831562042236, "num_tokens": 356840072.0, "step": 1120 }, { "epoch": 1.140386571719227, "grad_norm": 0.7552722692489624, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8201184272766113, "num_tokens": 357164387.0, "step": 1121 }, { "epoch": 1.1414038657171923, "grad_norm": 0.7549647092819214, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8276758193969727, "num_tokens": 357480123.0, "step": 1122 }, { "epoch": 1.1424211597151577, "grad_norm": 0.8171870112419128, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8288565874099731, "num_tokens": 357799511.0, "step": 1123 }, { "epoch": 1.143438453713123, "grad_norm": 0.7676132321357727, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8197697997093201, "num_tokens": 358095805.0, "step": 1124 }, { "epoch": 1.1444557477110886, "grad_norm": 0.7719829082489014, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8251679539680481, "num_tokens": 358405765.0, "step": 1125 }, { "epoch": 1.145473041709054, "grad_norm": 0.7176077365875244, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8127435445785522, "num_tokens": 358741998.0, "step": 1126 }, { "epoch": 1.1464903357070193, "grad_norm": 0.778710126876831, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8297712802886963, "num_tokens": 359053024.0, "step": 1127 }, { "epoch": 1.1475076297049847, "grad_norm": 0.7981055378913879, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8239826560020447, "num_tokens": 359375453.0, "step": 1128 }, { "epoch": 1.1485249237029502, "grad_norm": 0.7974837422370911, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8285097479820251, "num_tokens": 359692458.0, "step": 1129 }, { "epoch": 1.1495422177009156, "grad_norm": 0.7408716082572937, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8209438323974609, "num_tokens": 360008843.0, "step": 1130 }, { "epoch": 1.150559511698881, "grad_norm": 0.7501970529556274, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8257219791412354, "num_tokens": 360333592.0, "step": 1131 }, { "epoch": 1.1515768056968465, "grad_norm": 0.7652875781059265, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8355057835578918, "num_tokens": 360646033.0, "step": 1132 }, { "epoch": 1.1525940996948119, "grad_norm": 0.7454234957695007, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8191195726394653, "num_tokens": 360980106.0, "step": 1133 }, { "epoch": 1.1536113936927772, "grad_norm": 0.7779508829116821, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8256955742835999, "num_tokens": 361296869.0, "step": 1134 }, { "epoch": 1.1546286876907426, "grad_norm": 0.7592409253120422, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8217758536338806, "num_tokens": 361597141.0, "step": 1135 }, { "epoch": 1.155645981688708, "grad_norm": 0.7225300073623657, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8229520320892334, "num_tokens": 361931768.0, "step": 1136 }, { "epoch": 1.1566632756866735, "grad_norm": 0.7548590302467346, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8324354887008667, "num_tokens": 362246811.0, "step": 1137 }, { "epoch": 1.1576805696846388, "grad_norm": 0.7888747453689575, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8239426612854004, "num_tokens": 362555139.0, "step": 1138 }, { "epoch": 1.1586978636826042, "grad_norm": 0.7801425457000732, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.827373743057251, "num_tokens": 362881363.0, "step": 1139 }, { "epoch": 1.1597151576805698, "grad_norm": 0.7782330513000488, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8224520683288574, "num_tokens": 363207013.0, "step": 1140 }, { "epoch": 1.1607324516785351, "grad_norm": 0.7746996283531189, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8252719640731812, "num_tokens": 363501174.0, "step": 1141 }, { "epoch": 1.1617497456765005, "grad_norm": 0.7277313470840454, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8253401517868042, "num_tokens": 363831350.0, "step": 1142 }, { "epoch": 1.162767039674466, "grad_norm": 0.7452883124351501, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8252953290939331, "num_tokens": 364161537.0, "step": 1143 }, { "epoch": 1.1637843336724314, "grad_norm": 0.7810433506965637, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8254508972167969, "num_tokens": 364468701.0, "step": 1144 }, { "epoch": 1.1648016276703967, "grad_norm": 0.7638463377952576, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8235447406768799, "num_tokens": 364793011.0, "step": 1145 }, { "epoch": 1.165818921668362, "grad_norm": 0.8116283416748047, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8259240984916687, "num_tokens": 365125068.0, "step": 1146 }, { "epoch": 1.1668362156663274, "grad_norm": 0.7863451242446899, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8196288347244263, "num_tokens": 365447819.0, "step": 1147 }, { "epoch": 1.167853509664293, "grad_norm": 0.7274298071861267, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8230388760566711, "num_tokens": 365766838.0, "step": 1148 }, { "epoch": 1.1688708036622584, "grad_norm": 0.7949771285057068, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8283715844154358, "num_tokens": 366101878.0, "step": 1149 }, { "epoch": 1.1698880976602237, "grad_norm": 0.7356235384941101, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8254322409629822, "num_tokens": 366439188.0, "step": 1150 }, { "epoch": 1.1709053916581893, "grad_norm": 0.7703425884246826, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8203462362289429, "num_tokens": 366769572.0, "step": 1151 }, { "epoch": 1.1719226856561547, "grad_norm": 0.7374350428581238, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8288112878799438, "num_tokens": 367097208.0, "step": 1152 }, { "epoch": 1.17293997965412, "grad_norm": 0.7855170369148254, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8170003890991211, "num_tokens": 367424929.0, "step": 1153 }, { "epoch": 1.1739572736520856, "grad_norm": 0.7788858413696289, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8242186903953552, "num_tokens": 367753127.0, "step": 1154 }, { "epoch": 1.174974567650051, "grad_norm": 0.7844034433364868, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8226374387741089, "num_tokens": 368073652.0, "step": 1155 }, { "epoch": 1.1759918616480163, "grad_norm": 0.7838830947875977, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8258245587348938, "num_tokens": 368389034.0, "step": 1156 }, { "epoch": 1.1770091556459816, "grad_norm": 0.7512738704681396, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.820223867893219, "num_tokens": 368721238.0, "step": 1157 }, { "epoch": 1.178026449643947, "grad_norm": 0.7941290140151978, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8232244253158569, "num_tokens": 369039106.0, "step": 1158 }, { "epoch": 1.1790437436419126, "grad_norm": 0.7485936284065247, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8281660079956055, "num_tokens": 369375622.0, "step": 1159 }, { "epoch": 1.180061037639878, "grad_norm": 0.7521687150001526, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8194353580474854, "num_tokens": 369688789.0, "step": 1160 }, { "epoch": 1.1810783316378433, "grad_norm": 0.775185227394104, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8191851377487183, "num_tokens": 369998967.0, "step": 1161 }, { "epoch": 1.1820956256358088, "grad_norm": 0.7778295278549194, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8214291334152222, "num_tokens": 370333473.0, "step": 1162 }, { "epoch": 1.1831129196337742, "grad_norm": 0.7511019706726074, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.82322096824646, "num_tokens": 370660466.0, "step": 1163 }, { "epoch": 1.1841302136317395, "grad_norm": 0.780828058719635, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8165101408958435, "num_tokens": 370998386.0, "step": 1164 }, { "epoch": 1.1851475076297049, "grad_norm": 0.7184352278709412, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8255602717399597, "num_tokens": 371332751.0, "step": 1165 }, { "epoch": 1.1861648016276705, "grad_norm": 0.7472013831138611, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8212395906448364, "num_tokens": 371662573.0, "step": 1166 }, { "epoch": 1.1871820956256358, "grad_norm": 0.7376736998558044, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8313839435577393, "num_tokens": 371986831.0, "step": 1167 }, { "epoch": 1.1881993896236012, "grad_norm": 0.8291728496551514, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8143845200538635, "num_tokens": 372315615.0, "step": 1168 }, { "epoch": 1.1892166836215665, "grad_norm": 0.7506025433540344, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8245192766189575, "num_tokens": 372638897.0, "step": 1169 }, { "epoch": 1.190233977619532, "grad_norm": 0.7672327160835266, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8251274824142456, "num_tokens": 372956645.0, "step": 1170 }, { "epoch": 1.1912512716174974, "grad_norm": 1.3121378421783447, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8294004201889038, "num_tokens": 373265980.0, "step": 1171 }, { "epoch": 1.1922685656154628, "grad_norm": 0.7915225028991699, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.821526825428009, "num_tokens": 373575839.0, "step": 1172 }, { "epoch": 1.1932858596134284, "grad_norm": 0.7794643044471741, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8254324793815613, "num_tokens": 373896838.0, "step": 1173 }, { "epoch": 1.1943031536113937, "grad_norm": 0.7948567271232605, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8252392411231995, "num_tokens": 374210420.0, "step": 1174 }, { "epoch": 1.195320447609359, "grad_norm": 0.7596521973609924, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8344128727912903, "num_tokens": 374539265.0, "step": 1175 }, { "epoch": 1.1963377416073244, "grad_norm": 0.8085728287696838, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8254034519195557, "num_tokens": 374846358.0, "step": 1176 }, { "epoch": 1.19735503560529, "grad_norm": 0.8072423338890076, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8295195698738098, "num_tokens": 375157542.0, "step": 1177 }, { "epoch": 1.1983723296032553, "grad_norm": 0.7838948965072632, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.825675368309021, "num_tokens": 375472279.0, "step": 1178 }, { "epoch": 1.1993896236012207, "grad_norm": 0.8102052807807922, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8277257680892944, "num_tokens": 375785251.0, "step": 1179 }, { "epoch": 1.200406917599186, "grad_norm": 0.7381642460823059, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.828418493270874, "num_tokens": 376099013.0, "step": 1180 }, { "epoch": 1.2014242115971516, "grad_norm": 0.7651596665382385, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8231362700462341, "num_tokens": 376414415.0, "step": 1181 }, { "epoch": 1.202441505595117, "grad_norm": 0.8105350136756897, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8211319446563721, "num_tokens": 376731071.0, "step": 1182 }, { "epoch": 1.2034587995930823, "grad_norm": 0.8020378947257996, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8202518820762634, "num_tokens": 377061505.0, "step": 1183 }, { "epoch": 1.204476093591048, "grad_norm": 0.7923497557640076, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8226475119590759, "num_tokens": 377377739.0, "step": 1184 }, { "epoch": 1.2054933875890133, "grad_norm": 0.7810748219490051, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8202524185180664, "num_tokens": 377694342.0, "step": 1185 }, { "epoch": 1.2065106815869786, "grad_norm": 0.7620840668678284, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8272120952606201, "num_tokens": 378013523.0, "step": 1186 }, { "epoch": 1.207527975584944, "grad_norm": 0.8037322759628296, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8277493715286255, "num_tokens": 378315219.0, "step": 1187 }, { "epoch": 1.2085452695829095, "grad_norm": 0.7999018430709839, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8289440870285034, "num_tokens": 378622003.0, "step": 1188 }, { "epoch": 1.2095625635808749, "grad_norm": 0.7701825499534607, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8280704617500305, "num_tokens": 378932580.0, "step": 1189 }, { "epoch": 1.2105798575788402, "grad_norm": 0.7750142216682434, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8203139305114746, "num_tokens": 379261815.0, "step": 1190 }, { "epoch": 1.2115971515768056, "grad_norm": 0.7240380644798279, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8214856386184692, "num_tokens": 379589270.0, "step": 1191 }, { "epoch": 1.2126144455747712, "grad_norm": 0.7719489336013794, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8215484023094177, "num_tokens": 379905958.0, "step": 1192 }, { "epoch": 1.2136317395727365, "grad_norm": 0.7766221761703491, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8235345482826233, "num_tokens": 380215696.0, "step": 1193 }, { "epoch": 1.2146490335707019, "grad_norm": 0.7447682023048401, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8295247554779053, "num_tokens": 380552451.0, "step": 1194 }, { "epoch": 1.2156663275686674, "grad_norm": 0.7715691924095154, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.826104462146759, "num_tokens": 380862532.0, "step": 1195 }, { "epoch": 1.2166836215666328, "grad_norm": 0.7756348252296448, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8258826732635498, "num_tokens": 381181297.0, "step": 1196 }, { "epoch": 1.2177009155645981, "grad_norm": 0.7937551140785217, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8146166205406189, "num_tokens": 381492918.0, "step": 1197 }, { "epoch": 1.2187182095625635, "grad_norm": 0.7768622040748596, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8206217288970947, "num_tokens": 381810610.0, "step": 1198 }, { "epoch": 1.219735503560529, "grad_norm": 0.7419608235359192, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8271506428718567, "num_tokens": 382127627.0, "step": 1199 }, { "epoch": 1.2207527975584944, "grad_norm": 0.7592210173606873, "learning_rate": 1e-06, "loss": 0.5842, "mean_token_accuracy": 0.824863076210022, "num_tokens": 382453657.0, "step": 1200 }, { "epoch": 1.2217700915564598, "grad_norm": 0.8112965226173401, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8280565738677979, "num_tokens": 382781691.0, "step": 1201 }, { "epoch": 1.2227873855544251, "grad_norm": 0.7668476700782776, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8258017897605896, "num_tokens": 383107927.0, "step": 1202 }, { "epoch": 1.2238046795523907, "grad_norm": 1.771498680114746, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8235318660736084, "num_tokens": 383429967.0, "step": 1203 }, { "epoch": 1.224821973550356, "grad_norm": 0.7665342688560486, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8229868412017822, "num_tokens": 383744521.0, "step": 1204 }, { "epoch": 1.2258392675483214, "grad_norm": 0.7280278205871582, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8258658647537231, "num_tokens": 384070051.0, "step": 1205 }, { "epoch": 1.226856561546287, "grad_norm": 0.7722219824790955, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8209736347198486, "num_tokens": 384381225.0, "step": 1206 }, { "epoch": 1.2278738555442523, "grad_norm": 0.7422972917556763, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8206053376197815, "num_tokens": 384710145.0, "step": 1207 }, { "epoch": 1.2288911495422177, "grad_norm": 0.7677599191665649, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.817360520362854, "num_tokens": 385030375.0, "step": 1208 }, { "epoch": 1.229908443540183, "grad_norm": 0.7416399717330933, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8186507225036621, "num_tokens": 385361429.0, "step": 1209 }, { "epoch": 1.2309257375381486, "grad_norm": 0.7370989918708801, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8210272789001465, "num_tokens": 385690394.0, "step": 1210 }, { "epoch": 1.231943031536114, "grad_norm": 0.7769769430160522, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8265715837478638, "num_tokens": 386012537.0, "step": 1211 }, { "epoch": 1.2329603255340793, "grad_norm": 0.7492892146110535, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8234988451004028, "num_tokens": 386338087.0, "step": 1212 }, { "epoch": 1.2339776195320447, "grad_norm": 0.7639126181602478, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8244913816452026, "num_tokens": 386663700.0, "step": 1213 }, { "epoch": 1.2349949135300102, "grad_norm": 0.743516743183136, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8219711184501648, "num_tokens": 386979867.0, "step": 1214 }, { "epoch": 1.2360122075279756, "grad_norm": 0.7583151459693909, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8271145820617676, "num_tokens": 387308229.0, "step": 1215 }, { "epoch": 1.237029501525941, "grad_norm": 0.7758545279502869, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8180948495864868, "num_tokens": 387633541.0, "step": 1216 }, { "epoch": 1.2380467955239065, "grad_norm": 0.7462221384048462, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8287074565887451, "num_tokens": 387945149.0, "step": 1217 }, { "epoch": 1.2390640895218719, "grad_norm": 0.7475136518478394, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8280231952667236, "num_tokens": 388262139.0, "step": 1218 }, { "epoch": 1.2400813835198372, "grad_norm": 0.8114516139030457, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8244044780731201, "num_tokens": 388567660.0, "step": 1219 }, { "epoch": 1.2410986775178026, "grad_norm": 0.8436469435691833, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8259292840957642, "num_tokens": 388862732.0, "step": 1220 }, { "epoch": 1.2421159715157681, "grad_norm": 0.7752360105514526, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8258293867111206, "num_tokens": 389171597.0, "step": 1221 }, { "epoch": 1.2431332655137335, "grad_norm": 0.7781957983970642, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8197643756866455, "num_tokens": 389471070.0, "step": 1222 }, { "epoch": 1.2441505595116988, "grad_norm": 0.8264309763908386, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8217254877090454, "num_tokens": 389778197.0, "step": 1223 }, { "epoch": 1.2451678535096642, "grad_norm": 0.8131759762763977, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8253227472305298, "num_tokens": 390102274.0, "step": 1224 }, { "epoch": 1.2461851475076298, "grad_norm": 0.7956682443618774, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8182318806648254, "num_tokens": 390422450.0, "step": 1225 }, { "epoch": 1.2472024415055951, "grad_norm": 0.737737238407135, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8269497752189636, "num_tokens": 390752194.0, "step": 1226 }, { "epoch": 1.2482197355035605, "grad_norm": 0.7784290909767151, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8291661739349365, "num_tokens": 391076755.0, "step": 1227 }, { "epoch": 1.249237029501526, "grad_norm": 0.8793082237243652, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8253451585769653, "num_tokens": 391386429.0, "step": 1228 }, { "epoch": 1.2502543234994914, "grad_norm": 0.7870639562606812, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8213008642196655, "num_tokens": 391709893.0, "step": 1229 }, { "epoch": 1.2512716174974567, "grad_norm": 0.7400197982788086, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8226692080497742, "num_tokens": 392024517.0, "step": 1230 }, { "epoch": 1.252288911495422, "grad_norm": 0.7613899111747742, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.825670063495636, "num_tokens": 392336854.0, "step": 1231 }, { "epoch": 1.2533062054933877, "grad_norm": 0.8779903650283813, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8210177421569824, "num_tokens": 392656687.0, "step": 1232 }, { "epoch": 1.254323499491353, "grad_norm": 0.9043331742286682, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.8101489543914795, "num_tokens": 392972527.0, "step": 1233 }, { "epoch": 1.2553407934893184, "grad_norm": 0.8443557024002075, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8232437372207642, "num_tokens": 393296349.0, "step": 1234 }, { "epoch": 1.2563580874872837, "grad_norm": 0.7689527869224548, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8333369493484497, "num_tokens": 393622385.0, "step": 1235 }, { "epoch": 1.2573753814852493, "grad_norm": 0.7692641615867615, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8272202610969543, "num_tokens": 393940347.0, "step": 1236 }, { "epoch": 1.2583926754832147, "grad_norm": 0.8014664649963379, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8211422562599182, "num_tokens": 394274909.0, "step": 1237 }, { "epoch": 1.25940996948118, "grad_norm": 0.9445344805717468, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8287132382392883, "num_tokens": 394572404.0, "step": 1238 }, { "epoch": 1.2604272634791456, "grad_norm": 0.887416660785675, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8148610591888428, "num_tokens": 394904157.0, "step": 1239 }, { "epoch": 1.261444557477111, "grad_norm": 0.7784750461578369, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8244732022285461, "num_tokens": 395205744.0, "step": 1240 }, { "epoch": 1.2624618514750763, "grad_norm": 0.7089627385139465, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8267613649368286, "num_tokens": 395535591.0, "step": 1241 }, { "epoch": 1.2634791454730416, "grad_norm": 0.7316235303878784, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8229551911354065, "num_tokens": 395887109.0, "step": 1242 }, { "epoch": 1.264496439471007, "grad_norm": 0.7902207970619202, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8176922798156738, "num_tokens": 396215248.0, "step": 1243 }, { "epoch": 1.2655137334689726, "grad_norm": 0.8321451544761658, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8179523944854736, "num_tokens": 396538633.0, "step": 1244 }, { "epoch": 1.266531027466938, "grad_norm": 0.7029138207435608, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8264862298965454, "num_tokens": 396867308.0, "step": 1245 }, { "epoch": 1.2675483214649033, "grad_norm": 0.7796322107315063, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8210832476615906, "num_tokens": 397176821.0, "step": 1246 }, { "epoch": 1.2685656154628688, "grad_norm": 0.7757667899131775, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8296244740486145, "num_tokens": 397507718.0, "step": 1247 }, { "epoch": 1.2695829094608342, "grad_norm": 0.7379530072212219, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8277691602706909, "num_tokens": 397832455.0, "step": 1248 }, { "epoch": 1.2706002034587995, "grad_norm": 0.7905023694038391, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8222687244415283, "num_tokens": 398143376.0, "step": 1249 }, { "epoch": 1.2716174974567651, "grad_norm": 0.7380933165550232, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8268264532089233, "num_tokens": 398461934.0, "step": 1250 }, { "epoch": 1.2726347914547305, "grad_norm": 0.7449954152107239, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8227959275245667, "num_tokens": 398775458.0, "step": 1251 }, { "epoch": 1.2736520854526958, "grad_norm": 0.7715222239494324, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8172944188117981, "num_tokens": 399094210.0, "step": 1252 }, { "epoch": 1.2746693794506612, "grad_norm": 0.7808869481086731, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8260170221328735, "num_tokens": 399408637.0, "step": 1253 }, { "epoch": 1.2756866734486265, "grad_norm": 0.7651352286338806, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.824914813041687, "num_tokens": 399726159.0, "step": 1254 }, { "epoch": 1.276703967446592, "grad_norm": 0.7173874974250793, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8283831477165222, "num_tokens": 400058221.0, "step": 1255 }, { "epoch": 1.2777212614445574, "grad_norm": 0.7614690065383911, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8249248266220093, "num_tokens": 400381507.0, "step": 1256 }, { "epoch": 1.2787385554425228, "grad_norm": 0.7964950203895569, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8261619806289673, "num_tokens": 400704230.0, "step": 1257 }, { "epoch": 1.2797558494404884, "grad_norm": 0.797745406627655, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8254767656326294, "num_tokens": 401018958.0, "step": 1258 }, { "epoch": 1.2807731434384537, "grad_norm": 0.7655918598175049, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8233509063720703, "num_tokens": 401325913.0, "step": 1259 }, { "epoch": 1.281790437436419, "grad_norm": 0.776522696018219, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8195526599884033, "num_tokens": 401650213.0, "step": 1260 }, { "epoch": 1.2828077314343846, "grad_norm": 0.7863470315933228, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8247014880180359, "num_tokens": 401952668.0, "step": 1261 }, { "epoch": 1.28382502543235, "grad_norm": 0.7836190462112427, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8274684548377991, "num_tokens": 402266408.0, "step": 1262 }, { "epoch": 1.2848423194303153, "grad_norm": 0.7265501618385315, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8172202110290527, "num_tokens": 402595965.0, "step": 1263 }, { "epoch": 1.2858596134282807, "grad_norm": 0.7719841003417969, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.815332293510437, "num_tokens": 402919556.0, "step": 1264 }, { "epoch": 1.286876907426246, "grad_norm": 0.7617841362953186, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8226585388183594, "num_tokens": 403234129.0, "step": 1265 }, { "epoch": 1.2878942014242116, "grad_norm": 3.889275312423706, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8203097581863403, "num_tokens": 403545378.0, "step": 1266 }, { "epoch": 1.288911495422177, "grad_norm": 0.8040711879730225, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8193329572677612, "num_tokens": 403879895.0, "step": 1267 }, { "epoch": 1.2899287894201423, "grad_norm": 0.8139612078666687, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8277958631515503, "num_tokens": 404194940.0, "step": 1268 }, { "epoch": 1.290946083418108, "grad_norm": 0.7611631155014038, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8179622888565063, "num_tokens": 404516058.0, "step": 1269 }, { "epoch": 1.2919633774160733, "grad_norm": 0.8590632677078247, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8203931450843811, "num_tokens": 404823463.0, "step": 1270 }, { "epoch": 1.2929806714140386, "grad_norm": 0.7872003316879272, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8210145831108093, "num_tokens": 405133209.0, "step": 1271 }, { "epoch": 1.2939979654120042, "grad_norm": 0.762592077255249, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8206639885902405, "num_tokens": 405447583.0, "step": 1272 }, { "epoch": 1.2950152594099695, "grad_norm": 0.7649151682853699, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8295191526412964, "num_tokens": 405772749.0, "step": 1273 }, { "epoch": 1.2960325534079349, "grad_norm": 0.7442457675933838, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8269622325897217, "num_tokens": 406102126.0, "step": 1274 }, { "epoch": 1.2970498474059002, "grad_norm": 0.8645238280296326, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8254821300506592, "num_tokens": 406423086.0, "step": 1275 }, { "epoch": 1.2980671414038656, "grad_norm": 0.7339827418327332, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8312309384346008, "num_tokens": 406744290.0, "step": 1276 }, { "epoch": 1.2990844354018312, "grad_norm": 0.8143907189369202, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8225513696670532, "num_tokens": 407056075.0, "step": 1277 }, { "epoch": 1.3001017293997965, "grad_norm": 0.7675595879554749, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8220341801643372, "num_tokens": 407364405.0, "step": 1278 }, { "epoch": 1.3011190233977619, "grad_norm": 0.7461718916893005, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8260219097137451, "num_tokens": 407689878.0, "step": 1279 }, { "epoch": 1.3021363173957274, "grad_norm": 0.7368419170379639, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8259373307228088, "num_tokens": 408019653.0, "step": 1280 }, { "epoch": 1.3031536113936928, "grad_norm": 0.7481878399848938, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8262139558792114, "num_tokens": 408343205.0, "step": 1281 }, { "epoch": 1.3041709053916581, "grad_norm": 0.7598819136619568, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8240863084793091, "num_tokens": 408666686.0, "step": 1282 }, { "epoch": 1.3051881993896237, "grad_norm": 0.753930389881134, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8200889825820923, "num_tokens": 408989130.0, "step": 1283 }, { "epoch": 1.306205493387589, "grad_norm": 0.7269991040229797, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8264918923377991, "num_tokens": 409319697.0, "step": 1284 }, { "epoch": 1.3072227873855544, "grad_norm": 0.7664880156517029, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8235412836074829, "num_tokens": 409638569.0, "step": 1285 }, { "epoch": 1.3082400813835198, "grad_norm": 0.7126306295394897, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.821363091468811, "num_tokens": 409963322.0, "step": 1286 }, { "epoch": 1.3092573753814851, "grad_norm": 0.7578443288803101, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8326585292816162, "num_tokens": 410289774.0, "step": 1287 }, { "epoch": 1.3102746693794507, "grad_norm": 0.7516607046127319, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8333381414413452, "num_tokens": 410603142.0, "step": 1288 }, { "epoch": 1.311291963377416, "grad_norm": 0.7146863341331482, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8209625482559204, "num_tokens": 410934281.0, "step": 1289 }, { "epoch": 1.3123092573753814, "grad_norm": 0.7467058300971985, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8207160234451294, "num_tokens": 411262888.0, "step": 1290 }, { "epoch": 1.313326551373347, "grad_norm": 0.8217064142227173, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8268510103225708, "num_tokens": 411567594.0, "step": 1291 }, { "epoch": 1.3143438453713123, "grad_norm": 0.7733748555183411, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8187816143035889, "num_tokens": 411893042.0, "step": 1292 }, { "epoch": 1.3153611393692777, "grad_norm": 0.7679750919342041, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8263243436813354, "num_tokens": 412215931.0, "step": 1293 }, { "epoch": 1.3163784333672433, "grad_norm": 0.7660022377967834, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8236187100410461, "num_tokens": 412552715.0, "step": 1294 }, { "epoch": 1.3173957273652086, "grad_norm": 0.7751847505569458, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8215477466583252, "num_tokens": 412869340.0, "step": 1295 }, { "epoch": 1.318413021363174, "grad_norm": 0.7322462797164917, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8294398784637451, "num_tokens": 413187434.0, "step": 1296 }, { "epoch": 1.3194303153611393, "grad_norm": 0.7586652040481567, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8181220293045044, "num_tokens": 413523255.0, "step": 1297 }, { "epoch": 1.3204476093591047, "grad_norm": 0.7863152027130127, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.824659526348114, "num_tokens": 413836107.0, "step": 1298 }, { "epoch": 1.3214649033570702, "grad_norm": 0.8461436629295349, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8312619924545288, "num_tokens": 414156938.0, "step": 1299 }, { "epoch": 1.3224821973550356, "grad_norm": 0.7358280420303345, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8195974826812744, "num_tokens": 414482061.0, "step": 1300 }, { "epoch": 1.323499491353001, "grad_norm": 0.742594838142395, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8274781703948975, "num_tokens": 414806350.0, "step": 1301 }, { "epoch": 1.3245167853509665, "grad_norm": 0.7661194801330566, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8172125816345215, "num_tokens": 415151776.0, "step": 1302 }, { "epoch": 1.3255340793489319, "grad_norm": 0.7671038508415222, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8222029805183411, "num_tokens": 415475717.0, "step": 1303 }, { "epoch": 1.3265513733468972, "grad_norm": 0.7653346657752991, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8281009197235107, "num_tokens": 415782339.0, "step": 1304 }, { "epoch": 1.3275686673448628, "grad_norm": 0.7612704634666443, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8244327306747437, "num_tokens": 416091384.0, "step": 1305 }, { "epoch": 1.3285859613428281, "grad_norm": 0.7379361391067505, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8189595937728882, "num_tokens": 416408034.0, "step": 1306 }, { "epoch": 1.3296032553407935, "grad_norm": 0.7884662747383118, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8236218690872192, "num_tokens": 416715317.0, "step": 1307 }, { "epoch": 1.3306205493387588, "grad_norm": 0.8206257820129395, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8237992525100708, "num_tokens": 417017357.0, "step": 1308 }, { "epoch": 1.3316378433367242, "grad_norm": 0.7628151178359985, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8287310600280762, "num_tokens": 417324031.0, "step": 1309 }, { "epoch": 1.3326551373346898, "grad_norm": 0.7664852142333984, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8257479667663574, "num_tokens": 417635848.0, "step": 1310 }, { "epoch": 1.3336724313326551, "grad_norm": 0.717381477355957, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8340250253677368, "num_tokens": 417948063.0, "step": 1311 }, { "epoch": 1.3346897253306205, "grad_norm": 0.7889302372932434, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8164811134338379, "num_tokens": 418263839.0, "step": 1312 }, { "epoch": 1.335707019328586, "grad_norm": 0.7360092997550964, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.831147313117981, "num_tokens": 418582350.0, "step": 1313 }, { "epoch": 1.3367243133265514, "grad_norm": 0.8149121999740601, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8293501138687134, "num_tokens": 418893429.0, "step": 1314 }, { "epoch": 1.3377416073245167, "grad_norm": 0.7356194257736206, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8250120878219604, "num_tokens": 419228134.0, "step": 1315 }, { "epoch": 1.3387589013224823, "grad_norm": 0.7707953453063965, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8141721487045288, "num_tokens": 419546274.0, "step": 1316 }, { "epoch": 1.3397761953204477, "grad_norm": 0.8126129508018494, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8189851641654968, "num_tokens": 419861881.0, "step": 1317 }, { "epoch": 1.340793489318413, "grad_norm": 0.7575045228004456, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8282990455627441, "num_tokens": 420174883.0, "step": 1318 }, { "epoch": 1.3418107833163784, "grad_norm": 0.768957257270813, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8211398124694824, "num_tokens": 420490175.0, "step": 1319 }, { "epoch": 1.3428280773143437, "grad_norm": 0.7741740942001343, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8324936032295227, "num_tokens": 420818659.0, "step": 1320 }, { "epoch": 1.3438453713123093, "grad_norm": 0.8340362310409546, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8220871686935425, "num_tokens": 421147515.0, "step": 1321 }, { "epoch": 1.3448626653102747, "grad_norm": 0.7541214823722839, "learning_rate": 1e-06, "loss": 0.5842, "mean_token_accuracy": 0.8246658444404602, "num_tokens": 421474359.0, "step": 1322 }, { "epoch": 1.34587995930824, "grad_norm": 0.7496918439865112, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8251877427101135, "num_tokens": 421795778.0, "step": 1323 }, { "epoch": 1.3468972533062056, "grad_norm": 0.7333158254623413, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8225975036621094, "num_tokens": 422127600.0, "step": 1324 }, { "epoch": 1.347914547304171, "grad_norm": 0.7470682263374329, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8197658658027649, "num_tokens": 422441520.0, "step": 1325 }, { "epoch": 1.3489318413021363, "grad_norm": 0.8310449123382568, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8172779083251953, "num_tokens": 422758073.0, "step": 1326 }, { "epoch": 1.3499491353001019, "grad_norm": 0.8071318864822388, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8248072862625122, "num_tokens": 423072900.0, "step": 1327 }, { "epoch": 1.3509664292980672, "grad_norm": 0.7378259301185608, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8296499252319336, "num_tokens": 423399159.0, "step": 1328 }, { "epoch": 1.3519837232960326, "grad_norm": 0.7724500298500061, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8213194608688354, "num_tokens": 423723160.0, "step": 1329 }, { "epoch": 1.353001017293998, "grad_norm": 0.7197055816650391, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.837218165397644, "num_tokens": 424042895.0, "step": 1330 }, { "epoch": 1.3540183112919633, "grad_norm": 0.7320886850357056, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8275708556175232, "num_tokens": 424359175.0, "step": 1331 }, { "epoch": 1.3550356052899288, "grad_norm": 0.7555741667747498, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8243857026100159, "num_tokens": 424688740.0, "step": 1332 }, { "epoch": 1.3560528992878942, "grad_norm": 0.7789037823677063, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8249820470809937, "num_tokens": 425017780.0, "step": 1333 }, { "epoch": 1.3570701932858595, "grad_norm": 0.7404174208641052, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8227372169494629, "num_tokens": 425345218.0, "step": 1334 }, { "epoch": 1.3580874872838251, "grad_norm": 0.7573344707489014, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8235394358634949, "num_tokens": 425660450.0, "step": 1335 }, { "epoch": 1.3591047812817905, "grad_norm": 0.8044156432151794, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8230143785476685, "num_tokens": 425961495.0, "step": 1336 }, { "epoch": 1.3601220752797558, "grad_norm": 0.7424970865249634, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.826804518699646, "num_tokens": 426290019.0, "step": 1337 }, { "epoch": 1.3611393692777214, "grad_norm": 0.7788193225860596, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8316465020179749, "num_tokens": 426600712.0, "step": 1338 }, { "epoch": 1.3621566632756867, "grad_norm": 0.7808539867401123, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8237320780754089, "num_tokens": 426930227.0, "step": 1339 }, { "epoch": 1.363173957273652, "grad_norm": 0.7088400721549988, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8300147652626038, "num_tokens": 427263929.0, "step": 1340 }, { "epoch": 1.3641912512716174, "grad_norm": 0.8230143189430237, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8224456310272217, "num_tokens": 427581870.0, "step": 1341 }, { "epoch": 1.3652085452695828, "grad_norm": 0.7356793880462646, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.828331470489502, "num_tokens": 427890146.0, "step": 1342 }, { "epoch": 1.3662258392675484, "grad_norm": 0.7339310050010681, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8255443572998047, "num_tokens": 428210521.0, "step": 1343 }, { "epoch": 1.3672431332655137, "grad_norm": 0.7392643094062805, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8248897790908813, "num_tokens": 428539565.0, "step": 1344 }, { "epoch": 1.368260427263479, "grad_norm": 0.7387965321540833, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8200353980064392, "num_tokens": 428868507.0, "step": 1345 }, { "epoch": 1.3692777212614446, "grad_norm": 0.7944315075874329, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8235206007957458, "num_tokens": 429192002.0, "step": 1346 }, { "epoch": 1.37029501525941, "grad_norm": 0.7915907502174377, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.82527095079422, "num_tokens": 429499970.0, "step": 1347 }, { "epoch": 1.3713123092573754, "grad_norm": 0.7329350709915161, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8260283470153809, "num_tokens": 429822566.0, "step": 1348 }, { "epoch": 1.372329603255341, "grad_norm": 0.7127922773361206, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8313673734664917, "num_tokens": 430138798.0, "step": 1349 }, { "epoch": 1.3733468972533063, "grad_norm": 0.7566921710968018, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8332539796829224, "num_tokens": 430457184.0, "step": 1350 }, { "epoch": 1.3743641912512716, "grad_norm": 0.7381209135055542, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.821380078792572, "num_tokens": 430782101.0, "step": 1351 }, { "epoch": 1.375381485249237, "grad_norm": 0.7444591522216797, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8303647637367249, "num_tokens": 431093719.0, "step": 1352 }, { "epoch": 1.3763987792472023, "grad_norm": 0.7629130482673645, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8233219981193542, "num_tokens": 431420888.0, "step": 1353 }, { "epoch": 1.377416073245168, "grad_norm": 0.7519823312759399, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8230313062667847, "num_tokens": 431747572.0, "step": 1354 }, { "epoch": 1.3784333672431333, "grad_norm": 0.7806484699249268, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8208594918251038, "num_tokens": 432053072.0, "step": 1355 }, { "epoch": 1.3794506612410986, "grad_norm": 0.806029200553894, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8318723440170288, "num_tokens": 432356553.0, "step": 1356 }, { "epoch": 1.3804679552390642, "grad_norm": 0.7654630541801453, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8207912445068359, "num_tokens": 432670085.0, "step": 1357 }, { "epoch": 1.3814852492370295, "grad_norm": 0.7566870450973511, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8170797824859619, "num_tokens": 433004996.0, "step": 1358 }, { "epoch": 1.3825025432349949, "grad_norm": 0.7837706208229065, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.830281674861908, "num_tokens": 433309340.0, "step": 1359 }, { "epoch": 1.3835198372329605, "grad_norm": 0.7306072115898132, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8319963216781616, "num_tokens": 433655158.0, "step": 1360 }, { "epoch": 1.3845371312309258, "grad_norm": 0.7636180520057678, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8329065442085266, "num_tokens": 433968624.0, "step": 1361 }, { "epoch": 1.3855544252288912, "grad_norm": 0.7410780787467957, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8241591453552246, "num_tokens": 434295363.0, "step": 1362 }, { "epoch": 1.3865717192268565, "grad_norm": 0.7491535544395447, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8248358964920044, "num_tokens": 434613231.0, "step": 1363 }, { "epoch": 1.3875890132248219, "grad_norm": 0.7772935628890991, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8203805685043335, "num_tokens": 434917053.0, "step": 1364 }, { "epoch": 1.3886063072227874, "grad_norm": 0.7721388936042786, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.818004310131073, "num_tokens": 435226320.0, "step": 1365 }, { "epoch": 1.3896236012207528, "grad_norm": 0.9722421169281006, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8311303853988647, "num_tokens": 435536990.0, "step": 1366 }, { "epoch": 1.3906408952187181, "grad_norm": 0.7618489265441895, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8225003480911255, "num_tokens": 435847048.0, "step": 1367 }, { "epoch": 1.3916581892166837, "grad_norm": 0.7776749134063721, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8165792226791382, "num_tokens": 436161792.0, "step": 1368 }, { "epoch": 1.392675483214649, "grad_norm": 0.7927550673484802, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8167214393615723, "num_tokens": 436497752.0, "step": 1369 }, { "epoch": 1.3936927772126144, "grad_norm": 0.7556279897689819, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8288087844848633, "num_tokens": 436827912.0, "step": 1370 }, { "epoch": 1.39471007121058, "grad_norm": 0.7612072229385376, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8257796168327332, "num_tokens": 437161113.0, "step": 1371 }, { "epoch": 1.3957273652085453, "grad_norm": 0.8022080063819885, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8193477392196655, "num_tokens": 437453477.0, "step": 1372 }, { "epoch": 1.3967446592065107, "grad_norm": 0.8560948967933655, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8201726675033569, "num_tokens": 437749330.0, "step": 1373 }, { "epoch": 1.397761953204476, "grad_norm": 0.7637492418289185, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8309266567230225, "num_tokens": 438064905.0, "step": 1374 }, { "epoch": 1.3987792472024414, "grad_norm": 0.76103675365448, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8285423517227173, "num_tokens": 438374191.0, "step": 1375 }, { "epoch": 1.399796541200407, "grad_norm": 0.7823554873466492, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8292416930198669, "num_tokens": 438672284.0, "step": 1376 }, { "epoch": 1.4008138351983723, "grad_norm": 0.7935237884521484, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8235949277877808, "num_tokens": 438980241.0, "step": 1377 }, { "epoch": 1.4018311291963377, "grad_norm": 0.7849200367927551, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8282375931739807, "num_tokens": 439300154.0, "step": 1378 }, { "epoch": 1.4028484231943033, "grad_norm": 0.7585959434509277, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8329802751541138, "num_tokens": 439612805.0, "step": 1379 }, { "epoch": 1.4038657171922686, "grad_norm": 0.7586443424224854, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8274024724960327, "num_tokens": 439927068.0, "step": 1380 }, { "epoch": 1.404883011190234, "grad_norm": 0.7680992484092712, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8220770359039307, "num_tokens": 440231502.0, "step": 1381 }, { "epoch": 1.4059003051881995, "grad_norm": 0.7801609635353088, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8232927322387695, "num_tokens": 440550613.0, "step": 1382 }, { "epoch": 1.4069175991861649, "grad_norm": 0.7973816990852356, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8271034955978394, "num_tokens": 440878494.0, "step": 1383 }, { "epoch": 1.4079348931841302, "grad_norm": 0.808786153793335, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8254208564758301, "num_tokens": 441175256.0, "step": 1384 }, { "epoch": 1.4089521871820956, "grad_norm": 0.7621607184410095, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8252738118171692, "num_tokens": 441490539.0, "step": 1385 }, { "epoch": 1.409969481180061, "grad_norm": 0.7777722477912903, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8258455991744995, "num_tokens": 441811314.0, "step": 1386 }, { "epoch": 1.4109867751780265, "grad_norm": 0.7797698974609375, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8287750482559204, "num_tokens": 442127917.0, "step": 1387 }, { "epoch": 1.4120040691759919, "grad_norm": 0.777352511882782, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8221017122268677, "num_tokens": 442435182.0, "step": 1388 }, { "epoch": 1.4130213631739572, "grad_norm": 0.74081951379776, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.830318033695221, "num_tokens": 442769476.0, "step": 1389 }, { "epoch": 1.4140386571719228, "grad_norm": 0.7134003639221191, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8246288299560547, "num_tokens": 443108364.0, "step": 1390 }, { "epoch": 1.4150559511698881, "grad_norm": 0.7414463758468628, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8285084962844849, "num_tokens": 443434228.0, "step": 1391 }, { "epoch": 1.4160732451678535, "grad_norm": 0.7754774689674377, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8222627639770508, "num_tokens": 443753663.0, "step": 1392 }, { "epoch": 1.4170905391658188, "grad_norm": 0.7414329051971436, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.831973135471344, "num_tokens": 444064404.0, "step": 1393 }, { "epoch": 1.4181078331637844, "grad_norm": 0.7357895970344543, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8242871761322021, "num_tokens": 444385872.0, "step": 1394 }, { "epoch": 1.4191251271617498, "grad_norm": 0.7833715081214905, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8255316019058228, "num_tokens": 444686912.0, "step": 1395 }, { "epoch": 1.4201424211597151, "grad_norm": 0.7233708500862122, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8281997442245483, "num_tokens": 445012426.0, "step": 1396 }, { "epoch": 1.4211597151576805, "grad_norm": 0.6995869874954224, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8228816986083984, "num_tokens": 445343179.0, "step": 1397 }, { "epoch": 1.422177009155646, "grad_norm": 0.7607414722442627, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8319629430770874, "num_tokens": 445669852.0, "step": 1398 }, { "epoch": 1.4231943031536114, "grad_norm": 0.7545032501220703, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8259891271591187, "num_tokens": 445986396.0, "step": 1399 }, { "epoch": 1.4242115971515767, "grad_norm": 0.7849803566932678, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8252159357070923, "num_tokens": 446284635.0, "step": 1400 }, { "epoch": 1.4252288911495423, "grad_norm": 0.7250545620918274, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8250802755355835, "num_tokens": 446599329.0, "step": 1401 }, { "epoch": 1.4262461851475077, "grad_norm": 0.7383725047111511, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8266249895095825, "num_tokens": 446921393.0, "step": 1402 }, { "epoch": 1.427263479145473, "grad_norm": 0.7174665927886963, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8278155326843262, "num_tokens": 447257986.0, "step": 1403 }, { "epoch": 1.4282807731434384, "grad_norm": 0.7408458590507507, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8329218626022339, "num_tokens": 447568033.0, "step": 1404 }, { "epoch": 1.4292980671414037, "grad_norm": 0.771271288394928, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.828801155090332, "num_tokens": 447885294.0, "step": 1405 }, { "epoch": 1.4303153611393693, "grad_norm": 0.7759033441543579, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8226232528686523, "num_tokens": 448201132.0, "step": 1406 }, { "epoch": 1.4313326551373347, "grad_norm": 0.8168438076972961, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8233036398887634, "num_tokens": 448516015.0, "step": 1407 }, { "epoch": 1.4323499491353, "grad_norm": 0.786461591720581, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.823851466178894, "num_tokens": 448828846.0, "step": 1408 }, { "epoch": 1.4333672431332656, "grad_norm": 0.7642593383789062, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.820085883140564, "num_tokens": 449153221.0, "step": 1409 }, { "epoch": 1.434384537131231, "grad_norm": 0.7777250409126282, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8208210468292236, "num_tokens": 449462904.0, "step": 1410 }, { "epoch": 1.4354018311291963, "grad_norm": 0.7578172087669373, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8272188901901245, "num_tokens": 449779209.0, "step": 1411 }, { "epoch": 1.4364191251271619, "grad_norm": 0.8341385126113892, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8189631700515747, "num_tokens": 450077259.0, "step": 1412 }, { "epoch": 1.4374364191251272, "grad_norm": 0.763503909111023, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8248583078384399, "num_tokens": 450408833.0, "step": 1413 }, { "epoch": 1.4384537131230926, "grad_norm": 0.7525264620780945, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8266898989677429, "num_tokens": 450736983.0, "step": 1414 }, { "epoch": 1.439471007121058, "grad_norm": 0.7664144039154053, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8303419351577759, "num_tokens": 451046227.0, "step": 1415 }, { "epoch": 1.4404883011190233, "grad_norm": 0.7939390540122986, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.825028657913208, "num_tokens": 451349176.0, "step": 1416 }, { "epoch": 1.4415055951169888, "grad_norm": 0.7194551229476929, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.824691891670227, "num_tokens": 451667384.0, "step": 1417 }, { "epoch": 1.4425228891149542, "grad_norm": 0.7452390193939209, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8228015303611755, "num_tokens": 451978435.0, "step": 1418 }, { "epoch": 1.4435401831129195, "grad_norm": 0.8130427002906799, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8237577080726624, "num_tokens": 452287174.0, "step": 1419 }, { "epoch": 1.4445574771108851, "grad_norm": 0.7440806031227112, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.821694016456604, "num_tokens": 452595925.0, "step": 1420 }, { "epoch": 1.4455747711088505, "grad_norm": 0.8715015649795532, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8313428163528442, "num_tokens": 452902299.0, "step": 1421 }, { "epoch": 1.4465920651068158, "grad_norm": 0.7727867364883423, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8233987092971802, "num_tokens": 453219512.0, "step": 1422 }, { "epoch": 1.4476093591047814, "grad_norm": 0.7598720192909241, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8273491859436035, "num_tokens": 453528683.0, "step": 1423 }, { "epoch": 1.4486266531027467, "grad_norm": 0.7164946794509888, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.826592206954956, "num_tokens": 453856172.0, "step": 1424 }, { "epoch": 1.449643947100712, "grad_norm": 0.7681942582130432, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8334194421768188, "num_tokens": 454181489.0, "step": 1425 }, { "epoch": 1.4506612410986774, "grad_norm": 0.7888099551200867, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8240190744400024, "num_tokens": 454494258.0, "step": 1426 }, { "epoch": 1.4516785350966428, "grad_norm": 0.7697499990463257, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8328573107719421, "num_tokens": 454794488.0, "step": 1427 }, { "epoch": 1.4526958290946084, "grad_norm": 0.7287480235099792, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8192815184593201, "num_tokens": 455107150.0, "step": 1428 }, { "epoch": 1.4537131230925737, "grad_norm": 0.7731139063835144, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8271017074584961, "num_tokens": 455429921.0, "step": 1429 }, { "epoch": 1.454730417090539, "grad_norm": 0.7933812141418457, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8243111371994019, "num_tokens": 455754141.0, "step": 1430 }, { "epoch": 1.4557477110885046, "grad_norm": 0.7570680975914001, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.828052818775177, "num_tokens": 456076223.0, "step": 1431 }, { "epoch": 1.45676500508647, "grad_norm": 0.7283380627632141, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8255959749221802, "num_tokens": 456396940.0, "step": 1432 }, { "epoch": 1.4577822990844354, "grad_norm": 0.7613392472267151, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8309558629989624, "num_tokens": 456705322.0, "step": 1433 }, { "epoch": 1.458799593082401, "grad_norm": 0.8308210968971252, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8235096335411072, "num_tokens": 457034824.0, "step": 1434 }, { "epoch": 1.4598168870803663, "grad_norm": 0.7514590620994568, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8205565214157104, "num_tokens": 457358314.0, "step": 1435 }, { "epoch": 1.4608341810783316, "grad_norm": 0.7398003339767456, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.829264760017395, "num_tokens": 457677702.0, "step": 1436 }, { "epoch": 1.461851475076297, "grad_norm": 0.7642903923988342, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8233004808425903, "num_tokens": 458004068.0, "step": 1437 }, { "epoch": 1.4628687690742623, "grad_norm": 0.7861106991767883, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8219960927963257, "num_tokens": 458327986.0, "step": 1438 }, { "epoch": 1.463886063072228, "grad_norm": 0.7153347134590149, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.827376127243042, "num_tokens": 458641744.0, "step": 1439 }, { "epoch": 1.4649033570701933, "grad_norm": 0.7323182821273804, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8284355401992798, "num_tokens": 458963974.0, "step": 1440 }, { "epoch": 1.4659206510681586, "grad_norm": 0.8875779509544373, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8269001245498657, "num_tokens": 459273431.0, "step": 1441 }, { "epoch": 1.4669379450661242, "grad_norm": 0.7622124552726746, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8258703351020813, "num_tokens": 459590197.0, "step": 1442 }, { "epoch": 1.4679552390640895, "grad_norm": 0.7376556396484375, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.825239896774292, "num_tokens": 459898577.0, "step": 1443 }, { "epoch": 1.4689725330620549, "grad_norm": 0.7311775088310242, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8291976451873779, "num_tokens": 460209370.0, "step": 1444 }, { "epoch": 1.4699898270600205, "grad_norm": 0.7577133178710938, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8328637480735779, "num_tokens": 460538263.0, "step": 1445 }, { "epoch": 1.4710071210579858, "grad_norm": 0.7706095576286316, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8277468681335449, "num_tokens": 460866578.0, "step": 1446 }, { "epoch": 1.4720244150559512, "grad_norm": 0.8589508533477783, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8217111825942993, "num_tokens": 461171630.0, "step": 1447 }, { "epoch": 1.4730417090539165, "grad_norm": 0.7346833348274231, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8265662789344788, "num_tokens": 461506509.0, "step": 1448 }, { "epoch": 1.4740590030518819, "grad_norm": 0.75523841381073, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8384934663772583, "num_tokens": 461835821.0, "step": 1449 }, { "epoch": 1.4750762970498474, "grad_norm": 0.7664790749549866, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8306655883789062, "num_tokens": 462141478.0, "step": 1450 }, { "epoch": 1.4760935910478128, "grad_norm": 0.8109428882598877, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.831562876701355, "num_tokens": 462469829.0, "step": 1451 }, { "epoch": 1.4771108850457781, "grad_norm": 0.726762592792511, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8273746967315674, "num_tokens": 462805295.0, "step": 1452 }, { "epoch": 1.4781281790437437, "grad_norm": 0.7275935411453247, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8319987058639526, "num_tokens": 463137500.0, "step": 1453 }, { "epoch": 1.479145473041709, "grad_norm": 0.7638391256332397, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.823004424571991, "num_tokens": 463448939.0, "step": 1454 }, { "epoch": 1.4801627670396744, "grad_norm": 0.7693440318107605, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8211554288864136, "num_tokens": 463759585.0, "step": 1455 }, { "epoch": 1.48118006103764, "grad_norm": 0.7615060210227966, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8256450891494751, "num_tokens": 464082388.0, "step": 1456 }, { "epoch": 1.4821973550356053, "grad_norm": 0.713785707950592, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8260461688041687, "num_tokens": 464417202.0, "step": 1457 }, { "epoch": 1.4832146490335707, "grad_norm": 0.7960901856422424, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8279980421066284, "num_tokens": 464725276.0, "step": 1458 }, { "epoch": 1.484231943031536, "grad_norm": 0.795560359954834, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8323945999145508, "num_tokens": 465058613.0, "step": 1459 }, { "epoch": 1.4852492370295014, "grad_norm": 0.7435919642448425, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8298157453536987, "num_tokens": 465376097.0, "step": 1460 }, { "epoch": 1.486266531027467, "grad_norm": 0.7551680207252502, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.817765474319458, "num_tokens": 465702748.0, "step": 1461 }, { "epoch": 1.4872838250254323, "grad_norm": 0.7732077836990356, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8342182636260986, "num_tokens": 466007913.0, "step": 1462 }, { "epoch": 1.4883011190233977, "grad_norm": 0.7947700619697571, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8237362504005432, "num_tokens": 466327362.0, "step": 1463 }, { "epoch": 1.4893184130213633, "grad_norm": 0.7750511765480042, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8315537571907043, "num_tokens": 466638743.0, "step": 1464 }, { "epoch": 1.4903357070193286, "grad_norm": 0.7739797234535217, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8269380927085876, "num_tokens": 466958501.0, "step": 1465 }, { "epoch": 1.491353001017294, "grad_norm": 0.7865405082702637, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8227080702781677, "num_tokens": 467271547.0, "step": 1466 }, { "epoch": 1.4923702950152595, "grad_norm": 0.7469372749328613, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8284210562705994, "num_tokens": 467597696.0, "step": 1467 }, { "epoch": 1.4933875890132249, "grad_norm": 0.7437664866447449, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8262543082237244, "num_tokens": 467925187.0, "step": 1468 }, { "epoch": 1.4944048830111902, "grad_norm": 0.8482660055160522, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8197759985923767, "num_tokens": 468226973.0, "step": 1469 }, { "epoch": 1.4954221770091556, "grad_norm": 0.7456377744674683, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8341470956802368, "num_tokens": 468534657.0, "step": 1470 }, { "epoch": 1.496439471007121, "grad_norm": 0.7405015230178833, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8242038488388062, "num_tokens": 468863037.0, "step": 1471 }, { "epoch": 1.4974567650050865, "grad_norm": 0.7640259265899658, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8241820335388184, "num_tokens": 469175522.0, "step": 1472 }, { "epoch": 1.4984740590030519, "grad_norm": 0.7306478023529053, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.832508385181427, "num_tokens": 469489877.0, "step": 1473 }, { "epoch": 1.4994913530010172, "grad_norm": 0.7525316476821899, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8232234716415405, "num_tokens": 469823003.0, "step": 1474 }, { "epoch": 1.5005086469989828, "grad_norm": 0.728570282459259, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8248062133789062, "num_tokens": 470149249.0, "step": 1475 }, { "epoch": 1.5015259409969481, "grad_norm": 0.7569950222969055, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8198111653327942, "num_tokens": 470472689.0, "step": 1476 }, { "epoch": 1.5025432349949135, "grad_norm": 0.7983245253562927, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8236798048019409, "num_tokens": 470771648.0, "step": 1477 }, { "epoch": 1.503560528992879, "grad_norm": 0.7313922047615051, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.826393723487854, "num_tokens": 471089333.0, "step": 1478 }, { "epoch": 1.5045778229908442, "grad_norm": 0.739508867263794, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8257728815078735, "num_tokens": 471422317.0, "step": 1479 }, { "epoch": 1.5055951169888098, "grad_norm": 0.8040635585784912, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8284851908683777, "num_tokens": 471730047.0, "step": 1480 }, { "epoch": 1.5066124109867753, "grad_norm": 0.7771586775779724, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8217354416847229, "num_tokens": 472056255.0, "step": 1481 }, { "epoch": 1.5076297049847405, "grad_norm": 0.7523510456085205, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8259488344192505, "num_tokens": 472385581.0, "step": 1482 }, { "epoch": 1.508646998982706, "grad_norm": 0.8081842660903931, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8305275440216064, "num_tokens": 472697254.0, "step": 1483 }, { "epoch": 1.5096642929806714, "grad_norm": 0.7643046975135803, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8213087320327759, "num_tokens": 473029719.0, "step": 1484 }, { "epoch": 1.5106815869786367, "grad_norm": 0.74493807554245, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8234145641326904, "num_tokens": 473348305.0, "step": 1485 }, { "epoch": 1.5116988809766023, "grad_norm": 0.771102786064148, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.825602650642395, "num_tokens": 473666571.0, "step": 1486 }, { "epoch": 1.5127161749745677, "grad_norm": 0.7674226760864258, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8334149718284607, "num_tokens": 473976615.0, "step": 1487 }, { "epoch": 1.513733468972533, "grad_norm": 0.7604283690452576, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8303079009056091, "num_tokens": 474309906.0, "step": 1488 }, { "epoch": 1.5147507629704986, "grad_norm": 0.7903773784637451, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8222702741622925, "num_tokens": 474620054.0, "step": 1489 }, { "epoch": 1.5157680569684637, "grad_norm": 0.791972815990448, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8252575397491455, "num_tokens": 474940167.0, "step": 1490 }, { "epoch": 1.5167853509664293, "grad_norm": 0.7404340505599976, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8309566974639893, "num_tokens": 475258659.0, "step": 1491 }, { "epoch": 1.5178026449643949, "grad_norm": 0.7210290431976318, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8279041051864624, "num_tokens": 475577873.0, "step": 1492 }, { "epoch": 1.51881993896236, "grad_norm": 0.737732470035553, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8217068314552307, "num_tokens": 475909298.0, "step": 1493 }, { "epoch": 1.5198372329603256, "grad_norm": 0.8280147910118103, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8225430250167847, "num_tokens": 476240775.0, "step": 1494 }, { "epoch": 1.520854526958291, "grad_norm": 0.7968106269836426, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8245846629142761, "num_tokens": 476556580.0, "step": 1495 }, { "epoch": 1.5218718209562563, "grad_norm": 0.7490198016166687, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8279181718826294, "num_tokens": 476855526.0, "step": 1496 }, { "epoch": 1.5228891149542219, "grad_norm": 0.7276293635368347, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8238677978515625, "num_tokens": 477177840.0, "step": 1497 }, { "epoch": 1.5239064089521872, "grad_norm": 0.7779879570007324, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.830930769443512, "num_tokens": 477502383.0, "step": 1498 }, { "epoch": 1.5249237029501526, "grad_norm": 0.8194068670272827, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8289549946784973, "num_tokens": 477826492.0, "step": 1499 }, { "epoch": 1.5259409969481181, "grad_norm": 0.7614669799804688, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8331164717674255, "num_tokens": 478124029.0, "step": 1500 }, { "epoch": 1.5269582909460833, "grad_norm": 0.7624503970146179, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8300232291221619, "num_tokens": 478451961.0, "step": 1501 }, { "epoch": 1.5279755849440488, "grad_norm": 0.7618391513824463, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8214845657348633, "num_tokens": 478777570.0, "step": 1502 }, { "epoch": 1.5289928789420142, "grad_norm": 0.8302142024040222, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8292617201805115, "num_tokens": 479091694.0, "step": 1503 }, { "epoch": 1.5300101729399795, "grad_norm": 0.8318211436271667, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8252511024475098, "num_tokens": 479395630.0, "step": 1504 }, { "epoch": 1.5310274669379451, "grad_norm": 0.81647789478302, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8355250358581543, "num_tokens": 479705902.0, "step": 1505 }, { "epoch": 1.5320447609359105, "grad_norm": 0.7545574307441711, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.826928436756134, "num_tokens": 480016304.0, "step": 1506 }, { "epoch": 1.5330620549338758, "grad_norm": 0.7784783840179443, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8297576904296875, "num_tokens": 480346413.0, "step": 1507 }, { "epoch": 1.5340793489318414, "grad_norm": 0.8040642142295837, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.828713059425354, "num_tokens": 480658234.0, "step": 1508 }, { "epoch": 1.5350966429298067, "grad_norm": 0.7709273099899292, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8270728588104248, "num_tokens": 480980014.0, "step": 1509 }, { "epoch": 1.536113936927772, "grad_norm": 0.8043686747550964, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.818744957447052, "num_tokens": 481310267.0, "step": 1510 }, { "epoch": 1.5371312309257377, "grad_norm": 0.7916233539581299, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8126989603042603, "num_tokens": 481623100.0, "step": 1511 }, { "epoch": 1.5381485249237028, "grad_norm": 0.8328151702880859, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8281493186950684, "num_tokens": 481941546.0, "step": 1512 }, { "epoch": 1.5391658189216684, "grad_norm": 0.8383995294570923, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8183453679084778, "num_tokens": 482261954.0, "step": 1513 }, { "epoch": 1.5401831129196337, "grad_norm": 0.7694042921066284, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8272018432617188, "num_tokens": 482583861.0, "step": 1514 }, { "epoch": 1.541200406917599, "grad_norm": 0.7397472858428955, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8283990621566772, "num_tokens": 482917251.0, "step": 1515 }, { "epoch": 1.5422177009155646, "grad_norm": 0.7571448683738708, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8278690576553345, "num_tokens": 483235136.0, "step": 1516 }, { "epoch": 1.54323499491353, "grad_norm": 0.764622151851654, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8284850120544434, "num_tokens": 483551953.0, "step": 1517 }, { "epoch": 1.5442522889114954, "grad_norm": 0.7923704385757446, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8279635906219482, "num_tokens": 483861059.0, "step": 1518 }, { "epoch": 1.545269582909461, "grad_norm": 0.7625948190689087, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8296431303024292, "num_tokens": 484173960.0, "step": 1519 }, { "epoch": 1.5462868769074263, "grad_norm": 0.7725732922554016, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8205369710922241, "num_tokens": 484486741.0, "step": 1520 }, { "epoch": 1.5473041709053916, "grad_norm": 0.7608553171157837, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8246746063232422, "num_tokens": 484829318.0, "step": 1521 }, { "epoch": 1.5483214649033572, "grad_norm": 0.7753155827522278, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8245500326156616, "num_tokens": 485146681.0, "step": 1522 }, { "epoch": 1.5493387589013223, "grad_norm": 0.7314413785934448, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8249948024749756, "num_tokens": 485461388.0, "step": 1523 }, { "epoch": 1.550356052899288, "grad_norm": 0.7906734943389893, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8281234502792358, "num_tokens": 485774911.0, "step": 1524 }, { "epoch": 1.5513733468972533, "grad_norm": 0.7759382724761963, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8234958052635193, "num_tokens": 486099339.0, "step": 1525 }, { "epoch": 1.5523906408952186, "grad_norm": 0.7591975927352905, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.826912522315979, "num_tokens": 486412016.0, "step": 1526 }, { "epoch": 1.5534079348931842, "grad_norm": 0.7712595462799072, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8311680555343628, "num_tokens": 486710623.0, "step": 1527 }, { "epoch": 1.5544252288911495, "grad_norm": 0.777800977230072, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8223469257354736, "num_tokens": 487025165.0, "step": 1528 }, { "epoch": 1.5554425228891149, "grad_norm": 0.7843142747879028, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8229750990867615, "num_tokens": 487349819.0, "step": 1529 }, { "epoch": 1.5564598168870805, "grad_norm": 0.7695160508155823, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8223654627799988, "num_tokens": 487662393.0, "step": 1530 }, { "epoch": 1.5574771108850458, "grad_norm": 0.7649616599082947, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8219040632247925, "num_tokens": 487985726.0, "step": 1531 }, { "epoch": 1.5584944048830112, "grad_norm": 0.7629556059837341, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8320173025131226, "num_tokens": 488319919.0, "step": 1532 }, { "epoch": 1.5595116988809767, "grad_norm": 0.7568076848983765, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8286150097846985, "num_tokens": 488630649.0, "step": 1533 }, { "epoch": 1.5605289928789419, "grad_norm": 0.7621586322784424, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8262230157852173, "num_tokens": 488932053.0, "step": 1534 }, { "epoch": 1.5615462868769074, "grad_norm": 0.788788378238678, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8261498808860779, "num_tokens": 489250757.0, "step": 1535 }, { "epoch": 1.5625635808748728, "grad_norm": 0.7831841111183167, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8225564360618591, "num_tokens": 489567664.0, "step": 1536 }, { "epoch": 1.5635808748728381, "grad_norm": 0.7453765273094177, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8284498453140259, "num_tokens": 489874451.0, "step": 1537 }, { "epoch": 1.5645981688708037, "grad_norm": 0.7343994379043579, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8306755423545837, "num_tokens": 490204909.0, "step": 1538 }, { "epoch": 1.565615462868769, "grad_norm": 0.7722320556640625, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8224450349807739, "num_tokens": 490514531.0, "step": 1539 }, { "epoch": 1.5666327568667344, "grad_norm": 0.743450403213501, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8267664313316345, "num_tokens": 490845055.0, "step": 1540 }, { "epoch": 1.5676500508647, "grad_norm": 0.8455149531364441, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.817818820476532, "num_tokens": 491152661.0, "step": 1541 }, { "epoch": 1.5686673448626653, "grad_norm": 0.7722828388214111, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8273724317550659, "num_tokens": 491455713.0, "step": 1542 }, { "epoch": 1.5696846388606307, "grad_norm": 0.7712371945381165, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.831794798374176, "num_tokens": 491772168.0, "step": 1543 }, { "epoch": 1.5707019328585963, "grad_norm": 0.7466083765029907, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.82890784740448, "num_tokens": 492088615.0, "step": 1544 }, { "epoch": 1.5717192268565614, "grad_norm": 0.821121871471405, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8341401815414429, "num_tokens": 492397349.0, "step": 1545 }, { "epoch": 1.572736520854527, "grad_norm": 0.7918075323104858, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8286417126655579, "num_tokens": 492713338.0, "step": 1546 }, { "epoch": 1.5737538148524923, "grad_norm": 0.7808675169944763, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8277544975280762, "num_tokens": 493019564.0, "step": 1547 }, { "epoch": 1.5747711088504577, "grad_norm": 0.7513705492019653, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8273752331733704, "num_tokens": 493335981.0, "step": 1548 }, { "epoch": 1.5757884028484233, "grad_norm": 0.7467897534370422, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8333742618560791, "num_tokens": 493652983.0, "step": 1549 }, { "epoch": 1.5768056968463886, "grad_norm": 0.7320238351821899, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8308363556861877, "num_tokens": 493983683.0, "step": 1550 }, { "epoch": 1.577822990844354, "grad_norm": 0.7745316028594971, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8205184936523438, "num_tokens": 494296649.0, "step": 1551 }, { "epoch": 1.5788402848423195, "grad_norm": 0.7687009572982788, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8262377977371216, "num_tokens": 494612800.0, "step": 1552 }, { "epoch": 1.5798575788402849, "grad_norm": 0.7544592618942261, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8294517993927002, "num_tokens": 494920941.0, "step": 1553 }, { "epoch": 1.5808748728382502, "grad_norm": 0.73783278465271, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8330302238464355, "num_tokens": 495234703.0, "step": 1554 }, { "epoch": 1.5818921668362158, "grad_norm": 0.7729275822639465, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8295111060142517, "num_tokens": 495546408.0, "step": 1555 }, { "epoch": 1.582909460834181, "grad_norm": 0.8193879723548889, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8216261863708496, "num_tokens": 495873689.0, "step": 1556 }, { "epoch": 1.5839267548321465, "grad_norm": 0.7715921998023987, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8296725749969482, "num_tokens": 496200993.0, "step": 1557 }, { "epoch": 1.5849440488301119, "grad_norm": 0.7675861120223999, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8267984986305237, "num_tokens": 496528159.0, "step": 1558 }, { "epoch": 1.5859613428280772, "grad_norm": 0.7609422206878662, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8270455002784729, "num_tokens": 496842977.0, "step": 1559 }, { "epoch": 1.5869786368260428, "grad_norm": 0.798121452331543, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8252966403961182, "num_tokens": 497141795.0, "step": 1560 }, { "epoch": 1.5879959308240081, "grad_norm": 0.7561322450637817, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8259497880935669, "num_tokens": 497453652.0, "step": 1561 }, { "epoch": 1.5890132248219735, "grad_norm": 0.7411714196205139, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8266175985336304, "num_tokens": 497784575.0, "step": 1562 }, { "epoch": 1.590030518819939, "grad_norm": 0.7651845812797546, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8290013074874878, "num_tokens": 498096276.0, "step": 1563 }, { "epoch": 1.5910478128179044, "grad_norm": 0.7341787219047546, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8271242380142212, "num_tokens": 498422502.0, "step": 1564 }, { "epoch": 1.5920651068158698, "grad_norm": 0.7575334906578064, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8300933837890625, "num_tokens": 498743572.0, "step": 1565 }, { "epoch": 1.5930824008138353, "grad_norm": 0.7907820343971252, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8232530355453491, "num_tokens": 499051577.0, "step": 1566 }, { "epoch": 1.5940996948118005, "grad_norm": 0.7722137570381165, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.824563205242157, "num_tokens": 499360910.0, "step": 1567 }, { "epoch": 1.595116988809766, "grad_norm": 0.7802767753601074, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8294447660446167, "num_tokens": 499675969.0, "step": 1568 }, { "epoch": 1.5961342828077314, "grad_norm": 0.7496720552444458, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8307507634162903, "num_tokens": 499988817.0, "step": 1569 }, { "epoch": 1.5971515768056967, "grad_norm": 0.7167335152626038, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8224383592605591, "num_tokens": 500312578.0, "step": 1570 }, { "epoch": 1.5981688708036623, "grad_norm": 0.7863753437995911, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8292855620384216, "num_tokens": 500631973.0, "step": 1571 }, { "epoch": 1.5991861648016277, "grad_norm": 0.7296916246414185, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8279212117195129, "num_tokens": 500955095.0, "step": 1572 }, { "epoch": 1.600203458799593, "grad_norm": 0.7697763442993164, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8253737688064575, "num_tokens": 501271088.0, "step": 1573 }, { "epoch": 1.6012207527975586, "grad_norm": 0.8204194903373718, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8297313451766968, "num_tokens": 501573475.0, "step": 1574 }, { "epoch": 1.602238046795524, "grad_norm": 0.8012133836746216, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.828351616859436, "num_tokens": 501887647.0, "step": 1575 }, { "epoch": 1.6032553407934893, "grad_norm": 0.7700391411781311, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8309698104858398, "num_tokens": 502192904.0, "step": 1576 }, { "epoch": 1.6042726347914549, "grad_norm": 0.7932988405227661, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8294327855110168, "num_tokens": 502505274.0, "step": 1577 }, { "epoch": 1.60528992878942, "grad_norm": 0.7901273369789124, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8220197558403015, "num_tokens": 502817024.0, "step": 1578 }, { "epoch": 1.6063072227873856, "grad_norm": 0.8021532297134399, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.820052981376648, "num_tokens": 503156030.0, "step": 1579 }, { "epoch": 1.607324516785351, "grad_norm": 0.7981246709823608, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.830917239189148, "num_tokens": 503459669.0, "step": 1580 }, { "epoch": 1.6083418107833163, "grad_norm": 0.7735322117805481, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8284094929695129, "num_tokens": 503766380.0, "step": 1581 }, { "epoch": 1.6093591047812819, "grad_norm": 0.8570979833602905, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8192204236984253, "num_tokens": 504077350.0, "step": 1582 }, { "epoch": 1.6103763987792472, "grad_norm": 0.8010280132293701, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8301876783370972, "num_tokens": 504396848.0, "step": 1583 }, { "epoch": 1.6113936927772126, "grad_norm": 0.8592894077301025, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8320919275283813, "num_tokens": 504709088.0, "step": 1584 }, { "epoch": 1.6124109867751781, "grad_norm": 0.7861720323562622, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8243271112442017, "num_tokens": 505040673.0, "step": 1585 }, { "epoch": 1.6134282807731435, "grad_norm": 0.7919045090675354, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8284953236579895, "num_tokens": 505369618.0, "step": 1586 }, { "epoch": 1.6144455747711088, "grad_norm": 0.9178990125656128, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8316802382469177, "num_tokens": 505672795.0, "step": 1587 }, { "epoch": 1.6154628687690744, "grad_norm": 0.7910969853401184, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.829032301902771, "num_tokens": 506006904.0, "step": 1588 }, { "epoch": 1.6164801627670395, "grad_norm": 0.7462960481643677, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8354339599609375, "num_tokens": 506328403.0, "step": 1589 }, { "epoch": 1.6174974567650051, "grad_norm": 0.8121559619903564, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8301266431808472, "num_tokens": 506646322.0, "step": 1590 }, { "epoch": 1.6185147507629705, "grad_norm": 0.8175883293151855, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8349389433860779, "num_tokens": 506963590.0, "step": 1591 }, { "epoch": 1.6195320447609358, "grad_norm": 0.7866964936256409, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8321659564971924, "num_tokens": 507261633.0, "step": 1592 }, { "epoch": 1.6205493387589014, "grad_norm": 0.772517204284668, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8289494514465332, "num_tokens": 507586130.0, "step": 1593 }, { "epoch": 1.6215666327568667, "grad_norm": 0.7570743560791016, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8264689445495605, "num_tokens": 507895824.0, "step": 1594 }, { "epoch": 1.622583926754832, "grad_norm": 0.7695471048355103, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8220570087432861, "num_tokens": 508213579.0, "step": 1595 }, { "epoch": 1.6236012207527977, "grad_norm": 0.7480154633522034, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8287339210510254, "num_tokens": 508527614.0, "step": 1596 }, { "epoch": 1.624618514750763, "grad_norm": 0.7738633751869202, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8266138434410095, "num_tokens": 508844166.0, "step": 1597 }, { "epoch": 1.6256358087487284, "grad_norm": 0.7983934283256531, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8295655250549316, "num_tokens": 509153388.0, "step": 1598 }, { "epoch": 1.626653102746694, "grad_norm": 0.7699052095413208, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8266280889511108, "num_tokens": 509478651.0, "step": 1599 }, { "epoch": 1.627670396744659, "grad_norm": 0.8170141577720642, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8266683220863342, "num_tokens": 509789533.0, "step": 1600 }, { "epoch": 1.6286876907426246, "grad_norm": 0.7319942712783813, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8287757635116577, "num_tokens": 510127406.0, "step": 1601 }, { "epoch": 1.62970498474059, "grad_norm": 0.7690290808677673, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.833433985710144, "num_tokens": 510431387.0, "step": 1602 }, { "epoch": 1.6307222787385554, "grad_norm": 0.8090844750404358, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8240605592727661, "num_tokens": 510762803.0, "step": 1603 }, { "epoch": 1.631739572736521, "grad_norm": 0.7850157022476196, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.822102427482605, "num_tokens": 511087806.0, "step": 1604 }, { "epoch": 1.6327568667344863, "grad_norm": 0.8082931041717529, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8232232332229614, "num_tokens": 511413057.0, "step": 1605 }, { "epoch": 1.6337741607324516, "grad_norm": 0.7694851756095886, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8365007042884827, "num_tokens": 511722415.0, "step": 1606 }, { "epoch": 1.6347914547304172, "grad_norm": 0.7512543201446533, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8339874744415283, "num_tokens": 512040496.0, "step": 1607 }, { "epoch": 1.6358087487283826, "grad_norm": 0.8409536480903625, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.823777437210083, "num_tokens": 512348538.0, "step": 1608 }, { "epoch": 1.636826042726348, "grad_norm": 0.8341800570487976, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.825890302658081, "num_tokens": 512672571.0, "step": 1609 }, { "epoch": 1.6378433367243135, "grad_norm": 0.7653716206550598, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.827292799949646, "num_tokens": 512983469.0, "step": 1610 }, { "epoch": 1.6388606307222786, "grad_norm": 0.7501400113105774, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8306208848953247, "num_tokens": 513304550.0, "step": 1611 }, { "epoch": 1.6398779247202442, "grad_norm": 0.7226465344429016, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8239978551864624, "num_tokens": 513639647.0, "step": 1612 }, { "epoch": 1.6408952187182095, "grad_norm": 0.8217362761497498, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8272889256477356, "num_tokens": 513955189.0, "step": 1613 }, { "epoch": 1.6419125127161749, "grad_norm": 0.8021873235702515, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8280093669891357, "num_tokens": 514262184.0, "step": 1614 }, { "epoch": 1.6429298067141405, "grad_norm": 0.7563522458076477, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.83150315284729, "num_tokens": 514577426.0, "step": 1615 }, { "epoch": 1.6439471007121058, "grad_norm": 0.7791472673416138, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8287179470062256, "num_tokens": 514882566.0, "step": 1616 }, { "epoch": 1.6449643947100712, "grad_norm": 0.742529571056366, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8295882940292358, "num_tokens": 515212612.0, "step": 1617 }, { "epoch": 1.6459816887080367, "grad_norm": 0.8474096655845642, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8254484534263611, "num_tokens": 515525982.0, "step": 1618 }, { "epoch": 1.6469989827060019, "grad_norm": 0.7607150673866272, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8320398330688477, "num_tokens": 515853344.0, "step": 1619 }, { "epoch": 1.6480162767039674, "grad_norm": 0.7506897449493408, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.828422486782074, "num_tokens": 516159473.0, "step": 1620 }, { "epoch": 1.649033570701933, "grad_norm": 0.7465788125991821, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8236411213874817, "num_tokens": 516492142.0, "step": 1621 }, { "epoch": 1.6500508646998981, "grad_norm": 0.8063442707061768, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8278873562812805, "num_tokens": 516798136.0, "step": 1622 }, { "epoch": 1.6510681586978637, "grad_norm": 0.7778916954994202, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8266802430152893, "num_tokens": 517108993.0, "step": 1623 }, { "epoch": 1.652085452695829, "grad_norm": 1.6380517482757568, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8306084871292114, "num_tokens": 517439575.0, "step": 1624 }, { "epoch": 1.6531027466937944, "grad_norm": 0.770706295967102, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8241666555404663, "num_tokens": 517744319.0, "step": 1625 }, { "epoch": 1.65412004069176, "grad_norm": 0.7348789572715759, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8243869543075562, "num_tokens": 518071406.0, "step": 1626 }, { "epoch": 1.6551373346897253, "grad_norm": 0.7872225046157837, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8239084482192993, "num_tokens": 518389903.0, "step": 1627 }, { "epoch": 1.6561546286876907, "grad_norm": 0.7517799735069275, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.827627420425415, "num_tokens": 518689792.0, "step": 1628 }, { "epoch": 1.6571719226856563, "grad_norm": 0.7697553634643555, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8302412033081055, "num_tokens": 518982145.0, "step": 1629 }, { "epoch": 1.6581892166836214, "grad_norm": 0.7755339741706848, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8219212293624878, "num_tokens": 519307621.0, "step": 1630 }, { "epoch": 1.659206510681587, "grad_norm": 0.8022763133049011, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8267483711242676, "num_tokens": 519626545.0, "step": 1631 }, { "epoch": 1.6602238046795526, "grad_norm": 0.8232947587966919, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8260178565979004, "num_tokens": 519945118.0, "step": 1632 }, { "epoch": 1.6612410986775177, "grad_norm": 0.8284922242164612, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8165832757949829, "num_tokens": 520263275.0, "step": 1633 }, { "epoch": 1.6622583926754833, "grad_norm": 0.7653334140777588, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8228817582130432, "num_tokens": 520587306.0, "step": 1634 }, { "epoch": 1.6632756866734486, "grad_norm": 0.8112199306488037, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8297948837280273, "num_tokens": 520922639.0, "step": 1635 }, { "epoch": 1.664292980671414, "grad_norm": 0.8225566744804382, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8226288557052612, "num_tokens": 521246669.0, "step": 1636 }, { "epoch": 1.6653102746693795, "grad_norm": 0.7728680372238159, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8317882418632507, "num_tokens": 521552038.0, "step": 1637 }, { "epoch": 1.6663275686673449, "grad_norm": 0.7860717177391052, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8248538970947266, "num_tokens": 521871204.0, "step": 1638 }, { "epoch": 1.6673448626653102, "grad_norm": 0.7809126973152161, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8258426189422607, "num_tokens": 522180569.0, "step": 1639 }, { "epoch": 1.6683621566632758, "grad_norm": 0.8085176348686218, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8277914524078369, "num_tokens": 522491069.0, "step": 1640 }, { "epoch": 1.669379450661241, "grad_norm": 0.777393102645874, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8285313844680786, "num_tokens": 522813823.0, "step": 1641 }, { "epoch": 1.6703967446592065, "grad_norm": 0.7664257884025574, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8185603618621826, "num_tokens": 523142670.0, "step": 1642 }, { "epoch": 1.671414038657172, "grad_norm": 0.7619096040725708, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8226163983345032, "num_tokens": 523461987.0, "step": 1643 }, { "epoch": 1.6724313326551372, "grad_norm": 0.7851698398590088, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8290607929229736, "num_tokens": 523794689.0, "step": 1644 }, { "epoch": 1.6734486266531028, "grad_norm": 0.77797532081604, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8254382014274597, "num_tokens": 524128171.0, "step": 1645 }, { "epoch": 1.6744659206510681, "grad_norm": 0.765530526638031, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8212093114852905, "num_tokens": 524455562.0, "step": 1646 }, { "epoch": 1.6754832146490335, "grad_norm": 0.7552871108055115, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8209508657455444, "num_tokens": 524788549.0, "step": 1647 }, { "epoch": 1.676500508646999, "grad_norm": 0.7702638506889343, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.822458803653717, "num_tokens": 525108018.0, "step": 1648 }, { "epoch": 1.6775178026449644, "grad_norm": 0.7635334730148315, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8315030336380005, "num_tokens": 525429783.0, "step": 1649 }, { "epoch": 1.6785350966429298, "grad_norm": 0.7596954107284546, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.821110725402832, "num_tokens": 525756468.0, "step": 1650 }, { "epoch": 1.6795523906408953, "grad_norm": 0.7331662774085999, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8301153182983398, "num_tokens": 526072332.0, "step": 1651 }, { "epoch": 1.6805696846388605, "grad_norm": 0.7317625880241394, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8263423442840576, "num_tokens": 526390503.0, "step": 1652 }, { "epoch": 1.681586978636826, "grad_norm": 0.8702585697174072, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.821205735206604, "num_tokens": 526702092.0, "step": 1653 }, { "epoch": 1.6826042726347916, "grad_norm": 0.7284979820251465, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8286755681037903, "num_tokens": 527035121.0, "step": 1654 }, { "epoch": 1.6836215666327567, "grad_norm": 0.7543615698814392, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8284416198730469, "num_tokens": 527350842.0, "step": 1655 }, { "epoch": 1.6846388606307223, "grad_norm": 0.7683477997779846, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8239773511886597, "num_tokens": 527685407.0, "step": 1656 }, { "epoch": 1.6856561546286877, "grad_norm": 0.732888400554657, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8312336206436157, "num_tokens": 528011456.0, "step": 1657 }, { "epoch": 1.686673448626653, "grad_norm": 0.7869269847869873, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8261852860450745, "num_tokens": 528319045.0, "step": 1658 }, { "epoch": 1.6876907426246186, "grad_norm": 0.7795509696006775, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8243482708930969, "num_tokens": 528635401.0, "step": 1659 }, { "epoch": 1.688708036622584, "grad_norm": 0.76092928647995, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8321963548660278, "num_tokens": 528936587.0, "step": 1660 }, { "epoch": 1.6897253306205493, "grad_norm": 0.7945722341537476, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8236092329025269, "num_tokens": 529263800.0, "step": 1661 }, { "epoch": 1.6907426246185149, "grad_norm": 0.7325130701065063, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8348582983016968, "num_tokens": 529579783.0, "step": 1662 }, { "epoch": 1.69175991861648, "grad_norm": 0.7459843158721924, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.82767653465271, "num_tokens": 529922657.0, "step": 1663 }, { "epoch": 1.6927772126144456, "grad_norm": 0.7394583821296692, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8359782695770264, "num_tokens": 530237269.0, "step": 1664 }, { "epoch": 1.693794506612411, "grad_norm": 0.7484853267669678, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8226250410079956, "num_tokens": 530563601.0, "step": 1665 }, { "epoch": 1.6948118006103763, "grad_norm": 0.7826150059700012, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8226125240325928, "num_tokens": 530875466.0, "step": 1666 }, { "epoch": 1.6958290946083419, "grad_norm": 0.7448320984840393, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8283286094665527, "num_tokens": 531199906.0, "step": 1667 }, { "epoch": 1.6968463886063072, "grad_norm": 0.7585152387619019, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8263843655586243, "num_tokens": 531509317.0, "step": 1668 }, { "epoch": 1.6978636826042726, "grad_norm": 0.758061408996582, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8299506306648254, "num_tokens": 531821184.0, "step": 1669 }, { "epoch": 1.6988809766022381, "grad_norm": 0.7765728235244751, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8297423720359802, "num_tokens": 532137310.0, "step": 1670 }, { "epoch": 1.6998982706002035, "grad_norm": 0.7671675682067871, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8257801532745361, "num_tokens": 532440662.0, "step": 1671 }, { "epoch": 1.7009155645981688, "grad_norm": 0.7835381627082825, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8288462162017822, "num_tokens": 532750623.0, "step": 1672 }, { "epoch": 1.7019328585961344, "grad_norm": 0.7619048357009888, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8313275575637817, "num_tokens": 533079069.0, "step": 1673 }, { "epoch": 1.7029501525940995, "grad_norm": 0.7300487756729126, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8356854915618896, "num_tokens": 533404987.0, "step": 1674 }, { "epoch": 1.7039674465920651, "grad_norm": 0.7767015695571899, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8279440402984619, "num_tokens": 533715865.0, "step": 1675 }, { "epoch": 1.7049847405900305, "grad_norm": 0.858773946762085, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.824346661567688, "num_tokens": 534023360.0, "step": 1676 }, { "epoch": 1.7060020345879958, "grad_norm": 0.7402467131614685, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8312146663665771, "num_tokens": 534338342.0, "step": 1677 }, { "epoch": 1.7070193285859614, "grad_norm": 0.7947079539299011, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8272440433502197, "num_tokens": 534643540.0, "step": 1678 }, { "epoch": 1.7080366225839267, "grad_norm": 0.7758183479309082, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8284360766410828, "num_tokens": 534946089.0, "step": 1679 }, { "epoch": 1.709053916581892, "grad_norm": 0.7562273144721985, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8222533464431763, "num_tokens": 535248651.0, "step": 1680 }, { "epoch": 1.7100712105798577, "grad_norm": 0.8158243894577026, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8221856951713562, "num_tokens": 535572330.0, "step": 1681 }, { "epoch": 1.711088504577823, "grad_norm": 0.7543163895606995, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.827721357345581, "num_tokens": 535900015.0, "step": 1682 }, { "epoch": 1.7121057985757884, "grad_norm": 0.7882978916168213, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8247331976890564, "num_tokens": 536213601.0, "step": 1683 }, { "epoch": 1.713123092573754, "grad_norm": 0.756375253200531, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.822330892086029, "num_tokens": 536546179.0, "step": 1684 }, { "epoch": 1.714140386571719, "grad_norm": 0.8251622915267944, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8222557902336121, "num_tokens": 536873341.0, "step": 1685 }, { "epoch": 1.7151576805696847, "grad_norm": 0.7793753743171692, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8255091309547424, "num_tokens": 537182989.0, "step": 1686 }, { "epoch": 1.71617497456765, "grad_norm": 0.7966047525405884, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.825797975063324, "num_tokens": 537519346.0, "step": 1687 }, { "epoch": 1.7171922685656154, "grad_norm": 0.7807252407073975, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8319574594497681, "num_tokens": 537849156.0, "step": 1688 }, { "epoch": 1.718209562563581, "grad_norm": 0.821812093257904, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8211857676506042, "num_tokens": 538167004.0, "step": 1689 }, { "epoch": 1.7192268565615463, "grad_norm": 0.83882737159729, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8303667902946472, "num_tokens": 538455354.0, "step": 1690 }, { "epoch": 1.7202441505595116, "grad_norm": 0.7454236149787903, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8291758298873901, "num_tokens": 538776413.0, "step": 1691 }, { "epoch": 1.7212614445574772, "grad_norm": 0.7784879207611084, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8272480368614197, "num_tokens": 539089207.0, "step": 1692 }, { "epoch": 1.7222787385554426, "grad_norm": 0.8132408261299133, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8302366733551025, "num_tokens": 539409402.0, "step": 1693 }, { "epoch": 1.723296032553408, "grad_norm": 0.7897824048995972, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8230624198913574, "num_tokens": 539724109.0, "step": 1694 }, { "epoch": 1.7243133265513735, "grad_norm": 0.7500278949737549, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.828041672706604, "num_tokens": 540055954.0, "step": 1695 }, { "epoch": 1.7253306205493386, "grad_norm": 0.7602378726005554, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8209776878356934, "num_tokens": 540380282.0, "step": 1696 }, { "epoch": 1.7263479145473042, "grad_norm": 0.7624279260635376, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8228939175605774, "num_tokens": 540703596.0, "step": 1697 }, { "epoch": 1.7273652085452695, "grad_norm": 0.7722674012184143, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8232327103614807, "num_tokens": 541024110.0, "step": 1698 }, { "epoch": 1.7283825025432349, "grad_norm": 0.8067665696144104, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.828144907951355, "num_tokens": 541343999.0, "step": 1699 }, { "epoch": 1.7293997965412005, "grad_norm": 0.8109105229377747, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8250290751457214, "num_tokens": 541666906.0, "step": 1700 }, { "epoch": 1.7304170905391658, "grad_norm": 0.753727376461029, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8250855803489685, "num_tokens": 541998847.0, "step": 1701 }, { "epoch": 1.7314343845371312, "grad_norm": 0.7683834433555603, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8281385898590088, "num_tokens": 542330954.0, "step": 1702 }, { "epoch": 1.7324516785350967, "grad_norm": 0.7327966094017029, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8294649124145508, "num_tokens": 542649706.0, "step": 1703 }, { "epoch": 1.733468972533062, "grad_norm": 0.754997193813324, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8174710869789124, "num_tokens": 542971174.0, "step": 1704 }, { "epoch": 1.7344862665310274, "grad_norm": 0.7789589762687683, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.826871395111084, "num_tokens": 543286863.0, "step": 1705 }, { "epoch": 1.735503560528993, "grad_norm": 0.7593998312950134, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8190546035766602, "num_tokens": 543603599.0, "step": 1706 }, { "epoch": 1.7365208545269581, "grad_norm": 0.7631123065948486, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8233776092529297, "num_tokens": 543929074.0, "step": 1707 }, { "epoch": 1.7375381485249237, "grad_norm": 0.7596994042396545, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8212376832962036, "num_tokens": 544254565.0, "step": 1708 }, { "epoch": 1.738555442522889, "grad_norm": 0.7288892865180969, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8310983777046204, "num_tokens": 544594001.0, "step": 1709 }, { "epoch": 1.7395727365208544, "grad_norm": 0.7478421330451965, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8226866126060486, "num_tokens": 544905906.0, "step": 1710 }, { "epoch": 1.74059003051882, "grad_norm": 0.7765371203422546, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8321041464805603, "num_tokens": 545218275.0, "step": 1711 }, { "epoch": 1.7416073245167853, "grad_norm": 0.7736310958862305, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8248491883277893, "num_tokens": 545540433.0, "step": 1712 }, { "epoch": 1.7426246185147507, "grad_norm": 0.7796671390533447, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8282781839370728, "num_tokens": 545853945.0, "step": 1713 }, { "epoch": 1.7436419125127163, "grad_norm": 0.7976074814796448, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8329964280128479, "num_tokens": 546163829.0, "step": 1714 }, { "epoch": 1.7446592065106816, "grad_norm": 0.7982625365257263, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8288213014602661, "num_tokens": 546476551.0, "step": 1715 }, { "epoch": 1.745676500508647, "grad_norm": 0.733306348323822, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8309009075164795, "num_tokens": 546808037.0, "step": 1716 }, { "epoch": 1.7466937945066126, "grad_norm": 0.7403319478034973, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8330470323562622, "num_tokens": 547138121.0, "step": 1717 }, { "epoch": 1.7477110885045777, "grad_norm": 0.7419624924659729, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8281609416007996, "num_tokens": 547455414.0, "step": 1718 }, { "epoch": 1.7487283825025433, "grad_norm": 0.7999053001403809, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8309951424598694, "num_tokens": 547758420.0, "step": 1719 }, { "epoch": 1.7497456765005086, "grad_norm": 0.7836387157440186, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8303165435791016, "num_tokens": 548059321.0, "step": 1720 }, { "epoch": 1.750762970498474, "grad_norm": 0.7795581221580505, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8205006122589111, "num_tokens": 548363768.0, "step": 1721 }, { "epoch": 1.7517802644964395, "grad_norm": 0.7558625340461731, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8264075517654419, "num_tokens": 548670132.0, "step": 1722 }, { "epoch": 1.7527975584944049, "grad_norm": 0.7679370641708374, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8297629952430725, "num_tokens": 548986310.0, "step": 1723 }, { "epoch": 1.7538148524923702, "grad_norm": 0.7437918186187744, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8282225131988525, "num_tokens": 549294340.0, "step": 1724 }, { "epoch": 1.7548321464903358, "grad_norm": 0.7457557320594788, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8366795778274536, "num_tokens": 549610886.0, "step": 1725 }, { "epoch": 1.7558494404883012, "grad_norm": 0.7415375113487244, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8265110850334167, "num_tokens": 549926561.0, "step": 1726 }, { "epoch": 1.7568667344862665, "grad_norm": 0.7230045199394226, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8289594054222107, "num_tokens": 550244848.0, "step": 1727 }, { "epoch": 1.757884028484232, "grad_norm": 0.7837753891944885, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8344599604606628, "num_tokens": 550565225.0, "step": 1728 }, { "epoch": 1.7589013224821972, "grad_norm": 0.7353631854057312, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.828995943069458, "num_tokens": 550869158.0, "step": 1729 }, { "epoch": 1.7599186164801628, "grad_norm": 0.7451027631759644, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8255603909492493, "num_tokens": 551189311.0, "step": 1730 }, { "epoch": 1.7609359104781281, "grad_norm": 0.7759789228439331, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8234883546829224, "num_tokens": 551500042.0, "step": 1731 }, { "epoch": 1.7619532044760935, "grad_norm": 0.7609255313873291, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8300365209579468, "num_tokens": 551811847.0, "step": 1732 }, { "epoch": 1.762970498474059, "grad_norm": 0.733325719833374, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8318478465080261, "num_tokens": 552131236.0, "step": 1733 }, { "epoch": 1.7639877924720244, "grad_norm": 0.8272982239723206, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8292044401168823, "num_tokens": 552456755.0, "step": 1734 }, { "epoch": 1.7650050864699898, "grad_norm": 0.7635635137557983, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8342831134796143, "num_tokens": 552777207.0, "step": 1735 }, { "epoch": 1.7660223804679553, "grad_norm": 0.7395186424255371, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8267441987991333, "num_tokens": 553087376.0, "step": 1736 }, { "epoch": 1.7670396744659207, "grad_norm": 0.7357089519500732, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8273993730545044, "num_tokens": 553406997.0, "step": 1737 }, { "epoch": 1.768056968463886, "grad_norm": 0.7930406332015991, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8186691403388977, "num_tokens": 553726380.0, "step": 1738 }, { "epoch": 1.7690742624618516, "grad_norm": 0.7484753727912903, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8241056799888611, "num_tokens": 554060617.0, "step": 1739 }, { "epoch": 1.7700915564598168, "grad_norm": 0.7562209367752075, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8281306028366089, "num_tokens": 554385731.0, "step": 1740 }, { "epoch": 1.7711088504577823, "grad_norm": 0.7572125792503357, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8297789096832275, "num_tokens": 554714277.0, "step": 1741 }, { "epoch": 1.7721261444557477, "grad_norm": 0.8044959902763367, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.824925422668457, "num_tokens": 555033920.0, "step": 1742 }, { "epoch": 1.773143438453713, "grad_norm": 0.7124162912368774, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8302481770515442, "num_tokens": 555371365.0, "step": 1743 }, { "epoch": 1.7741607324516786, "grad_norm": 0.723034679889679, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.830976128578186, "num_tokens": 555695553.0, "step": 1744 }, { "epoch": 1.775178026449644, "grad_norm": 0.7885095477104187, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8223779797554016, "num_tokens": 556006070.0, "step": 1745 }, { "epoch": 1.7761953204476093, "grad_norm": 0.7051622271537781, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.835534930229187, "num_tokens": 556324330.0, "step": 1746 }, { "epoch": 1.7772126144455749, "grad_norm": 0.777579128742218, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8295488357543945, "num_tokens": 556639463.0, "step": 1747 }, { "epoch": 1.7782299084435402, "grad_norm": 0.7351852059364319, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.825910210609436, "num_tokens": 556964733.0, "step": 1748 }, { "epoch": 1.7792472024415056, "grad_norm": 0.7478289008140564, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8244177103042603, "num_tokens": 557293193.0, "step": 1749 }, { "epoch": 1.7802644964394712, "grad_norm": 0.7599475383758545, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8303159475326538, "num_tokens": 557609634.0, "step": 1750 }, { "epoch": 1.7812817904374363, "grad_norm": 0.7593867182731628, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8315073251724243, "num_tokens": 557933267.0, "step": 1751 }, { "epoch": 1.7822990844354019, "grad_norm": 1.2834893465042114, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8185144066810608, "num_tokens": 558238439.0, "step": 1752 }, { "epoch": 1.7833163784333672, "grad_norm": 0.7640964984893799, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8251737356185913, "num_tokens": 558564521.0, "step": 1753 }, { "epoch": 1.7843336724313326, "grad_norm": 0.781453013420105, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8211246132850647, "num_tokens": 558870711.0, "step": 1754 }, { "epoch": 1.7853509664292981, "grad_norm": 0.7851511836051941, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8274396657943726, "num_tokens": 559184486.0, "step": 1755 }, { "epoch": 1.7863682604272635, "grad_norm": 0.7635732293128967, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8281428813934326, "num_tokens": 559487831.0, "step": 1756 }, { "epoch": 1.7873855544252288, "grad_norm": 0.7452448010444641, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8321632146835327, "num_tokens": 559805689.0, "step": 1757 }, { "epoch": 1.7884028484231944, "grad_norm": 0.7915704250335693, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8353804349899292, "num_tokens": 560117177.0, "step": 1758 }, { "epoch": 1.7894201424211598, "grad_norm": 0.7450749278068542, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8259989023208618, "num_tokens": 560434600.0, "step": 1759 }, { "epoch": 1.7904374364191251, "grad_norm": 0.7560845613479614, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8280336856842041, "num_tokens": 560752929.0, "step": 1760 }, { "epoch": 1.7914547304170907, "grad_norm": 0.7767955660820007, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8263304233551025, "num_tokens": 561057758.0, "step": 1761 }, { "epoch": 1.7924720244150558, "grad_norm": 0.7627075910568237, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8305634260177612, "num_tokens": 561364341.0, "step": 1762 }, { "epoch": 1.7934893184130214, "grad_norm": 0.734595537185669, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8293797969818115, "num_tokens": 561682192.0, "step": 1763 }, { "epoch": 1.7945066124109867, "grad_norm": 0.7812824845314026, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8202382922172546, "num_tokens": 561984366.0, "step": 1764 }, { "epoch": 1.795523906408952, "grad_norm": 0.7395841479301453, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8286502361297607, "num_tokens": 562319171.0, "step": 1765 }, { "epoch": 1.7965412004069177, "grad_norm": 0.7442046999931335, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.822655200958252, "num_tokens": 562649718.0, "step": 1766 }, { "epoch": 1.797558494404883, "grad_norm": 0.7415618896484375, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8303205966949463, "num_tokens": 562962769.0, "step": 1767 }, { "epoch": 1.7985757884028484, "grad_norm": 0.8440918922424316, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8259419202804565, "num_tokens": 563281657.0, "step": 1768 }, { "epoch": 1.799593082400814, "grad_norm": 0.7686070203781128, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8270139098167419, "num_tokens": 563599442.0, "step": 1769 }, { "epoch": 1.8006103763987793, "grad_norm": 0.7371310591697693, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8267232179641724, "num_tokens": 563938038.0, "step": 1770 }, { "epoch": 1.8016276703967447, "grad_norm": 0.7386311292648315, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8327062129974365, "num_tokens": 564249675.0, "step": 1771 }, { "epoch": 1.8026449643947102, "grad_norm": 0.7869567275047302, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8284811973571777, "num_tokens": 564549442.0, "step": 1772 }, { "epoch": 1.8036622583926754, "grad_norm": 0.8040668964385986, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8295624256134033, "num_tokens": 564879892.0, "step": 1773 }, { "epoch": 1.804679552390641, "grad_norm": 0.8081727027893066, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8280688524246216, "num_tokens": 565210219.0, "step": 1774 }, { "epoch": 1.8056968463886063, "grad_norm": 0.761573314666748, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8316044807434082, "num_tokens": 565526228.0, "step": 1775 }, { "epoch": 1.8067141403865716, "grad_norm": 0.7849445343017578, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8230304718017578, "num_tokens": 565852900.0, "step": 1776 }, { "epoch": 1.8077314343845372, "grad_norm": 0.7502607703208923, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.831209123134613, "num_tokens": 566180405.0, "step": 1777 }, { "epoch": 1.8087487283825026, "grad_norm": 0.797438383102417, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8235186338424683, "num_tokens": 566504302.0, "step": 1778 }, { "epoch": 1.809766022380468, "grad_norm": 0.7596687078475952, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8234046697616577, "num_tokens": 566830753.0, "step": 1779 }, { "epoch": 1.8107833163784335, "grad_norm": 0.8487532734870911, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8312756419181824, "num_tokens": 567125149.0, "step": 1780 }, { "epoch": 1.8118006103763988, "grad_norm": 0.72871994972229, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8263726234436035, "num_tokens": 567439219.0, "step": 1781 }, { "epoch": 1.8128179043743642, "grad_norm": 0.7636623978614807, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8279685974121094, "num_tokens": 567757245.0, "step": 1782 }, { "epoch": 1.8138351983723298, "grad_norm": 0.7923188805580139, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8303213119506836, "num_tokens": 568063203.0, "step": 1783 }, { "epoch": 1.814852492370295, "grad_norm": 0.784536600112915, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8268868923187256, "num_tokens": 568375485.0, "step": 1784 }, { "epoch": 1.8158697863682605, "grad_norm": 0.7456648945808411, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8270926475524902, "num_tokens": 568704776.0, "step": 1785 }, { "epoch": 1.8168870803662258, "grad_norm": 0.7257611155509949, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8301023244857788, "num_tokens": 569029468.0, "step": 1786 }, { "epoch": 1.8179043743641912, "grad_norm": 0.7755439877510071, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8267319202423096, "num_tokens": 569332534.0, "step": 1787 }, { "epoch": 1.8189216683621567, "grad_norm": 0.7320640683174133, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8308744430541992, "num_tokens": 569647573.0, "step": 1788 }, { "epoch": 1.819938962360122, "grad_norm": 0.7781921625137329, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8281724452972412, "num_tokens": 569960390.0, "step": 1789 }, { "epoch": 1.8209562563580874, "grad_norm": 0.7738130688667297, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8257182240486145, "num_tokens": 570276992.0, "step": 1790 }, { "epoch": 1.821973550356053, "grad_norm": 0.7794972062110901, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8308176398277283, "num_tokens": 570601038.0, "step": 1791 }, { "epoch": 1.8229908443540181, "grad_norm": 0.7800111770629883, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8281744718551636, "num_tokens": 570927501.0, "step": 1792 }, { "epoch": 1.8240081383519837, "grad_norm": 0.7633543610572815, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8264333009719849, "num_tokens": 571225760.0, "step": 1793 }, { "epoch": 1.8250254323499493, "grad_norm": 0.7642391920089722, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8267236948013306, "num_tokens": 571536763.0, "step": 1794 }, { "epoch": 1.8260427263479144, "grad_norm": 0.7453933358192444, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8335306644439697, "num_tokens": 571865425.0, "step": 1795 }, { "epoch": 1.82706002034588, "grad_norm": 1.2367335557937622, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8284416198730469, "num_tokens": 572184872.0, "step": 1796 }, { "epoch": 1.8280773143438453, "grad_norm": 0.7603802680969238, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8286498785018921, "num_tokens": 572506248.0, "step": 1797 }, { "epoch": 1.8290946083418107, "grad_norm": 0.7347100377082825, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8210036754608154, "num_tokens": 572820670.0, "step": 1798 }, { "epoch": 1.8301119023397763, "grad_norm": 1.4764974117279053, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8353832960128784, "num_tokens": 573129521.0, "step": 1799 }, { "epoch": 1.8311291963377416, "grad_norm": 0.7458595037460327, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.825110137462616, "num_tokens": 573458787.0, "step": 1800 }, { "epoch": 1.832146490335707, "grad_norm": 0.7468324899673462, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8235799074172974, "num_tokens": 573794196.0, "step": 1801 }, { "epoch": 1.8331637843336726, "grad_norm": 0.7786433696746826, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8281745910644531, "num_tokens": 574118594.0, "step": 1802 }, { "epoch": 1.8341810783316377, "grad_norm": 0.7661991119384766, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8331655263900757, "num_tokens": 574429910.0, "step": 1803 }, { "epoch": 1.8351983723296033, "grad_norm": 1.088448405265808, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8323421478271484, "num_tokens": 574750525.0, "step": 1804 }, { "epoch": 1.8362156663275688, "grad_norm": 0.7912493348121643, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8276345729827881, "num_tokens": 575086614.0, "step": 1805 }, { "epoch": 1.837232960325534, "grad_norm": 0.7702261209487915, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8321763277053833, "num_tokens": 575407175.0, "step": 1806 }, { "epoch": 1.8382502543234995, "grad_norm": 0.8245096206665039, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8336420655250549, "num_tokens": 575734331.0, "step": 1807 }, { "epoch": 1.8392675483214649, "grad_norm": 0.7476911544799805, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8227272629737854, "num_tokens": 576048212.0, "step": 1808 }, { "epoch": 1.8402848423194302, "grad_norm": 0.7333124279975891, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8251782059669495, "num_tokens": 576379541.0, "step": 1809 }, { "epoch": 1.8413021363173958, "grad_norm": 0.7403326630592346, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8277183771133423, "num_tokens": 576693938.0, "step": 1810 }, { "epoch": 1.8423194303153612, "grad_norm": 0.8351613879203796, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8285555243492126, "num_tokens": 576996432.0, "step": 1811 }, { "epoch": 1.8433367243133265, "grad_norm": 0.7698614597320557, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8226888179779053, "num_tokens": 577320221.0, "step": 1812 }, { "epoch": 1.844354018311292, "grad_norm": 0.7601704597473145, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8269021511077881, "num_tokens": 577643317.0, "step": 1813 }, { "epoch": 1.8453713123092572, "grad_norm": 0.7298431396484375, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8303334712982178, "num_tokens": 577962353.0, "step": 1814 }, { "epoch": 1.8463886063072228, "grad_norm": 0.7406944036483765, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8237319588661194, "num_tokens": 578280116.0, "step": 1815 }, { "epoch": 1.8474059003051884, "grad_norm": 0.8527024984359741, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8289049863815308, "num_tokens": 578587724.0, "step": 1816 }, { "epoch": 1.8484231943031535, "grad_norm": 0.7803925275802612, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.82612144947052, "num_tokens": 578907271.0, "step": 1817 }, { "epoch": 1.849440488301119, "grad_norm": 0.7724500298500061, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8265190720558167, "num_tokens": 579229124.0, "step": 1818 }, { "epoch": 1.8504577822990844, "grad_norm": 0.747200608253479, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8350002765655518, "num_tokens": 579541087.0, "step": 1819 }, { "epoch": 1.8514750762970498, "grad_norm": 0.775551438331604, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8248890042304993, "num_tokens": 579853781.0, "step": 1820 }, { "epoch": 1.8524923702950153, "grad_norm": 0.7513517141342163, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8300427794456482, "num_tokens": 580175507.0, "step": 1821 }, { "epoch": 1.8535096642929807, "grad_norm": 0.8104873895645142, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8259702324867249, "num_tokens": 580499293.0, "step": 1822 }, { "epoch": 1.854526958290946, "grad_norm": 0.785900890827179, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8257489800453186, "num_tokens": 580805832.0, "step": 1823 }, { "epoch": 1.8555442522889116, "grad_norm": 0.7971948981285095, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8351184725761414, "num_tokens": 581107989.0, "step": 1824 }, { "epoch": 1.8565615462868768, "grad_norm": 0.7580695748329163, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8308185935020447, "num_tokens": 581423977.0, "step": 1825 }, { "epoch": 1.8575788402848423, "grad_norm": 0.7906219363212585, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8221039772033691, "num_tokens": 581742263.0, "step": 1826 }, { "epoch": 1.8585961342828077, "grad_norm": 0.7569301128387451, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8262839317321777, "num_tokens": 582060741.0, "step": 1827 }, { "epoch": 1.859613428280773, "grad_norm": 0.7414655685424805, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8287638425827026, "num_tokens": 582383233.0, "step": 1828 }, { "epoch": 1.8606307222787386, "grad_norm": 0.7988961338996887, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8204246759414673, "num_tokens": 582700928.0, "step": 1829 }, { "epoch": 1.861648016276704, "grad_norm": 0.754478931427002, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8302208781242371, "num_tokens": 583019968.0, "step": 1830 }, { "epoch": 1.8626653102746693, "grad_norm": 0.811531126499176, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8267852067947388, "num_tokens": 583318374.0, "step": 1831 }, { "epoch": 1.8636826042726349, "grad_norm": 0.7797427177429199, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8314744234085083, "num_tokens": 583646646.0, "step": 1832 }, { "epoch": 1.8646998982706002, "grad_norm": 0.757041871547699, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8293916583061218, "num_tokens": 583961908.0, "step": 1833 }, { "epoch": 1.8657171922685656, "grad_norm": 0.7979153394699097, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8318359851837158, "num_tokens": 584274112.0, "step": 1834 }, { "epoch": 1.8667344862665312, "grad_norm": 0.7214437127113342, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8350898027420044, "num_tokens": 584604753.0, "step": 1835 }, { "epoch": 1.8677517802644963, "grad_norm": 0.7353912591934204, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8271667957305908, "num_tokens": 584922779.0, "step": 1836 }, { "epoch": 1.8687690742624619, "grad_norm": 0.7849698662757874, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8227310180664062, "num_tokens": 585258146.0, "step": 1837 }, { "epoch": 1.8697863682604272, "grad_norm": 0.7752400040626526, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8339313864707947, "num_tokens": 585575451.0, "step": 1838 }, { "epoch": 1.8708036622583926, "grad_norm": 0.7641316652297974, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.825273871421814, "num_tokens": 585907402.0, "step": 1839 }, { "epoch": 1.8718209562563581, "grad_norm": 0.7349004745483398, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8251415491104126, "num_tokens": 586248102.0, "step": 1840 }, { "epoch": 1.8728382502543235, "grad_norm": 0.7678957581520081, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8224442005157471, "num_tokens": 586569377.0, "step": 1841 }, { "epoch": 1.8738555442522888, "grad_norm": 0.7082367539405823, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8304846286773682, "num_tokens": 586891543.0, "step": 1842 }, { "epoch": 1.8748728382502544, "grad_norm": 0.7639223337173462, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.826633870601654, "num_tokens": 587214972.0, "step": 1843 }, { "epoch": 1.8758901322482198, "grad_norm": 0.7483731508255005, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8220580816268921, "num_tokens": 587541589.0, "step": 1844 }, { "epoch": 1.8769074262461851, "grad_norm": 0.78640216588974, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8221210241317749, "num_tokens": 587860976.0, "step": 1845 }, { "epoch": 1.8779247202441507, "grad_norm": 0.7742619514465332, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8282708525657654, "num_tokens": 588168649.0, "step": 1846 }, { "epoch": 1.8789420142421158, "grad_norm": 0.7555391192436218, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8348156213760376, "num_tokens": 588491275.0, "step": 1847 }, { "epoch": 1.8799593082400814, "grad_norm": 0.7500118613243103, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8297252058982849, "num_tokens": 588817168.0, "step": 1848 }, { "epoch": 1.8809766022380467, "grad_norm": 0.7919448018074036, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8308275938034058, "num_tokens": 589135465.0, "step": 1849 }, { "epoch": 1.881993896236012, "grad_norm": 0.7840650677680969, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8216804265975952, "num_tokens": 589459066.0, "step": 1850 }, { "epoch": 1.8830111902339777, "grad_norm": 0.7872382998466492, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8276762962341309, "num_tokens": 589787765.0, "step": 1851 }, { "epoch": 1.884028484231943, "grad_norm": 0.8268197178840637, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8270741701126099, "num_tokens": 590119500.0, "step": 1852 }, { "epoch": 1.8850457782299084, "grad_norm": 0.7454878091812134, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8230266571044922, "num_tokens": 590427085.0, "step": 1853 }, { "epoch": 1.886063072227874, "grad_norm": 0.7611258029937744, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8284235000610352, "num_tokens": 590750486.0, "step": 1854 }, { "epoch": 1.8870803662258393, "grad_norm": 0.8493229746818542, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8279353380203247, "num_tokens": 591063252.0, "step": 1855 }, { "epoch": 1.8880976602238047, "grad_norm": 0.7734782099723816, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8284351825714111, "num_tokens": 591367281.0, "step": 1856 }, { "epoch": 1.8891149542217702, "grad_norm": 0.7691129446029663, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8218074440956116, "num_tokens": 591689473.0, "step": 1857 }, { "epoch": 1.8901322482197354, "grad_norm": 0.7397372126579285, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8286991119384766, "num_tokens": 592022697.0, "step": 1858 }, { "epoch": 1.891149542217701, "grad_norm": 0.7259275317192078, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8260259628295898, "num_tokens": 592341817.0, "step": 1859 }, { "epoch": 1.8921668362156663, "grad_norm": 0.8009742498397827, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8263646364212036, "num_tokens": 592667852.0, "step": 1860 }, { "epoch": 1.8931841302136316, "grad_norm": 0.7932537198066711, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8248840570449829, "num_tokens": 592990364.0, "step": 1861 }, { "epoch": 1.8942014242115972, "grad_norm": 0.7673304080963135, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8271982669830322, "num_tokens": 593313033.0, "step": 1862 }, { "epoch": 1.8952187182095626, "grad_norm": 0.7386042475700378, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8284578323364258, "num_tokens": 593629565.0, "step": 1863 }, { "epoch": 1.896236012207528, "grad_norm": 0.7839403748512268, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8257097005844116, "num_tokens": 593932811.0, "step": 1864 }, { "epoch": 1.8972533062054935, "grad_norm": 0.8242468237876892, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8315895795822144, "num_tokens": 594251800.0, "step": 1865 }, { "epoch": 1.8982706002034588, "grad_norm": 0.7358497381210327, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8254359364509583, "num_tokens": 594566525.0, "step": 1866 }, { "epoch": 1.8992878942014242, "grad_norm": 0.7373138070106506, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8289642333984375, "num_tokens": 594903927.0, "step": 1867 }, { "epoch": 1.9003051881993898, "grad_norm": 0.8773072957992554, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8294283151626587, "num_tokens": 595203411.0, "step": 1868 }, { "epoch": 1.901322482197355, "grad_norm": 0.8440640568733215, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8205792903900146, "num_tokens": 595523298.0, "step": 1869 }, { "epoch": 1.9023397761953205, "grad_norm": 0.843758761882782, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8344087600708008, "num_tokens": 595851295.0, "step": 1870 }, { "epoch": 1.9033570701932858, "grad_norm": 0.8056725859642029, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8255429267883301, "num_tokens": 596165520.0, "step": 1871 }, { "epoch": 1.9043743641912512, "grad_norm": 0.7543884515762329, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8281387686729431, "num_tokens": 596492724.0, "step": 1872 }, { "epoch": 1.9053916581892167, "grad_norm": 0.8288529515266418, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8341200351715088, "num_tokens": 596815498.0, "step": 1873 }, { "epoch": 1.906408952187182, "grad_norm": 0.7765027284622192, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.82168048620224, "num_tokens": 597137505.0, "step": 1874 }, { "epoch": 1.9074262461851474, "grad_norm": 0.7864693999290466, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8255944848060608, "num_tokens": 597439964.0, "step": 1875 }, { "epoch": 1.908443540183113, "grad_norm": 0.7858473062515259, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8250025510787964, "num_tokens": 597762524.0, "step": 1876 }, { "epoch": 1.9094608341810784, "grad_norm": 0.8264142274856567, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8224828839302063, "num_tokens": 598088815.0, "step": 1877 }, { "epoch": 1.9104781281790437, "grad_norm": 0.8210098743438721, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8275391459465027, "num_tokens": 598408435.0, "step": 1878 }, { "epoch": 1.9114954221770093, "grad_norm": 0.7526994943618774, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.829414963722229, "num_tokens": 598703467.0, "step": 1879 }, { "epoch": 1.9125127161749744, "grad_norm": 0.7536560893058777, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8269705772399902, "num_tokens": 599047533.0, "step": 1880 }, { "epoch": 1.91353001017294, "grad_norm": 0.8106531500816345, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8292891383171082, "num_tokens": 599369815.0, "step": 1881 }, { "epoch": 1.9145473041709054, "grad_norm": 0.7695590853691101, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8354257345199585, "num_tokens": 599687509.0, "step": 1882 }, { "epoch": 1.9155645981688707, "grad_norm": 0.7978789210319519, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8345689177513123, "num_tokens": 599989016.0, "step": 1883 }, { "epoch": 1.9165818921668363, "grad_norm": 0.7534072995185852, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8252373933792114, "num_tokens": 600296480.0, "step": 1884 }, { "epoch": 1.9175991861648016, "grad_norm": 0.7588997483253479, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8319488167762756, "num_tokens": 600617011.0, "step": 1885 }, { "epoch": 1.918616480162767, "grad_norm": 0.8035694360733032, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.820303201675415, "num_tokens": 600942695.0, "step": 1886 }, { "epoch": 1.9196337741607326, "grad_norm": 0.8165997862815857, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8230639696121216, "num_tokens": 601254189.0, "step": 1887 }, { "epoch": 1.920651068158698, "grad_norm": 0.777153730392456, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8262922167778015, "num_tokens": 601571439.0, "step": 1888 }, { "epoch": 1.9216683621566633, "grad_norm": 0.7819951176643372, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8283118009567261, "num_tokens": 601896520.0, "step": 1889 }, { "epoch": 1.9226856561546288, "grad_norm": 0.8021113276481628, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8276652693748474, "num_tokens": 602203498.0, "step": 1890 }, { "epoch": 1.923702950152594, "grad_norm": 0.8063667416572571, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.827467143535614, "num_tokens": 602510131.0, "step": 1891 }, { "epoch": 1.9247202441505595, "grad_norm": 0.7440853714942932, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8289334177970886, "num_tokens": 602835451.0, "step": 1892 }, { "epoch": 1.9257375381485249, "grad_norm": 0.7506888508796692, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8250542283058167, "num_tokens": 603141191.0, "step": 1893 }, { "epoch": 1.9267548321464902, "grad_norm": 0.7600045800209045, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.831433892250061, "num_tokens": 603470478.0, "step": 1894 }, { "epoch": 1.9277721261444558, "grad_norm": 0.8485811948776245, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.83091139793396, "num_tokens": 603791455.0, "step": 1895 }, { "epoch": 1.9287894201424212, "grad_norm": 0.789547324180603, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8254702091217041, "num_tokens": 604109105.0, "step": 1896 }, { "epoch": 1.9298067141403865, "grad_norm": 0.7782526612281799, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8261871337890625, "num_tokens": 604425281.0, "step": 1897 }, { "epoch": 1.930824008138352, "grad_norm": 0.7582260370254517, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.821397066116333, "num_tokens": 604760916.0, "step": 1898 }, { "epoch": 1.9318413021363174, "grad_norm": 0.8307642936706543, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8259291052818298, "num_tokens": 605069512.0, "step": 1899 }, { "epoch": 1.9328585961342828, "grad_norm": 0.7825501561164856, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8273696899414062, "num_tokens": 605388317.0, "step": 1900 }, { "epoch": 1.9338758901322484, "grad_norm": 0.7845165133476257, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8265910744667053, "num_tokens": 605720362.0, "step": 1901 }, { "epoch": 1.9348931841302135, "grad_norm": 0.7711207866668701, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.823683500289917, "num_tokens": 606043288.0, "step": 1902 }, { "epoch": 1.935910478128179, "grad_norm": 0.7821605205535889, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8253616094589233, "num_tokens": 606348632.0, "step": 1903 }, { "epoch": 1.9369277721261444, "grad_norm": 0.7327739596366882, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8361741304397583, "num_tokens": 606676157.0, "step": 1904 }, { "epoch": 1.9379450661241098, "grad_norm": 0.787828803062439, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8274844884872437, "num_tokens": 607011481.0, "step": 1905 }, { "epoch": 1.9389623601220753, "grad_norm": 0.7983585596084595, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8234202861785889, "num_tokens": 607334595.0, "step": 1906 }, { "epoch": 1.9399796541200407, "grad_norm": 0.7567639946937561, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8337772488594055, "num_tokens": 607647463.0, "step": 1907 }, { "epoch": 1.940996948118006, "grad_norm": 0.7706727385520935, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8285652995109558, "num_tokens": 607969176.0, "step": 1908 }, { "epoch": 1.9420142421159716, "grad_norm": 0.85313880443573, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8226011991500854, "num_tokens": 608294081.0, "step": 1909 }, { "epoch": 1.943031536113937, "grad_norm": 0.8062782883644104, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.823600172996521, "num_tokens": 608626842.0, "step": 1910 }, { "epoch": 1.9440488301119023, "grad_norm": 0.7171719074249268, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8306635618209839, "num_tokens": 608969123.0, "step": 1911 }, { "epoch": 1.945066124109868, "grad_norm": 0.8143373727798462, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8298995494842529, "num_tokens": 609267308.0, "step": 1912 }, { "epoch": 1.946083418107833, "grad_norm": 1.0744192600250244, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8253097534179688, "num_tokens": 609605979.0, "step": 1913 }, { "epoch": 1.9471007121057986, "grad_norm": 0.7887747287750244, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8280919790267944, "num_tokens": 609905194.0, "step": 1914 }, { "epoch": 1.948118006103764, "grad_norm": 0.7372229695320129, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8253764510154724, "num_tokens": 610232914.0, "step": 1915 }, { "epoch": 1.9491353001017293, "grad_norm": 0.8306304812431335, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8301711678504944, "num_tokens": 610524356.0, "step": 1916 }, { "epoch": 1.9501525940996949, "grad_norm": 0.7157770395278931, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8307013511657715, "num_tokens": 610848911.0, "step": 1917 }, { "epoch": 1.9511698880976602, "grad_norm": 0.7406263947486877, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.832368791103363, "num_tokens": 611151896.0, "step": 1918 }, { "epoch": 1.9521871820956256, "grad_norm": 0.7178313732147217, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8341268301010132, "num_tokens": 611477707.0, "step": 1919 }, { "epoch": 1.9532044760935912, "grad_norm": 0.8264352679252625, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8236597776412964, "num_tokens": 611786310.0, "step": 1920 }, { "epoch": 1.9542217700915565, "grad_norm": 0.790361225605011, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8291520476341248, "num_tokens": 612106242.0, "step": 1921 }, { "epoch": 1.9552390640895219, "grad_norm": 0.7451375722885132, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8325362205505371, "num_tokens": 612408694.0, "step": 1922 }, { "epoch": 1.9562563580874874, "grad_norm": 0.7814817428588867, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8296904563903809, "num_tokens": 612723003.0, "step": 1923 }, { "epoch": 1.9572736520854526, "grad_norm": 0.7812781929969788, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8298949003219604, "num_tokens": 613044117.0, "step": 1924 }, { "epoch": 1.9582909460834181, "grad_norm": 0.8395456671714783, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8346281051635742, "num_tokens": 613322625.0, "step": 1925 }, { "epoch": 1.9593082400813835, "grad_norm": 0.7608759999275208, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.828799307346344, "num_tokens": 613652844.0, "step": 1926 }, { "epoch": 1.9603255340793488, "grad_norm": 0.7218765616416931, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8309551477432251, "num_tokens": 613969421.0, "step": 1927 }, { "epoch": 1.9613428280773144, "grad_norm": 0.7457222938537598, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8331649899482727, "num_tokens": 614295637.0, "step": 1928 }, { "epoch": 1.9623601220752798, "grad_norm": 0.7531415224075317, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.828516960144043, "num_tokens": 614612191.0, "step": 1929 }, { "epoch": 1.9633774160732451, "grad_norm": 0.8161703944206238, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8255881667137146, "num_tokens": 614941005.0, "step": 1930 }, { "epoch": 1.9643947100712107, "grad_norm": 0.7939525842666626, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8334426879882812, "num_tokens": 615260901.0, "step": 1931 }, { "epoch": 1.965412004069176, "grad_norm": 0.8395957946777344, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8290640711784363, "num_tokens": 615576002.0, "step": 1932 }, { "epoch": 1.9664292980671414, "grad_norm": 0.7337270975112915, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8317199349403381, "num_tokens": 615906641.0, "step": 1933 }, { "epoch": 1.967446592065107, "grad_norm": 0.7723780870437622, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8182454705238342, "num_tokens": 616248968.0, "step": 1934 }, { "epoch": 1.968463886063072, "grad_norm": 0.7680447697639465, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8295483589172363, "num_tokens": 616578475.0, "step": 1935 }, { "epoch": 1.9694811800610377, "grad_norm": 0.7666056156158447, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8256507515907288, "num_tokens": 616899976.0, "step": 1936 }, { "epoch": 1.970498474059003, "grad_norm": 0.7603473663330078, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8319006562232971, "num_tokens": 617214839.0, "step": 1937 }, { "epoch": 1.9715157680569684, "grad_norm": 0.7682324051856995, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.823277473449707, "num_tokens": 617512120.0, "step": 1938 }, { "epoch": 1.972533062054934, "grad_norm": 0.7841208577156067, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.829049825668335, "num_tokens": 617823554.0, "step": 1939 }, { "epoch": 1.9735503560528993, "grad_norm": 0.7929815053939819, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8287888765335083, "num_tokens": 618137243.0, "step": 1940 }, { "epoch": 1.9745676500508647, "grad_norm": 0.8292960524559021, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.825197160243988, "num_tokens": 618473609.0, "step": 1941 }, { "epoch": 1.9755849440488302, "grad_norm": 0.7060987949371338, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8338747024536133, "num_tokens": 618796427.0, "step": 1942 }, { "epoch": 1.9766022380467956, "grad_norm": 0.7885839343070984, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8296661972999573, "num_tokens": 619108709.0, "step": 1943 }, { "epoch": 1.977619532044761, "grad_norm": 0.7611109614372253, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8274158239364624, "num_tokens": 619435220.0, "step": 1944 }, { "epoch": 1.9786368260427265, "grad_norm": 0.7665042877197266, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8177695274353027, "num_tokens": 619759819.0, "step": 1945 }, { "epoch": 1.9796541200406916, "grad_norm": 0.781718373298645, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8298677802085876, "num_tokens": 620070498.0, "step": 1946 }, { "epoch": 1.9806714140386572, "grad_norm": 0.7552787661552429, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8303831815719604, "num_tokens": 620393836.0, "step": 1947 }, { "epoch": 1.9816887080366226, "grad_norm": 0.7667186260223389, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8307204842567444, "num_tokens": 620719684.0, "step": 1948 }, { "epoch": 1.982706002034588, "grad_norm": 0.7332197427749634, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.825315535068512, "num_tokens": 621050520.0, "step": 1949 }, { "epoch": 1.9837232960325535, "grad_norm": 0.8660725355148315, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8284949064254761, "num_tokens": 621363543.0, "step": 1950 }, { "epoch": 1.9847405900305188, "grad_norm": 0.7703171968460083, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8312957286834717, "num_tokens": 621674098.0, "step": 1951 }, { "epoch": 1.9857578840284842, "grad_norm": 0.7844569683074951, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.830365777015686, "num_tokens": 621989155.0, "step": 1952 }, { "epoch": 1.9867751780264498, "grad_norm": 0.7418847680091858, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8281689882278442, "num_tokens": 622317261.0, "step": 1953 }, { "epoch": 1.987792472024415, "grad_norm": 1.7465903759002686, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.825128972530365, "num_tokens": 622647732.0, "step": 1954 }, { "epoch": 1.9888097660223805, "grad_norm": 0.7512000203132629, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8317239284515381, "num_tokens": 622984287.0, "step": 1955 }, { "epoch": 1.989827060020346, "grad_norm": 0.7754637598991394, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8367646932601929, "num_tokens": 623279377.0, "step": 1956 }, { "epoch": 1.9908443540183112, "grad_norm": 0.7074607014656067, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8279786109924316, "num_tokens": 623617093.0, "step": 1957 }, { "epoch": 1.9918616480162767, "grad_norm": 0.7336138486862183, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8236699104309082, "num_tokens": 623935367.0, "step": 1958 }, { "epoch": 1.992878942014242, "grad_norm": 0.7565160989761353, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8255666494369507, "num_tokens": 624268903.0, "step": 1959 }, { "epoch": 1.9938962360122074, "grad_norm": 0.7770515084266663, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8275524377822876, "num_tokens": 624585621.0, "step": 1960 }, { "epoch": 1.994913530010173, "grad_norm": 0.7109097242355347, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.822210431098938, "num_tokens": 624919983.0, "step": 1961 }, { "epoch": 1.9959308240081384, "grad_norm": 0.8382838368415833, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8173540830612183, "num_tokens": 625244861.0, "step": 1962 }, { "epoch": 1.9969481180061037, "grad_norm": 0.7442500591278076, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8312058448791504, "num_tokens": 625553633.0, "step": 1963 }, { "epoch": 1.9979654120040693, "grad_norm": 0.7596532106399536, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8254216909408569, "num_tokens": 625879310.0, "step": 1964 }, { "epoch": 1.9989827060020344, "grad_norm": 0.7299008369445801, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8325172662734985, "num_tokens": 626194056.0, "step": 1965 }, { "epoch": 2.0, "grad_norm": 0.769568681716919, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8282358050346375, "num_tokens": 626514393.0, "step": 1966 }, { "epoch": 2.0010172939979656, "grad_norm": 0.793281078338623, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8329326510429382, "num_tokens": 626843728.0, "step": 1967 }, { "epoch": 2.0020345879959307, "grad_norm": 0.7408942580223083, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8274793028831482, "num_tokens": 627179506.0, "step": 1968 }, { "epoch": 2.0030518819938963, "grad_norm": 0.7407317757606506, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8320581316947937, "num_tokens": 627483549.0, "step": 1969 }, { "epoch": 2.004069175991862, "grad_norm": 0.8348525762557983, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.828995406627655, "num_tokens": 627813505.0, "step": 1970 }, { "epoch": 2.005086469989827, "grad_norm": 0.7179054021835327, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8263566493988037, "num_tokens": 628136119.0, "step": 1971 }, { "epoch": 2.0061037639877926, "grad_norm": 0.8164916038513184, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8339582085609436, "num_tokens": 628450141.0, "step": 1972 }, { "epoch": 2.0071210579857577, "grad_norm": 0.7368289828300476, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8332412242889404, "num_tokens": 628770893.0, "step": 1973 }, { "epoch": 2.0081383519837233, "grad_norm": 0.7819622159004211, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8278636932373047, "num_tokens": 629091061.0, "step": 1974 }, { "epoch": 2.009155645981689, "grad_norm": 0.7813390493392944, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8363057971000671, "num_tokens": 629412815.0, "step": 1975 }, { "epoch": 2.010172939979654, "grad_norm": 0.8105148077011108, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8274290561676025, "num_tokens": 629722640.0, "step": 1976 }, { "epoch": 2.0111902339776195, "grad_norm": 0.7819290161132812, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8274918794631958, "num_tokens": 630044682.0, "step": 1977 }, { "epoch": 2.012207527975585, "grad_norm": 0.8088401556015015, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8194631338119507, "num_tokens": 630333856.0, "step": 1978 }, { "epoch": 2.0132248219735502, "grad_norm": 0.7477482557296753, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8224201202392578, "num_tokens": 630642252.0, "step": 1979 }, { "epoch": 2.014242115971516, "grad_norm": 0.7271042466163635, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8269437551498413, "num_tokens": 630967182.0, "step": 1980 }, { "epoch": 2.0152594099694814, "grad_norm": 0.7491155862808228, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.841214656829834, "num_tokens": 631277861.0, "step": 1981 }, { "epoch": 2.0162767039674465, "grad_norm": 0.7819457054138184, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8392908573150635, "num_tokens": 631581074.0, "step": 1982 }, { "epoch": 2.017293997965412, "grad_norm": 0.7867006659507751, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8317956328392029, "num_tokens": 631908085.0, "step": 1983 }, { "epoch": 2.018311291963377, "grad_norm": 0.7748855948448181, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8271198272705078, "num_tokens": 632228951.0, "step": 1984 }, { "epoch": 2.019328585961343, "grad_norm": 0.7708137631416321, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8322231769561768, "num_tokens": 632557630.0, "step": 1985 }, { "epoch": 2.0203458799593084, "grad_norm": 0.734464168548584, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8276263475418091, "num_tokens": 632888435.0, "step": 1986 }, { "epoch": 2.0213631739572735, "grad_norm": 0.7887408137321472, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8354251384735107, "num_tokens": 633213083.0, "step": 1987 }, { "epoch": 2.022380467955239, "grad_norm": 0.7359147071838379, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8312580585479736, "num_tokens": 633521507.0, "step": 1988 }, { "epoch": 2.0233977619532046, "grad_norm": 0.7471558451652527, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8286077976226807, "num_tokens": 633851293.0, "step": 1989 }, { "epoch": 2.0244150559511698, "grad_norm": 0.7837832570075989, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8252357840538025, "num_tokens": 634182314.0, "step": 1990 }, { "epoch": 2.0254323499491353, "grad_norm": 0.7467847466468811, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8307093977928162, "num_tokens": 634503716.0, "step": 1991 }, { "epoch": 2.026449643947101, "grad_norm": 0.7618154883384705, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8307061195373535, "num_tokens": 634812750.0, "step": 1992 }, { "epoch": 2.027466937945066, "grad_norm": 0.7568649053573608, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8288283348083496, "num_tokens": 635124979.0, "step": 1993 }, { "epoch": 2.0284842319430316, "grad_norm": 0.8403826951980591, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8298501372337341, "num_tokens": 635455215.0, "step": 1994 }, { "epoch": 2.0295015259409968, "grad_norm": 0.7417284250259399, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8286497592926025, "num_tokens": 635778771.0, "step": 1995 }, { "epoch": 2.0305188199389623, "grad_norm": 0.7632890939712524, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8259048461914062, "num_tokens": 636091977.0, "step": 1996 }, { "epoch": 2.031536113936928, "grad_norm": 0.7756537199020386, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8347346782684326, "num_tokens": 636426068.0, "step": 1997 }, { "epoch": 2.032553407934893, "grad_norm": 0.7656052708625793, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8321259021759033, "num_tokens": 636728987.0, "step": 1998 }, { "epoch": 2.0335707019328586, "grad_norm": 0.7594374418258667, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8242126703262329, "num_tokens": 637048170.0, "step": 1999 }, { "epoch": 2.034587995930824, "grad_norm": 0.7326385378837585, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8288227319717407, "num_tokens": 637376842.0, "step": 2000 }, { "epoch": 2.0356052899287893, "grad_norm": 0.7957135438919067, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8231464624404907, "num_tokens": 637708776.0, "step": 2001 }, { "epoch": 2.036622583926755, "grad_norm": 0.7293063402175903, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8277890086174011, "num_tokens": 638026761.0, "step": 2002 }, { "epoch": 2.0376398779247205, "grad_norm": 0.7880427241325378, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.828234851360321, "num_tokens": 638337687.0, "step": 2003 }, { "epoch": 2.0386571719226856, "grad_norm": 0.7594811320304871, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8365904688835144, "num_tokens": 638663203.0, "step": 2004 }, { "epoch": 2.039674465920651, "grad_norm": 0.7555544376373291, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8272154331207275, "num_tokens": 638985653.0, "step": 2005 }, { "epoch": 2.0406917599186163, "grad_norm": 0.857343852519989, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8249437212944031, "num_tokens": 639296652.0, "step": 2006 }, { "epoch": 2.041709053916582, "grad_norm": 0.7711302042007446, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.825864315032959, "num_tokens": 639629812.0, "step": 2007 }, { "epoch": 2.0427263479145474, "grad_norm": 1.0842350721359253, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8358830809593201, "num_tokens": 639953979.0, "step": 2008 }, { "epoch": 2.0437436419125126, "grad_norm": 0.7909249067306519, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8325679898262024, "num_tokens": 640271057.0, "step": 2009 }, { "epoch": 2.044760935910478, "grad_norm": 0.730385422706604, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8343998789787292, "num_tokens": 640574916.0, "step": 2010 }, { "epoch": 2.0457782299084437, "grad_norm": 0.7266718745231628, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8327420949935913, "num_tokens": 640899252.0, "step": 2011 }, { "epoch": 2.046795523906409, "grad_norm": 0.7553056478500366, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8326089978218079, "num_tokens": 641225765.0, "step": 2012 }, { "epoch": 2.0478128179043744, "grad_norm": 0.7447772026062012, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8331438899040222, "num_tokens": 641553952.0, "step": 2013 }, { "epoch": 2.04883011190234, "grad_norm": 0.7557468414306641, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.825654149055481, "num_tokens": 641871688.0, "step": 2014 }, { "epoch": 2.049847405900305, "grad_norm": 0.7843445539474487, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8237768411636353, "num_tokens": 642182839.0, "step": 2015 }, { "epoch": 2.0508646998982707, "grad_norm": 0.8298075199127197, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8268923759460449, "num_tokens": 642490309.0, "step": 2016 }, { "epoch": 2.051881993896236, "grad_norm": 0.7977005243301392, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8326629996299744, "num_tokens": 642789777.0, "step": 2017 }, { "epoch": 2.0528992878942014, "grad_norm": 0.7488961815834045, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8282648324966431, "num_tokens": 643121797.0, "step": 2018 }, { "epoch": 2.053916581892167, "grad_norm": 0.7535355687141418, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8259459137916565, "num_tokens": 643442233.0, "step": 2019 }, { "epoch": 2.054933875890132, "grad_norm": 0.752249538898468, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8324294686317444, "num_tokens": 643770297.0, "step": 2020 }, { "epoch": 2.0559511698880977, "grad_norm": 0.7770043015480042, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.828926146030426, "num_tokens": 644077898.0, "step": 2021 }, { "epoch": 2.0569684638860632, "grad_norm": 0.7959228754043579, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8270010948181152, "num_tokens": 644396823.0, "step": 2022 }, { "epoch": 2.0579857578840284, "grad_norm": 0.7729719281196594, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8291889429092407, "num_tokens": 644715765.0, "step": 2023 }, { "epoch": 2.059003051881994, "grad_norm": 0.7595959305763245, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8271936178207397, "num_tokens": 645039148.0, "step": 2024 }, { "epoch": 2.0600203458799595, "grad_norm": 0.8012211322784424, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.828609049320221, "num_tokens": 645359940.0, "step": 2025 }, { "epoch": 2.0610376398779247, "grad_norm": 0.7749910354614258, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8325744867324829, "num_tokens": 645695713.0, "step": 2026 }, { "epoch": 2.0620549338758902, "grad_norm": 0.8015928864479065, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8329358100891113, "num_tokens": 646007060.0, "step": 2027 }, { "epoch": 2.0630722278738554, "grad_norm": 0.7851582169532776, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8285136222839355, "num_tokens": 646310789.0, "step": 2028 }, { "epoch": 2.064089521871821, "grad_norm": 0.7077644467353821, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.836718738079071, "num_tokens": 646651450.0, "step": 2029 }, { "epoch": 2.0651068158697865, "grad_norm": 0.8080658316612244, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.827492356300354, "num_tokens": 646974846.0, "step": 2030 }, { "epoch": 2.0661241098677516, "grad_norm": 0.8135608434677124, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8345105648040771, "num_tokens": 647294337.0, "step": 2031 }, { "epoch": 2.067141403865717, "grad_norm": 0.7688538432121277, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8303366899490356, "num_tokens": 647614724.0, "step": 2032 }, { "epoch": 2.068158697863683, "grad_norm": 0.7426178455352783, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.833173930644989, "num_tokens": 647933285.0, "step": 2033 }, { "epoch": 2.069175991861648, "grad_norm": 0.7545600533485413, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8328017592430115, "num_tokens": 648251173.0, "step": 2034 }, { "epoch": 2.0701932858596135, "grad_norm": 0.8664435148239136, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8208303451538086, "num_tokens": 648551725.0, "step": 2035 }, { "epoch": 2.0712105798575786, "grad_norm": 0.7247047424316406, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8322950601577759, "num_tokens": 648863508.0, "step": 2036 }, { "epoch": 2.072227873855544, "grad_norm": 0.7342517971992493, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8353379964828491, "num_tokens": 649176493.0, "step": 2037 }, { "epoch": 2.0732451678535098, "grad_norm": 0.7122109532356262, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8390113711357117, "num_tokens": 649497977.0, "step": 2038 }, { "epoch": 2.074262461851475, "grad_norm": 0.7776530981063843, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8287674784660339, "num_tokens": 649826804.0, "step": 2039 }, { "epoch": 2.0752797558494405, "grad_norm": 0.9624866843223572, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8347163200378418, "num_tokens": 650125023.0, "step": 2040 }, { "epoch": 2.076297049847406, "grad_norm": 1.2730292081832886, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8335597515106201, "num_tokens": 650457502.0, "step": 2041 }, { "epoch": 2.077314343845371, "grad_norm": 0.7527278065681458, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8305301666259766, "num_tokens": 650774229.0, "step": 2042 }, { "epoch": 2.0783316378433367, "grad_norm": 0.7566222548484802, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8274224996566772, "num_tokens": 651094235.0, "step": 2043 }, { "epoch": 2.0793489318413023, "grad_norm": 0.8623789548873901, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8269400596618652, "num_tokens": 651403530.0, "step": 2044 }, { "epoch": 2.0803662258392674, "grad_norm": 0.8023196458816528, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8259271383285522, "num_tokens": 651737149.0, "step": 2045 }, { "epoch": 2.081383519837233, "grad_norm": 0.7700502276420593, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8281545639038086, "num_tokens": 652057532.0, "step": 2046 }, { "epoch": 2.082400813835198, "grad_norm": 0.7362005114555359, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8289458751678467, "num_tokens": 652364933.0, "step": 2047 }, { "epoch": 2.0834181078331637, "grad_norm": 0.7432454228401184, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8287770748138428, "num_tokens": 652693646.0, "step": 2048 }, { "epoch": 2.0844354018311293, "grad_norm": 0.7227328419685364, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8361355662345886, "num_tokens": 653007977.0, "step": 2049 }, { "epoch": 2.0854526958290944, "grad_norm": 0.7496834993362427, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8348923921585083, "num_tokens": 653315197.0, "step": 2050 }, { "epoch": 2.08646998982706, "grad_norm": 0.7300513386726379, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8325375914573669, "num_tokens": 653634130.0, "step": 2051 }, { "epoch": 2.0874872838250256, "grad_norm": 0.7461215853691101, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8253625631332397, "num_tokens": 653956123.0, "step": 2052 }, { "epoch": 2.0885045778229907, "grad_norm": 0.7695837020874023, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8242179155349731, "num_tokens": 654284883.0, "step": 2053 }, { "epoch": 2.0895218718209563, "grad_norm": 0.7454880475997925, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8213703632354736, "num_tokens": 654620877.0, "step": 2054 }, { "epoch": 2.090539165818922, "grad_norm": 0.7532869577407837, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8228363394737244, "num_tokens": 654955083.0, "step": 2055 }, { "epoch": 2.091556459816887, "grad_norm": 0.7616458535194397, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8365463614463806, "num_tokens": 655262947.0, "step": 2056 }, { "epoch": 2.0925737538148526, "grad_norm": 0.7663776874542236, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8342633843421936, "num_tokens": 655582982.0, "step": 2057 }, { "epoch": 2.0935910478128177, "grad_norm": 0.748302161693573, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8303278684616089, "num_tokens": 655892296.0, "step": 2058 }, { "epoch": 2.0946083418107833, "grad_norm": 0.7514637112617493, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8340238332748413, "num_tokens": 656199760.0, "step": 2059 }, { "epoch": 2.095625635808749, "grad_norm": 0.7614337205886841, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8253083825111389, "num_tokens": 656514419.0, "step": 2060 }, { "epoch": 2.096642929806714, "grad_norm": 0.7688518762588501, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8416045308113098, "num_tokens": 656815650.0, "step": 2061 }, { "epoch": 2.0976602238046795, "grad_norm": 0.742057204246521, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8324074745178223, "num_tokens": 657128763.0, "step": 2062 }, { "epoch": 2.098677517802645, "grad_norm": 0.7600764036178589, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8270496129989624, "num_tokens": 657442965.0, "step": 2063 }, { "epoch": 2.0996948118006102, "grad_norm": 0.7068182229995728, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8338534235954285, "num_tokens": 657787301.0, "step": 2064 }, { "epoch": 2.100712105798576, "grad_norm": 0.7744857668876648, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.831404447555542, "num_tokens": 658086444.0, "step": 2065 }, { "epoch": 2.1017293997965414, "grad_norm": 0.7384071350097656, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8269665837287903, "num_tokens": 658417852.0, "step": 2066 }, { "epoch": 2.1027466937945065, "grad_norm": 0.7803059816360474, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.829984188079834, "num_tokens": 658727698.0, "step": 2067 }, { "epoch": 2.103763987792472, "grad_norm": 0.7474600672721863, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8319146037101746, "num_tokens": 659036589.0, "step": 2068 }, { "epoch": 2.104781281790437, "grad_norm": 0.765299916267395, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8267961740493774, "num_tokens": 659356387.0, "step": 2069 }, { "epoch": 2.105798575788403, "grad_norm": 0.7543993592262268, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8287253379821777, "num_tokens": 659665573.0, "step": 2070 }, { "epoch": 2.1068158697863684, "grad_norm": 0.7225178480148315, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8303307294845581, "num_tokens": 659986298.0, "step": 2071 }, { "epoch": 2.1078331637843335, "grad_norm": 0.7693639993667603, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8354504108428955, "num_tokens": 660295004.0, "step": 2072 }, { "epoch": 2.108850457782299, "grad_norm": 0.7996508479118347, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8350417613983154, "num_tokens": 660607411.0, "step": 2073 }, { "epoch": 2.1098677517802646, "grad_norm": 0.7588698267936707, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8349910378456116, "num_tokens": 660921469.0, "step": 2074 }, { "epoch": 2.1108850457782298, "grad_norm": 0.7550652623176575, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8192981481552124, "num_tokens": 661237468.0, "step": 2075 }, { "epoch": 2.1119023397761953, "grad_norm": 0.7587368488311768, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8338797688484192, "num_tokens": 661553561.0, "step": 2076 }, { "epoch": 2.112919633774161, "grad_norm": 0.7872341275215149, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8278385996818542, "num_tokens": 661865294.0, "step": 2077 }, { "epoch": 2.113936927772126, "grad_norm": 0.7376534342765808, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8293165564537048, "num_tokens": 662192533.0, "step": 2078 }, { "epoch": 2.1149542217700916, "grad_norm": 0.7513229846954346, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8273903131484985, "num_tokens": 662502572.0, "step": 2079 }, { "epoch": 2.1159715157680568, "grad_norm": 0.7460830211639404, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8257139325141907, "num_tokens": 662821554.0, "step": 2080 }, { "epoch": 2.1169888097660223, "grad_norm": 0.766185998916626, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.828972578048706, "num_tokens": 663123259.0, "step": 2081 }, { "epoch": 2.118006103763988, "grad_norm": 0.7785876989364624, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8344416618347168, "num_tokens": 663445600.0, "step": 2082 }, { "epoch": 2.119023397761953, "grad_norm": 0.7833909392356873, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8279260396957397, "num_tokens": 663747652.0, "step": 2083 }, { "epoch": 2.1200406917599186, "grad_norm": 0.7774428725242615, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8186205625534058, "num_tokens": 664069904.0, "step": 2084 }, { "epoch": 2.121057985757884, "grad_norm": 0.7461346387863159, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8285263776779175, "num_tokens": 664386511.0, "step": 2085 }, { "epoch": 2.1220752797558493, "grad_norm": 0.748580276966095, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8327457904815674, "num_tokens": 664712795.0, "step": 2086 }, { "epoch": 2.123092573753815, "grad_norm": 0.8052957057952881, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8318384885787964, "num_tokens": 665036489.0, "step": 2087 }, { "epoch": 2.1241098677517805, "grad_norm": 0.7805037498474121, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.835271954536438, "num_tokens": 665345764.0, "step": 2088 }, { "epoch": 2.1251271617497456, "grad_norm": 0.7457467913627625, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8313297033309937, "num_tokens": 665650495.0, "step": 2089 }, { "epoch": 2.126144455747711, "grad_norm": 0.7864099740982056, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8272011280059814, "num_tokens": 665976759.0, "step": 2090 }, { "epoch": 2.1271617497456763, "grad_norm": 0.7976198196411133, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8287394046783447, "num_tokens": 666274229.0, "step": 2091 }, { "epoch": 2.128179043743642, "grad_norm": 0.718885064125061, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8265447616577148, "num_tokens": 666617768.0, "step": 2092 }, { "epoch": 2.1291963377416074, "grad_norm": 0.758816123008728, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8275684118270874, "num_tokens": 666935081.0, "step": 2093 }, { "epoch": 2.1302136317395726, "grad_norm": 0.7571752667427063, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.831500232219696, "num_tokens": 667248835.0, "step": 2094 }, { "epoch": 2.131230925737538, "grad_norm": 0.7851269245147705, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8253339529037476, "num_tokens": 667556405.0, "step": 2095 }, { "epoch": 2.1322482197355037, "grad_norm": 0.7476552724838257, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8259379267692566, "num_tokens": 667894932.0, "step": 2096 }, { "epoch": 2.133265513733469, "grad_norm": 0.7466548085212708, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8320664167404175, "num_tokens": 668202295.0, "step": 2097 }, { "epoch": 2.1342828077314344, "grad_norm": 0.7157275676727295, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8280497789382935, "num_tokens": 668537919.0, "step": 2098 }, { "epoch": 2.1353001017294, "grad_norm": 0.7510685920715332, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8300042748451233, "num_tokens": 668854088.0, "step": 2099 }, { "epoch": 2.136317395727365, "grad_norm": 0.7428464889526367, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8350558280944824, "num_tokens": 669192028.0, "step": 2100 }, { "epoch": 2.1373346897253307, "grad_norm": 0.794187605381012, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.833488941192627, "num_tokens": 669516310.0, "step": 2101 }, { "epoch": 2.138351983723296, "grad_norm": 0.6934726238250732, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8282812833786011, "num_tokens": 669843789.0, "step": 2102 }, { "epoch": 2.1393692777212614, "grad_norm": 0.7874788045883179, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8306584358215332, "num_tokens": 670161044.0, "step": 2103 }, { "epoch": 2.140386571719227, "grad_norm": 0.7843090295791626, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8288133144378662, "num_tokens": 670470095.0, "step": 2104 }, { "epoch": 2.141403865717192, "grad_norm": 0.7777359485626221, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8294340968132019, "num_tokens": 670784785.0, "step": 2105 }, { "epoch": 2.1424211597151577, "grad_norm": 0.7022597193717957, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.833615779876709, "num_tokens": 671102909.0, "step": 2106 }, { "epoch": 2.1434384537131232, "grad_norm": 0.7523261904716492, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8297773599624634, "num_tokens": 671421218.0, "step": 2107 }, { "epoch": 2.1444557477110884, "grad_norm": 0.7527234554290771, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8291486501693726, "num_tokens": 671755061.0, "step": 2108 }, { "epoch": 2.145473041709054, "grad_norm": 0.7956981658935547, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8387909531593323, "num_tokens": 672085682.0, "step": 2109 }, { "epoch": 2.1464903357070195, "grad_norm": 0.7895192503929138, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8217060565948486, "num_tokens": 672406808.0, "step": 2110 }, { "epoch": 2.1475076297049847, "grad_norm": 0.7586142420768738, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8228213787078857, "num_tokens": 672742756.0, "step": 2111 }, { "epoch": 2.1485249237029502, "grad_norm": 0.6966933608055115, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8360923528671265, "num_tokens": 673076346.0, "step": 2112 }, { "epoch": 2.1495422177009154, "grad_norm": 0.7788679003715515, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8248082399368286, "num_tokens": 673390482.0, "step": 2113 }, { "epoch": 2.150559511698881, "grad_norm": 0.7420569658279419, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8269627094268799, "num_tokens": 673724830.0, "step": 2114 }, { "epoch": 2.1515768056968465, "grad_norm": 0.9678299427032471, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8264451026916504, "num_tokens": 674041915.0, "step": 2115 }, { "epoch": 2.1525940996948116, "grad_norm": 0.7400864958763123, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8259016275405884, "num_tokens": 674376067.0, "step": 2116 }, { "epoch": 2.153611393692777, "grad_norm": 0.7815296649932861, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8299500346183777, "num_tokens": 674683835.0, "step": 2117 }, { "epoch": 2.154628687690743, "grad_norm": 0.7235947251319885, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8269928693771362, "num_tokens": 675021716.0, "step": 2118 }, { "epoch": 2.155645981688708, "grad_norm": 0.7602812647819519, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8294414281845093, "num_tokens": 675339134.0, "step": 2119 }, { "epoch": 2.1566632756866735, "grad_norm": 0.7821603417396545, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8252530097961426, "num_tokens": 675650088.0, "step": 2120 }, { "epoch": 2.157680569684639, "grad_norm": 0.7559882402420044, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8339157700538635, "num_tokens": 675976634.0, "step": 2121 }, { "epoch": 2.158697863682604, "grad_norm": 0.7382848858833313, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.823092520236969, "num_tokens": 676301382.0, "step": 2122 }, { "epoch": 2.1597151576805698, "grad_norm": 0.7315267324447632, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8290728330612183, "num_tokens": 676630331.0, "step": 2123 }, { "epoch": 2.160732451678535, "grad_norm": 0.7366230487823486, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8245151042938232, "num_tokens": 676971441.0, "step": 2124 }, { "epoch": 2.1617497456765005, "grad_norm": 0.7743238806724548, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8322397470474243, "num_tokens": 677292539.0, "step": 2125 }, { "epoch": 2.162767039674466, "grad_norm": 0.8578828573226929, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8391723036766052, "num_tokens": 677609509.0, "step": 2126 }, { "epoch": 2.163784333672431, "grad_norm": 0.7956092953681946, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8298279643058777, "num_tokens": 677937681.0, "step": 2127 }, { "epoch": 2.1648016276703967, "grad_norm": 0.7627188563346863, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8378783464431763, "num_tokens": 678236943.0, "step": 2128 }, { "epoch": 2.1658189216683623, "grad_norm": 0.7750257253646851, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8241315484046936, "num_tokens": 678533114.0, "step": 2129 }, { "epoch": 2.1668362156663274, "grad_norm": 0.8628514409065247, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8356536626815796, "num_tokens": 678854315.0, "step": 2130 }, { "epoch": 2.167853509664293, "grad_norm": 0.7474110126495361, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8338069915771484, "num_tokens": 679170763.0, "step": 2131 }, { "epoch": 2.1688708036622586, "grad_norm": 0.7617558836936951, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8265351057052612, "num_tokens": 679480751.0, "step": 2132 }, { "epoch": 2.1698880976602237, "grad_norm": 0.7319729924201965, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8302922248840332, "num_tokens": 679799542.0, "step": 2133 }, { "epoch": 2.1709053916581893, "grad_norm": 0.7312279939651489, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8329917788505554, "num_tokens": 680126987.0, "step": 2134 }, { "epoch": 2.1719226856561544, "grad_norm": 0.7618582844734192, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8301103711128235, "num_tokens": 680445654.0, "step": 2135 }, { "epoch": 2.17293997965412, "grad_norm": 0.7430460453033447, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8269782662391663, "num_tokens": 680766149.0, "step": 2136 }, { "epoch": 2.1739572736520856, "grad_norm": 0.7346223592758179, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8346844911575317, "num_tokens": 681081193.0, "step": 2137 }, { "epoch": 2.1749745676500507, "grad_norm": 0.987307071685791, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8271561861038208, "num_tokens": 681389768.0, "step": 2138 }, { "epoch": 2.1759918616480163, "grad_norm": 0.7216676473617554, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8329777717590332, "num_tokens": 681716150.0, "step": 2139 }, { "epoch": 2.177009155645982, "grad_norm": 0.7814753651618958, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8237301111221313, "num_tokens": 682034982.0, "step": 2140 }, { "epoch": 2.178026449643947, "grad_norm": 0.7712879776954651, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8320897817611694, "num_tokens": 682360761.0, "step": 2141 }, { "epoch": 2.1790437436419126, "grad_norm": 0.7365331053733826, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8374788761138916, "num_tokens": 682683740.0, "step": 2142 }, { "epoch": 2.180061037639878, "grad_norm": 0.7548322677612305, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8324960470199585, "num_tokens": 682998758.0, "step": 2143 }, { "epoch": 2.1810783316378433, "grad_norm": 0.7492295503616333, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8332866430282593, "num_tokens": 683315540.0, "step": 2144 }, { "epoch": 2.182095625635809, "grad_norm": 0.7494568824768066, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8304259777069092, "num_tokens": 683639710.0, "step": 2145 }, { "epoch": 2.183112919633774, "grad_norm": 0.7964203953742981, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8240746259689331, "num_tokens": 683962253.0, "step": 2146 }, { "epoch": 2.1841302136317395, "grad_norm": 0.7324085831642151, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.832410991191864, "num_tokens": 684276805.0, "step": 2147 }, { "epoch": 2.185147507629705, "grad_norm": 0.7728395462036133, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8357415795326233, "num_tokens": 684600612.0, "step": 2148 }, { "epoch": 2.1861648016276702, "grad_norm": 0.8084637522697449, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8287630081176758, "num_tokens": 684915465.0, "step": 2149 }, { "epoch": 2.187182095625636, "grad_norm": 0.7585083246231079, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8277579545974731, "num_tokens": 685243003.0, "step": 2150 }, { "epoch": 2.1881993896236014, "grad_norm": 0.7352972030639648, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.830999493598938, "num_tokens": 685586437.0, "step": 2151 }, { "epoch": 2.1892166836215665, "grad_norm": 0.7726067900657654, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8336653709411621, "num_tokens": 685883071.0, "step": 2152 }, { "epoch": 2.190233977619532, "grad_norm": 0.7544667720794678, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.833609402179718, "num_tokens": 686203095.0, "step": 2153 }, { "epoch": 2.1912512716174977, "grad_norm": 0.7761970162391663, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8342102766036987, "num_tokens": 686515166.0, "step": 2154 }, { "epoch": 2.192268565615463, "grad_norm": 0.7551308870315552, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8281980752944946, "num_tokens": 686847911.0, "step": 2155 }, { "epoch": 2.1932858596134284, "grad_norm": 0.7399230003356934, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8297072649002075, "num_tokens": 687161028.0, "step": 2156 }, { "epoch": 2.1943031536113935, "grad_norm": 0.7779721617698669, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8314580917358398, "num_tokens": 687462776.0, "step": 2157 }, { "epoch": 2.195320447609359, "grad_norm": 0.7697399854660034, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8339000344276428, "num_tokens": 687785512.0, "step": 2158 }, { "epoch": 2.1963377416073246, "grad_norm": 0.7364392876625061, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8337985873222351, "num_tokens": 688112149.0, "step": 2159 }, { "epoch": 2.1973550356052898, "grad_norm": 0.8275453448295593, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8366191387176514, "num_tokens": 688432958.0, "step": 2160 }, { "epoch": 2.1983723296032553, "grad_norm": 0.8337928652763367, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8279739618301392, "num_tokens": 688745272.0, "step": 2161 }, { "epoch": 2.199389623601221, "grad_norm": 0.7610588073730469, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8283995985984802, "num_tokens": 689058958.0, "step": 2162 }, { "epoch": 2.200406917599186, "grad_norm": 0.7536100149154663, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8297035694122314, "num_tokens": 689387117.0, "step": 2163 }, { "epoch": 2.2014242115971516, "grad_norm": 0.819729745388031, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8299574255943298, "num_tokens": 689703523.0, "step": 2164 }, { "epoch": 2.202441505595117, "grad_norm": 0.8181864619255066, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8252584338188171, "num_tokens": 690011320.0, "step": 2165 }, { "epoch": 2.2034587995930823, "grad_norm": 0.7794902920722961, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8352136611938477, "num_tokens": 690312474.0, "step": 2166 }, { "epoch": 2.204476093591048, "grad_norm": 0.7757052183151245, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8337260484695435, "num_tokens": 690629913.0, "step": 2167 }, { "epoch": 2.205493387589013, "grad_norm": 0.7456837892532349, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8383486270904541, "num_tokens": 690946910.0, "step": 2168 }, { "epoch": 2.2065106815869786, "grad_norm": 0.765623152256012, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8256553411483765, "num_tokens": 691287105.0, "step": 2169 }, { "epoch": 2.207527975584944, "grad_norm": 0.8190492987632751, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.830524742603302, "num_tokens": 691590813.0, "step": 2170 }, { "epoch": 2.2085452695829093, "grad_norm": 0.7249578833580017, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8298805952072144, "num_tokens": 691912782.0, "step": 2171 }, { "epoch": 2.209562563580875, "grad_norm": 0.7654576301574707, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8291027545928955, "num_tokens": 692242294.0, "step": 2172 }, { "epoch": 2.2105798575788405, "grad_norm": 0.7766432762145996, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8241997957229614, "num_tokens": 692562541.0, "step": 2173 }, { "epoch": 2.2115971515768056, "grad_norm": 0.7897323369979858, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8327312469482422, "num_tokens": 692863692.0, "step": 2174 }, { "epoch": 2.212614445574771, "grad_norm": 0.776065468788147, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8322169780731201, "num_tokens": 693172309.0, "step": 2175 }, { "epoch": 2.2136317395727367, "grad_norm": 0.8281997442245483, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8276952505111694, "num_tokens": 693494304.0, "step": 2176 }, { "epoch": 2.214649033570702, "grad_norm": 0.7335885763168335, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8332297205924988, "num_tokens": 693827204.0, "step": 2177 }, { "epoch": 2.2156663275686674, "grad_norm": 0.7174627780914307, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.826240599155426, "num_tokens": 694172486.0, "step": 2178 }, { "epoch": 2.2166836215666326, "grad_norm": 0.7426119446754456, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8206398487091064, "num_tokens": 694504560.0, "step": 2179 }, { "epoch": 2.217700915564598, "grad_norm": 0.7555232048034668, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8234156966209412, "num_tokens": 694817430.0, "step": 2180 }, { "epoch": 2.2187182095625637, "grad_norm": 0.7455035448074341, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8338122963905334, "num_tokens": 695139610.0, "step": 2181 }, { "epoch": 2.219735503560529, "grad_norm": 0.7357603311538696, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8327358961105347, "num_tokens": 695455043.0, "step": 2182 }, { "epoch": 2.2207527975584944, "grad_norm": 0.7902182936668396, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8277439475059509, "num_tokens": 695788464.0, "step": 2183 }, { "epoch": 2.22177009155646, "grad_norm": 0.7493472099304199, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8297045230865479, "num_tokens": 696110054.0, "step": 2184 }, { "epoch": 2.222787385554425, "grad_norm": 0.7690534591674805, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.83198481798172, "num_tokens": 696419384.0, "step": 2185 }, { "epoch": 2.2238046795523907, "grad_norm": 0.7142757177352905, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.834639847278595, "num_tokens": 696754841.0, "step": 2186 }, { "epoch": 2.2248219735503563, "grad_norm": 0.7481052279472351, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8269023895263672, "num_tokens": 697066242.0, "step": 2187 }, { "epoch": 2.2258392675483214, "grad_norm": 0.754620373249054, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8295397758483887, "num_tokens": 697384006.0, "step": 2188 }, { "epoch": 2.226856561546287, "grad_norm": 0.8104036450386047, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8218930959701538, "num_tokens": 697695648.0, "step": 2189 }, { "epoch": 2.227873855544252, "grad_norm": 0.7504926919937134, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8311076164245605, "num_tokens": 698019018.0, "step": 2190 }, { "epoch": 2.2288911495422177, "grad_norm": 0.7394097447395325, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8285304307937622, "num_tokens": 698339787.0, "step": 2191 }, { "epoch": 2.2299084435401832, "grad_norm": 0.7734787464141846, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8304460048675537, "num_tokens": 698644406.0, "step": 2192 }, { "epoch": 2.2309257375381484, "grad_norm": 0.7935941815376282, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8333359956741333, "num_tokens": 698949448.0, "step": 2193 }, { "epoch": 2.231943031536114, "grad_norm": 0.7313924431800842, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8283383250236511, "num_tokens": 699276343.0, "step": 2194 }, { "epoch": 2.2329603255340795, "grad_norm": 0.7338786721229553, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8266017436981201, "num_tokens": 699600680.0, "step": 2195 }, { "epoch": 2.2339776195320447, "grad_norm": 0.7657689452171326, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8337849378585815, "num_tokens": 699920412.0, "step": 2196 }, { "epoch": 2.2349949135300102, "grad_norm": 0.7452083826065063, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8301029205322266, "num_tokens": 700246268.0, "step": 2197 }, { "epoch": 2.236012207527976, "grad_norm": 0.7452763319015503, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8316724896430969, "num_tokens": 700561439.0, "step": 2198 }, { "epoch": 2.237029501525941, "grad_norm": 0.7289541959762573, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8343749046325684, "num_tokens": 700904533.0, "step": 2199 }, { "epoch": 2.2380467955239065, "grad_norm": 0.7585221529006958, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8261143565177917, "num_tokens": 701220420.0, "step": 2200 }, { "epoch": 2.2390640895218716, "grad_norm": 0.7407345175743103, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8327363729476929, "num_tokens": 701544168.0, "step": 2201 }, { "epoch": 2.240081383519837, "grad_norm": 0.8011090159416199, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8249886631965637, "num_tokens": 701866354.0, "step": 2202 }, { "epoch": 2.241098677517803, "grad_norm": 0.8079937696456909, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8320120573043823, "num_tokens": 702180586.0, "step": 2203 }, { "epoch": 2.242115971515768, "grad_norm": 0.7229354977607727, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8307468295097351, "num_tokens": 702507782.0, "step": 2204 }, { "epoch": 2.2431332655137335, "grad_norm": 0.6952229142189026, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8240323066711426, "num_tokens": 702834861.0, "step": 2205 }, { "epoch": 2.244150559511699, "grad_norm": 0.7500696182250977, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8281593322753906, "num_tokens": 703173572.0, "step": 2206 }, { "epoch": 2.245167853509664, "grad_norm": 0.7257187962532043, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8321106433868408, "num_tokens": 703498507.0, "step": 2207 }, { "epoch": 2.2461851475076298, "grad_norm": 0.7724878191947937, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8294163346290588, "num_tokens": 703823540.0, "step": 2208 }, { "epoch": 2.2472024415055953, "grad_norm": 0.816226065158844, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8298070430755615, "num_tokens": 704156689.0, "step": 2209 }, { "epoch": 2.2482197355035605, "grad_norm": 0.7845411896705627, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8310840725898743, "num_tokens": 704467781.0, "step": 2210 }, { "epoch": 2.249237029501526, "grad_norm": 0.7413820624351501, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8295588493347168, "num_tokens": 704805338.0, "step": 2211 }, { "epoch": 2.250254323499491, "grad_norm": 0.7710822224617004, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8290252089500427, "num_tokens": 705135627.0, "step": 2212 }, { "epoch": 2.2512716174974567, "grad_norm": 0.7354344129562378, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8267947435379028, "num_tokens": 705443703.0, "step": 2213 }, { "epoch": 2.2522889114954223, "grad_norm": 0.7690240740776062, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8332376480102539, "num_tokens": 705767052.0, "step": 2214 }, { "epoch": 2.2533062054933874, "grad_norm": 0.7489812970161438, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8299515247344971, "num_tokens": 706101095.0, "step": 2215 }, { "epoch": 2.254323499491353, "grad_norm": 0.745998740196228, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8307999968528748, "num_tokens": 706423567.0, "step": 2216 }, { "epoch": 2.2553407934893186, "grad_norm": 0.7590211629867554, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8313041925430298, "num_tokens": 706747002.0, "step": 2217 }, { "epoch": 2.2563580874872837, "grad_norm": 0.7307959794998169, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8323990106582642, "num_tokens": 707083769.0, "step": 2218 }, { "epoch": 2.2573753814852493, "grad_norm": 0.7042139172554016, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8334348797798157, "num_tokens": 707407475.0, "step": 2219 }, { "epoch": 2.258392675483215, "grad_norm": 0.7372254729270935, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8272490501403809, "num_tokens": 707731659.0, "step": 2220 }, { "epoch": 2.25940996948118, "grad_norm": 0.7876691222190857, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.835241436958313, "num_tokens": 708049516.0, "step": 2221 }, { "epoch": 2.2604272634791456, "grad_norm": 0.7591989040374756, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8333436846733093, "num_tokens": 708365608.0, "step": 2222 }, { "epoch": 2.2614445574771107, "grad_norm": 0.7744635939598083, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8359612822532654, "num_tokens": 708668919.0, "step": 2223 }, { "epoch": 2.2624618514750763, "grad_norm": 0.788546621799469, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8418200016021729, "num_tokens": 708978992.0, "step": 2224 }, { "epoch": 2.263479145473042, "grad_norm": 0.7489559650421143, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.835007905960083, "num_tokens": 709289990.0, "step": 2225 }, { "epoch": 2.264496439471007, "grad_norm": 0.7431305050849915, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8231770992279053, "num_tokens": 709620493.0, "step": 2226 }, { "epoch": 2.2655137334689726, "grad_norm": 0.9410425424575806, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8315439224243164, "num_tokens": 709945354.0, "step": 2227 }, { "epoch": 2.266531027466938, "grad_norm": 0.7582634091377258, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8364298343658447, "num_tokens": 710261263.0, "step": 2228 }, { "epoch": 2.2675483214649033, "grad_norm": 0.7781898975372314, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8395653367042542, "num_tokens": 710579962.0, "step": 2229 }, { "epoch": 2.268565615462869, "grad_norm": 0.7267823815345764, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8301853537559509, "num_tokens": 710918771.0, "step": 2230 }, { "epoch": 2.2695829094608344, "grad_norm": 0.7535921931266785, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8331706523895264, "num_tokens": 711248181.0, "step": 2231 }, { "epoch": 2.2706002034587995, "grad_norm": 0.7851473093032837, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8320990800857544, "num_tokens": 711562515.0, "step": 2232 }, { "epoch": 2.271617497456765, "grad_norm": 0.7720229625701904, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8314772248268127, "num_tokens": 711864605.0, "step": 2233 }, { "epoch": 2.2726347914547302, "grad_norm": 0.757318913936615, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8224295377731323, "num_tokens": 712202336.0, "step": 2234 }, { "epoch": 2.273652085452696, "grad_norm": 0.8308467864990234, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8255714178085327, "num_tokens": 712510845.0, "step": 2235 }, { "epoch": 2.2746693794506614, "grad_norm": 0.7796366810798645, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8357149362564087, "num_tokens": 712815226.0, "step": 2236 }, { "epoch": 2.2756866734486265, "grad_norm": 0.796631395816803, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8310263156890869, "num_tokens": 713136477.0, "step": 2237 }, { "epoch": 2.276703967446592, "grad_norm": 0.816248893737793, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8342663645744324, "num_tokens": 713456971.0, "step": 2238 }, { "epoch": 2.2777212614445577, "grad_norm": 0.7735084891319275, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8262051939964294, "num_tokens": 713779256.0, "step": 2239 }, { "epoch": 2.278738555442523, "grad_norm": 0.7764680981636047, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8285849094390869, "num_tokens": 714098535.0, "step": 2240 }, { "epoch": 2.2797558494404884, "grad_norm": 0.8629921078681946, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8270991444587708, "num_tokens": 714415903.0, "step": 2241 }, { "epoch": 2.280773143438454, "grad_norm": 0.7582558393478394, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8276588916778564, "num_tokens": 714721963.0, "step": 2242 }, { "epoch": 2.281790437436419, "grad_norm": 0.7453955411911011, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.829574704170227, "num_tokens": 715036265.0, "step": 2243 }, { "epoch": 2.2828077314343846, "grad_norm": 0.7843350768089294, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8280022144317627, "num_tokens": 715351692.0, "step": 2244 }, { "epoch": 2.2838250254323498, "grad_norm": 0.8674906492233276, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8287839889526367, "num_tokens": 715667028.0, "step": 2245 }, { "epoch": 2.2848423194303153, "grad_norm": 0.8587877154350281, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8311952948570251, "num_tokens": 715978519.0, "step": 2246 }, { "epoch": 2.285859613428281, "grad_norm": 0.7901305556297302, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8253798484802246, "num_tokens": 716294789.0, "step": 2247 }, { "epoch": 2.286876907426246, "grad_norm": 0.7929912209510803, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.834429144859314, "num_tokens": 716601971.0, "step": 2248 }, { "epoch": 2.2878942014242116, "grad_norm": 0.7639389038085938, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8323620557785034, "num_tokens": 716916644.0, "step": 2249 }, { "epoch": 2.288911495422177, "grad_norm": 0.8073030114173889, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8344812989234924, "num_tokens": 717218108.0, "step": 2250 }, { "epoch": 2.2899287894201423, "grad_norm": 0.7753877639770508, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8317732810974121, "num_tokens": 717517090.0, "step": 2251 }, { "epoch": 2.290946083418108, "grad_norm": 0.7521124482154846, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8278107643127441, "num_tokens": 717846642.0, "step": 2252 }, { "epoch": 2.2919633774160735, "grad_norm": 0.8184477090835571, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8333908319473267, "num_tokens": 718173696.0, "step": 2253 }, { "epoch": 2.2929806714140386, "grad_norm": 0.7861326336860657, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8234462141990662, "num_tokens": 718494164.0, "step": 2254 }, { "epoch": 2.293997965412004, "grad_norm": 0.7486854195594788, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8295444846153259, "num_tokens": 718815468.0, "step": 2255 }, { "epoch": 2.2950152594099693, "grad_norm": 0.7647976875305176, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8253200650215149, "num_tokens": 719136494.0, "step": 2256 }, { "epoch": 2.296032553407935, "grad_norm": 0.7757217288017273, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8283142447471619, "num_tokens": 719474769.0, "step": 2257 }, { "epoch": 2.2970498474059005, "grad_norm": 0.7896122932434082, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8248286843299866, "num_tokens": 719775158.0, "step": 2258 }, { "epoch": 2.2980671414038656, "grad_norm": 0.7767871022224426, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8307970762252808, "num_tokens": 720075628.0, "step": 2259 }, { "epoch": 2.299084435401831, "grad_norm": 0.7573894262313843, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8246124982833862, "num_tokens": 720390860.0, "step": 2260 }, { "epoch": 2.3001017293997967, "grad_norm": 0.8404419422149658, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8311671018600464, "num_tokens": 720717089.0, "step": 2261 }, { "epoch": 2.301119023397762, "grad_norm": 0.7757552266120911, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.833970308303833, "num_tokens": 721053638.0, "step": 2262 }, { "epoch": 2.3021363173957274, "grad_norm": 0.7333695888519287, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8311830759048462, "num_tokens": 721379440.0, "step": 2263 }, { "epoch": 2.303153611393693, "grad_norm": 0.7551787495613098, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8288209438323975, "num_tokens": 721691506.0, "step": 2264 }, { "epoch": 2.304170905391658, "grad_norm": 0.797990083694458, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8305119872093201, "num_tokens": 721991138.0, "step": 2265 }, { "epoch": 2.3051881993896237, "grad_norm": 0.7845953702926636, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8282233476638794, "num_tokens": 722325029.0, "step": 2266 }, { "epoch": 2.306205493387589, "grad_norm": 0.7707501649856567, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8319345116615295, "num_tokens": 722639326.0, "step": 2267 }, { "epoch": 2.3072227873855544, "grad_norm": 0.7780898213386536, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8259189128875732, "num_tokens": 722945492.0, "step": 2268 }, { "epoch": 2.30824008138352, "grad_norm": 0.8189555406570435, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8206167817115784, "num_tokens": 723262253.0, "step": 2269 }, { "epoch": 2.309257375381485, "grad_norm": 0.7804981470108032, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8242859244346619, "num_tokens": 723575991.0, "step": 2270 }, { "epoch": 2.3102746693794507, "grad_norm": 0.736781656742096, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8251981735229492, "num_tokens": 723892353.0, "step": 2271 }, { "epoch": 2.311291963377416, "grad_norm": 0.7337729334831238, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8330903053283691, "num_tokens": 724220225.0, "step": 2272 }, { "epoch": 2.3123092573753814, "grad_norm": 0.7931007742881775, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8291524648666382, "num_tokens": 724542617.0, "step": 2273 }, { "epoch": 2.313326551373347, "grad_norm": 0.9265283346176147, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8326225876808167, "num_tokens": 724863471.0, "step": 2274 }, { "epoch": 2.3143438453713125, "grad_norm": 0.7294421195983887, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8275586366653442, "num_tokens": 725205147.0, "step": 2275 }, { "epoch": 2.3153611393692777, "grad_norm": 0.7400902509689331, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8316060304641724, "num_tokens": 725544049.0, "step": 2276 }, { "epoch": 2.3163784333672433, "grad_norm": 0.7395307421684265, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8303009271621704, "num_tokens": 725863765.0, "step": 2277 }, { "epoch": 2.3173957273652084, "grad_norm": 0.7711753845214844, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8282003998756409, "num_tokens": 726172877.0, "step": 2278 }, { "epoch": 2.318413021363174, "grad_norm": 0.7608178853988647, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.830217182636261, "num_tokens": 726475413.0, "step": 2279 }, { "epoch": 2.3194303153611395, "grad_norm": 0.7279531359672546, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8248142600059509, "num_tokens": 726809779.0, "step": 2280 }, { "epoch": 2.3204476093591047, "grad_norm": 0.7574766874313354, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8328359127044678, "num_tokens": 727125760.0, "step": 2281 }, { "epoch": 2.3214649033570702, "grad_norm": 0.7688292860984802, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.829384446144104, "num_tokens": 727438075.0, "step": 2282 }, { "epoch": 2.3224821973550354, "grad_norm": 0.7651029229164124, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8305647969245911, "num_tokens": 727764115.0, "step": 2283 }, { "epoch": 2.323499491353001, "grad_norm": 0.7398386597633362, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8299177289009094, "num_tokens": 728092211.0, "step": 2284 }, { "epoch": 2.3245167853509665, "grad_norm": 0.7717942595481873, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8286029100418091, "num_tokens": 728425475.0, "step": 2285 }, { "epoch": 2.325534079348932, "grad_norm": 0.8052406907081604, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.826826274394989, "num_tokens": 728738679.0, "step": 2286 }, { "epoch": 2.326551373346897, "grad_norm": 0.8091646432876587, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8217065930366516, "num_tokens": 729077322.0, "step": 2287 }, { "epoch": 2.327568667344863, "grad_norm": 0.7941763401031494, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8312726616859436, "num_tokens": 729376355.0, "step": 2288 }, { "epoch": 2.328585961342828, "grad_norm": 0.7231628894805908, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.830712080001831, "num_tokens": 729697233.0, "step": 2289 }, { "epoch": 2.3296032553407935, "grad_norm": 0.7471105456352234, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8266077041625977, "num_tokens": 730009711.0, "step": 2290 }, { "epoch": 2.330620549338759, "grad_norm": 0.7873372435569763, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8235731720924377, "num_tokens": 730315273.0, "step": 2291 }, { "epoch": 2.331637843336724, "grad_norm": 0.7682083249092102, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8349224328994751, "num_tokens": 730622310.0, "step": 2292 }, { "epoch": 2.3326551373346898, "grad_norm": 0.7594954967498779, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8270331621170044, "num_tokens": 730938482.0, "step": 2293 }, { "epoch": 2.333672431332655, "grad_norm": 0.7214847803115845, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8242443799972534, "num_tokens": 731256654.0, "step": 2294 }, { "epoch": 2.3346897253306205, "grad_norm": 0.7712514400482178, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8341904282569885, "num_tokens": 731560273.0, "step": 2295 }, { "epoch": 2.335707019328586, "grad_norm": 0.7314186096191406, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8387254476547241, "num_tokens": 731882252.0, "step": 2296 }, { "epoch": 2.3367243133265516, "grad_norm": 0.7417098879814148, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8319731950759888, "num_tokens": 732205142.0, "step": 2297 }, { "epoch": 2.3377416073245167, "grad_norm": 0.7539238333702087, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8251879215240479, "num_tokens": 732523658.0, "step": 2298 }, { "epoch": 2.3387589013224823, "grad_norm": 0.7640252113342285, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8272615671157837, "num_tokens": 732842797.0, "step": 2299 }, { "epoch": 2.3397761953204474, "grad_norm": 0.7650970816612244, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8229362964630127, "num_tokens": 733168024.0, "step": 2300 }, { "epoch": 2.340793489318413, "grad_norm": 0.733159601688385, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8358616828918457, "num_tokens": 733495169.0, "step": 2301 }, { "epoch": 2.3418107833163786, "grad_norm": 0.7942311763763428, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8320826888084412, "num_tokens": 733783519.0, "step": 2302 }, { "epoch": 2.3428280773143437, "grad_norm": 0.771440863609314, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8328754901885986, "num_tokens": 734114541.0, "step": 2303 }, { "epoch": 2.3438453713123093, "grad_norm": 0.7965013980865479, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.824408233165741, "num_tokens": 734444293.0, "step": 2304 }, { "epoch": 2.3448626653102744, "grad_norm": 0.7346433401107788, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8340154886245728, "num_tokens": 734747917.0, "step": 2305 }, { "epoch": 2.34587995930824, "grad_norm": 0.7440131902694702, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8372299075126648, "num_tokens": 735069410.0, "step": 2306 }, { "epoch": 2.3468972533062056, "grad_norm": 0.7549577355384827, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.836683452129364, "num_tokens": 735380532.0, "step": 2307 }, { "epoch": 2.347914547304171, "grad_norm": 0.7508228421211243, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.828955888748169, "num_tokens": 735689763.0, "step": 2308 }, { "epoch": 2.3489318413021363, "grad_norm": 0.7547523975372314, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8409914374351501, "num_tokens": 736000433.0, "step": 2309 }, { "epoch": 2.349949135300102, "grad_norm": 0.7216244339942932, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8328282833099365, "num_tokens": 736329409.0, "step": 2310 }, { "epoch": 2.350966429298067, "grad_norm": 0.7756535410881042, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8296158313751221, "num_tokens": 736642712.0, "step": 2311 }, { "epoch": 2.3519837232960326, "grad_norm": 0.7554481625556946, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8292926549911499, "num_tokens": 736956462.0, "step": 2312 }, { "epoch": 2.353001017293998, "grad_norm": 0.6937792897224426, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8327944874763489, "num_tokens": 737279301.0, "step": 2313 }, { "epoch": 2.3540183112919633, "grad_norm": 0.7226637005805969, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8313299417495728, "num_tokens": 737617291.0, "step": 2314 }, { "epoch": 2.355035605289929, "grad_norm": 0.7673088312149048, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8323171138763428, "num_tokens": 737938881.0, "step": 2315 }, { "epoch": 2.356052899287894, "grad_norm": 0.7496564984321594, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8294878005981445, "num_tokens": 738254476.0, "step": 2316 }, { "epoch": 2.3570701932858595, "grad_norm": 0.7185983657836914, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.83353590965271, "num_tokens": 738587235.0, "step": 2317 }, { "epoch": 2.358087487283825, "grad_norm": 0.7741244435310364, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8346359729766846, "num_tokens": 738903821.0, "step": 2318 }, { "epoch": 2.3591047812817907, "grad_norm": 0.7434374094009399, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8271213173866272, "num_tokens": 739208932.0, "step": 2319 }, { "epoch": 2.360122075279756, "grad_norm": 0.768464207649231, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8318808078765869, "num_tokens": 739529429.0, "step": 2320 }, { "epoch": 2.3611393692777214, "grad_norm": 0.8427668809890747, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8341370224952698, "num_tokens": 739826514.0, "step": 2321 }, { "epoch": 2.3621566632756865, "grad_norm": 0.7625221610069275, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8287997245788574, "num_tokens": 740139289.0, "step": 2322 }, { "epoch": 2.363173957273652, "grad_norm": 0.7757455706596375, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8334740400314331, "num_tokens": 740449970.0, "step": 2323 }, { "epoch": 2.3641912512716177, "grad_norm": 0.7737842798233032, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8300380110740662, "num_tokens": 740754163.0, "step": 2324 }, { "epoch": 2.365208545269583, "grad_norm": 0.7863881587982178, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8350942730903625, "num_tokens": 741074703.0, "step": 2325 }, { "epoch": 2.3662258392675484, "grad_norm": 0.7454284429550171, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8342845439910889, "num_tokens": 741403525.0, "step": 2326 }, { "epoch": 2.3672431332655135, "grad_norm": 0.7511443495750427, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8395842909812927, "num_tokens": 741728453.0, "step": 2327 }, { "epoch": 2.368260427263479, "grad_norm": 0.7700026035308838, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8323467373847961, "num_tokens": 742045389.0, "step": 2328 }, { "epoch": 2.3692777212614446, "grad_norm": 0.7246688604354858, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8339344263076782, "num_tokens": 742355845.0, "step": 2329 }, { "epoch": 2.3702950152594098, "grad_norm": 0.739668071269989, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8269882202148438, "num_tokens": 742671682.0, "step": 2330 }, { "epoch": 2.3713123092573754, "grad_norm": 0.7679663300514221, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8265938758850098, "num_tokens": 742999389.0, "step": 2331 }, { "epoch": 2.372329603255341, "grad_norm": 0.7775615453720093, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8285351395606995, "num_tokens": 743328436.0, "step": 2332 }, { "epoch": 2.373346897253306, "grad_norm": 0.7559525966644287, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8302074670791626, "num_tokens": 743642776.0, "step": 2333 }, { "epoch": 2.3743641912512716, "grad_norm": 0.7330087423324585, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8330814838409424, "num_tokens": 743962306.0, "step": 2334 }, { "epoch": 2.375381485249237, "grad_norm": 0.779496431350708, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8268112540245056, "num_tokens": 744277189.0, "step": 2335 }, { "epoch": 2.3763987792472023, "grad_norm": 0.7355311512947083, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8319157361984253, "num_tokens": 744602450.0, "step": 2336 }, { "epoch": 2.377416073245168, "grad_norm": 0.7220591306686401, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.83145672082901, "num_tokens": 744941184.0, "step": 2337 }, { "epoch": 2.378433367243133, "grad_norm": 0.7432832717895508, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8296507596969604, "num_tokens": 745273816.0, "step": 2338 }, { "epoch": 2.3794506612410986, "grad_norm": 0.8230165243148804, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8256447315216064, "num_tokens": 745600889.0, "step": 2339 }, { "epoch": 2.380467955239064, "grad_norm": 0.760030210018158, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8372802734375, "num_tokens": 745931100.0, "step": 2340 }, { "epoch": 2.3814852492370293, "grad_norm": 0.7500247359275818, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.830858588218689, "num_tokens": 746255957.0, "step": 2341 }, { "epoch": 2.382502543234995, "grad_norm": 0.7601770758628845, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8307530879974365, "num_tokens": 746589548.0, "step": 2342 }, { "epoch": 2.3835198372329605, "grad_norm": 0.7895753979682922, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8327901363372803, "num_tokens": 746894866.0, "step": 2343 }, { "epoch": 2.3845371312309256, "grad_norm": 0.763584554195404, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8359324336051941, "num_tokens": 747223779.0, "step": 2344 }, { "epoch": 2.385554425228891, "grad_norm": 0.7605777978897095, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8366625308990479, "num_tokens": 747533998.0, "step": 2345 }, { "epoch": 2.3865717192268567, "grad_norm": 0.8445054888725281, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.819532573223114, "num_tokens": 747850725.0, "step": 2346 }, { "epoch": 2.387589013224822, "grad_norm": 0.8237773776054382, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8330358862876892, "num_tokens": 748172245.0, "step": 2347 }, { "epoch": 2.3886063072227874, "grad_norm": 0.9143764972686768, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8333683013916016, "num_tokens": 748476090.0, "step": 2348 }, { "epoch": 2.3896236012207526, "grad_norm": 0.7322075963020325, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.827389121055603, "num_tokens": 748802917.0, "step": 2349 }, { "epoch": 2.390640895218718, "grad_norm": 0.7762848138809204, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.824276328086853, "num_tokens": 749117584.0, "step": 2350 }, { "epoch": 2.3916581892166837, "grad_norm": 0.7897521257400513, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8273171186447144, "num_tokens": 749440693.0, "step": 2351 }, { "epoch": 2.392675483214649, "grad_norm": 0.8838954567909241, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8362421989440918, "num_tokens": 749746118.0, "step": 2352 }, { "epoch": 2.3936927772126144, "grad_norm": 0.7455595135688782, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8335433602333069, "num_tokens": 750064386.0, "step": 2353 }, { "epoch": 2.39471007121058, "grad_norm": 0.7266263365745544, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8309916257858276, "num_tokens": 750392408.0, "step": 2354 }, { "epoch": 2.395727365208545, "grad_norm": 0.7415468692779541, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8322256803512573, "num_tokens": 750725594.0, "step": 2355 }, { "epoch": 2.3967446592065107, "grad_norm": 0.7751864194869995, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8317053914070129, "num_tokens": 751019674.0, "step": 2356 }, { "epoch": 2.3977619532044763, "grad_norm": 0.7352653741836548, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8348796963691711, "num_tokens": 751341611.0, "step": 2357 }, { "epoch": 2.3987792472024414, "grad_norm": 0.7456854581832886, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8336353898048401, "num_tokens": 751658357.0, "step": 2358 }, { "epoch": 2.399796541200407, "grad_norm": 0.7456473708152771, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8315349817276001, "num_tokens": 751985925.0, "step": 2359 }, { "epoch": 2.400813835198372, "grad_norm": 0.7698581218719482, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.827774167060852, "num_tokens": 752297184.0, "step": 2360 }, { "epoch": 2.4018311291963377, "grad_norm": 0.8498561978340149, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8302951455116272, "num_tokens": 752623708.0, "step": 2361 }, { "epoch": 2.4028484231943033, "grad_norm": 0.9180802702903748, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8314825296401978, "num_tokens": 752932908.0, "step": 2362 }, { "epoch": 2.4038657171922684, "grad_norm": 0.7461639642715454, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8299914598464966, "num_tokens": 753256285.0, "step": 2363 }, { "epoch": 2.404883011190234, "grad_norm": 0.7700833678245544, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8337959051132202, "num_tokens": 753554563.0, "step": 2364 }, { "epoch": 2.4059003051881995, "grad_norm": 0.7168130874633789, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.833526611328125, "num_tokens": 753883153.0, "step": 2365 }, { "epoch": 2.4069175991861647, "grad_norm": 0.8115290999412537, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8273876905441284, "num_tokens": 754197393.0, "step": 2366 }, { "epoch": 2.4079348931841302, "grad_norm": 0.7849256992340088, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8283255696296692, "num_tokens": 754512562.0, "step": 2367 }, { "epoch": 2.408952187182096, "grad_norm": 0.7660202383995056, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8274675607681274, "num_tokens": 754823935.0, "step": 2368 }, { "epoch": 2.409969481180061, "grad_norm": 0.7553797960281372, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8311089277267456, "num_tokens": 755155364.0, "step": 2369 }, { "epoch": 2.4109867751780265, "grad_norm": 0.7647053599357605, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8304147124290466, "num_tokens": 755467594.0, "step": 2370 }, { "epoch": 2.4120040691759916, "grad_norm": 0.8000221848487854, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.837266206741333, "num_tokens": 755776980.0, "step": 2371 }, { "epoch": 2.413021363173957, "grad_norm": 0.794994056224823, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8333130478858948, "num_tokens": 756086651.0, "step": 2372 }, { "epoch": 2.414038657171923, "grad_norm": 0.8113973140716553, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8280282020568848, "num_tokens": 756399704.0, "step": 2373 }, { "epoch": 2.415055951169888, "grad_norm": 0.7954896092414856, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8311757445335388, "num_tokens": 756711634.0, "step": 2374 }, { "epoch": 2.4160732451678535, "grad_norm": 0.7626403570175171, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8354580998420715, "num_tokens": 757032898.0, "step": 2375 }, { "epoch": 2.417090539165819, "grad_norm": 0.7603203654289246, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8340161442756653, "num_tokens": 757357972.0, "step": 2376 }, { "epoch": 2.418107833163784, "grad_norm": 0.7872731685638428, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.831898033618927, "num_tokens": 757677296.0, "step": 2377 }, { "epoch": 2.4191251271617498, "grad_norm": 0.7660987973213196, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8300293684005737, "num_tokens": 757995955.0, "step": 2378 }, { "epoch": 2.4201424211597153, "grad_norm": 0.7972212433815002, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8283858299255371, "num_tokens": 758312347.0, "step": 2379 }, { "epoch": 2.4211597151576805, "grad_norm": 0.8009522557258606, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.831788957118988, "num_tokens": 758639550.0, "step": 2380 }, { "epoch": 2.422177009155646, "grad_norm": 0.7857116460800171, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8312411308288574, "num_tokens": 758958913.0, "step": 2381 }, { "epoch": 2.423194303153611, "grad_norm": 0.7693212628364563, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.829216718673706, "num_tokens": 759270507.0, "step": 2382 }, { "epoch": 2.4242115971515767, "grad_norm": 0.7448201775550842, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8389084339141846, "num_tokens": 759579946.0, "step": 2383 }, { "epoch": 2.4252288911495423, "grad_norm": 0.7547968029975891, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8323776125907898, "num_tokens": 759919471.0, "step": 2384 }, { "epoch": 2.4262461851475075, "grad_norm": 0.7595186233520508, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8392271995544434, "num_tokens": 760224828.0, "step": 2385 }, { "epoch": 2.427263479145473, "grad_norm": 0.7325853705406189, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8344419598579407, "num_tokens": 760536153.0, "step": 2386 }, { "epoch": 2.4282807731434386, "grad_norm": 0.7919133901596069, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8291533589363098, "num_tokens": 760838181.0, "step": 2387 }, { "epoch": 2.4292980671414037, "grad_norm": 0.7144816517829895, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8297811150550842, "num_tokens": 761170210.0, "step": 2388 }, { "epoch": 2.4303153611393693, "grad_norm": 0.7816941738128662, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8292558789253235, "num_tokens": 761491740.0, "step": 2389 }, { "epoch": 2.431332655137335, "grad_norm": 0.78831547498703, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8312612175941467, "num_tokens": 761813341.0, "step": 2390 }, { "epoch": 2.4323499491353, "grad_norm": 0.7110333442687988, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8343943357467651, "num_tokens": 762139354.0, "step": 2391 }, { "epoch": 2.4333672431332656, "grad_norm": 0.7214698195457458, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8357784748077393, "num_tokens": 762460461.0, "step": 2392 }, { "epoch": 2.4343845371312307, "grad_norm": 0.817523181438446, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8315002918243408, "num_tokens": 762803568.0, "step": 2393 }, { "epoch": 2.4354018311291963, "grad_norm": 0.8252072334289551, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8329967260360718, "num_tokens": 763121607.0, "step": 2394 }, { "epoch": 2.436419125127162, "grad_norm": 0.8103281855583191, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8263031244277954, "num_tokens": 763440200.0, "step": 2395 }, { "epoch": 2.437436419125127, "grad_norm": 0.7681299448013306, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8315169215202332, "num_tokens": 763765662.0, "step": 2396 }, { "epoch": 2.4384537131230926, "grad_norm": 0.8423113226890564, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.835465669631958, "num_tokens": 764082350.0, "step": 2397 }, { "epoch": 2.439471007121058, "grad_norm": 0.7529908418655396, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8306792974472046, "num_tokens": 764395519.0, "step": 2398 }, { "epoch": 2.4404883011190233, "grad_norm": 0.7986095547676086, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8285139799118042, "num_tokens": 764720615.0, "step": 2399 }, { "epoch": 2.441505595116989, "grad_norm": 0.7806645035743713, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.826043426990509, "num_tokens": 765064375.0, "step": 2400 }, { "epoch": 2.4425228891149544, "grad_norm": 0.7634555101394653, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8234164714813232, "num_tokens": 765405280.0, "step": 2401 }, { "epoch": 2.4435401831129195, "grad_norm": 0.8364507555961609, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8306921720504761, "num_tokens": 765704128.0, "step": 2402 }, { "epoch": 2.444557477110885, "grad_norm": 0.803433358669281, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8307260274887085, "num_tokens": 766021963.0, "step": 2403 }, { "epoch": 2.4455747711088502, "grad_norm": 0.7807213068008423, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8288591504096985, "num_tokens": 766348235.0, "step": 2404 }, { "epoch": 2.446592065106816, "grad_norm": 0.8111218214035034, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8356228470802307, "num_tokens": 766664722.0, "step": 2405 }, { "epoch": 2.4476093591047814, "grad_norm": 0.7768924236297607, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8374151587486267, "num_tokens": 766973521.0, "step": 2406 }, { "epoch": 2.4486266531027465, "grad_norm": 0.7840931415557861, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8322388529777527, "num_tokens": 767301821.0, "step": 2407 }, { "epoch": 2.449643947100712, "grad_norm": 0.7447266578674316, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.833337664604187, "num_tokens": 767630278.0, "step": 2408 }, { "epoch": 2.4506612410986777, "grad_norm": 1.020521879196167, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8331669569015503, "num_tokens": 767961963.0, "step": 2409 }, { "epoch": 2.451678535096643, "grad_norm": 0.7774084210395813, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.840404748916626, "num_tokens": 768274598.0, "step": 2410 }, { "epoch": 2.4526958290946084, "grad_norm": 0.770845353603363, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8253146409988403, "num_tokens": 768596525.0, "step": 2411 }, { "epoch": 2.453713123092574, "grad_norm": 0.752336859703064, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8379479646682739, "num_tokens": 768931098.0, "step": 2412 }, { "epoch": 2.454730417090539, "grad_norm": 0.7862922549247742, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8290746212005615, "num_tokens": 769236726.0, "step": 2413 }, { "epoch": 2.4557477110885046, "grad_norm": 0.7337533235549927, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8312757015228271, "num_tokens": 769546860.0, "step": 2414 }, { "epoch": 2.4567650050864698, "grad_norm": 0.7539020776748657, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8379347324371338, "num_tokens": 769852870.0, "step": 2415 }, { "epoch": 2.4577822990844354, "grad_norm": 0.7792903184890747, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.827686607837677, "num_tokens": 770157889.0, "step": 2416 }, { "epoch": 2.458799593082401, "grad_norm": 0.7698293924331665, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8331127166748047, "num_tokens": 770495718.0, "step": 2417 }, { "epoch": 2.459816887080366, "grad_norm": 0.7606080174446106, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8306302428245544, "num_tokens": 770807568.0, "step": 2418 }, { "epoch": 2.4608341810783316, "grad_norm": 0.771902322769165, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8367187976837158, "num_tokens": 771115947.0, "step": 2419 }, { "epoch": 2.461851475076297, "grad_norm": 0.7755984663963318, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8261435031890869, "num_tokens": 771443575.0, "step": 2420 }, { "epoch": 2.4628687690742623, "grad_norm": 0.7729278206825256, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8226972818374634, "num_tokens": 771758233.0, "step": 2421 }, { "epoch": 2.463886063072228, "grad_norm": 0.7166008353233337, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8314967155456543, "num_tokens": 772093055.0, "step": 2422 }, { "epoch": 2.4649033570701935, "grad_norm": 0.7218311429023743, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8374101519584656, "num_tokens": 772411962.0, "step": 2423 }, { "epoch": 2.4659206510681586, "grad_norm": 0.7513918280601501, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8277909755706787, "num_tokens": 772734084.0, "step": 2424 }, { "epoch": 2.466937945066124, "grad_norm": 0.7552136778831482, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8380136489868164, "num_tokens": 773069483.0, "step": 2425 }, { "epoch": 2.4679552390640893, "grad_norm": 0.8175935745239258, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8256757855415344, "num_tokens": 773388030.0, "step": 2426 }, { "epoch": 2.468972533062055, "grad_norm": 0.8159247636795044, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8304073214530945, "num_tokens": 773708733.0, "step": 2427 }, { "epoch": 2.4699898270600205, "grad_norm": 0.7622677087783813, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.835042417049408, "num_tokens": 774027412.0, "step": 2428 }, { "epoch": 2.4710071210579856, "grad_norm": 0.7640722393989563, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8352609872817993, "num_tokens": 774354417.0, "step": 2429 }, { "epoch": 2.472024415055951, "grad_norm": 0.8168275952339172, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8326980471611023, "num_tokens": 774658382.0, "step": 2430 }, { "epoch": 2.4730417090539167, "grad_norm": 0.7612686157226562, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8303864598274231, "num_tokens": 774978306.0, "step": 2431 }, { "epoch": 2.474059003051882, "grad_norm": 0.781137228012085, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8313940763473511, "num_tokens": 775301131.0, "step": 2432 }, { "epoch": 2.4750762970498474, "grad_norm": 0.8054970502853394, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8306020498275757, "num_tokens": 775609674.0, "step": 2433 }, { "epoch": 2.476093591047813, "grad_norm": 0.7589148283004761, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8268976211547852, "num_tokens": 775937278.0, "step": 2434 }, { "epoch": 2.477110885045778, "grad_norm": 0.8487206101417542, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8378028869628906, "num_tokens": 776256891.0, "step": 2435 }, { "epoch": 2.4781281790437437, "grad_norm": 0.7576294541358948, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8346007466316223, "num_tokens": 776580024.0, "step": 2436 }, { "epoch": 2.479145473041709, "grad_norm": 0.7526679039001465, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8290330171585083, "num_tokens": 776881190.0, "step": 2437 }, { "epoch": 2.4801627670396744, "grad_norm": 0.7862757444381714, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8304464221000671, "num_tokens": 777210571.0, "step": 2438 }, { "epoch": 2.48118006103764, "grad_norm": 0.790669858455658, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8283755779266357, "num_tokens": 777541394.0, "step": 2439 }, { "epoch": 2.482197355035605, "grad_norm": 0.7850418090820312, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8332133889198303, "num_tokens": 777839792.0, "step": 2440 }, { "epoch": 2.4832146490335707, "grad_norm": 0.7663286328315735, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8340730667114258, "num_tokens": 778138868.0, "step": 2441 }, { "epoch": 2.4842319430315363, "grad_norm": 0.7318602800369263, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8366137146949768, "num_tokens": 778464246.0, "step": 2442 }, { "epoch": 2.4852492370295014, "grad_norm": 0.8146898150444031, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8201758861541748, "num_tokens": 778782743.0, "step": 2443 }, { "epoch": 2.486266531027467, "grad_norm": 0.7857292890548706, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8359119296073914, "num_tokens": 779106415.0, "step": 2444 }, { "epoch": 2.4872838250254325, "grad_norm": 0.7584625482559204, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8299977779388428, "num_tokens": 779428841.0, "step": 2445 }, { "epoch": 2.4883011190233977, "grad_norm": 0.7550146579742432, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.833655834197998, "num_tokens": 779752096.0, "step": 2446 }, { "epoch": 2.4893184130213633, "grad_norm": 0.802813708782196, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8306933641433716, "num_tokens": 780066182.0, "step": 2447 }, { "epoch": 2.4903357070193284, "grad_norm": 0.7713908553123474, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8302340507507324, "num_tokens": 780379379.0, "step": 2448 }, { "epoch": 2.491353001017294, "grad_norm": 0.7197617292404175, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8294973373413086, "num_tokens": 780709618.0, "step": 2449 }, { "epoch": 2.4923702950152595, "grad_norm": 0.7505249977111816, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8199161291122437, "num_tokens": 781038418.0, "step": 2450 }, { "epoch": 2.4933875890132247, "grad_norm": 0.8106674551963806, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.830228328704834, "num_tokens": 781342015.0, "step": 2451 }, { "epoch": 2.4944048830111902, "grad_norm": 0.747441828250885, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8331936597824097, "num_tokens": 781654668.0, "step": 2452 }, { "epoch": 2.495422177009156, "grad_norm": 0.762697696685791, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8264614939689636, "num_tokens": 781972092.0, "step": 2453 }, { "epoch": 2.496439471007121, "grad_norm": 0.764396071434021, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8198715448379517, "num_tokens": 782293763.0, "step": 2454 }, { "epoch": 2.4974567650050865, "grad_norm": 0.7979555130004883, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8260531425476074, "num_tokens": 782606788.0, "step": 2455 }, { "epoch": 2.498474059003052, "grad_norm": 0.7844521999359131, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8317605257034302, "num_tokens": 782909993.0, "step": 2456 }, { "epoch": 2.499491353001017, "grad_norm": 0.8027681708335876, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8310152888298035, "num_tokens": 783215610.0, "step": 2457 }, { "epoch": 2.500508646998983, "grad_norm": 0.7083740234375, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8324770927429199, "num_tokens": 783555679.0, "step": 2458 }, { "epoch": 2.501525940996948, "grad_norm": 0.8292408585548401, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8293014764785767, "num_tokens": 783880886.0, "step": 2459 }, { "epoch": 2.5025432349949135, "grad_norm": 0.7347398400306702, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8344863653182983, "num_tokens": 784198844.0, "step": 2460 }, { "epoch": 2.503560528992879, "grad_norm": 1.7198132276535034, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8289639949798584, "num_tokens": 784524245.0, "step": 2461 }, { "epoch": 2.504577822990844, "grad_norm": 0.7936400771141052, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8311157822608948, "num_tokens": 784850952.0, "step": 2462 }, { "epoch": 2.5055951169888098, "grad_norm": 0.7660007476806641, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8293622732162476, "num_tokens": 785151045.0, "step": 2463 }, { "epoch": 2.5066124109867753, "grad_norm": 0.7893761396408081, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8309811353683472, "num_tokens": 785470010.0, "step": 2464 }, { "epoch": 2.5076297049847405, "grad_norm": 0.7528966069221497, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8246625661849976, "num_tokens": 785801195.0, "step": 2465 }, { "epoch": 2.508646998982706, "grad_norm": 0.7794021368026733, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8252547383308411, "num_tokens": 786115203.0, "step": 2466 }, { "epoch": 2.5096642929806716, "grad_norm": 0.8330888152122498, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8368071913719177, "num_tokens": 786424319.0, "step": 2467 }, { "epoch": 2.5106815869786367, "grad_norm": 0.751885175704956, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8278042078018188, "num_tokens": 786732176.0, "step": 2468 }, { "epoch": 2.5116988809766023, "grad_norm": 0.7629954814910889, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8319458961486816, "num_tokens": 787060121.0, "step": 2469 }, { "epoch": 2.5127161749745675, "grad_norm": 0.7450445294380188, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8261040449142456, "num_tokens": 787389440.0, "step": 2470 }, { "epoch": 2.513733468972533, "grad_norm": 0.7563230395317078, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8288929462432861, "num_tokens": 787711904.0, "step": 2471 }, { "epoch": 2.5147507629704986, "grad_norm": 0.7394628524780273, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8336602449417114, "num_tokens": 788040834.0, "step": 2472 }, { "epoch": 2.5157680569684637, "grad_norm": 0.7385982871055603, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8274520635604858, "num_tokens": 788365031.0, "step": 2473 }, { "epoch": 2.5167853509664293, "grad_norm": 0.7424465417861938, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.837134599685669, "num_tokens": 788684184.0, "step": 2474 }, { "epoch": 2.517802644964395, "grad_norm": 0.7463836669921875, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8347026705741882, "num_tokens": 789007564.0, "step": 2475 }, { "epoch": 2.51881993896236, "grad_norm": 0.7344636917114258, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8264937996864319, "num_tokens": 789338711.0, "step": 2476 }, { "epoch": 2.5198372329603256, "grad_norm": 0.7780519723892212, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8339964151382446, "num_tokens": 789651554.0, "step": 2477 }, { "epoch": 2.520854526958291, "grad_norm": 0.763392448425293, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8263593912124634, "num_tokens": 789980144.0, "step": 2478 }, { "epoch": 2.5218718209562563, "grad_norm": 0.7752723693847656, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8367195725440979, "num_tokens": 790289686.0, "step": 2479 }, { "epoch": 2.522889114954222, "grad_norm": 0.780409574508667, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8243266344070435, "num_tokens": 790617809.0, "step": 2480 }, { "epoch": 2.523906408952187, "grad_norm": 0.793233335018158, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8192570209503174, "num_tokens": 790931867.0, "step": 2481 }, { "epoch": 2.5249237029501526, "grad_norm": 0.7812409996986389, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8291711807250977, "num_tokens": 791257431.0, "step": 2482 }, { "epoch": 2.525940996948118, "grad_norm": 0.7788034081459045, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8330932855606079, "num_tokens": 791571096.0, "step": 2483 }, { "epoch": 2.5269582909460833, "grad_norm": 0.7682954668998718, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8256453275680542, "num_tokens": 791889891.0, "step": 2484 }, { "epoch": 2.527975584944049, "grad_norm": 0.733441174030304, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8320663571357727, "num_tokens": 792214100.0, "step": 2485 }, { "epoch": 2.528992878942014, "grad_norm": 0.7833328247070312, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8362271189689636, "num_tokens": 792530725.0, "step": 2486 }, { "epoch": 2.5300101729399795, "grad_norm": 0.7879757881164551, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8272276520729065, "num_tokens": 792847903.0, "step": 2487 }, { "epoch": 2.531027466937945, "grad_norm": 0.7867984771728516, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.833976149559021, "num_tokens": 793151668.0, "step": 2488 }, { "epoch": 2.5320447609359107, "grad_norm": 0.783227801322937, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8294700384140015, "num_tokens": 793460024.0, "step": 2489 }, { "epoch": 2.533062054933876, "grad_norm": 0.7828521728515625, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8348932266235352, "num_tokens": 793777015.0, "step": 2490 }, { "epoch": 2.5340793489318414, "grad_norm": 0.7644302845001221, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8249564170837402, "num_tokens": 794106369.0, "step": 2491 }, { "epoch": 2.5350966429298065, "grad_norm": 0.7875006794929504, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8332185745239258, "num_tokens": 794419893.0, "step": 2492 }, { "epoch": 2.536113936927772, "grad_norm": 0.8249663710594177, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8363446593284607, "num_tokens": 794725898.0, "step": 2493 }, { "epoch": 2.5371312309257377, "grad_norm": 0.7706735730171204, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8266034126281738, "num_tokens": 795034685.0, "step": 2494 }, { "epoch": 2.538148524923703, "grad_norm": 0.7650234699249268, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8322057127952576, "num_tokens": 795359866.0, "step": 2495 }, { "epoch": 2.5391658189216684, "grad_norm": 0.7540287971496582, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8335233926773071, "num_tokens": 795665057.0, "step": 2496 }, { "epoch": 2.5401831129196335, "grad_norm": 1.013059377670288, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8318750858306885, "num_tokens": 795976783.0, "step": 2497 }, { "epoch": 2.541200406917599, "grad_norm": 0.8115147352218628, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8316807746887207, "num_tokens": 796287301.0, "step": 2498 }, { "epoch": 2.5422177009155646, "grad_norm": 0.763339102268219, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8338295221328735, "num_tokens": 796600291.0, "step": 2499 }, { "epoch": 2.5432349949135302, "grad_norm": 0.7467638254165649, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8346984386444092, "num_tokens": 796904278.0, "step": 2500 }, { "epoch": 2.5442522889114954, "grad_norm": 0.7442284226417542, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8270628452301025, "num_tokens": 797229543.0, "step": 2501 }, { "epoch": 2.545269582909461, "grad_norm": 0.7795024514198303, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8265025019645691, "num_tokens": 797536387.0, "step": 2502 }, { "epoch": 2.546286876907426, "grad_norm": 0.7624030113220215, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.828598141670227, "num_tokens": 797843165.0, "step": 2503 }, { "epoch": 2.5473041709053916, "grad_norm": 0.7822015881538391, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8312956094741821, "num_tokens": 798149467.0, "step": 2504 }, { "epoch": 2.548321464903357, "grad_norm": 0.7261184453964233, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8313003778457642, "num_tokens": 798476152.0, "step": 2505 }, { "epoch": 2.5493387589013223, "grad_norm": 0.7039114236831665, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8312034606933594, "num_tokens": 798807326.0, "step": 2506 }, { "epoch": 2.550356052899288, "grad_norm": 0.7534456849098206, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8210437893867493, "num_tokens": 799134326.0, "step": 2507 }, { "epoch": 2.551373346897253, "grad_norm": 0.7393434047698975, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8362290263175964, "num_tokens": 799444712.0, "step": 2508 }, { "epoch": 2.5523906408952186, "grad_norm": 0.7687445282936096, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8302136659622192, "num_tokens": 799761918.0, "step": 2509 }, { "epoch": 2.553407934893184, "grad_norm": 0.723220705986023, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8383730053901672, "num_tokens": 800086313.0, "step": 2510 }, { "epoch": 2.5544252288911498, "grad_norm": 0.7691549062728882, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.826926589012146, "num_tokens": 800392700.0, "step": 2511 }, { "epoch": 2.555442522889115, "grad_norm": 0.7230547666549683, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8337690234184265, "num_tokens": 800723148.0, "step": 2512 }, { "epoch": 2.5564598168870805, "grad_norm": 0.7520925998687744, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8333572149276733, "num_tokens": 801037154.0, "step": 2513 }, { "epoch": 2.5574771108850456, "grad_norm": 0.7694193124771118, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8236919045448303, "num_tokens": 801343393.0, "step": 2514 }, { "epoch": 2.558494404883011, "grad_norm": 0.7773657441139221, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8282038569450378, "num_tokens": 801652284.0, "step": 2515 }, { "epoch": 2.5595116988809767, "grad_norm": 0.7484728693962097, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8301058411598206, "num_tokens": 801967943.0, "step": 2516 }, { "epoch": 2.560528992878942, "grad_norm": 0.731221616268158, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8355869054794312, "num_tokens": 802277825.0, "step": 2517 }, { "epoch": 2.5615462868769074, "grad_norm": 0.7734962105751038, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8329318165779114, "num_tokens": 802590185.0, "step": 2518 }, { "epoch": 2.5625635808748726, "grad_norm": 0.7789332270622253, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.842313289642334, "num_tokens": 802896699.0, "step": 2519 }, { "epoch": 2.563580874872838, "grad_norm": 0.7538076639175415, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8314604759216309, "num_tokens": 803217224.0, "step": 2520 }, { "epoch": 2.5645981688708037, "grad_norm": 0.7312325239181519, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8282783627510071, "num_tokens": 803552881.0, "step": 2521 }, { "epoch": 2.5656154628687693, "grad_norm": 0.7468303442001343, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8361011743545532, "num_tokens": 803876358.0, "step": 2522 }, { "epoch": 2.5666327568667344, "grad_norm": 0.8178204298019409, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8333477973937988, "num_tokens": 804192458.0, "step": 2523 }, { "epoch": 2.5676500508647, "grad_norm": 0.7961925864219666, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8332005739212036, "num_tokens": 804511634.0, "step": 2524 }, { "epoch": 2.568667344862665, "grad_norm": 0.981375515460968, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8280132412910461, "num_tokens": 804828982.0, "step": 2525 }, { "epoch": 2.5696846388606307, "grad_norm": 0.7216664552688599, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8246604800224304, "num_tokens": 805159595.0, "step": 2526 }, { "epoch": 2.5707019328585963, "grad_norm": 0.7868689894676208, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8302907943725586, "num_tokens": 805467391.0, "step": 2527 }, { "epoch": 2.5717192268565614, "grad_norm": 0.7492839694023132, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8306524157524109, "num_tokens": 805767777.0, "step": 2528 }, { "epoch": 2.572736520854527, "grad_norm": 0.748909056186676, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8326543569564819, "num_tokens": 806083011.0, "step": 2529 }, { "epoch": 2.573753814852492, "grad_norm": 0.7458982467651367, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.833580732345581, "num_tokens": 806389989.0, "step": 2530 }, { "epoch": 2.5747711088504577, "grad_norm": 0.7254387736320496, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8330187797546387, "num_tokens": 806724467.0, "step": 2531 }, { "epoch": 2.5757884028484233, "grad_norm": 0.7179408669471741, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8376388549804688, "num_tokens": 807048509.0, "step": 2532 }, { "epoch": 2.576805696846389, "grad_norm": 0.773971438407898, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8336687088012695, "num_tokens": 807355916.0, "step": 2533 }, { "epoch": 2.577822990844354, "grad_norm": 0.7327409982681274, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8345814943313599, "num_tokens": 807664154.0, "step": 2534 }, { "epoch": 2.5788402848423195, "grad_norm": 0.7396280169487, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8391358852386475, "num_tokens": 807982521.0, "step": 2535 }, { "epoch": 2.5798575788402847, "grad_norm": 0.745638906955719, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8290165662765503, "num_tokens": 808310119.0, "step": 2536 }, { "epoch": 2.5808748728382502, "grad_norm": 0.750477135181427, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8326935172080994, "num_tokens": 808637880.0, "step": 2537 }, { "epoch": 2.581892166836216, "grad_norm": 0.724315345287323, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.83362877368927, "num_tokens": 808973018.0, "step": 2538 }, { "epoch": 2.582909460834181, "grad_norm": 0.732577919960022, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8321558833122253, "num_tokens": 809288153.0, "step": 2539 }, { "epoch": 2.5839267548321465, "grad_norm": 0.7791313529014587, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.834289014339447, "num_tokens": 809594813.0, "step": 2540 }, { "epoch": 2.5849440488301116, "grad_norm": 0.7388265132904053, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8334192037582397, "num_tokens": 809914564.0, "step": 2541 }, { "epoch": 2.585961342828077, "grad_norm": 0.7179327607154846, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8271803855895996, "num_tokens": 810235007.0, "step": 2542 }, { "epoch": 2.586978636826043, "grad_norm": 0.763087272644043, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8310995101928711, "num_tokens": 810562179.0, "step": 2543 }, { "epoch": 2.5879959308240084, "grad_norm": 0.7702552080154419, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8343356847763062, "num_tokens": 810890208.0, "step": 2544 }, { "epoch": 2.5890132248219735, "grad_norm": 0.7129185199737549, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8318526744842529, "num_tokens": 811223836.0, "step": 2545 }, { "epoch": 2.590030518819939, "grad_norm": 0.735282838344574, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8289995789527893, "num_tokens": 811552000.0, "step": 2546 }, { "epoch": 2.591047812817904, "grad_norm": 0.7413491010665894, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8306357860565186, "num_tokens": 811879361.0, "step": 2547 }, { "epoch": 2.5920651068158698, "grad_norm": 0.8138635754585266, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8339513540267944, "num_tokens": 812201258.0, "step": 2548 }, { "epoch": 2.5930824008138353, "grad_norm": 0.7724733352661133, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8256838321685791, "num_tokens": 812522643.0, "step": 2549 }, { "epoch": 2.5940996948118005, "grad_norm": 0.7342690229415894, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8354153633117676, "num_tokens": 812857034.0, "step": 2550 }, { "epoch": 2.595116988809766, "grad_norm": 1.2330756187438965, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8277719020843506, "num_tokens": 813171705.0, "step": 2551 }, { "epoch": 2.596134282807731, "grad_norm": 0.8125199675559998, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8276410698890686, "num_tokens": 813478790.0, "step": 2552 }, { "epoch": 2.5971515768056967, "grad_norm": 0.7920307517051697, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.830203115940094, "num_tokens": 813795523.0, "step": 2553 }, { "epoch": 2.5981688708036623, "grad_norm": 0.7683935165405273, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8320221304893494, "num_tokens": 814113911.0, "step": 2554 }, { "epoch": 2.599186164801628, "grad_norm": 0.7362149357795715, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8310428857803345, "num_tokens": 814435363.0, "step": 2555 }, { "epoch": 2.600203458799593, "grad_norm": 0.7711970210075378, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8286822438240051, "num_tokens": 814760052.0, "step": 2556 }, { "epoch": 2.6012207527975586, "grad_norm": 0.7560073137283325, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8328245878219604, "num_tokens": 815081519.0, "step": 2557 }, { "epoch": 2.6022380467955237, "grad_norm": 0.7367380857467651, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8287874460220337, "num_tokens": 815404054.0, "step": 2558 }, { "epoch": 2.6032553407934893, "grad_norm": 0.7321335077285767, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8326501846313477, "num_tokens": 815726850.0, "step": 2559 }, { "epoch": 2.604272634791455, "grad_norm": 0.797602653503418, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8246169090270996, "num_tokens": 816039389.0, "step": 2560 }, { "epoch": 2.60528992878942, "grad_norm": 0.7175667881965637, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8312521576881409, "num_tokens": 816369450.0, "step": 2561 }, { "epoch": 2.6063072227873856, "grad_norm": 0.7842046022415161, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8326440453529358, "num_tokens": 816691307.0, "step": 2562 }, { "epoch": 2.6073245167853507, "grad_norm": 0.7255687117576599, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8336960673332214, "num_tokens": 817014438.0, "step": 2563 }, { "epoch": 2.6083418107833163, "grad_norm": 0.7649070620536804, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8324762582778931, "num_tokens": 817342242.0, "step": 2564 }, { "epoch": 2.609359104781282, "grad_norm": 0.8680239915847778, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8290839195251465, "num_tokens": 817643303.0, "step": 2565 }, { "epoch": 2.6103763987792474, "grad_norm": 0.74476158618927, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8423797488212585, "num_tokens": 817952612.0, "step": 2566 }, { "epoch": 2.6113936927772126, "grad_norm": 0.7176665663719177, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8251190781593323, "num_tokens": 818287143.0, "step": 2567 }, { "epoch": 2.612410986775178, "grad_norm": 0.726041316986084, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.833194375038147, "num_tokens": 818608748.0, "step": 2568 }, { "epoch": 2.6134282807731433, "grad_norm": 0.7279813885688782, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8337301015853882, "num_tokens": 818924573.0, "step": 2569 }, { "epoch": 2.614445574771109, "grad_norm": 0.8289151191711426, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8384712934494019, "num_tokens": 819253050.0, "step": 2570 }, { "epoch": 2.6154628687690744, "grad_norm": 0.7946772575378418, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8344959020614624, "num_tokens": 819585374.0, "step": 2571 }, { "epoch": 2.6164801627670395, "grad_norm": 0.7746971845626831, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8323616981506348, "num_tokens": 819902977.0, "step": 2572 }, { "epoch": 2.617497456765005, "grad_norm": 0.735686719417572, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8307164907455444, "num_tokens": 820219660.0, "step": 2573 }, { "epoch": 2.6185147507629702, "grad_norm": 0.9371272325515747, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8262836933135986, "num_tokens": 820542132.0, "step": 2574 }, { "epoch": 2.619532044760936, "grad_norm": 0.745701253414154, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8384919166564941, "num_tokens": 820860446.0, "step": 2575 }, { "epoch": 2.6205493387589014, "grad_norm": 0.734398603439331, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8236312866210938, "num_tokens": 821181422.0, "step": 2576 }, { "epoch": 2.621566632756867, "grad_norm": 0.7533614039421082, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8329817652702332, "num_tokens": 821489947.0, "step": 2577 }, { "epoch": 2.622583926754832, "grad_norm": 0.7327065467834473, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8312802314758301, "num_tokens": 821809310.0, "step": 2578 }, { "epoch": 2.6236012207527977, "grad_norm": 0.7578933238983154, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8326842188835144, "num_tokens": 822129562.0, "step": 2579 }, { "epoch": 2.624618514750763, "grad_norm": 0.7132900953292847, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8397087454795837, "num_tokens": 822441494.0, "step": 2580 }, { "epoch": 2.6256358087487284, "grad_norm": 0.783174455165863, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8319453597068787, "num_tokens": 822761536.0, "step": 2581 }, { "epoch": 2.626653102746694, "grad_norm": 0.7523170113563538, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8283185362815857, "num_tokens": 823092906.0, "step": 2582 }, { "epoch": 2.627670396744659, "grad_norm": 0.796053946018219, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8302963972091675, "num_tokens": 823403147.0, "step": 2583 }, { "epoch": 2.6286876907426246, "grad_norm": 0.7748081684112549, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8307095766067505, "num_tokens": 823694330.0, "step": 2584 }, { "epoch": 2.62970498474059, "grad_norm": 0.736704409122467, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8346253037452698, "num_tokens": 824009139.0, "step": 2585 }, { "epoch": 2.6307222787385554, "grad_norm": 0.7823536992073059, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8384369015693665, "num_tokens": 824329297.0, "step": 2586 }, { "epoch": 2.631739572736521, "grad_norm": 0.7874622941017151, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8292387127876282, "num_tokens": 824673484.0, "step": 2587 }, { "epoch": 2.6327568667344865, "grad_norm": 0.7718305587768555, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.828931450843811, "num_tokens": 825002632.0, "step": 2588 }, { "epoch": 2.6337741607324516, "grad_norm": 0.7725831270217896, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8310902118682861, "num_tokens": 825320835.0, "step": 2589 }, { "epoch": 2.634791454730417, "grad_norm": 0.736696183681488, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8332586288452148, "num_tokens": 825642437.0, "step": 2590 }, { "epoch": 2.6358087487283823, "grad_norm": 0.7683669328689575, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8285846710205078, "num_tokens": 825949889.0, "step": 2591 }, { "epoch": 2.636826042726348, "grad_norm": 0.8031424880027771, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8351372480392456, "num_tokens": 826249106.0, "step": 2592 }, { "epoch": 2.6378433367243135, "grad_norm": 0.7533608675003052, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8309059143066406, "num_tokens": 826566435.0, "step": 2593 }, { "epoch": 2.6388606307222786, "grad_norm": 0.7591979503631592, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8305896520614624, "num_tokens": 826880946.0, "step": 2594 }, { "epoch": 2.639877924720244, "grad_norm": 0.7779766321182251, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8329280614852905, "num_tokens": 827204054.0, "step": 2595 }, { "epoch": 2.6408952187182093, "grad_norm": 0.7244125008583069, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.838779091835022, "num_tokens": 827526209.0, "step": 2596 }, { "epoch": 2.641912512716175, "grad_norm": 0.8206683993339539, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8372079133987427, "num_tokens": 827841027.0, "step": 2597 }, { "epoch": 2.6429298067141405, "grad_norm": 0.7615863680839539, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8255772590637207, "num_tokens": 828167926.0, "step": 2598 }, { "epoch": 2.643947100712106, "grad_norm": 0.7539216876029968, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8324081897735596, "num_tokens": 828492310.0, "step": 2599 }, { "epoch": 2.644964394710071, "grad_norm": 0.8165754675865173, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8313982486724854, "num_tokens": 828811219.0, "step": 2600 }, { "epoch": 2.6459816887080367, "grad_norm": 0.7321832180023193, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8401905298233032, "num_tokens": 829121022.0, "step": 2601 }, { "epoch": 2.646998982706002, "grad_norm": 0.7699927687644958, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.830227792263031, "num_tokens": 829442232.0, "step": 2602 }, { "epoch": 2.6480162767039674, "grad_norm": 0.7571011185646057, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8308849334716797, "num_tokens": 829766930.0, "step": 2603 }, { "epoch": 2.649033570701933, "grad_norm": 0.7879880666732788, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8314686417579651, "num_tokens": 830084634.0, "step": 2604 }, { "epoch": 2.650050864699898, "grad_norm": 0.7368956208229065, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8385523557662964, "num_tokens": 830411922.0, "step": 2605 }, { "epoch": 2.6510681586978637, "grad_norm": 0.7690927982330322, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8323553204536438, "num_tokens": 830725899.0, "step": 2606 }, { "epoch": 2.652085452695829, "grad_norm": 0.7460976839065552, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8354268670082092, "num_tokens": 831048409.0, "step": 2607 }, { "epoch": 2.6531027466937944, "grad_norm": 0.7625808715820312, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8273561596870422, "num_tokens": 831363323.0, "step": 2608 }, { "epoch": 2.65412004069176, "grad_norm": 0.7780117392539978, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.82862389087677, "num_tokens": 831680359.0, "step": 2609 }, { "epoch": 2.6551373346897256, "grad_norm": 0.7825818657875061, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8251497745513916, "num_tokens": 831993691.0, "step": 2610 }, { "epoch": 2.6561546286876907, "grad_norm": 0.7896843552589417, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.832328200340271, "num_tokens": 832317882.0, "step": 2611 }, { "epoch": 2.6571719226856563, "grad_norm": 0.7164852023124695, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8292514085769653, "num_tokens": 832638879.0, "step": 2612 }, { "epoch": 2.6581892166836214, "grad_norm": 0.7482007145881653, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8312879800796509, "num_tokens": 832949967.0, "step": 2613 }, { "epoch": 2.659206510681587, "grad_norm": 0.7153770923614502, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.829216480255127, "num_tokens": 833279512.0, "step": 2614 }, { "epoch": 2.6602238046795526, "grad_norm": 0.8966414928436279, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8260223865509033, "num_tokens": 833592791.0, "step": 2615 }, { "epoch": 2.6612410986775177, "grad_norm": 0.7625566720962524, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.841602087020874, "num_tokens": 833906060.0, "step": 2616 }, { "epoch": 2.6622583926754833, "grad_norm": 0.7611061334609985, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8299546241760254, "num_tokens": 834238761.0, "step": 2617 }, { "epoch": 2.6632756866734484, "grad_norm": 0.7519465684890747, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.831902027130127, "num_tokens": 834556371.0, "step": 2618 }, { "epoch": 2.664292980671414, "grad_norm": 0.8466227054595947, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8316989541053772, "num_tokens": 834886463.0, "step": 2619 }, { "epoch": 2.6653102746693795, "grad_norm": 0.7515255212783813, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8234021067619324, "num_tokens": 835210506.0, "step": 2620 }, { "epoch": 2.666327568667345, "grad_norm": 0.7361791133880615, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8277843594551086, "num_tokens": 835541684.0, "step": 2621 }, { "epoch": 2.6673448626653102, "grad_norm": 0.7728709578514099, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8393305540084839, "num_tokens": 835845878.0, "step": 2622 }, { "epoch": 2.668362156663276, "grad_norm": 0.7608600854873657, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8346510529518127, "num_tokens": 836171735.0, "step": 2623 }, { "epoch": 2.669379450661241, "grad_norm": 0.7453369498252869, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8335638046264648, "num_tokens": 836490407.0, "step": 2624 }, { "epoch": 2.6703967446592065, "grad_norm": 0.765600323677063, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8250622749328613, "num_tokens": 836806889.0, "step": 2625 }, { "epoch": 2.671414038657172, "grad_norm": 0.7886757850646973, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8383769392967224, "num_tokens": 837114999.0, "step": 2626 }, { "epoch": 2.672431332655137, "grad_norm": 0.7657735347747803, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8311899900436401, "num_tokens": 837446232.0, "step": 2627 }, { "epoch": 2.673448626653103, "grad_norm": 0.7709946632385254, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8304938077926636, "num_tokens": 837755582.0, "step": 2628 }, { "epoch": 2.674465920651068, "grad_norm": 0.7618497610092163, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8329755663871765, "num_tokens": 838095626.0, "step": 2629 }, { "epoch": 2.6754832146490335, "grad_norm": 0.7509616613388062, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8354884386062622, "num_tokens": 838424701.0, "step": 2630 }, { "epoch": 2.676500508646999, "grad_norm": 0.7259635329246521, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8430804014205933, "num_tokens": 838749312.0, "step": 2631 }, { "epoch": 2.6775178026449646, "grad_norm": 0.7568365931510925, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8327680826187134, "num_tokens": 839056583.0, "step": 2632 }, { "epoch": 2.6785350966429298, "grad_norm": 0.7711842656135559, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8370242714881897, "num_tokens": 839375666.0, "step": 2633 }, { "epoch": 2.6795523906408953, "grad_norm": 0.8110151886940002, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8339352607727051, "num_tokens": 839686915.0, "step": 2634 }, { "epoch": 2.6805696846388605, "grad_norm": 0.7888451218605042, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8314832448959351, "num_tokens": 840014745.0, "step": 2635 }, { "epoch": 2.681586978636826, "grad_norm": 0.7647059559822083, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8256795406341553, "num_tokens": 840328107.0, "step": 2636 }, { "epoch": 2.6826042726347916, "grad_norm": 0.7674223184585571, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8363380432128906, "num_tokens": 840645654.0, "step": 2637 }, { "epoch": 2.6836215666327567, "grad_norm": 0.8335756659507751, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8278694152832031, "num_tokens": 840940420.0, "step": 2638 }, { "epoch": 2.6846388606307223, "grad_norm": 0.7549247145652771, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8322106599807739, "num_tokens": 841254500.0, "step": 2639 }, { "epoch": 2.6856561546286875, "grad_norm": 0.8285913467407227, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8317642211914062, "num_tokens": 841560795.0, "step": 2640 }, { "epoch": 2.686673448626653, "grad_norm": 0.7781467437744141, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8295000791549683, "num_tokens": 841878106.0, "step": 2641 }, { "epoch": 2.6876907426246186, "grad_norm": 0.8468384742736816, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8307315111160278, "num_tokens": 842189540.0, "step": 2642 }, { "epoch": 2.688708036622584, "grad_norm": 0.7900856137275696, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8336501121520996, "num_tokens": 842497226.0, "step": 2643 }, { "epoch": 2.6897253306205493, "grad_norm": 0.8252914547920227, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8377597332000732, "num_tokens": 842814612.0, "step": 2644 }, { "epoch": 2.690742624618515, "grad_norm": 0.7623134255409241, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.827522873878479, "num_tokens": 843117888.0, "step": 2645 }, { "epoch": 2.69175991861648, "grad_norm": 0.7408959865570068, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8351221084594727, "num_tokens": 843448573.0, "step": 2646 }, { "epoch": 2.6927772126144456, "grad_norm": 0.8234544992446899, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8329236507415771, "num_tokens": 843753683.0, "step": 2647 }, { "epoch": 2.693794506612411, "grad_norm": 0.7720510363578796, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8356927633285522, "num_tokens": 844084227.0, "step": 2648 }, { "epoch": 2.6948118006103763, "grad_norm": 0.7771438360214233, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8273162841796875, "num_tokens": 844401669.0, "step": 2649 }, { "epoch": 2.695829094608342, "grad_norm": 0.8102500438690186, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8374292850494385, "num_tokens": 844700667.0, "step": 2650 }, { "epoch": 2.696846388606307, "grad_norm": 0.7536808252334595, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8293795585632324, "num_tokens": 845028014.0, "step": 2651 }, { "epoch": 2.6978636826042726, "grad_norm": 0.7645767331123352, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8358734846115112, "num_tokens": 845341860.0, "step": 2652 }, { "epoch": 2.698880976602238, "grad_norm": 0.7719890475273132, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8348777890205383, "num_tokens": 845655021.0, "step": 2653 }, { "epoch": 2.6998982706002037, "grad_norm": 0.7685028910636902, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8365878462791443, "num_tokens": 845981043.0, "step": 2654 }, { "epoch": 2.700915564598169, "grad_norm": 0.8190966248512268, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8298583030700684, "num_tokens": 846289014.0, "step": 2655 }, { "epoch": 2.7019328585961344, "grad_norm": 0.7613459825515747, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8314666152000427, "num_tokens": 846620497.0, "step": 2656 }, { "epoch": 2.7029501525940995, "grad_norm": 0.7502064108848572, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8381735682487488, "num_tokens": 846939843.0, "step": 2657 }, { "epoch": 2.703967446592065, "grad_norm": 0.7535653114318848, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8325021266937256, "num_tokens": 847258615.0, "step": 2658 }, { "epoch": 2.7049847405900307, "grad_norm": 0.7349403500556946, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8369818329811096, "num_tokens": 847566136.0, "step": 2659 }, { "epoch": 2.706002034587996, "grad_norm": 1.5821315050125122, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8259924650192261, "num_tokens": 847880060.0, "step": 2660 }, { "epoch": 2.7070193285859614, "grad_norm": 0.8164228796958923, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8358385562896729, "num_tokens": 848174723.0, "step": 2661 }, { "epoch": 2.7080366225839265, "grad_norm": 0.7600139379501343, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8354610800743103, "num_tokens": 848487841.0, "step": 2662 }, { "epoch": 2.709053916581892, "grad_norm": 0.707962155342102, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8348598480224609, "num_tokens": 848802975.0, "step": 2663 }, { "epoch": 2.7100712105798577, "grad_norm": 0.7490245699882507, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8320720195770264, "num_tokens": 849119947.0, "step": 2664 }, { "epoch": 2.7110885045778232, "grad_norm": 0.7445569634437561, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8383816480636597, "num_tokens": 849449730.0, "step": 2665 }, { "epoch": 2.7121057985757884, "grad_norm": 0.7544460892677307, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.831578254699707, "num_tokens": 849776273.0, "step": 2666 }, { "epoch": 2.713123092573754, "grad_norm": 0.7565827965736389, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8308531045913696, "num_tokens": 850089497.0, "step": 2667 }, { "epoch": 2.714140386571719, "grad_norm": 0.754690408706665, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8350880146026611, "num_tokens": 850402164.0, "step": 2668 }, { "epoch": 2.7151576805696847, "grad_norm": 0.7614121437072754, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8297010064125061, "num_tokens": 850732466.0, "step": 2669 }, { "epoch": 2.7161749745676502, "grad_norm": 0.8393460512161255, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8328503370285034, "num_tokens": 851032068.0, "step": 2670 }, { "epoch": 2.7171922685656154, "grad_norm": 0.7551735043525696, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8348469734191895, "num_tokens": 851344640.0, "step": 2671 }, { "epoch": 2.718209562563581, "grad_norm": 0.7437580823898315, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8372975587844849, "num_tokens": 851650774.0, "step": 2672 }, { "epoch": 2.719226856561546, "grad_norm": 0.7455363273620605, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8253104090690613, "num_tokens": 851963425.0, "step": 2673 }, { "epoch": 2.7202441505595116, "grad_norm": 0.7925944328308105, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8333321809768677, "num_tokens": 852286142.0, "step": 2674 }, { "epoch": 2.721261444557477, "grad_norm": 0.7680412530899048, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8373532295227051, "num_tokens": 852582032.0, "step": 2675 }, { "epoch": 2.722278738555443, "grad_norm": 0.7905005216598511, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8377512693405151, "num_tokens": 852884998.0, "step": 2676 }, { "epoch": 2.723296032553408, "grad_norm": 0.7383782863616943, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8325200080871582, "num_tokens": 853215576.0, "step": 2677 }, { "epoch": 2.7243133265513735, "grad_norm": 0.7262946367263794, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8283469080924988, "num_tokens": 853539145.0, "step": 2678 }, { "epoch": 2.7253306205493386, "grad_norm": 0.769798219203949, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8353898525238037, "num_tokens": 853850299.0, "step": 2679 }, { "epoch": 2.726347914547304, "grad_norm": 0.8584019541740417, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8308888673782349, "num_tokens": 854162459.0, "step": 2680 }, { "epoch": 2.7273652085452698, "grad_norm": 0.77191162109375, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.839631199836731, "num_tokens": 854480464.0, "step": 2681 }, { "epoch": 2.728382502543235, "grad_norm": 0.7624874711036682, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8337525725364685, "num_tokens": 854791740.0, "step": 2682 }, { "epoch": 2.7293997965412005, "grad_norm": 0.7865549921989441, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8374840021133423, "num_tokens": 855114343.0, "step": 2683 }, { "epoch": 2.7304170905391656, "grad_norm": 0.7589814066886902, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8377084732055664, "num_tokens": 855437674.0, "step": 2684 }, { "epoch": 2.731434384537131, "grad_norm": 0.744863748550415, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8346495628356934, "num_tokens": 855766653.0, "step": 2685 }, { "epoch": 2.7324516785350967, "grad_norm": 0.7601133584976196, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8303238153457642, "num_tokens": 856096377.0, "step": 2686 }, { "epoch": 2.7334689725330623, "grad_norm": 0.781333327293396, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.828827440738678, "num_tokens": 856408236.0, "step": 2687 }, { "epoch": 2.7344862665310274, "grad_norm": 0.7458904981613159, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8281187415122986, "num_tokens": 856726568.0, "step": 2688 }, { "epoch": 2.735503560528993, "grad_norm": 0.7970263957977295, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.832145631313324, "num_tokens": 857033785.0, "step": 2689 }, { "epoch": 2.736520854526958, "grad_norm": 0.7536654472351074, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8339976072311401, "num_tokens": 857354595.0, "step": 2690 }, { "epoch": 2.7375381485249237, "grad_norm": 0.7576792240142822, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8351134061813354, "num_tokens": 857678702.0, "step": 2691 }, { "epoch": 2.7385554425228893, "grad_norm": 0.7687288522720337, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8334512710571289, "num_tokens": 858003534.0, "step": 2692 }, { "epoch": 2.7395727365208544, "grad_norm": 0.7608603239059448, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8300853967666626, "num_tokens": 858340027.0, "step": 2693 }, { "epoch": 2.74059003051882, "grad_norm": 0.7603384852409363, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8286716341972351, "num_tokens": 858667008.0, "step": 2694 }, { "epoch": 2.741607324516785, "grad_norm": 0.7376677989959717, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8316107988357544, "num_tokens": 858997524.0, "step": 2695 }, { "epoch": 2.7426246185147507, "grad_norm": 0.7685206532478333, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8322591781616211, "num_tokens": 859308138.0, "step": 2696 }, { "epoch": 2.7436419125127163, "grad_norm": 0.8232240676879883, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8303198218345642, "num_tokens": 859624523.0, "step": 2697 }, { "epoch": 2.744659206510682, "grad_norm": 0.7209020256996155, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8319036960601807, "num_tokens": 859964920.0, "step": 2698 }, { "epoch": 2.745676500508647, "grad_norm": 0.8025298118591309, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8306405544281006, "num_tokens": 860276802.0, "step": 2699 }, { "epoch": 2.7466937945066126, "grad_norm": 0.7556748390197754, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8321038484573364, "num_tokens": 860603738.0, "step": 2700 }, { "epoch": 2.7477110885045777, "grad_norm": 0.7994128465652466, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8345211148262024, "num_tokens": 860922428.0, "step": 2701 }, { "epoch": 2.7487283825025433, "grad_norm": 0.7713677287101746, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8429844379425049, "num_tokens": 861269373.0, "step": 2702 }, { "epoch": 2.749745676500509, "grad_norm": 0.7557693719863892, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8305192589759827, "num_tokens": 861599524.0, "step": 2703 }, { "epoch": 2.750762970498474, "grad_norm": 0.7688437700271606, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.829423189163208, "num_tokens": 861913031.0, "step": 2704 }, { "epoch": 2.7517802644964395, "grad_norm": 0.761762261390686, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8373246192932129, "num_tokens": 862222025.0, "step": 2705 }, { "epoch": 2.7527975584944047, "grad_norm": 0.7933812141418457, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8344542384147644, "num_tokens": 862540675.0, "step": 2706 }, { "epoch": 2.7538148524923702, "grad_norm": 0.7460007667541504, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8310642242431641, "num_tokens": 862862721.0, "step": 2707 }, { "epoch": 2.754832146490336, "grad_norm": 0.765570878982544, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8333597779273987, "num_tokens": 863189315.0, "step": 2708 }, { "epoch": 2.7558494404883014, "grad_norm": 0.7401432394981384, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8326635360717773, "num_tokens": 863516158.0, "step": 2709 }, { "epoch": 2.7568667344862665, "grad_norm": 0.7682474851608276, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8240602016448975, "num_tokens": 863840254.0, "step": 2710 }, { "epoch": 2.757884028484232, "grad_norm": 0.768813967704773, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8326060771942139, "num_tokens": 864145411.0, "step": 2711 }, { "epoch": 2.758901322482197, "grad_norm": 0.7643911242485046, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8430464267730713, "num_tokens": 864463131.0, "step": 2712 }, { "epoch": 2.759918616480163, "grad_norm": 0.7496541142463684, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8330836296081543, "num_tokens": 864802104.0, "step": 2713 }, { "epoch": 2.7609359104781284, "grad_norm": 0.7185865044593811, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8266865611076355, "num_tokens": 865146102.0, "step": 2714 }, { "epoch": 2.7619532044760935, "grad_norm": 0.8160783052444458, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8358222246170044, "num_tokens": 865438180.0, "step": 2715 }, { "epoch": 2.762970498474059, "grad_norm": 0.7751122117042542, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8310149908065796, "num_tokens": 865761672.0, "step": 2716 }, { "epoch": 2.763987792472024, "grad_norm": 1.0494272708892822, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8299132585525513, "num_tokens": 866080300.0, "step": 2717 }, { "epoch": 2.7650050864699898, "grad_norm": 0.744733452796936, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8335196375846863, "num_tokens": 866401658.0, "step": 2718 }, { "epoch": 2.7660223804679553, "grad_norm": 0.7312759160995483, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8347940444946289, "num_tokens": 866735792.0, "step": 2719 }, { "epoch": 2.767039674465921, "grad_norm": 0.7587853670120239, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8313775062561035, "num_tokens": 867069334.0, "step": 2720 }, { "epoch": 2.768056968463886, "grad_norm": 0.7180788516998291, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8317238092422485, "num_tokens": 867389752.0, "step": 2721 }, { "epoch": 2.7690742624618516, "grad_norm": 0.7803786396980286, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8263806104660034, "num_tokens": 867692997.0, "step": 2722 }, { "epoch": 2.7700915564598168, "grad_norm": 0.7313866019248962, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8340601921081543, "num_tokens": 868020290.0, "step": 2723 }, { "epoch": 2.7711088504577823, "grad_norm": 0.7689408659934998, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8297499418258667, "num_tokens": 868330610.0, "step": 2724 }, { "epoch": 2.772126144455748, "grad_norm": 0.7322613596916199, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8314656019210815, "num_tokens": 868649173.0, "step": 2725 }, { "epoch": 2.773143438453713, "grad_norm": 0.7614291310310364, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8358194828033447, "num_tokens": 868970827.0, "step": 2726 }, { "epoch": 2.7741607324516786, "grad_norm": 0.7638256549835205, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.837417721748352, "num_tokens": 869274987.0, "step": 2727 }, { "epoch": 2.7751780264496437, "grad_norm": 0.7863031029701233, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8326071500778198, "num_tokens": 869586556.0, "step": 2728 }, { "epoch": 2.7761953204476093, "grad_norm": 0.7495458126068115, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8311454653739929, "num_tokens": 869923056.0, "step": 2729 }, { "epoch": 2.777212614445575, "grad_norm": 0.7531720399856567, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8206604719161987, "num_tokens": 870245934.0, "step": 2730 }, { "epoch": 2.7782299084435405, "grad_norm": 0.7870125770568848, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8362097144126892, "num_tokens": 870553892.0, "step": 2731 }, { "epoch": 2.7792472024415056, "grad_norm": 0.8140343427658081, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8341789841651917, "num_tokens": 870854822.0, "step": 2732 }, { "epoch": 2.780264496439471, "grad_norm": 0.7666418552398682, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8256700038909912, "num_tokens": 871168666.0, "step": 2733 }, { "epoch": 2.7812817904374363, "grad_norm": 0.7200734615325928, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8355676531791687, "num_tokens": 871494942.0, "step": 2734 }, { "epoch": 2.782299084435402, "grad_norm": 0.7705289721488953, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8269005417823792, "num_tokens": 871816010.0, "step": 2735 }, { "epoch": 2.7833163784333674, "grad_norm": 0.7768874764442444, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8357193470001221, "num_tokens": 872141817.0, "step": 2736 }, { "epoch": 2.7843336724313326, "grad_norm": 0.7831196188926697, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8289015293121338, "num_tokens": 872455047.0, "step": 2737 }, { "epoch": 2.785350966429298, "grad_norm": 0.7852113246917725, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8279637694358826, "num_tokens": 872792380.0, "step": 2738 }, { "epoch": 2.7863682604272633, "grad_norm": 0.7540766596794128, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8289576172828674, "num_tokens": 873101516.0, "step": 2739 }, { "epoch": 2.787385554425229, "grad_norm": 0.7995472550392151, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8327668309211731, "num_tokens": 873396170.0, "step": 2740 }, { "epoch": 2.7884028484231944, "grad_norm": 0.7751038074493408, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8297097682952881, "num_tokens": 873721507.0, "step": 2741 }, { "epoch": 2.78942014242116, "grad_norm": 0.7843905091285706, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.839174211025238, "num_tokens": 874047437.0, "step": 2742 }, { "epoch": 2.790437436419125, "grad_norm": 0.7461211681365967, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8334130644798279, "num_tokens": 874355294.0, "step": 2743 }, { "epoch": 2.7914547304170907, "grad_norm": 0.7744642496109009, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8340868353843689, "num_tokens": 874686080.0, "step": 2744 }, { "epoch": 2.792472024415056, "grad_norm": 0.7094562649726868, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8346304297447205, "num_tokens": 875008971.0, "step": 2745 }, { "epoch": 2.7934893184130214, "grad_norm": 0.7940659523010254, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8369737267494202, "num_tokens": 875321096.0, "step": 2746 }, { "epoch": 2.794506612410987, "grad_norm": 0.7668308019638062, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8349287509918213, "num_tokens": 875635412.0, "step": 2747 }, { "epoch": 2.795523906408952, "grad_norm": 0.8064855933189392, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8344882130622864, "num_tokens": 875976489.0, "step": 2748 }, { "epoch": 2.7965412004069177, "grad_norm": 0.7406098246574402, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8245760202407837, "num_tokens": 876298846.0, "step": 2749 }, { "epoch": 2.797558494404883, "grad_norm": 0.7594728469848633, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8307546377182007, "num_tokens": 876621222.0, "step": 2750 }, { "epoch": 2.7985757884028484, "grad_norm": 0.7740033864974976, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8324582576751709, "num_tokens": 876924869.0, "step": 2751 }, { "epoch": 2.799593082400814, "grad_norm": 0.7317264080047607, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8307446241378784, "num_tokens": 877247182.0, "step": 2752 }, { "epoch": 2.8006103763987795, "grad_norm": 0.7167682647705078, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8371304273605347, "num_tokens": 877562351.0, "step": 2753 }, { "epoch": 2.8016276703967447, "grad_norm": 0.7626464366912842, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8335856199264526, "num_tokens": 877878025.0, "step": 2754 }, { "epoch": 2.8026449643947102, "grad_norm": 0.8109871745109558, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8261090517044067, "num_tokens": 878193570.0, "step": 2755 }, { "epoch": 2.8036622583926754, "grad_norm": 0.7584338188171387, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8293702602386475, "num_tokens": 878511125.0, "step": 2756 }, { "epoch": 2.804679552390641, "grad_norm": 0.8380006551742554, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8421043753623962, "num_tokens": 878816670.0, "step": 2757 }, { "epoch": 2.8056968463886065, "grad_norm": 0.7756363153457642, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8319076299667358, "num_tokens": 879137968.0, "step": 2758 }, { "epoch": 2.8067141403865716, "grad_norm": 0.774976372718811, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8325152397155762, "num_tokens": 879459289.0, "step": 2759 }, { "epoch": 2.807731434384537, "grad_norm": 0.765339195728302, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8319852352142334, "num_tokens": 879793433.0, "step": 2760 }, { "epoch": 2.8087487283825023, "grad_norm": 0.72953200340271, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8244074583053589, "num_tokens": 880110903.0, "step": 2761 }, { "epoch": 2.809766022380468, "grad_norm": 0.7537474632263184, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8377683758735657, "num_tokens": 880423872.0, "step": 2762 }, { "epoch": 2.8107833163784335, "grad_norm": 0.7519904375076294, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8344398736953735, "num_tokens": 880730161.0, "step": 2763 }, { "epoch": 2.811800610376399, "grad_norm": 0.7822726368904114, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8343329429626465, "num_tokens": 881055356.0, "step": 2764 }, { "epoch": 2.812817904374364, "grad_norm": 0.7529635429382324, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8321344256401062, "num_tokens": 881381086.0, "step": 2765 }, { "epoch": 2.8138351983723298, "grad_norm": 0.7604416012763977, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8359646201133728, "num_tokens": 881697429.0, "step": 2766 }, { "epoch": 2.814852492370295, "grad_norm": 0.7417066693305969, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.832127571105957, "num_tokens": 882012327.0, "step": 2767 }, { "epoch": 2.8158697863682605, "grad_norm": 0.7463370561599731, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8359636664390564, "num_tokens": 882333209.0, "step": 2768 }, { "epoch": 2.816887080366226, "grad_norm": 0.768548846244812, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8318994045257568, "num_tokens": 882643243.0, "step": 2769 }, { "epoch": 2.817904374364191, "grad_norm": 0.7283583283424377, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8356508016586304, "num_tokens": 882969232.0, "step": 2770 }, { "epoch": 2.8189216683621567, "grad_norm": 0.7772321701049805, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8342421054840088, "num_tokens": 883279013.0, "step": 2771 }, { "epoch": 2.819938962360122, "grad_norm": 0.7435513138771057, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.829302191734314, "num_tokens": 883592025.0, "step": 2772 }, { "epoch": 2.8209562563580874, "grad_norm": 0.807037889957428, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8380916118621826, "num_tokens": 883908818.0, "step": 2773 }, { "epoch": 2.821973550356053, "grad_norm": 0.7611693739891052, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8256351947784424, "num_tokens": 884225754.0, "step": 2774 }, { "epoch": 2.822990844354018, "grad_norm": 0.7505136728286743, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8276066780090332, "num_tokens": 884553353.0, "step": 2775 }, { "epoch": 2.8240081383519837, "grad_norm": 0.7669854760169983, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8279123306274414, "num_tokens": 884885565.0, "step": 2776 }, { "epoch": 2.8250254323499493, "grad_norm": 0.7808510065078735, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8301238417625427, "num_tokens": 885208221.0, "step": 2777 }, { "epoch": 2.8260427263479144, "grad_norm": 0.7980822324752808, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8371199369430542, "num_tokens": 885522863.0, "step": 2778 }, { "epoch": 2.82706002034588, "grad_norm": 0.7625985741615295, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8328350782394409, "num_tokens": 885850081.0, "step": 2779 }, { "epoch": 2.8280773143438456, "grad_norm": 0.7917464375495911, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8335134983062744, "num_tokens": 886186789.0, "step": 2780 }, { "epoch": 2.8290946083418107, "grad_norm": 0.7513185143470764, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8289699554443359, "num_tokens": 886496034.0, "step": 2781 }, { "epoch": 2.8301119023397763, "grad_norm": 0.7158359289169312, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8345068693161011, "num_tokens": 886817995.0, "step": 2782 }, { "epoch": 2.8311291963377414, "grad_norm": 0.8091121912002563, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8350147008895874, "num_tokens": 887120555.0, "step": 2783 }, { "epoch": 2.832146490335707, "grad_norm": 0.7498506307601929, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8364719152450562, "num_tokens": 887434554.0, "step": 2784 }, { "epoch": 2.8331637843336726, "grad_norm": 0.7743940353393555, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8380556702613831, "num_tokens": 887749092.0, "step": 2785 }, { "epoch": 2.8341810783316377, "grad_norm": 0.7444809079170227, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8259097337722778, "num_tokens": 888072082.0, "step": 2786 }, { "epoch": 2.8351983723296033, "grad_norm": 0.7832455635070801, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8302660584449768, "num_tokens": 888384734.0, "step": 2787 }, { "epoch": 2.836215666327569, "grad_norm": 0.7678145170211792, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8378886580467224, "num_tokens": 888705436.0, "step": 2788 }, { "epoch": 2.837232960325534, "grad_norm": 0.7604836821556091, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8355188369750977, "num_tokens": 889025194.0, "step": 2789 }, { "epoch": 2.8382502543234995, "grad_norm": 0.7448104023933411, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8303406238555908, "num_tokens": 889348596.0, "step": 2790 }, { "epoch": 2.839267548321465, "grad_norm": 0.7357156276702881, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8347613215446472, "num_tokens": 889676197.0, "step": 2791 }, { "epoch": 2.8402848423194302, "grad_norm": 0.8427879810333252, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8312827348709106, "num_tokens": 890005117.0, "step": 2792 }, { "epoch": 2.841302136317396, "grad_norm": 0.78268963098526, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8319834470748901, "num_tokens": 890316760.0, "step": 2793 }, { "epoch": 2.842319430315361, "grad_norm": 0.7581083178520203, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.832615852355957, "num_tokens": 890620871.0, "step": 2794 }, { "epoch": 2.8433367243133265, "grad_norm": 0.7503229975700378, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8342493176460266, "num_tokens": 890936449.0, "step": 2795 }, { "epoch": 2.844354018311292, "grad_norm": 0.7486345767974854, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8329163789749146, "num_tokens": 891269034.0, "step": 2796 }, { "epoch": 2.845371312309257, "grad_norm": 0.7206019163131714, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.834210991859436, "num_tokens": 891601141.0, "step": 2797 }, { "epoch": 2.846388606307223, "grad_norm": 0.7604000568389893, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8274968862533569, "num_tokens": 891911699.0, "step": 2798 }, { "epoch": 2.8474059003051884, "grad_norm": 0.743278443813324, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8318988680839539, "num_tokens": 892236979.0, "step": 2799 }, { "epoch": 2.8484231943031535, "grad_norm": 0.787842869758606, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8333825469017029, "num_tokens": 892536633.0, "step": 2800 }, { "epoch": 2.849440488301119, "grad_norm": 0.7452677488327026, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8356083631515503, "num_tokens": 892862170.0, "step": 2801 }, { "epoch": 2.8504577822990846, "grad_norm": 0.7698550224304199, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8296544551849365, "num_tokens": 893170520.0, "step": 2802 }, { "epoch": 2.8514750762970498, "grad_norm": 0.7370395064353943, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8322560787200928, "num_tokens": 893480701.0, "step": 2803 }, { "epoch": 2.8524923702950153, "grad_norm": 0.7674466371536255, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8261927962303162, "num_tokens": 893795202.0, "step": 2804 }, { "epoch": 2.8535096642929805, "grad_norm": 0.7611037492752075, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8353677988052368, "num_tokens": 894108299.0, "step": 2805 }, { "epoch": 2.854526958290946, "grad_norm": 0.7476277351379395, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.82884681224823, "num_tokens": 894419933.0, "step": 2806 }, { "epoch": 2.8555442522889116, "grad_norm": 0.737981379032135, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8351168632507324, "num_tokens": 894741124.0, "step": 2807 }, { "epoch": 2.8565615462868768, "grad_norm": 0.7774837017059326, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8259122371673584, "num_tokens": 895056647.0, "step": 2808 }, { "epoch": 2.8575788402848423, "grad_norm": 0.777219831943512, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8236808776855469, "num_tokens": 895380518.0, "step": 2809 }, { "epoch": 2.8585961342828075, "grad_norm": 0.7577489018440247, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.833845317363739, "num_tokens": 895702008.0, "step": 2810 }, { "epoch": 2.859613428280773, "grad_norm": 0.7717519998550415, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8394262194633484, "num_tokens": 896011546.0, "step": 2811 }, { "epoch": 2.8606307222787386, "grad_norm": 0.7280170917510986, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8363389372825623, "num_tokens": 896329246.0, "step": 2812 }, { "epoch": 2.861648016276704, "grad_norm": 0.741651177406311, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8382123112678528, "num_tokens": 896652517.0, "step": 2813 }, { "epoch": 2.8626653102746693, "grad_norm": 0.7922471761703491, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8348798155784607, "num_tokens": 896970551.0, "step": 2814 }, { "epoch": 2.863682604272635, "grad_norm": 0.7952819466590881, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8329468965530396, "num_tokens": 897291892.0, "step": 2815 }, { "epoch": 2.8646998982706, "grad_norm": 0.7516127824783325, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8317283391952515, "num_tokens": 897605769.0, "step": 2816 }, { "epoch": 2.8657171922685656, "grad_norm": 0.7459313869476318, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8376507759094238, "num_tokens": 897914249.0, "step": 2817 }, { "epoch": 2.866734486266531, "grad_norm": 0.7906985878944397, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8333652019500732, "num_tokens": 898228863.0, "step": 2818 }, { "epoch": 2.8677517802644963, "grad_norm": 0.7567861676216125, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8349796533584595, "num_tokens": 898530951.0, "step": 2819 }, { "epoch": 2.868769074262462, "grad_norm": 0.9229597449302673, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8336858749389648, "num_tokens": 898851712.0, "step": 2820 }, { "epoch": 2.869786368260427, "grad_norm": 0.8401355743408203, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8318575620651245, "num_tokens": 899173646.0, "step": 2821 }, { "epoch": 2.8708036622583926, "grad_norm": 0.7329902648925781, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.834531843662262, "num_tokens": 899486743.0, "step": 2822 }, { "epoch": 2.871820956256358, "grad_norm": 0.7627372145652771, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8335117101669312, "num_tokens": 899809627.0, "step": 2823 }, { "epoch": 2.8728382502543237, "grad_norm": 0.7298281192779541, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8327968120574951, "num_tokens": 900137262.0, "step": 2824 }, { "epoch": 2.873855544252289, "grad_norm": 0.7618030905723572, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8297356367111206, "num_tokens": 900459708.0, "step": 2825 }, { "epoch": 2.8748728382502544, "grad_norm": 0.7472760081291199, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8299884796142578, "num_tokens": 900777165.0, "step": 2826 }, { "epoch": 2.8758901322482195, "grad_norm": 0.7818968892097473, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8371264934539795, "num_tokens": 901100446.0, "step": 2827 }, { "epoch": 2.876907426246185, "grad_norm": 0.7526576519012451, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8375152945518494, "num_tokens": 901413081.0, "step": 2828 }, { "epoch": 2.8779247202441507, "grad_norm": 0.8172447085380554, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8350641131401062, "num_tokens": 901726168.0, "step": 2829 }, { "epoch": 2.878942014242116, "grad_norm": 0.7760986089706421, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8311924934387207, "num_tokens": 902035368.0, "step": 2830 }, { "epoch": 2.8799593082400814, "grad_norm": 0.7464948892593384, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8369188904762268, "num_tokens": 902361599.0, "step": 2831 }, { "epoch": 2.8809766022380465, "grad_norm": 0.7712615132331848, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.833279550075531, "num_tokens": 902671212.0, "step": 2832 }, { "epoch": 2.881993896236012, "grad_norm": 0.8632667064666748, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8271939158439636, "num_tokens": 902987411.0, "step": 2833 }, { "epoch": 2.8830111902339777, "grad_norm": 0.7410888075828552, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8314687013626099, "num_tokens": 903308282.0, "step": 2834 }, { "epoch": 2.8840284842319432, "grad_norm": 0.7582542896270752, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8322716355323792, "num_tokens": 903628314.0, "step": 2835 }, { "epoch": 2.8850457782299084, "grad_norm": 0.7456071972846985, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8332905769348145, "num_tokens": 903947269.0, "step": 2836 }, { "epoch": 2.886063072227874, "grad_norm": 0.7716429829597473, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8303126692771912, "num_tokens": 904246591.0, "step": 2837 }, { "epoch": 2.887080366225839, "grad_norm": 0.7811114192008972, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8313155770301819, "num_tokens": 904565888.0, "step": 2838 }, { "epoch": 2.8880976602238047, "grad_norm": 0.8217208981513977, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8256539702415466, "num_tokens": 904888197.0, "step": 2839 }, { "epoch": 2.8891149542217702, "grad_norm": 0.7821131944656372, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8305214643478394, "num_tokens": 905218458.0, "step": 2840 }, { "epoch": 2.8901322482197354, "grad_norm": 1.0390526056289673, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8258970379829407, "num_tokens": 905543712.0, "step": 2841 }, { "epoch": 2.891149542217701, "grad_norm": 0.8377124667167664, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8334788084030151, "num_tokens": 905848988.0, "step": 2842 }, { "epoch": 2.892166836215666, "grad_norm": 0.7347882390022278, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8319611549377441, "num_tokens": 906155224.0, "step": 2843 }, { "epoch": 2.8931841302136316, "grad_norm": 0.7705765962600708, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8289397954940796, "num_tokens": 906474923.0, "step": 2844 }, { "epoch": 2.894201424211597, "grad_norm": 0.7141281962394714, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8342429399490356, "num_tokens": 906788218.0, "step": 2845 }, { "epoch": 2.895218718209563, "grad_norm": 0.7942697405815125, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8304331302642822, "num_tokens": 907091206.0, "step": 2846 }, { "epoch": 2.896236012207528, "grad_norm": 0.825537919998169, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8328003883361816, "num_tokens": 907426047.0, "step": 2847 }, { "epoch": 2.8972533062054935, "grad_norm": 0.7048806548118591, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8351121544837952, "num_tokens": 907753455.0, "step": 2848 }, { "epoch": 2.8982706002034586, "grad_norm": 0.7680245637893677, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8381252288818359, "num_tokens": 908071238.0, "step": 2849 }, { "epoch": 2.899287894201424, "grad_norm": 0.774487316608429, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8338207006454468, "num_tokens": 908384365.0, "step": 2850 }, { "epoch": 2.9003051881993898, "grad_norm": 0.8231586217880249, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8338854908943176, "num_tokens": 908692929.0, "step": 2851 }, { "epoch": 2.901322482197355, "grad_norm": 0.7833102941513062, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8346694707870483, "num_tokens": 909009416.0, "step": 2852 }, { "epoch": 2.9023397761953205, "grad_norm": 0.7677398920059204, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8401553630828857, "num_tokens": 909326555.0, "step": 2853 }, { "epoch": 2.9033570701932856, "grad_norm": 0.7626712918281555, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8389687538146973, "num_tokens": 909639003.0, "step": 2854 }, { "epoch": 2.904374364191251, "grad_norm": 0.8620499968528748, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8299873471260071, "num_tokens": 909943894.0, "step": 2855 }, { "epoch": 2.9053916581892167, "grad_norm": 0.7735525369644165, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8311653733253479, "num_tokens": 910282610.0, "step": 2856 }, { "epoch": 2.9064089521871823, "grad_norm": 0.7400574684143066, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8316192030906677, "num_tokens": 910603096.0, "step": 2857 }, { "epoch": 2.9074262461851474, "grad_norm": 0.7685510516166687, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8340034484863281, "num_tokens": 910907889.0, "step": 2858 }, { "epoch": 2.908443540183113, "grad_norm": 0.9150114059448242, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8319932222366333, "num_tokens": 911219506.0, "step": 2859 }, { "epoch": 2.909460834181078, "grad_norm": 0.7258608937263489, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8304907083511353, "num_tokens": 911543675.0, "step": 2860 }, { "epoch": 2.9104781281790437, "grad_norm": 0.7376611828804016, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8338150382041931, "num_tokens": 911875094.0, "step": 2861 }, { "epoch": 2.9114954221770093, "grad_norm": 0.823221743106842, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8295491933822632, "num_tokens": 912185571.0, "step": 2862 }, { "epoch": 2.9125127161749744, "grad_norm": 0.7491183280944824, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8328477740287781, "num_tokens": 912513109.0, "step": 2863 }, { "epoch": 2.91353001017294, "grad_norm": 0.7279940843582153, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8309471011161804, "num_tokens": 912850119.0, "step": 2864 }, { "epoch": 2.914547304170905, "grad_norm": 0.7640823721885681, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8364745378494263, "num_tokens": 913178881.0, "step": 2865 }, { "epoch": 2.9155645981688707, "grad_norm": 0.7615811228752136, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.833329975605011, "num_tokens": 913518334.0, "step": 2866 }, { "epoch": 2.9165818921668363, "grad_norm": 0.7493634223937988, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8334664702415466, "num_tokens": 913821428.0, "step": 2867 }, { "epoch": 2.917599186164802, "grad_norm": 0.7722597122192383, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8375759720802307, "num_tokens": 914126038.0, "step": 2868 }, { "epoch": 2.918616480162767, "grad_norm": 0.7251677513122559, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8307819366455078, "num_tokens": 914459291.0, "step": 2869 }, { "epoch": 2.9196337741607326, "grad_norm": 0.7932417392730713, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8403124809265137, "num_tokens": 914765805.0, "step": 2870 }, { "epoch": 2.9206510681586977, "grad_norm": 0.8020852208137512, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8264729976654053, "num_tokens": 915083509.0, "step": 2871 }, { "epoch": 2.9216683621566633, "grad_norm": 0.7599645256996155, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8344203233718872, "num_tokens": 915391939.0, "step": 2872 }, { "epoch": 2.922685656154629, "grad_norm": 0.760013222694397, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8350082039833069, "num_tokens": 915708594.0, "step": 2873 }, { "epoch": 2.923702950152594, "grad_norm": 0.7909659743309021, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8332921266555786, "num_tokens": 915997116.0, "step": 2874 }, { "epoch": 2.9247202441505595, "grad_norm": 0.7611994743347168, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8260282278060913, "num_tokens": 916313182.0, "step": 2875 }, { "epoch": 2.9257375381485247, "grad_norm": 0.8146640062332153, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8340287208557129, "num_tokens": 916617077.0, "step": 2876 }, { "epoch": 2.9267548321464902, "grad_norm": 0.7757108211517334, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8344134092330933, "num_tokens": 916935162.0, "step": 2877 }, { "epoch": 2.927772126144456, "grad_norm": 0.7191690802574158, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8260297775268555, "num_tokens": 917266918.0, "step": 2878 }, { "epoch": 2.9287894201424214, "grad_norm": 0.774037778377533, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8385676741600037, "num_tokens": 917586569.0, "step": 2879 }, { "epoch": 2.9298067141403865, "grad_norm": 0.7555479407310486, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8282998204231262, "num_tokens": 917913379.0, "step": 2880 }, { "epoch": 2.930824008138352, "grad_norm": 0.7959446310997009, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8281709551811218, "num_tokens": 918219033.0, "step": 2881 }, { "epoch": 2.931841302136317, "grad_norm": 0.7435408234596252, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8339255452156067, "num_tokens": 918538998.0, "step": 2882 }, { "epoch": 2.932858596134283, "grad_norm": 0.7767910361289978, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8368157744407654, "num_tokens": 918848120.0, "step": 2883 }, { "epoch": 2.9338758901322484, "grad_norm": 0.7454608082771301, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8298519849777222, "num_tokens": 919158162.0, "step": 2884 }, { "epoch": 2.9348931841302135, "grad_norm": 0.8529176115989685, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8282391428947449, "num_tokens": 919454435.0, "step": 2885 }, { "epoch": 2.935910478128179, "grad_norm": 0.996655285358429, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8331795930862427, "num_tokens": 919767371.0, "step": 2886 }, { "epoch": 2.936927772126144, "grad_norm": 0.8495599627494812, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8371427655220032, "num_tokens": 920097231.0, "step": 2887 }, { "epoch": 2.9379450661241098, "grad_norm": 0.7650257349014282, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8332249522209167, "num_tokens": 920404124.0, "step": 2888 }, { "epoch": 2.9389623601220753, "grad_norm": 0.799810528755188, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8236322402954102, "num_tokens": 920723542.0, "step": 2889 }, { "epoch": 2.939979654120041, "grad_norm": 0.8185797929763794, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8314552903175354, "num_tokens": 921032468.0, "step": 2890 }, { "epoch": 2.940996948118006, "grad_norm": 0.7548120617866516, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8355669975280762, "num_tokens": 921343839.0, "step": 2891 }, { "epoch": 2.9420142421159716, "grad_norm": 0.7259658575057983, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8298841714859009, "num_tokens": 921677191.0, "step": 2892 }, { "epoch": 2.9430315361139368, "grad_norm": 0.8542818427085876, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8286068439483643, "num_tokens": 921985325.0, "step": 2893 }, { "epoch": 2.9440488301119023, "grad_norm": 0.7178732752799988, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8378465175628662, "num_tokens": 922307545.0, "step": 2894 }, { "epoch": 2.945066124109868, "grad_norm": 0.7427535653114319, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8293996453285217, "num_tokens": 922627414.0, "step": 2895 }, { "epoch": 2.946083418107833, "grad_norm": 0.7810771465301514, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8286993503570557, "num_tokens": 922940862.0, "step": 2896 }, { "epoch": 2.9471007121057986, "grad_norm": 0.7892922759056091, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8313384056091309, "num_tokens": 923268455.0, "step": 2897 }, { "epoch": 2.9481180061037637, "grad_norm": 0.759972095489502, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8341248035430908, "num_tokens": 923581013.0, "step": 2898 }, { "epoch": 2.9491353001017293, "grad_norm": 0.7308177351951599, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8310506343841553, "num_tokens": 923889387.0, "step": 2899 }, { "epoch": 2.950152594099695, "grad_norm": 0.7997915148735046, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8328058123588562, "num_tokens": 924193594.0, "step": 2900 }, { "epoch": 2.9511698880976605, "grad_norm": 0.7386446595191956, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8359384536743164, "num_tokens": 924523122.0, "step": 2901 }, { "epoch": 2.9521871820956256, "grad_norm": 0.8074952363967896, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8404185771942139, "num_tokens": 924829073.0, "step": 2902 }, { "epoch": 2.953204476093591, "grad_norm": 0.9806423187255859, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8329747915267944, "num_tokens": 925176143.0, "step": 2903 }, { "epoch": 2.9542217700915563, "grad_norm": 0.7590146064758301, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8392025232315063, "num_tokens": 925477479.0, "step": 2904 }, { "epoch": 2.955239064089522, "grad_norm": 0.8951772451400757, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8377875089645386, "num_tokens": 925794642.0, "step": 2905 }, { "epoch": 2.9562563580874874, "grad_norm": 0.786734938621521, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8343855738639832, "num_tokens": 926085112.0, "step": 2906 }, { "epoch": 2.9572736520854526, "grad_norm": 0.8489226698875427, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8272536396980286, "num_tokens": 926404523.0, "step": 2907 }, { "epoch": 2.958290946083418, "grad_norm": 0.852146327495575, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8402287364006042, "num_tokens": 926703380.0, "step": 2908 }, { "epoch": 2.9593082400813833, "grad_norm": 0.8744028806686401, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8320158123970032, "num_tokens": 927009636.0, "step": 2909 }, { "epoch": 2.960325534079349, "grad_norm": 0.7487736940383911, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8285006880760193, "num_tokens": 927349363.0, "step": 2910 }, { "epoch": 2.9613428280773144, "grad_norm": 0.8733537793159485, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8354209661483765, "num_tokens": 927661962.0, "step": 2911 }, { "epoch": 2.96236012207528, "grad_norm": 0.7431246638298035, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8311399221420288, "num_tokens": 927984140.0, "step": 2912 }, { "epoch": 2.963377416073245, "grad_norm": 0.7541692852973938, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8358690738677979, "num_tokens": 928321531.0, "step": 2913 }, { "epoch": 2.9643947100712107, "grad_norm": 0.7754326462745667, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8390681743621826, "num_tokens": 928624216.0, "step": 2914 }, { "epoch": 2.965412004069176, "grad_norm": 0.7342763543128967, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8322646617889404, "num_tokens": 928950416.0, "step": 2915 }, { "epoch": 2.9664292980671414, "grad_norm": 0.7890582084655762, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8303789496421814, "num_tokens": 929254662.0, "step": 2916 }, { "epoch": 2.967446592065107, "grad_norm": 0.782354474067688, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8377487659454346, "num_tokens": 929559750.0, "step": 2917 }, { "epoch": 2.968463886063072, "grad_norm": 0.8146892786026001, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8391036987304688, "num_tokens": 929861530.0, "step": 2918 }, { "epoch": 2.9694811800610377, "grad_norm": 0.7631412148475647, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8333353400230408, "num_tokens": 930177144.0, "step": 2919 }, { "epoch": 2.970498474059003, "grad_norm": 0.7770146727561951, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8283876180648804, "num_tokens": 930500065.0, "step": 2920 }, { "epoch": 2.9715157680569684, "grad_norm": 0.9702063202857971, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8294034004211426, "num_tokens": 930809980.0, "step": 2921 }, { "epoch": 2.972533062054934, "grad_norm": 0.7356535196304321, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8369752168655396, "num_tokens": 931124035.0, "step": 2922 }, { "epoch": 2.9735503560528995, "grad_norm": 0.7785635590553284, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.837040901184082, "num_tokens": 931415682.0, "step": 2923 }, { "epoch": 2.9745676500508647, "grad_norm": 0.7457777857780457, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8285586833953857, "num_tokens": 931735254.0, "step": 2924 }, { "epoch": 2.9755849440488302, "grad_norm": 0.7707124948501587, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8271319270133972, "num_tokens": 932044046.0, "step": 2925 }, { "epoch": 2.9766022380467954, "grad_norm": 0.7712051272392273, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8353266716003418, "num_tokens": 932350730.0, "step": 2926 }, { "epoch": 2.977619532044761, "grad_norm": 0.7692690491676331, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.833324134349823, "num_tokens": 932673626.0, "step": 2927 }, { "epoch": 2.9786368260427265, "grad_norm": 0.7063543200492859, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8369261026382446, "num_tokens": 933021539.0, "step": 2928 }, { "epoch": 2.9796541200406916, "grad_norm": 0.740020215511322, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8245356678962708, "num_tokens": 933342649.0, "step": 2929 }, { "epoch": 2.980671414038657, "grad_norm": 0.7315794825553894, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8372519016265869, "num_tokens": 933683564.0, "step": 2930 }, { "epoch": 2.9816887080366223, "grad_norm": 0.7954710125923157, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8298385143280029, "num_tokens": 933996328.0, "step": 2931 }, { "epoch": 2.982706002034588, "grad_norm": 0.7554163932800293, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8375445604324341, "num_tokens": 934323985.0, "step": 2932 }, { "epoch": 2.9837232960325535, "grad_norm": 0.7268316745758057, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8338518142700195, "num_tokens": 934651946.0, "step": 2933 }, { "epoch": 2.984740590030519, "grad_norm": 0.7595980763435364, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.836643397808075, "num_tokens": 934983408.0, "step": 2934 }, { "epoch": 2.985757884028484, "grad_norm": 0.7359001636505127, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8356486558914185, "num_tokens": 935295080.0, "step": 2935 }, { "epoch": 2.9867751780264498, "grad_norm": 0.7544114589691162, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8339033126831055, "num_tokens": 935616845.0, "step": 2936 }, { "epoch": 2.987792472024415, "grad_norm": 0.9193035364151001, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8357737064361572, "num_tokens": 935925552.0, "step": 2937 }, { "epoch": 2.9888097660223805, "grad_norm": 0.7781708836555481, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8341381549835205, "num_tokens": 936253904.0, "step": 2938 }, { "epoch": 2.989827060020346, "grad_norm": 0.7724592685699463, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8389288187026978, "num_tokens": 936577899.0, "step": 2939 }, { "epoch": 2.990844354018311, "grad_norm": 0.8233632445335388, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.832964301109314, "num_tokens": 936910362.0, "step": 2940 }, { "epoch": 2.9918616480162767, "grad_norm": 0.7473217248916626, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8334630727767944, "num_tokens": 937231947.0, "step": 2941 }, { "epoch": 2.992878942014242, "grad_norm": 0.8038507699966431, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8273025751113892, "num_tokens": 937547528.0, "step": 2942 }, { "epoch": 2.9938962360122074, "grad_norm": 0.7563943862915039, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8375269174575806, "num_tokens": 937866031.0, "step": 2943 }, { "epoch": 2.994913530010173, "grad_norm": 1.0789949893951416, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8369662165641785, "num_tokens": 938184792.0, "step": 2944 }, { "epoch": 2.9959308240081386, "grad_norm": 0.7308768630027771, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8324853181838989, "num_tokens": 938504623.0, "step": 2945 }, { "epoch": 2.9969481180061037, "grad_norm": 0.8132859468460083, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8332552313804626, "num_tokens": 938827631.0, "step": 2946 }, { "epoch": 2.9979654120040693, "grad_norm": 0.7725070118904114, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8354282379150391, "num_tokens": 939140656.0, "step": 2947 }, { "epoch": 2.9989827060020344, "grad_norm": 0.7818151712417603, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8355613350868225, "num_tokens": 939437597.0, "step": 2948 }, { "epoch": 3.0, "grad_norm": 0.7417168617248535, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8380080461502075, "num_tokens": 939774271.0, "step": 2949 }, { "epoch": 3.0, "step": 2949, "total_flos": 6.428081412041081e+18, "train_loss": 0.6103494446655175, "train_runtime": 7328.7514, "train_samples_per_second": 51.484, "train_steps_per_second": 0.402 } ], "logging_steps": 1, "max_steps": 2949, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1475, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.428081412041081e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }