{ "best_global_step": 9000, "best_metric": 0.44325256, "best_model_checkpoint": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426/checkpoint-9000", "epoch": 5.150246834084568, "eval_steps": 1000, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042927666881305, "grad_norm": 1.6802181005477905, "learning_rate": 8.928571428571429e-08, "loss": 0.9940392374992371, "memory(GiB)": 47.57, "step": 1, "token_acc": 0.7531317395493966, "train_speed(iter/s)": 0.017784 }, { "epoch": 0.008585533376261001, "grad_norm": 0.8075768947601318, "learning_rate": 1.7857142857142857e-06, "loss": 0.9834547544780531, "memory(GiB)": 72.72, "step": 20, "token_acc": 0.7346348107371886, "train_speed(iter/s)": 0.071123 }, { "epoch": 0.017171066752522002, "grad_norm": 0.4582468867301941, "learning_rate": 3.5714285714285714e-06, "loss": 0.8647201538085938, "memory(GiB)": 72.72, "step": 40, "token_acc": 0.7403293957393929, "train_speed(iter/s)": 0.078614 }, { "epoch": 0.025756600128783, "grad_norm": 0.22638651728630066, "learning_rate": 5.357142857142857e-06, "loss": 0.756020975112915, "memory(GiB)": 72.72, "step": 60, "token_acc": 0.7670503626615286, "train_speed(iter/s)": 0.081671 }, { "epoch": 0.034342133505044004, "grad_norm": 0.22832736372947693, "learning_rate": 7.142857142857143e-06, "loss": 0.6885409832000733, "memory(GiB)": 72.72, "step": 80, "token_acc": 0.7765815619910137, "train_speed(iter/s)": 0.083578 }, { "epoch": 0.042927666881305004, "grad_norm": 0.1798371970653534, "learning_rate": 8.92857142857143e-06, "loss": 0.6466886043548584, "memory(GiB)": 72.72, "step": 100, "token_acc": 0.7900423674902669, "train_speed(iter/s)": 0.084818 }, { "epoch": 0.051513200257566, "grad_norm": 0.17644034326076508, "learning_rate": 1.0714285714285714e-05, "loss": 0.6123067378997803, "memory(GiB)": 72.72, "step": 120, "token_acc": 0.7967146967110779, "train_speed(iter/s)": 0.086053 }, { "epoch": 0.060098733633827, "grad_norm": 0.20387020707130432, "learning_rate": 1.25e-05, "loss": 0.6003653049468994, "memory(GiB)": 72.72, "step": 140, "token_acc": 0.8265060359403877, "train_speed(iter/s)": 0.086654 }, { "epoch": 0.06868426701008801, "grad_norm": 0.24960927665233612, "learning_rate": 1.4285714285714285e-05, "loss": 0.5757434368133545, "memory(GiB)": 72.72, "step": 160, "token_acc": 0.7987813737966625, "train_speed(iter/s)": 0.087161 }, { "epoch": 0.07726980038634901, "grad_norm": 0.2726881504058838, "learning_rate": 1.6071428571428572e-05, "loss": 0.5653277397155761, "memory(GiB)": 72.72, "step": 180, "token_acc": 0.8027387420432055, "train_speed(iter/s)": 0.087596 }, { "epoch": 0.08585533376261001, "grad_norm": 0.2119862288236618, "learning_rate": 1.785714285714286e-05, "loss": 0.5523943424224853, "memory(GiB)": 72.72, "step": 200, "token_acc": 0.8231761512065608, "train_speed(iter/s)": 0.087961 }, { "epoch": 0.094440867138871, "grad_norm": 0.24396856129169464, "learning_rate": 1.9642857142857145e-05, "loss": 0.5498331546783447, "memory(GiB)": 72.72, "step": 220, "token_acc": 0.7940026244174245, "train_speed(iter/s)": 0.088292 }, { "epoch": 0.103026400515132, "grad_norm": 0.2601749002933502, "learning_rate": 2.1428571428571428e-05, "loss": 0.5398545265197754, "memory(GiB)": 72.72, "step": 240, "token_acc": 0.80989644710031, "train_speed(iter/s)": 0.088502 }, { "epoch": 0.111611933891393, "grad_norm": 0.42718759179115295, "learning_rate": 2.3214285714285715e-05, "loss": 0.5296700477600098, "memory(GiB)": 72.72, "step": 260, "token_acc": 0.8186448573942751, "train_speed(iter/s)": 0.088739 }, { "epoch": 0.120197467267654, "grad_norm": 0.2564183175563812, "learning_rate": 2.5e-05, "loss": 0.5314459323883056, "memory(GiB)": 72.72, "step": 280, "token_acc": 0.8186711788362628, "train_speed(iter/s)": 0.088928 }, { "epoch": 0.128783000643915, "grad_norm": 0.42152953147888184, "learning_rate": 2.6785714285714288e-05, "loss": 0.5304059028625489, "memory(GiB)": 72.72, "step": 300, "token_acc": 0.8313805341388459, "train_speed(iter/s)": 0.089116 }, { "epoch": 0.13736853402017601, "grad_norm": 0.44018375873565674, "learning_rate": 2.857142857142857e-05, "loss": 0.5269341945648194, "memory(GiB)": 72.72, "step": 320, "token_acc": 0.8261322879913329, "train_speed(iter/s)": 0.089293 }, { "epoch": 0.14595406739643701, "grad_norm": 0.332704097032547, "learning_rate": 3.0357142857142857e-05, "loss": 0.5224681854248047, "memory(GiB)": 72.72, "step": 340, "token_acc": 0.8305089071105363, "train_speed(iter/s)": 0.08944 }, { "epoch": 0.15453960077269802, "grad_norm": 0.2763151526451111, "learning_rate": 3.2142857142857144e-05, "loss": 0.5171589374542236, "memory(GiB)": 72.72, "step": 360, "token_acc": 0.8298510336859191, "train_speed(iter/s)": 0.08953 }, { "epoch": 0.16312513414895902, "grad_norm": 0.49729594588279724, "learning_rate": 3.392857142857143e-05, "loss": 0.5136796474456787, "memory(GiB)": 72.72, "step": 380, "token_acc": 0.8310140069023154, "train_speed(iter/s)": 0.089637 }, { "epoch": 0.17171066752522002, "grad_norm": 0.3252655267715454, "learning_rate": 3.571428571428572e-05, "loss": 0.5128469944000245, "memory(GiB)": 72.72, "step": 400, "token_acc": 0.8536899287574551, "train_speed(iter/s)": 0.089729 }, { "epoch": 0.180296200901481, "grad_norm": 0.28958284854888916, "learning_rate": 3.7500000000000003e-05, "loss": 0.5108192920684814, "memory(GiB)": 72.72, "step": 420, "token_acc": 0.8385648117441578, "train_speed(iter/s)": 0.089819 }, { "epoch": 0.188881734277742, "grad_norm": 0.34760820865631104, "learning_rate": 3.928571428571429e-05, "loss": 0.5059587478637695, "memory(GiB)": 72.72, "step": 440, "token_acc": 0.8356636206879049, "train_speed(iter/s)": 0.089855 }, { "epoch": 0.197467267654003, "grad_norm": 0.41139236092567444, "learning_rate": 4.107142857142857e-05, "loss": 0.5062141418457031, "memory(GiB)": 72.72, "step": 460, "token_acc": 0.8206318874596391, "train_speed(iter/s)": 0.089919 }, { "epoch": 0.206052801030264, "grad_norm": 0.3865952789783478, "learning_rate": 4.2857142857142856e-05, "loss": 0.4976132869720459, "memory(GiB)": 72.72, "step": 480, "token_acc": 0.809242185807305, "train_speed(iter/s)": 0.090029 }, { "epoch": 0.214638334406525, "grad_norm": 0.34395724534988403, "learning_rate": 4.464285714285715e-05, "loss": 0.504787015914917, "memory(GiB)": 72.72, "step": 500, "token_acc": 0.8331918132662932, "train_speed(iter/s)": 0.090119 }, { "epoch": 0.223223867782786, "grad_norm": 0.23087145388126373, "learning_rate": 4.642857142857143e-05, "loss": 0.49814434051513673, "memory(GiB)": 72.72, "step": 520, "token_acc": 0.8304561567217256, "train_speed(iter/s)": 0.0902 }, { "epoch": 0.231809401159047, "grad_norm": 0.3384479582309723, "learning_rate": 4.8214285714285716e-05, "loss": 0.49905076026916506, "memory(GiB)": 72.72, "step": 540, "token_acc": 0.8417635120347525, "train_speed(iter/s)": 0.090248 }, { "epoch": 0.240394934535308, "grad_norm": 0.40263310074806213, "learning_rate": 5e-05, "loss": 0.4956265926361084, "memory(GiB)": 72.72, "step": 560, "token_acc": 0.816079870788593, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.248980467911569, "grad_norm": 0.30763953924179077, "learning_rate": 4.999984903632473e-05, "loss": 0.4967645168304443, "memory(GiB)": 72.72, "step": 580, "token_acc": 0.8258691170614004, "train_speed(iter/s)": 0.0903 }, { "epoch": 0.25756600128783, "grad_norm": 0.28709837794303894, "learning_rate": 4.999939614712212e-05, "loss": 0.49540038108825685, "memory(GiB)": 72.72, "step": 600, "token_acc": 0.8345701058201058, "train_speed(iter/s)": 0.090345 }, { "epoch": 0.266151534664091, "grad_norm": 0.27484264969825745, "learning_rate": 4.999864133786175e-05, "loss": 0.4913135051727295, "memory(GiB)": 72.72, "step": 620, "token_acc": 0.8408849265417659, "train_speed(iter/s)": 0.090402 }, { "epoch": 0.27473706804035203, "grad_norm": 0.275291383266449, "learning_rate": 4.999758461765953e-05, "loss": 0.4913851261138916, "memory(GiB)": 72.72, "step": 640, "token_acc": 0.823726404893571, "train_speed(iter/s)": 0.090443 }, { "epoch": 0.28332260141661303, "grad_norm": 0.31161361932754517, "learning_rate": 4.999622599927756e-05, "loss": 0.48822855949401855, "memory(GiB)": 72.72, "step": 660, "token_acc": 0.8308604661462827, "train_speed(iter/s)": 0.090487 }, { "epoch": 0.29190813479287403, "grad_norm": 0.3709673285484314, "learning_rate": 4.999456549912401e-05, "loss": 0.486361026763916, "memory(GiB)": 72.72, "step": 680, "token_acc": 0.8271976771900934, "train_speed(iter/s)": 0.090543 }, { "epoch": 0.30049366816913503, "grad_norm": 0.2165047973394394, "learning_rate": 4.99926031372529e-05, "loss": 0.48601832389831545, "memory(GiB)": 72.72, "step": 700, "token_acc": 0.8387291407835747, "train_speed(iter/s)": 0.090587 }, { "epoch": 0.30907920154539603, "grad_norm": 0.24446570873260498, "learning_rate": 4.999033893736386e-05, "loss": 0.48243279457092286, "memory(GiB)": 72.72, "step": 720, "token_acc": 0.8372941834434668, "train_speed(iter/s)": 0.090636 }, { "epoch": 0.31766473492165703, "grad_norm": 0.24655242264270782, "learning_rate": 4.998777292680182e-05, "loss": 0.48319129943847655, "memory(GiB)": 72.72, "step": 740, "token_acc": 0.8441225801781377, "train_speed(iter/s)": 0.090658 }, { "epoch": 0.32625026829791803, "grad_norm": 0.2514285445213318, "learning_rate": 4.998490513655676e-05, "loss": 0.47730517387390137, "memory(GiB)": 72.72, "step": 760, "token_acc": 0.8397503992980168, "train_speed(iter/s)": 0.090692 }, { "epoch": 0.33483580167417903, "grad_norm": 0.2303766906261444, "learning_rate": 4.998173560126323e-05, "loss": 0.4783301830291748, "memory(GiB)": 72.72, "step": 780, "token_acc": 0.8443087371876962, "train_speed(iter/s)": 0.090725 }, { "epoch": 0.34342133505044004, "grad_norm": 0.2418110966682434, "learning_rate": 4.997826435920003e-05, "loss": 0.47623915672302247, "memory(GiB)": 72.72, "step": 800, "token_acc": 0.8400687170332869, "train_speed(iter/s)": 0.090766 }, { "epoch": 0.35200686842670104, "grad_norm": 0.24591697752475739, "learning_rate": 4.9974491452289664e-05, "loss": 0.47069730758666994, "memory(GiB)": 72.72, "step": 820, "token_acc": 0.833947379545595, "train_speed(iter/s)": 0.090805 }, { "epoch": 0.360592401802962, "grad_norm": 0.17342260479927063, "learning_rate": 4.9970416926097885e-05, "loss": 0.47403693199157715, "memory(GiB)": 72.72, "step": 840, "token_acc": 0.82827573574307, "train_speed(iter/s)": 0.090837 }, { "epoch": 0.369177935179223, "grad_norm": 0.25668865442276, "learning_rate": 4.9966040829833115e-05, "loss": 0.4738626003265381, "memory(GiB)": 72.72, "step": 860, "token_acc": 0.8238550967767793, "train_speed(iter/s)": 0.090859 }, { "epoch": 0.377763468555484, "grad_norm": 0.23179244995117188, "learning_rate": 4.99613632163459e-05, "loss": 0.47292590141296387, "memory(GiB)": 72.72, "step": 880, "token_acc": 0.8182959019634485, "train_speed(iter/s)": 0.090879 }, { "epoch": 0.386349001931745, "grad_norm": 0.220433309674263, "learning_rate": 4.995638414212821e-05, "loss": 0.47188587188720704, "memory(GiB)": 72.72, "step": 900, "token_acc": 0.8470956528576601, "train_speed(iter/s)": 0.090882 }, { "epoch": 0.394934535308006, "grad_norm": 0.18783436715602875, "learning_rate": 4.9951103667312795e-05, "loss": 0.46758122444152833, "memory(GiB)": 72.72, "step": 920, "token_acc": 0.8408150854174393, "train_speed(iter/s)": 0.090902 }, { "epoch": 0.403520068684267, "grad_norm": 0.19517077505588531, "learning_rate": 4.994552185567244e-05, "loss": 0.4659998893737793, "memory(GiB)": 72.72, "step": 940, "token_acc": 0.8483965614563244, "train_speed(iter/s)": 0.090926 }, { "epoch": 0.412105602060528, "grad_norm": 0.21663770079612732, "learning_rate": 4.9939638774619216e-05, "loss": 0.46530804634094236, "memory(GiB)": 72.72, "step": 960, "token_acc": 0.8299559114387157, "train_speed(iter/s)": 0.090953 }, { "epoch": 0.420691135436789, "grad_norm": 0.2215634137392044, "learning_rate": 4.993345449520364e-05, "loss": 0.46740241050720216, "memory(GiB)": 72.72, "step": 980, "token_acc": 0.8444846788598264, "train_speed(iter/s)": 0.090976 }, { "epoch": 0.42927666881305, "grad_norm": 0.26028579473495483, "learning_rate": 4.992696909211384e-05, "loss": 0.4601090431213379, "memory(GiB)": 72.72, "step": 1000, "token_acc": 0.8435964299778611, "train_speed(iter/s)": 0.091007 }, { "epoch": 0.42927666881305, "eval_loss": 0.49964410066604614, "eval_runtime": 68.8659, "eval_samples_per_second": 54.657, "eval_steps_per_second": 0.697, "eval_token_acc": 0.8252659243990504, "step": 1000 }, { "epoch": 0.437862202189311, "grad_norm": 0.1770976185798645, "learning_rate": 4.992018264367464e-05, "loss": 0.4663649082183838, "memory(GiB)": 72.72, "step": 1020, "token_acc": 0.8298894735758832, "train_speed(iter/s)": 0.090155 }, { "epoch": 0.446447735565572, "grad_norm": 0.19963237643241882, "learning_rate": 4.991309523184661e-05, "loss": 0.45961837768554686, "memory(GiB)": 72.72, "step": 1040, "token_acc": 0.8395059398856838, "train_speed(iter/s)": 0.090109 }, { "epoch": 0.455033268941833, "grad_norm": 0.16753822565078735, "learning_rate": 4.9905706942225094e-05, "loss": 0.4637617111206055, "memory(GiB)": 72.72, "step": 1060, "token_acc": 0.8323250084598088, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.463618802318094, "grad_norm": 0.17514312267303467, "learning_rate": 4.989801786403916e-05, "loss": 0.45838212966918945, "memory(GiB)": 72.72, "step": 1080, "token_acc": 0.8644555660931506, "train_speed(iter/s)": 0.090071 }, { "epoch": 0.472204335694355, "grad_norm": 0.18766745924949646, "learning_rate": 4.989002809015052e-05, "loss": 0.46158289909362793, "memory(GiB)": 72.72, "step": 1100, "token_acc": 0.8342647441453608, "train_speed(iter/s)": 0.090073 }, { "epoch": 0.480789869070616, "grad_norm": 0.16132639348506927, "learning_rate": 4.9881737717052436e-05, "loss": 0.4612901210784912, "memory(GiB)": 72.72, "step": 1120, "token_acc": 0.8357742084275915, "train_speed(iter/s)": 0.090059 }, { "epoch": 0.489375402446877, "grad_norm": 0.2307191789150238, "learning_rate": 4.987314684486852e-05, "loss": 0.4583921432495117, "memory(GiB)": 72.72, "step": 1140, "token_acc": 0.8285798810251781, "train_speed(iter/s)": 0.090074 }, { "epoch": 0.497960935823138, "grad_norm": 0.18384596705436707, "learning_rate": 4.9864255577351534e-05, "loss": 0.4601446151733398, "memory(GiB)": 72.72, "step": 1160, "token_acc": 0.8331488125236877, "train_speed(iter/s)": 0.090082 }, { "epoch": 0.506546469199399, "grad_norm": 0.16498738527297974, "learning_rate": 4.985506402188217e-05, "loss": 0.46405863761901855, "memory(GiB)": 72.72, "step": 1180, "token_acc": 0.8514261702005886, "train_speed(iter/s)": 0.090094 }, { "epoch": 0.51513200257566, "grad_norm": 0.20875471830368042, "learning_rate": 4.98455722894677e-05, "loss": 0.4559325695037842, "memory(GiB)": 72.72, "step": 1200, "token_acc": 0.8449181040663494, "train_speed(iter/s)": 0.090127 }, { "epoch": 0.523717535951921, "grad_norm": 0.20588186383247375, "learning_rate": 4.9835780494740655e-05, "loss": 0.4588587760925293, "memory(GiB)": 72.72, "step": 1220, "token_acc": 0.8452262520285315, "train_speed(iter/s)": 0.090144 }, { "epoch": 0.532303069328182, "grad_norm": 0.1740783005952835, "learning_rate": 4.982568875595748e-05, "loss": 0.4509147644042969, "memory(GiB)": 72.72, "step": 1240, "token_acc": 0.8587345890329355, "train_speed(iter/s)": 0.090167 }, { "epoch": 0.5408886027044431, "grad_norm": 0.16246297955513, "learning_rate": 4.981529719499704e-05, "loss": 0.45482635498046875, "memory(GiB)": 72.72, "step": 1260, "token_acc": 0.8503446562311433, "train_speed(iter/s)": 0.09019 }, { "epoch": 0.5494741360807041, "grad_norm": 0.16924946010112762, "learning_rate": 4.98046059373592e-05, "loss": 0.45041213035583494, "memory(GiB)": 72.72, "step": 1280, "token_acc": 0.8577976623734301, "train_speed(iter/s)": 0.090182 }, { "epoch": 0.5580596694569651, "grad_norm": 0.1474551260471344, "learning_rate": 4.979361511216328e-05, "loss": 0.4552830696105957, "memory(GiB)": 72.72, "step": 1300, "token_acc": 0.8588476242043739, "train_speed(iter/s)": 0.090192 }, { "epoch": 0.5666452028332261, "grad_norm": 0.18833380937576294, "learning_rate": 4.978232485214652e-05, "loss": 0.45859723091125487, "memory(GiB)": 72.72, "step": 1320, "token_acc": 0.8404036444064411, "train_speed(iter/s)": 0.090203 }, { "epoch": 0.5752307362094871, "grad_norm": 0.15271180868148804, "learning_rate": 4.977073529366244e-05, "loss": 0.45444612503051757, "memory(GiB)": 72.72, "step": 1340, "token_acc": 0.8709423088586175, "train_speed(iter/s)": 0.090204 }, { "epoch": 0.5838162695857481, "grad_norm": 0.15060073137283325, "learning_rate": 4.975884657667922e-05, "loss": 0.44826583862304686, "memory(GiB)": 72.72, "step": 1360, "token_acc": 0.8445347567633144, "train_speed(iter/s)": 0.090219 }, { "epoch": 0.5924018029620091, "grad_norm": 0.20435456931591034, "learning_rate": 4.974665884477803e-05, "loss": 0.4500474452972412, "memory(GiB)": 72.72, "step": 1380, "token_acc": 0.8436003043631144, "train_speed(iter/s)": 0.090231 }, { "epoch": 0.6009873363382701, "grad_norm": 0.14778906106948853, "learning_rate": 4.9734172245151256e-05, "loss": 0.45103793144226073, "memory(GiB)": 72.72, "step": 1400, "token_acc": 0.83692786963815, "train_speed(iter/s)": 0.090242 }, { "epoch": 0.6095728697145311, "grad_norm": 0.14574205875396729, "learning_rate": 4.972138692860072e-05, "loss": 0.445733642578125, "memory(GiB)": 72.72, "step": 1420, "token_acc": 0.8457673279623152, "train_speed(iter/s)": 0.090256 }, { "epoch": 0.6181584030907921, "grad_norm": 0.16354091465473175, "learning_rate": 4.97083030495359e-05, "loss": 0.44748697280883787, "memory(GiB)": 72.72, "step": 1440, "token_acc": 0.8472576057582539, "train_speed(iter/s)": 0.09027 }, { "epoch": 0.6267439364670531, "grad_norm": 0.14656536281108856, "learning_rate": 4.969492076597203e-05, "loss": 0.44432525634765624, "memory(GiB)": 72.72, "step": 1460, "token_acc": 0.8350947008237803, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.6353294698433141, "grad_norm": 0.16932909190654755, "learning_rate": 4.9681240239528216e-05, "loss": 0.44748797416687014, "memory(GiB)": 72.72, "step": 1480, "token_acc": 0.8489820684323982, "train_speed(iter/s)": 0.090307 }, { "epoch": 0.6439150032195751, "grad_norm": 0.16873878240585327, "learning_rate": 4.9667261635425446e-05, "loss": 0.4508372783660889, "memory(GiB)": 72.72, "step": 1500, "token_acc": 0.8508125264242287, "train_speed(iter/s)": 0.090325 }, { "epoch": 0.6525005365958361, "grad_norm": 0.1554819792509079, "learning_rate": 4.965298512248466e-05, "loss": 0.4475415706634521, "memory(GiB)": 72.72, "step": 1520, "token_acc": 0.8513087716943568, "train_speed(iter/s)": 0.090345 }, { "epoch": 0.6610860699720971, "grad_norm": 0.15099839866161346, "learning_rate": 4.963841087312462e-05, "loss": 0.44126238822937014, "memory(GiB)": 72.72, "step": 1540, "token_acc": 0.8473235774968391, "train_speed(iter/s)": 0.090357 }, { "epoch": 0.6696716033483581, "grad_norm": 0.16528978943824768, "learning_rate": 4.9623539063359925e-05, "loss": 0.44157891273498534, "memory(GiB)": 72.72, "step": 1560, "token_acc": 0.8506024455489073, "train_speed(iter/s)": 0.090379 }, { "epoch": 0.6782571367246191, "grad_norm": 0.1654183566570282, "learning_rate": 4.9608369872798815e-05, "loss": 0.4443850517272949, "memory(GiB)": 72.72, "step": 1580, "token_acc": 0.8580666295200214, "train_speed(iter/s)": 0.090387 }, { "epoch": 0.6868426701008801, "grad_norm": 0.17317461967468262, "learning_rate": 4.9592903484641026e-05, "loss": 0.44514150619506837, "memory(GiB)": 72.72, "step": 1600, "token_acc": 0.8373144994303555, "train_speed(iter/s)": 0.090402 }, { "epoch": 0.6954282034771411, "grad_norm": 0.14516599476337433, "learning_rate": 4.9577140085675586e-05, "loss": 0.4465588092803955, "memory(GiB)": 72.72, "step": 1620, "token_acc": 0.8457774631145212, "train_speed(iter/s)": 0.090411 }, { "epoch": 0.7040137368534021, "grad_norm": 0.19526512920856476, "learning_rate": 4.956107986627855e-05, "loss": 0.44002666473388674, "memory(GiB)": 72.72, "step": 1640, "token_acc": 0.8571882184288229, "train_speed(iter/s)": 0.090425 }, { "epoch": 0.7125992702296631, "grad_norm": 0.15198639035224915, "learning_rate": 4.954472302041069e-05, "loss": 0.4411801815032959, "memory(GiB)": 72.72, "step": 1660, "token_acc": 0.8389074986086463, "train_speed(iter/s)": 0.090436 }, { "epoch": 0.721184803605924, "grad_norm": 0.14642658829689026, "learning_rate": 4.952806974561518e-05, "loss": 0.4408212184906006, "memory(GiB)": 72.72, "step": 1680, "token_acc": 0.8505627783277739, "train_speed(iter/s)": 0.090445 }, { "epoch": 0.729770336982185, "grad_norm": 0.15425585210323334, "learning_rate": 4.951112024301517e-05, "loss": 0.4436194896697998, "memory(GiB)": 72.72, "step": 1700, "token_acc": 0.8396897524541256, "train_speed(iter/s)": 0.090457 }, { "epoch": 0.738355870358446, "grad_norm": 0.1366390883922577, "learning_rate": 4.9493874717311416e-05, "loss": 0.4426912307739258, "memory(GiB)": 72.72, "step": 1720, "token_acc": 0.8376902006111753, "train_speed(iter/s)": 0.090475 }, { "epoch": 0.746941403734707, "grad_norm": 0.18443486094474792, "learning_rate": 4.9476333376779746e-05, "loss": 0.4428090572357178, "memory(GiB)": 72.72, "step": 1740, "token_acc": 0.8405056707361122, "train_speed(iter/s)": 0.090496 }, { "epoch": 0.755526937110968, "grad_norm": 0.16430367529392242, "learning_rate": 4.945849643326857e-05, "loss": 0.4388707637786865, "memory(GiB)": 72.72, "step": 1760, "token_acc": 0.8453185251787173, "train_speed(iter/s)": 0.090513 }, { "epoch": 0.764112470487229, "grad_norm": 0.16152745485305786, "learning_rate": 4.9440364102196345e-05, "loss": 0.43615312576293946, "memory(GiB)": 72.72, "step": 1780, "token_acc": 0.855409006002105, "train_speed(iter/s)": 0.090532 }, { "epoch": 0.77269800386349, "grad_norm": 0.18139781057834625, "learning_rate": 4.942193660254892e-05, "loss": 0.440519380569458, "memory(GiB)": 72.72, "step": 1800, "token_acc": 0.8458556213090118, "train_speed(iter/s)": 0.09055 }, { "epoch": 0.781283537239751, "grad_norm": 0.1560135781764984, "learning_rate": 4.9403214156876966e-05, "loss": 0.4351651191711426, "memory(GiB)": 72.72, "step": 1820, "token_acc": 0.844846138018734, "train_speed(iter/s)": 0.090564 }, { "epoch": 0.789869070616012, "grad_norm": 0.18113134801387787, "learning_rate": 4.9384196991293205e-05, "loss": 0.4427495002746582, "memory(GiB)": 72.72, "step": 1840, "token_acc": 0.8444957533319758, "train_speed(iter/s)": 0.090581 }, { "epoch": 0.798454603992273, "grad_norm": 0.16674058139324188, "learning_rate": 4.9364885335469734e-05, "loss": 0.4387219429016113, "memory(GiB)": 72.72, "step": 1860, "token_acc": 0.862598161076389, "train_speed(iter/s)": 0.090598 }, { "epoch": 0.807040137368534, "grad_norm": 0.1326039731502533, "learning_rate": 4.934527942263523e-05, "loss": 0.4364177703857422, "memory(GiB)": 72.72, "step": 1880, "token_acc": 0.8337704981881752, "train_speed(iter/s)": 0.090612 }, { "epoch": 0.815625670744795, "grad_norm": 0.15598100423812866, "learning_rate": 4.9325379489572165e-05, "loss": 0.4394540309906006, "memory(GiB)": 72.72, "step": 1900, "token_acc": 0.8388467949805115, "train_speed(iter/s)": 0.090628 }, { "epoch": 0.824211204121056, "grad_norm": 0.19666017591953278, "learning_rate": 4.930518577661388e-05, "loss": 0.4369682788848877, "memory(GiB)": 72.72, "step": 1920, "token_acc": 0.8537222609570074, "train_speed(iter/s)": 0.090641 }, { "epoch": 0.832796737497317, "grad_norm": 0.14630870521068573, "learning_rate": 4.928469852764176e-05, "loss": 0.43962607383728025, "memory(GiB)": 72.72, "step": 1940, "token_acc": 0.8393774787079826, "train_speed(iter/s)": 0.090657 }, { "epoch": 0.841382270873578, "grad_norm": 0.1797455996274948, "learning_rate": 4.926391799008223e-05, "loss": 0.4379319190979004, "memory(GiB)": 72.72, "step": 1960, "token_acc": 0.843222227690404, "train_speed(iter/s)": 0.090674 }, { "epoch": 0.849967804249839, "grad_norm": 0.12199361622333527, "learning_rate": 4.92428444149038e-05, "loss": 0.4334880352020264, "memory(GiB)": 72.72, "step": 1980, "token_acc": 0.8431945161599516, "train_speed(iter/s)": 0.090689 }, { "epoch": 0.8585533376261, "grad_norm": 0.14421170949935913, "learning_rate": 4.922147805661402e-05, "loss": 0.43396615982055664, "memory(GiB)": 72.72, "step": 2000, "token_acc": 0.8505196095201227, "train_speed(iter/s)": 0.0907 }, { "epoch": 0.8585533376261, "eval_loss": 0.470032662153244, "eval_runtime": 68.4365, "eval_samples_per_second": 55.0, "eval_steps_per_second": 0.701, "eval_token_acc": 0.8330788522432155, "step": 2000 }, { "epoch": 0.867138871002361, "grad_norm": 0.14598308503627777, "learning_rate": 4.91998191732564e-05, "loss": 0.4354074001312256, "memory(GiB)": 72.72, "step": 2020, "token_acc": 0.8444459301633199, "train_speed(iter/s)": 0.090271 }, { "epoch": 0.875724404378622, "grad_norm": 0.14193296432495117, "learning_rate": 4.917786802640732e-05, "loss": 0.4282365322113037, "memory(GiB)": 72.72, "step": 2040, "token_acc": 0.851482400022546, "train_speed(iter/s)": 0.090242 }, { "epoch": 0.884309937754883, "grad_norm": 0.1344188153743744, "learning_rate": 4.9155624881172834e-05, "loss": 0.4284001350402832, "memory(GiB)": 72.72, "step": 2060, "token_acc": 0.8423048427291708, "train_speed(iter/s)": 0.090228 }, { "epoch": 0.892895471131144, "grad_norm": 0.19214758276939392, "learning_rate": 4.91330900061855e-05, "loss": 0.4374197483062744, "memory(GiB)": 72.72, "step": 2080, "token_acc": 0.838007610676071, "train_speed(iter/s)": 0.09022 }, { "epoch": 0.901481004507405, "grad_norm": 0.14042872190475464, "learning_rate": 4.911026367360114e-05, "loss": 0.4368441104888916, "memory(GiB)": 72.72, "step": 2100, "token_acc": 0.8546391628505924, "train_speed(iter/s)": 0.09023 }, { "epoch": 0.910066537883666, "grad_norm": 0.12134739011526108, "learning_rate": 4.90871461590955e-05, "loss": 0.4329835414886475, "memory(GiB)": 72.72, "step": 2120, "token_acc": 0.8415444091274719, "train_speed(iter/s)": 0.090231 }, { "epoch": 0.918652071259927, "grad_norm": 0.13989004492759705, "learning_rate": 4.906373774186097e-05, "loss": 0.4377878665924072, "memory(GiB)": 72.72, "step": 2140, "token_acc": 0.848478083434529, "train_speed(iter/s)": 0.090235 }, { "epoch": 0.927237604636188, "grad_norm": 0.13958944380283356, "learning_rate": 4.904003870460323e-05, "loss": 0.4368983268737793, "memory(GiB)": 72.72, "step": 2160, "token_acc": 0.8715589150065507, "train_speed(iter/s)": 0.090238 }, { "epoch": 0.935823138012449, "grad_norm": 0.12047629058361053, "learning_rate": 4.901604933353776e-05, "loss": 0.432587194442749, "memory(GiB)": 72.72, "step": 2180, "token_acc": 0.8463879291216281, "train_speed(iter/s)": 0.090247 }, { "epoch": 0.94440867138871, "grad_norm": 0.19937904179096222, "learning_rate": 4.899176991838646e-05, "loss": 0.42923874855041505, "memory(GiB)": 72.72, "step": 2200, "token_acc": 0.8560462814584306, "train_speed(iter/s)": 0.090255 }, { "epoch": 0.952994204764971, "grad_norm": 0.13658791780471802, "learning_rate": 4.896720075237411e-05, "loss": 0.43826861381530763, "memory(GiB)": 72.72, "step": 2220, "token_acc": 0.8582506049536265, "train_speed(iter/s)": 0.090257 }, { "epoch": 0.961579738141232, "grad_norm": 0.1443174183368683, "learning_rate": 4.894234213222484e-05, "loss": 0.4363288879394531, "memory(GiB)": 72.72, "step": 2240, "token_acc": 0.8583811494758153, "train_speed(iter/s)": 0.090269 }, { "epoch": 0.970165271517493, "grad_norm": 0.1416754275560379, "learning_rate": 4.8917194358158534e-05, "loss": 0.43085694313049316, "memory(GiB)": 72.72, "step": 2260, "token_acc": 0.8524799246312664, "train_speed(iter/s)": 0.090285 }, { "epoch": 0.978750804893754, "grad_norm": 0.15419602394104004, "learning_rate": 4.889175773388722e-05, "loss": 0.42989211082458495, "memory(GiB)": 72.72, "step": 2280, "token_acc": 0.8570728938425664, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.987336338270015, "grad_norm": 0.15600045025348663, "learning_rate": 4.886603256661142e-05, "loss": 0.43334760665893557, "memory(GiB)": 72.72, "step": 2300, "token_acc": 0.844059695609059, "train_speed(iter/s)": 0.090301 }, { "epoch": 0.995921871646276, "grad_norm": 0.1368878036737442, "learning_rate": 4.884001916701639e-05, "loss": 0.4333777904510498, "memory(GiB)": 72.72, "step": 2320, "token_acc": 0.841434785356969, "train_speed(iter/s)": 0.090297 }, { "epoch": 1.0042927666881305, "grad_norm": 0.17282716929912567, "learning_rate": 4.881371784926839e-05, "loss": 0.42626185417175294, "memory(GiB)": 72.72, "step": 2340, "token_acc": 0.8547437072110268, "train_speed(iter/s)": 0.090271 }, { "epoch": 1.0128783000643915, "grad_norm": 0.21046976745128632, "learning_rate": 4.878712893101092e-05, "loss": 0.40583181381225586, "memory(GiB)": 72.72, "step": 2360, "token_acc": 0.8494737944090475, "train_speed(iter/s)": 0.09027 }, { "epoch": 1.0214638334406525, "grad_norm": 0.1504330039024353, "learning_rate": 4.8760252733360845e-05, "loss": 0.40615053176879884, "memory(GiB)": 72.72, "step": 2380, "token_acc": 0.861498977359772, "train_speed(iter/s)": 0.090271 }, { "epoch": 1.0300493668169135, "grad_norm": 0.13325518369674683, "learning_rate": 4.8733089580904525e-05, "loss": 0.4108716011047363, "memory(GiB)": 72.72, "step": 2400, "token_acc": 0.8607458709259072, "train_speed(iter/s)": 0.090273 }, { "epoch": 1.0386349001931745, "grad_norm": 0.14907221496105194, "learning_rate": 4.870563980169391e-05, "loss": 0.4110468864440918, "memory(GiB)": 72.72, "step": 2420, "token_acc": 0.8597078066556821, "train_speed(iter/s)": 0.090268 }, { "epoch": 1.0472204335694355, "grad_norm": 0.13383924961090088, "learning_rate": 4.867790372724257e-05, "loss": 0.4098019599914551, "memory(GiB)": 72.72, "step": 2440, "token_acc": 0.8552879722635879, "train_speed(iter/s)": 0.090259 }, { "epoch": 1.0558059669456965, "grad_norm": 0.1269863396883011, "learning_rate": 4.864988169252168e-05, "loss": 0.40692687034606934, "memory(GiB)": 72.72, "step": 2460, "token_acc": 0.8569142548291154, "train_speed(iter/s)": 0.090254 }, { "epoch": 1.0643915003219575, "grad_norm": 0.1471211463212967, "learning_rate": 4.862157403595598e-05, "loss": 0.4115363597869873, "memory(GiB)": 72.72, "step": 2480, "token_acc": 0.8509032023648785, "train_speed(iter/s)": 0.090253 }, { "epoch": 1.0729770336982185, "grad_norm": 0.1170874610543251, "learning_rate": 4.859298109941971e-05, "loss": 0.40721793174743653, "memory(GiB)": 72.72, "step": 2500, "token_acc": 0.8535656636728612, "train_speed(iter/s)": 0.09024 }, { "epoch": 1.0815625670744795, "grad_norm": 0.15042538940906525, "learning_rate": 4.8564103228232445e-05, "loss": 0.4073436737060547, "memory(GiB)": 72.72, "step": 2520, "token_acc": 0.8541391331235382, "train_speed(iter/s)": 0.090233 }, { "epoch": 1.0901481004507405, "grad_norm": 0.13396978378295898, "learning_rate": 4.8534940771154954e-05, "loss": 0.40180039405822754, "memory(GiB)": 72.72, "step": 2540, "token_acc": 0.8529722329553782, "train_speed(iter/s)": 0.09023 }, { "epoch": 1.0987336338270015, "grad_norm": 0.1457211673259735, "learning_rate": 4.850549408038498e-05, "loss": 0.4088040828704834, "memory(GiB)": 72.72, "step": 2560, "token_acc": 0.8557055478261985, "train_speed(iter/s)": 0.090233 }, { "epoch": 1.1073191672032625, "grad_norm": 0.1382468044757843, "learning_rate": 4.8475763511552965e-05, "loss": 0.4087985515594482, "memory(GiB)": 72.72, "step": 2580, "token_acc": 0.8476997133289814, "train_speed(iter/s)": 0.090235 }, { "epoch": 1.1159047005795235, "grad_norm": 0.13849055767059326, "learning_rate": 4.844574942371779e-05, "loss": 0.4051491737365723, "memory(GiB)": 72.72, "step": 2600, "token_acc": 0.8530841075229781, "train_speed(iter/s)": 0.09023 }, { "epoch": 1.1244902339557845, "grad_norm": 0.10844399780035019, "learning_rate": 4.841545217936241e-05, "loss": 0.40656099319458006, "memory(GiB)": 72.72, "step": 2620, "token_acc": 0.8659740741451311, "train_speed(iter/s)": 0.090231 }, { "epoch": 1.1330757673320455, "grad_norm": 0.14399060606956482, "learning_rate": 4.838487214438951e-05, "loss": 0.40219764709472655, "memory(GiB)": 72.72, "step": 2640, "token_acc": 0.8656865378871902, "train_speed(iter/s)": 0.090232 }, { "epoch": 1.1416613007083065, "grad_norm": 0.15220606327056885, "learning_rate": 4.8354009688117026e-05, "loss": 0.409071159362793, "memory(GiB)": 72.72, "step": 2660, "token_acc": 0.8480521276805948, "train_speed(iter/s)": 0.090228 }, { "epoch": 1.1502468340845675, "grad_norm": 0.13173505663871765, "learning_rate": 4.832286518327376e-05, "loss": 0.40669097900390627, "memory(GiB)": 72.72, "step": 2680, "token_acc": 0.8510178845290564, "train_speed(iter/s)": 0.090198 }, { "epoch": 1.1588323674608285, "grad_norm": 0.13876375555992126, "learning_rate": 4.829143900599481e-05, "loss": 0.40750818252563475, "memory(GiB)": 72.72, "step": 2700, "token_acc": 0.8563185312128616, "train_speed(iter/s)": 0.090196 }, { "epoch": 1.1674179008370895, "grad_norm": 0.1286059468984604, "learning_rate": 4.825973153581709e-05, "loss": 0.4104398250579834, "memory(GiB)": 72.72, "step": 2720, "token_acc": 0.8429334625658422, "train_speed(iter/s)": 0.090194 }, { "epoch": 1.1760034342133505, "grad_norm": 0.11903152614831924, "learning_rate": 4.8227743155674684e-05, "loss": 0.405780553817749, "memory(GiB)": 72.72, "step": 2740, "token_acc": 0.8503315207488469, "train_speed(iter/s)": 0.090196 }, { "epoch": 1.1845889675896115, "grad_norm": 0.13296058773994446, "learning_rate": 4.819547425189429e-05, "loss": 0.406817626953125, "memory(GiB)": 72.72, "step": 2760, "token_acc": 0.8561766559029692, "train_speed(iter/s)": 0.090196 }, { "epoch": 1.1931745009658725, "grad_norm": 0.1934213489294052, "learning_rate": 4.816292521419046e-05, "loss": 0.40883073806762693, "memory(GiB)": 72.72, "step": 2780, "token_acc": 0.844781303243432, "train_speed(iter/s)": 0.090191 }, { "epoch": 1.2017600343421335, "grad_norm": 0.14654423296451569, "learning_rate": 4.813009643566101e-05, "loss": 0.40619373321533203, "memory(GiB)": 72.72, "step": 2800, "token_acc": 0.8772289089291062, "train_speed(iter/s)": 0.090194 }, { "epoch": 1.2103455677183945, "grad_norm": 0.15193308889865875, "learning_rate": 4.8096988312782174e-05, "loss": 0.41390376091003417, "memory(GiB)": 72.72, "step": 2820, "token_acc": 0.8615587932421312, "train_speed(iter/s)": 0.089981 }, { "epoch": 1.2189311010946555, "grad_norm": 0.31674590706825256, "learning_rate": 4.8063601245403864e-05, "loss": 0.40833268165588377, "memory(GiB)": 72.72, "step": 2840, "token_acc": 0.8733467856737243, "train_speed(iter/s)": 0.089985 }, { "epoch": 1.2275166344709165, "grad_norm": 0.14927241206169128, "learning_rate": 4.802993563674483e-05, "loss": 0.4076714038848877, "memory(GiB)": 72.72, "step": 2860, "token_acc": 0.854028153160118, "train_speed(iter/s)": 0.089983 }, { "epoch": 1.2361021678471775, "grad_norm": 0.12387314438819885, "learning_rate": 4.7995991893387796e-05, "loss": 0.4103559970855713, "memory(GiB)": 72.72, "step": 2880, "token_acc": 0.8473229063574101, "train_speed(iter/s)": 0.089987 }, { "epoch": 1.2446877012234385, "grad_norm": 0.12055594474077225, "learning_rate": 4.7961770425274545e-05, "loss": 0.4068136215209961, "memory(GiB)": 72.72, "step": 2900, "token_acc": 0.8558365116304547, "train_speed(iter/s)": 0.089986 }, { "epoch": 1.2532732345996995, "grad_norm": 0.15471091866493225, "learning_rate": 4.7927271645700966e-05, "loss": 0.40784463882446287, "memory(GiB)": 72.72, "step": 2920, "token_acc": 0.8654847024471946, "train_speed(iter/s)": 0.089986 }, { "epoch": 1.2618587679759605, "grad_norm": 0.14402052760124207, "learning_rate": 4.789249597131205e-05, "loss": 0.416036319732666, "memory(GiB)": 72.72, "step": 2940, "token_acc": 0.8460941475007567, "train_speed(iter/s)": 0.089991 }, { "epoch": 1.2704443013522215, "grad_norm": 0.12818260490894318, "learning_rate": 4.7857443822096905e-05, "loss": 0.4087369441986084, "memory(GiB)": 72.72, "step": 2960, "token_acc": 0.8485137361496985, "train_speed(iter/s)": 0.089995 }, { "epoch": 1.2790298347284825, "grad_norm": 0.1295406073331833, "learning_rate": 4.7822115621383626e-05, "loss": 0.406325101852417, "memory(GiB)": 72.72, "step": 2980, "token_acc": 0.844768784514136, "train_speed(iter/s)": 0.089994 }, { "epoch": 1.2876153681047435, "grad_norm": 0.12578845024108887, "learning_rate": 4.77865117958342e-05, "loss": 0.4075514316558838, "memory(GiB)": 72.72, "step": 3000, "token_acc": 0.8509595377960147, "train_speed(iter/s)": 0.089997 }, { "epoch": 1.2876153681047435, "eval_loss": 0.458536833524704, "eval_runtime": 73.3656, "eval_samples_per_second": 51.305, "eval_steps_per_second": 0.654, "eval_token_acc": 0.836097728930092, "step": 3000 }, { "epoch": 1.2962009014810045, "grad_norm": 0.14806412160396576, "learning_rate": 4.7750632775439396e-05, "loss": 0.4144165515899658, "memory(GiB)": 72.72, "step": 3020, "token_acc": 0.8443864646089783, "train_speed(iter/s)": 0.089698 }, { "epoch": 1.3047864348572655, "grad_norm": 0.12434946000576019, "learning_rate": 4.771447899351351e-05, "loss": 0.4105505466461182, "memory(GiB)": 72.72, "step": 3040, "token_acc": 0.8428811902693311, "train_speed(iter/s)": 0.089674 }, { "epoch": 1.3133719682335265, "grad_norm": 0.13531598448753357, "learning_rate": 4.767805088668916e-05, "loss": 0.40719943046569823, "memory(GiB)": 72.72, "step": 3060, "token_acc": 0.8596681679947646, "train_speed(iter/s)": 0.089654 }, { "epoch": 1.3219575016097875, "grad_norm": 0.1139962449669838, "learning_rate": 4.764134889491203e-05, "loss": 0.41121878623962405, "memory(GiB)": 72.72, "step": 3080, "token_acc": 0.8527798587518215, "train_speed(iter/s)": 0.089646 }, { "epoch": 1.3305430349860485, "grad_norm": 0.12814205884933472, "learning_rate": 4.760437346143551e-05, "loss": 0.409865140914917, "memory(GiB)": 72.72, "step": 3100, "token_acc": 0.8476134603221164, "train_speed(iter/s)": 0.089635 }, { "epoch": 1.3391285683623095, "grad_norm": 0.12462539970874786, "learning_rate": 4.7567125032815394e-05, "loss": 0.4104144096374512, "memory(GiB)": 72.72, "step": 3120, "token_acc": 0.8563240702901512, "train_speed(iter/s)": 0.089626 }, { "epoch": 1.3477141017385705, "grad_norm": 0.14763693511486053, "learning_rate": 4.752960405890446e-05, "loss": 0.4084192752838135, "memory(GiB)": 72.72, "step": 3140, "token_acc": 0.8617203660494134, "train_speed(iter/s)": 0.08962 }, { "epoch": 1.3562996351148315, "grad_norm": 0.1228506788611412, "learning_rate": 4.749181099284703e-05, "loss": 0.4092958927154541, "memory(GiB)": 72.72, "step": 3160, "token_acc": 0.8593913560568706, "train_speed(iter/s)": 0.089613 }, { "epoch": 1.3648851684910925, "grad_norm": 0.13346746563911438, "learning_rate": 4.745374629107352e-05, "loss": 0.4028874397277832, "memory(GiB)": 72.72, "step": 3180, "token_acc": 0.8684066693278472, "train_speed(iter/s)": 0.08961 }, { "epoch": 1.3734707018673535, "grad_norm": 0.11734752357006073, "learning_rate": 4.7415410413294914e-05, "loss": 0.40769195556640625, "memory(GiB)": 72.72, "step": 3200, "token_acc": 0.8596272472768909, "train_speed(iter/s)": 0.089609 }, { "epoch": 1.3820562352436145, "grad_norm": 0.12647828459739685, "learning_rate": 4.737680382249721e-05, "loss": 0.40363130569458006, "memory(GiB)": 72.72, "step": 3220, "token_acc": 0.8496863902084465, "train_speed(iter/s)": 0.089608 }, { "epoch": 1.3906417686198755, "grad_norm": 0.1108260527253151, "learning_rate": 4.733792698493584e-05, "loss": 0.40738682746887206, "memory(GiB)": 72.72, "step": 3240, "token_acc": 0.8423361384211572, "train_speed(iter/s)": 0.089609 }, { "epoch": 1.3992273019961365, "grad_norm": 0.12982375919818878, "learning_rate": 4.7298780370130014e-05, "loss": 0.4081905364990234, "memory(GiB)": 72.72, "step": 3260, "token_acc": 0.856714728114282, "train_speed(iter/s)": 0.089608 }, { "epoch": 1.4078128353723975, "grad_norm": 0.11955548077821732, "learning_rate": 4.7259364450857096e-05, "loss": 0.405292272567749, "memory(GiB)": 72.72, "step": 3280, "token_acc": 0.8639308100087719, "train_speed(iter/s)": 0.08961 }, { "epoch": 1.4163983687486585, "grad_norm": 0.13745689392089844, "learning_rate": 4.721967970314684e-05, "loss": 0.40678954124450684, "memory(GiB)": 72.72, "step": 3300, "token_acc": 0.8452769593980903, "train_speed(iter/s)": 0.089609 }, { "epoch": 1.4249839021249195, "grad_norm": 0.12158916145563126, "learning_rate": 4.717972660627567e-05, "loss": 0.40230860710144045, "memory(GiB)": 72.72, "step": 3320, "token_acc": 0.8659293308755474, "train_speed(iter/s)": 0.089614 }, { "epoch": 1.4335694355011805, "grad_norm": 0.14111177623271942, "learning_rate": 4.713950564276091e-05, "loss": 0.4016873359680176, "memory(GiB)": 72.72, "step": 3340, "token_acc": 0.8510812474231116, "train_speed(iter/s)": 0.089592 }, { "epoch": 1.4421549688774415, "grad_norm": 0.10712361335754395, "learning_rate": 4.70990172983549e-05, "loss": 0.4058821201324463, "memory(GiB)": 72.72, "step": 3360, "token_acc": 0.8550924401373665, "train_speed(iter/s)": 0.089592 }, { "epoch": 1.4507405022537025, "grad_norm": 0.11166644841432571, "learning_rate": 4.705826206203918e-05, "loss": 0.4066760540008545, "memory(GiB)": 72.72, "step": 3380, "token_acc": 0.8444937034366048, "train_speed(iter/s)": 0.089586 }, { "epoch": 1.4593260356299635, "grad_norm": 0.14026156067848206, "learning_rate": 4.701724042601859e-05, "loss": 0.40719261169433596, "memory(GiB)": 72.72, "step": 3400, "token_acc": 0.8498371056241426, "train_speed(iter/s)": 0.08959 }, { "epoch": 1.4679115690062245, "grad_norm": 0.13125832378864288, "learning_rate": 4.697595288571528e-05, "loss": 0.4064974308013916, "memory(GiB)": 72.72, "step": 3420, "token_acc": 0.8575960472975773, "train_speed(iter/s)": 0.089593 }, { "epoch": 1.4764971023824855, "grad_norm": 0.12359972298145294, "learning_rate": 4.6934399939762746e-05, "loss": 0.4019315242767334, "memory(GiB)": 72.72, "step": 3440, "token_acc": 0.8573588526594907, "train_speed(iter/s)": 0.089592 }, { "epoch": 1.4850826357587465, "grad_norm": 0.15697510540485382, "learning_rate": 4.689258208999983e-05, "loss": 0.4078845500946045, "memory(GiB)": 72.72, "step": 3460, "token_acc": 0.8560958939786878, "train_speed(iter/s)": 0.089591 }, { "epoch": 1.4936681691350076, "grad_norm": 0.11863242089748383, "learning_rate": 4.685049984146463e-05, "loss": 0.4097602844238281, "memory(GiB)": 72.72, "step": 3480, "token_acc": 0.8628702144893777, "train_speed(iter/s)": 0.08959 }, { "epoch": 1.5022537025112686, "grad_norm": 0.11114250868558884, "learning_rate": 4.680815370238843e-05, "loss": 0.40899147987365725, "memory(GiB)": 72.72, "step": 3500, "token_acc": 0.8451921045701701, "train_speed(iter/s)": 0.089584 }, { "epoch": 1.5108392358875296, "grad_norm": 0.1112656220793724, "learning_rate": 4.676554418418953e-05, "loss": 0.40816683769226075, "memory(GiB)": 72.72, "step": 3520, "token_acc": 0.8431806288233773, "train_speed(iter/s)": 0.089584 }, { "epoch": 1.5194247692637906, "grad_norm": 0.11323296278715134, "learning_rate": 4.6722671801467074e-05, "loss": 0.4055006980895996, "memory(GiB)": 72.72, "step": 3540, "token_acc": 0.8815225166268434, "train_speed(iter/s)": 0.089589 }, { "epoch": 1.5280103026400516, "grad_norm": 0.12150542438030243, "learning_rate": 4.6679537071994874e-05, "loss": 0.4004813194274902, "memory(GiB)": 72.72, "step": 3560, "token_acc": 0.8570034017657414, "train_speed(iter/s)": 0.089589 }, { "epoch": 1.5365958360163126, "grad_norm": 0.12244880199432373, "learning_rate": 4.6636140516715104e-05, "loss": 0.4029510021209717, "memory(GiB)": 72.72, "step": 3580, "token_acc": 0.8517572914459227, "train_speed(iter/s)": 0.089593 }, { "epoch": 1.5451813693925736, "grad_norm": 0.1206183210015297, "learning_rate": 4.659248265973205e-05, "loss": 0.40460500717163084, "memory(GiB)": 72.72, "step": 3600, "token_acc": 0.8554049462946347, "train_speed(iter/s)": 0.089596 }, { "epoch": 1.5537669027688346, "grad_norm": 0.1283605545759201, "learning_rate": 4.6548564028305746e-05, "loss": 0.40555410385131835, "memory(GiB)": 72.72, "step": 3620, "token_acc": 0.8552409152003629, "train_speed(iter/s)": 0.0896 }, { "epoch": 1.5623524361450956, "grad_norm": 0.10448771715164185, "learning_rate": 4.650438515284564e-05, "loss": 0.4010280132293701, "memory(GiB)": 72.72, "step": 3640, "token_acc": 0.8516997869926084, "train_speed(iter/s)": 0.089603 }, { "epoch": 1.5709379695213566, "grad_norm": 0.14749032258987427, "learning_rate": 4.645994656690417e-05, "loss": 0.4050903797149658, "memory(GiB)": 72.72, "step": 3660, "token_acc": 0.8502366458426844, "train_speed(iter/s)": 0.0896 }, { "epoch": 1.5795235028976176, "grad_norm": 0.1269512176513672, "learning_rate": 4.6415248807170296e-05, "loss": 0.4045454502105713, "memory(GiB)": 72.72, "step": 3680, "token_acc": 0.8799187339606501, "train_speed(iter/s)": 0.089583 }, { "epoch": 1.5881090362738786, "grad_norm": 0.11708427965641022, "learning_rate": 4.637029241346309e-05, "loss": 0.4028982162475586, "memory(GiB)": 72.72, "step": 3700, "token_acc": 0.8584745030316171, "train_speed(iter/s)": 0.089582 }, { "epoch": 1.5966945696501396, "grad_norm": 0.12971659004688263, "learning_rate": 4.632507792872513e-05, "loss": 0.4027679920196533, "memory(GiB)": 72.72, "step": 3720, "token_acc": 0.8444115651659281, "train_speed(iter/s)": 0.089587 }, { "epoch": 1.6052801030264006, "grad_norm": 0.1406456083059311, "learning_rate": 4.6279605899016007e-05, "loss": 0.4045069694519043, "memory(GiB)": 72.72, "step": 3740, "token_acc": 0.8620591654047942, "train_speed(iter/s)": 0.089588 }, { "epoch": 1.6138656364026616, "grad_norm": 0.12651792168617249, "learning_rate": 4.6233876873505694e-05, "loss": 0.3987946271896362, "memory(GiB)": 72.72, "step": 3760, "token_acc": 0.8604080254900858, "train_speed(iter/s)": 0.089587 }, { "epoch": 1.6224511697789226, "grad_norm": 0.1294124722480774, "learning_rate": 4.618789140446793e-05, "loss": 0.4040426254272461, "memory(GiB)": 72.72, "step": 3780, "token_acc": 0.8575430560407852, "train_speed(iter/s)": 0.089588 }, { "epoch": 1.6310367031551836, "grad_norm": 0.13899479806423187, "learning_rate": 4.614165004727356e-05, "loss": 0.40485129356384275, "memory(GiB)": 72.72, "step": 3800, "token_acc": 0.8618784194621236, "train_speed(iter/s)": 0.089589 }, { "epoch": 1.6396222365314446, "grad_norm": 0.11304246634244919, "learning_rate": 4.609515336038379e-05, "loss": 0.39697728157043455, "memory(GiB)": 72.72, "step": 3820, "token_acc": 0.8657167944284284, "train_speed(iter/s)": 0.089593 }, { "epoch": 1.6482077699077056, "grad_norm": 0.10555765777826309, "learning_rate": 4.604840190534349e-05, "loss": 0.4016863346099854, "memory(GiB)": 72.72, "step": 3840, "token_acc": 0.8618964493040964, "train_speed(iter/s)": 0.089597 }, { "epoch": 1.6567933032839666, "grad_norm": 0.10668028146028519, "learning_rate": 4.600139624677436e-05, "loss": 0.40195555686950685, "memory(GiB)": 72.72, "step": 3860, "token_acc": 0.8585640908572081, "train_speed(iter/s)": 0.089599 }, { "epoch": 1.6653788366602276, "grad_norm": 0.11972223222255707, "learning_rate": 4.5954136952368175e-05, "loss": 0.404964542388916, "memory(GiB)": 72.72, "step": 3880, "token_acc": 0.8751289644195156, "train_speed(iter/s)": 0.089603 }, { "epoch": 1.6739643700364886, "grad_norm": 0.1090841144323349, "learning_rate": 4.590662459287987e-05, "loss": 0.4025224208831787, "memory(GiB)": 72.72, "step": 3900, "token_acc": 0.8712011406091705, "train_speed(iter/s)": 0.089608 }, { "epoch": 1.6825499034127496, "grad_norm": 0.09250445663928986, "learning_rate": 4.585885974212068e-05, "loss": 0.39822845458984374, "memory(GiB)": 72.72, "step": 3920, "token_acc": 0.8478179395649417, "train_speed(iter/s)": 0.089608 }, { "epoch": 1.6911354367890106, "grad_norm": 0.12228672951459885, "learning_rate": 4.58108429769512e-05, "loss": 0.4002052307128906, "memory(GiB)": 72.72, "step": 3940, "token_acc": 0.8550839992606666, "train_speed(iter/s)": 0.089611 }, { "epoch": 1.6997209701652716, "grad_norm": 0.11608360707759857, "learning_rate": 4.576257487727442e-05, "loss": 0.40276689529418946, "memory(GiB)": 72.72, "step": 3960, "token_acc": 0.8589090178774137, "train_speed(iter/s)": 0.089614 }, { "epoch": 1.7083065035415326, "grad_norm": 0.10027152299880981, "learning_rate": 4.571405602602871e-05, "loss": 0.39651687145233155, "memory(GiB)": 72.72, "step": 3980, "token_acc": 0.8630956830570248, "train_speed(iter/s)": 0.089614 }, { "epoch": 1.7168920369177934, "grad_norm": 0.13469679653644562, "learning_rate": 4.5665287009180796e-05, "loss": 0.404406213760376, "memory(GiB)": 72.72, "step": 4000, "token_acc": 0.8562729568578561, "train_speed(iter/s)": 0.089618 }, { "epoch": 1.7168920369177934, "eval_loss": 0.45004475116729736, "eval_runtime": 69.5068, "eval_samples_per_second": 54.153, "eval_steps_per_second": 0.691, "eval_token_acc": 0.838482459020931, "step": 4000 }, { "epoch": 1.7254775702940544, "grad_norm": 0.11884400248527527, "learning_rate": 4.5616268415718686e-05, "loss": 0.4039021968841553, "memory(GiB)": 72.72, "step": 4020, "token_acc": 0.8519779575146431, "train_speed(iter/s)": 0.089391 }, { "epoch": 1.7340631036703154, "grad_norm": 0.11766815185546875, "learning_rate": 4.5567000837644555e-05, "loss": 0.40551328659057617, "memory(GiB)": 72.72, "step": 4040, "token_acc": 0.8603655792648116, "train_speed(iter/s)": 0.089374 }, { "epoch": 1.7426486370465764, "grad_norm": 0.1035754606127739, "learning_rate": 4.551748486996755e-05, "loss": 0.3972191333770752, "memory(GiB)": 72.72, "step": 4060, "token_acc": 0.8441598716065328, "train_speed(iter/s)": 0.08936 }, { "epoch": 1.7512341704228374, "grad_norm": 0.11534030735492706, "learning_rate": 4.5467721110696685e-05, "loss": 0.39623782634735105, "memory(GiB)": 72.72, "step": 4080, "token_acc": 0.8508078067985404, "train_speed(iter/s)": 0.089346 }, { "epoch": 1.7598197037990984, "grad_norm": 0.11770807206630707, "learning_rate": 4.541771016083356e-05, "loss": 0.4031228542327881, "memory(GiB)": 72.72, "step": 4100, "token_acc": 0.8575402257628572, "train_speed(iter/s)": 0.089337 }, { "epoch": 1.7684052371753594, "grad_norm": 0.11031018942594528, "learning_rate": 4.5367452624365107e-05, "loss": 0.39590916633605955, "memory(GiB)": 72.72, "step": 4120, "token_acc": 0.8493938383274198, "train_speed(iter/s)": 0.089333 }, { "epoch": 1.7769907705516204, "grad_norm": 0.12101167440414429, "learning_rate": 4.531694910825632e-05, "loss": 0.4022487163543701, "memory(GiB)": 72.72, "step": 4140, "token_acc": 0.8616033848286162, "train_speed(iter/s)": 0.08933 }, { "epoch": 1.7855763039278814, "grad_norm": 0.12361987680196762, "learning_rate": 4.526620022244293e-05, "loss": 0.3952162265777588, "memory(GiB)": 72.72, "step": 4160, "token_acc": 0.8546911728976807, "train_speed(iter/s)": 0.089329 }, { "epoch": 1.7941618373041424, "grad_norm": 0.11886027455329895, "learning_rate": 4.521520657982399e-05, "loss": 0.3967653751373291, "memory(GiB)": 72.72, "step": 4180, "token_acc": 0.850109229842917, "train_speed(iter/s)": 0.089327 }, { "epoch": 1.8027473706804034, "grad_norm": 0.10228098928928375, "learning_rate": 4.516396879625451e-05, "loss": 0.3982940435409546, "memory(GiB)": 72.72, "step": 4200, "token_acc": 0.8674571957241461, "train_speed(iter/s)": 0.089324 }, { "epoch": 1.8113329040566644, "grad_norm": 0.13192002475261688, "learning_rate": 4.5112487490538033e-05, "loss": 0.4016000747680664, "memory(GiB)": 72.72, "step": 4220, "token_acc": 0.8583699143774935, "train_speed(iter/s)": 0.089324 }, { "epoch": 1.8199184374329254, "grad_norm": 0.13863115012645721, "learning_rate": 4.5060763284419114e-05, "loss": 0.3993339538574219, "memory(GiB)": 72.72, "step": 4240, "token_acc": 0.8529649884386873, "train_speed(iter/s)": 0.089327 }, { "epoch": 1.8285039708091864, "grad_norm": 0.1052585169672966, "learning_rate": 4.500879680257587e-05, "loss": 0.39501266479492186, "memory(GiB)": 72.72, "step": 4260, "token_acc": 0.8490466163025552, "train_speed(iter/s)": 0.089326 }, { "epoch": 1.8370895041854474, "grad_norm": 0.11824264377355576, "learning_rate": 4.495658867261237e-05, "loss": 0.3999388933181763, "memory(GiB)": 72.72, "step": 4280, "token_acc": 0.8604835011176714, "train_speed(iter/s)": 0.08933 }, { "epoch": 1.8456750375617084, "grad_norm": 0.10404901951551437, "learning_rate": 4.490413952505113e-05, "loss": 0.399350905418396, "memory(GiB)": 72.72, "step": 4300, "token_acc": 0.8754184479751959, "train_speed(iter/s)": 0.089333 }, { "epoch": 1.8542605709379694, "grad_norm": 0.11935856193304062, "learning_rate": 4.485144999332541e-05, "loss": 0.3988263845443726, "memory(GiB)": 72.72, "step": 4320, "token_acc": 0.8642416058331645, "train_speed(iter/s)": 0.089334 }, { "epoch": 1.8628461043142304, "grad_norm": 0.12025253474712372, "learning_rate": 4.4798520713771655e-05, "loss": 0.3969618320465088, "memory(GiB)": 72.72, "step": 4340, "token_acc": 0.8592759073410623, "train_speed(iter/s)": 0.089324 }, { "epoch": 1.8714316376904914, "grad_norm": 0.10460798442363739, "learning_rate": 4.474535232562176e-05, "loss": 0.4043170928955078, "memory(GiB)": 72.72, "step": 4360, "token_acc": 0.852819602922532, "train_speed(iter/s)": 0.089327 }, { "epoch": 1.8800171710667524, "grad_norm": 0.10020267218351364, "learning_rate": 4.469194547099532e-05, "loss": 0.3999593734741211, "memory(GiB)": 72.72, "step": 4380, "token_acc": 0.8611075959033526, "train_speed(iter/s)": 0.089328 }, { "epoch": 1.8886027044430134, "grad_norm": 0.12959228456020355, "learning_rate": 4.463830079489196e-05, "loss": 0.39733612537384033, "memory(GiB)": 72.72, "step": 4400, "token_acc": 0.8531978711946401, "train_speed(iter/s)": 0.089335 }, { "epoch": 1.8971882378192744, "grad_norm": 0.11115922778844833, "learning_rate": 4.458441894518348e-05, "loss": 0.4049359321594238, "memory(GiB)": 72.72, "step": 4420, "token_acc": 0.8702030459301568, "train_speed(iter/s)": 0.089338 }, { "epoch": 1.9057737711955354, "grad_norm": 0.10734923928976059, "learning_rate": 4.453030057260604e-05, "loss": 0.40124940872192383, "memory(GiB)": 72.72, "step": 4440, "token_acc": 0.8526137694097369, "train_speed(iter/s)": 0.089343 }, { "epoch": 1.9143593045717964, "grad_norm": 0.10538238286972046, "learning_rate": 4.44759463307523e-05, "loss": 0.3986711263656616, "memory(GiB)": 72.72, "step": 4460, "token_acc": 0.8580899206582427, "train_speed(iter/s)": 0.089347 }, { "epoch": 1.9229448379480574, "grad_norm": 0.11792416125535965, "learning_rate": 4.4421356876063566e-05, "loss": 0.4009650707244873, "memory(GiB)": 72.72, "step": 4480, "token_acc": 0.8415756258347672, "train_speed(iter/s)": 0.089351 }, { "epoch": 1.9315303713243184, "grad_norm": 0.10540692508220673, "learning_rate": 4.4366532867821816e-05, "loss": 0.40032110214233396, "memory(GiB)": 72.72, "step": 4500, "token_acc": 0.8645283673549553, "train_speed(iter/s)": 0.089356 }, { "epoch": 1.9401159047005794, "grad_norm": 0.10806146264076233, "learning_rate": 4.4311474968141745e-05, "loss": 0.4047665596008301, "memory(GiB)": 72.72, "step": 4520, "token_acc": 0.8665738751278136, "train_speed(iter/s)": 0.089358 }, { "epoch": 1.9487014380768404, "grad_norm": 0.0982556939125061, "learning_rate": 4.4256183841962776e-05, "loss": 0.39951965808868406, "memory(GiB)": 72.72, "step": 4540, "token_acc": 0.8557438649716252, "train_speed(iter/s)": 0.08936 }, { "epoch": 1.9572869714531014, "grad_norm": 0.11462666094303131, "learning_rate": 4.420066015704105e-05, "loss": 0.39820613861083987, "memory(GiB)": 72.72, "step": 4560, "token_acc": 0.851616577376715, "train_speed(iter/s)": 0.089365 }, { "epoch": 1.9658725048293624, "grad_norm": 0.12274167686700821, "learning_rate": 4.414490458394134e-05, "loss": 0.39962952136993407, "memory(GiB)": 72.72, "step": 4580, "token_acc": 0.8450544293089454, "train_speed(iter/s)": 0.089369 }, { "epoch": 1.9744580382056234, "grad_norm": 0.11052652448415756, "learning_rate": 4.408891779602892e-05, "loss": 0.40143113136291503, "memory(GiB)": 72.72, "step": 4600, "token_acc": 0.8466183479919549, "train_speed(iter/s)": 0.089369 }, { "epoch": 1.9830435715818844, "grad_norm": 0.11736435443162918, "learning_rate": 4.403270046946151e-05, "loss": 0.39746062755584716, "memory(GiB)": 72.72, "step": 4620, "token_acc": 0.8545920867275066, "train_speed(iter/s)": 0.08937 }, { "epoch": 1.9916291049581454, "grad_norm": 0.09831462055444717, "learning_rate": 4.397625328318104e-05, "loss": 0.40285186767578124, "memory(GiB)": 72.72, "step": 4640, "token_acc": 0.8588040292883812, "train_speed(iter/s)": 0.089375 }, { "epoch": 2.0, "grad_norm": 0.1868225783109665, "learning_rate": 4.3919576918905495e-05, "loss": 0.40441222190856935, "memory(GiB)": 72.72, "step": 4660, "token_acc": 0.8483147592149679, "train_speed(iter/s)": 0.089388 }, { "epoch": 2.008585533376261, "grad_norm": 0.1071023941040039, "learning_rate": 4.3862672061120637e-05, "loss": 0.3615531921386719, "memory(GiB)": 72.72, "step": 4680, "token_acc": 0.8768156740901892, "train_speed(iter/s)": 0.089352 }, { "epoch": 2.017171066752522, "grad_norm": 0.10470844805240631, "learning_rate": 4.3805539397071806e-05, "loss": 0.36854674816131594, "memory(GiB)": 72.72, "step": 4700, "token_acc": 0.8658165567867299, "train_speed(iter/s)": 0.089356 }, { "epoch": 2.025756600128783, "grad_norm": 0.11344057321548462, "learning_rate": 4.374817961675553e-05, "loss": 0.36517815589904784, "memory(GiB)": 72.72, "step": 4720, "token_acc": 0.8573345434699361, "train_speed(iter/s)": 0.08936 }, { "epoch": 2.034342133505044, "grad_norm": 0.12088248133659363, "learning_rate": 4.369059341291131e-05, "loss": 0.3732161045074463, "memory(GiB)": 72.72, "step": 4740, "token_acc": 0.8643586935864834, "train_speed(iter/s)": 0.089232 }, { "epoch": 2.042927666881305, "grad_norm": 0.1079382449388504, "learning_rate": 4.3632781481013105e-05, "loss": 0.3706186294555664, "memory(GiB)": 72.72, "step": 4760, "token_acc": 0.8583230735096428, "train_speed(iter/s)": 0.089236 }, { "epoch": 2.051513200257566, "grad_norm": 0.10861940681934357, "learning_rate": 4.357474451926107e-05, "loss": 0.36578049659729006, "memory(GiB)": 72.72, "step": 4780, "token_acc": 0.8634094633238114, "train_speed(iter/s)": 0.089242 }, { "epoch": 2.060098733633827, "grad_norm": 0.10976995527744293, "learning_rate": 4.351648322857304e-05, "loss": 0.3717454671859741, "memory(GiB)": 72.72, "step": 4800, "token_acc": 0.8754387101732538, "train_speed(iter/s)": 0.089245 }, { "epoch": 2.068684267010088, "grad_norm": 0.11381576955318451, "learning_rate": 4.345799831257612e-05, "loss": 0.3690098524093628, "memory(GiB)": 72.72, "step": 4820, "token_acc": 0.8754805492942517, "train_speed(iter/s)": 0.089251 }, { "epoch": 2.077269800386349, "grad_norm": 0.11746755242347717, "learning_rate": 4.339929047759812e-05, "loss": 0.3719310760498047, "memory(GiB)": 72.72, "step": 4840, "token_acc": 0.8569852569604883, "train_speed(iter/s)": 0.089254 }, { "epoch": 2.08585533376261, "grad_norm": 0.1201663538813591, "learning_rate": 4.334036043265909e-05, "loss": 0.366811728477478, "memory(GiB)": 72.72, "step": 4860, "token_acc": 0.868435326772985, "train_speed(iter/s)": 0.089256 }, { "epoch": 2.094440867138871, "grad_norm": 0.10783121734857559, "learning_rate": 4.3281208889462715e-05, "loss": 0.3673741102218628, "memory(GiB)": 72.72, "step": 4880, "token_acc": 0.8597632948845746, "train_speed(iter/s)": 0.089257 }, { "epoch": 2.103026400515132, "grad_norm": 0.10495728254318237, "learning_rate": 4.3221836562387754e-05, "loss": 0.371392560005188, "memory(GiB)": 72.72, "step": 4900, "token_acc": 0.8613381730879158, "train_speed(iter/s)": 0.08926 }, { "epoch": 2.111611933891393, "grad_norm": 0.12534630298614502, "learning_rate": 4.3162244168479385e-05, "loss": 0.37217743396759034, "memory(GiB)": 72.72, "step": 4920, "token_acc": 0.8550305751583707, "train_speed(iter/s)": 0.089265 }, { "epoch": 2.120197467267654, "grad_norm": 0.11463397741317749, "learning_rate": 4.310243242744055e-05, "loss": 0.37210404872894287, "memory(GiB)": 72.72, "step": 4940, "token_acc": 0.8669802804648793, "train_speed(iter/s)": 0.089267 }, { "epoch": 2.128783000643915, "grad_norm": 0.0987405851483345, "learning_rate": 4.304240206162326e-05, "loss": 0.36531455516815187, "memory(GiB)": 72.72, "step": 4960, "token_acc": 0.8615956192835081, "train_speed(iter/s)": 0.089271 }, { "epoch": 2.137368534020176, "grad_norm": 0.10236576199531555, "learning_rate": 4.2982153796019895e-05, "loss": 0.3683722734451294, "memory(GiB)": 72.72, "step": 4980, "token_acc": 0.8691021414446882, "train_speed(iter/s)": 0.089273 }, { "epoch": 2.145954067396437, "grad_norm": 0.11913823336362839, "learning_rate": 4.292168835825442e-05, "loss": 0.36794998645782473, "memory(GiB)": 72.72, "step": 5000, "token_acc": 0.8603812367895441, "train_speed(iter/s)": 0.089274 }, { "epoch": 2.145954067396437, "eval_loss": 0.44772276282310486, "eval_runtime": 74.9501, "eval_samples_per_second": 50.22, "eval_steps_per_second": 0.64, "eval_token_acc": 0.8396186479726367, "step": 5000 }, { "epoch": 2.154539600772698, "grad_norm": 0.10815497487783432, "learning_rate": 4.286100647857362e-05, "loss": 0.3666555881500244, "memory(GiB)": 72.72, "step": 5020, "token_acc": 0.8487304373111233, "train_speed(iter/s)": 0.089094 }, { "epoch": 2.163125134148959, "grad_norm": 0.10708407312631607, "learning_rate": 4.2800108889838244e-05, "loss": 0.3680349111557007, "memory(GiB)": 72.72, "step": 5040, "token_acc": 0.8607205605794815, "train_speed(iter/s)": 0.089078 }, { "epoch": 2.17171066752522, "grad_norm": 0.10280643403530121, "learning_rate": 4.273899632751422e-05, "loss": 0.3690458297729492, "memory(GiB)": 72.72, "step": 5060, "token_acc": 0.8681282741623693, "train_speed(iter/s)": 0.089068 }, { "epoch": 2.180296200901481, "grad_norm": 0.11067724972963333, "learning_rate": 4.267766952966369e-05, "loss": 0.37246291637420653, "memory(GiB)": 72.72, "step": 5080, "token_acc": 0.8648300486787626, "train_speed(iter/s)": 0.089062 }, { "epoch": 2.188881734277742, "grad_norm": 0.10517250746488571, "learning_rate": 4.261612923693617e-05, "loss": 0.37222487926483155, "memory(GiB)": 72.72, "step": 5100, "token_acc": 0.8561770562371953, "train_speed(iter/s)": 0.089058 }, { "epoch": 2.197467267654003, "grad_norm": 0.11643174290657043, "learning_rate": 4.255437619255955e-05, "loss": 0.37151226997375486, "memory(GiB)": 72.72, "step": 5120, "token_acc": 0.856546833515401, "train_speed(iter/s)": 0.089056 }, { "epoch": 2.206052801030264, "grad_norm": 0.10725266486406326, "learning_rate": 4.2492411142331164e-05, "loss": 0.3672873258590698, "memory(GiB)": 72.72, "step": 5140, "token_acc": 0.8657454419748819, "train_speed(iter/s)": 0.089055 }, { "epoch": 2.214638334406525, "grad_norm": 0.10386510193347931, "learning_rate": 4.243023483460875e-05, "loss": 0.3682314395904541, "memory(GiB)": 72.72, "step": 5160, "token_acc": 0.8692801593001643, "train_speed(iter/s)": 0.089056 }, { "epoch": 2.223223867782786, "grad_norm": 0.11796915531158447, "learning_rate": 4.236784802030141e-05, "loss": 0.3701756000518799, "memory(GiB)": 72.72, "step": 5180, "token_acc": 0.8771635645482831, "train_speed(iter/s)": 0.089057 }, { "epoch": 2.231809401159047, "grad_norm": 0.10015714913606644, "learning_rate": 4.230525145286057e-05, "loss": 0.36999518871307374, "memory(GiB)": 72.72, "step": 5200, "token_acc": 0.8674851697347774, "train_speed(iter/s)": 0.089057 }, { "epoch": 2.240394934535308, "grad_norm": 0.1074676439166069, "learning_rate": 4.224244588827088e-05, "loss": 0.3750225782394409, "memory(GiB)": 72.72, "step": 5220, "token_acc": 0.8527425346133436, "train_speed(iter/s)": 0.089057 }, { "epoch": 2.248980467911569, "grad_norm": 0.10311347991228104, "learning_rate": 4.2179432085041016e-05, "loss": 0.3746063232421875, "memory(GiB)": 72.72, "step": 5240, "token_acc": 0.8669185952544043, "train_speed(iter/s)": 0.089056 }, { "epoch": 2.25756600128783, "grad_norm": 0.11873036623001099, "learning_rate": 4.211621080419463e-05, "loss": 0.37813477516174315, "memory(GiB)": 72.72, "step": 5260, "token_acc": 0.8692531193982356, "train_speed(iter/s)": 0.089056 }, { "epoch": 2.266151534664091, "grad_norm": 0.11505374312400818, "learning_rate": 4.205278280926106e-05, "loss": 0.37494683265686035, "memory(GiB)": 72.72, "step": 5280, "token_acc": 0.8686222108977568, "train_speed(iter/s)": 0.089057 }, { "epoch": 2.274737068040352, "grad_norm": 0.10475321859121323, "learning_rate": 4.198914886626617e-05, "loss": 0.37322399616241453, "memory(GiB)": 72.72, "step": 5300, "token_acc": 0.8642545858709445, "train_speed(iter/s)": 0.089058 }, { "epoch": 2.283322601416613, "grad_norm": 0.10895238816738129, "learning_rate": 4.192530974372307e-05, "loss": 0.37212719917297366, "memory(GiB)": 72.72, "step": 5320, "token_acc": 0.8592036985069942, "train_speed(iter/s)": 0.089059 }, { "epoch": 2.291908134792874, "grad_norm": 0.13440454006195068, "learning_rate": 4.186126621262286e-05, "loss": 0.3748520612716675, "memory(GiB)": 72.72, "step": 5340, "token_acc": 0.8694009430316147, "train_speed(iter/s)": 0.089059 }, { "epoch": 2.300493668169135, "grad_norm": 0.10428149253129959, "learning_rate": 4.1797019046425264e-05, "loss": 0.3729527711868286, "memory(GiB)": 72.72, "step": 5360, "token_acc": 0.8606326299971436, "train_speed(iter/s)": 0.089059 }, { "epoch": 2.309079201545396, "grad_norm": 0.10109774023294449, "learning_rate": 4.173256902104937e-05, "loss": 0.3786268949508667, "memory(GiB)": 72.72, "step": 5380, "token_acc": 0.8546159979614149, "train_speed(iter/s)": 0.089063 }, { "epoch": 2.317664734921657, "grad_norm": 0.1086476594209671, "learning_rate": 4.166791691486417e-05, "loss": 0.37719101905822755, "memory(GiB)": 72.72, "step": 5400, "token_acc": 0.8614693814596865, "train_speed(iter/s)": 0.089065 }, { "epoch": 2.326250268297918, "grad_norm": 0.0986161157488823, "learning_rate": 4.1603063508679254e-05, "loss": 0.3716520071029663, "memory(GiB)": 72.72, "step": 5420, "token_acc": 0.8700038391325128, "train_speed(iter/s)": 0.089068 }, { "epoch": 2.334835801674179, "grad_norm": 0.10710026323795319, "learning_rate": 4.1538009585735296e-05, "loss": 0.37460925579071047, "memory(GiB)": 72.72, "step": 5440, "token_acc": 0.864236101862486, "train_speed(iter/s)": 0.089068 }, { "epoch": 2.34342133505044, "grad_norm": 0.1084044948220253, "learning_rate": 4.1472755931694626e-05, "loss": 0.37008664608001707, "memory(GiB)": 72.72, "step": 5460, "token_acc": 0.884960342611309, "train_speed(iter/s)": 0.08907 }, { "epoch": 2.352006868426701, "grad_norm": 0.1017412543296814, "learning_rate": 4.1407303334631784e-05, "loss": 0.37591137886047366, "memory(GiB)": 72.72, "step": 5480, "token_acc": 0.8690706806478822, "train_speed(iter/s)": 0.089076 }, { "epoch": 2.360592401802962, "grad_norm": 0.09642521291971207, "learning_rate": 4.134165258502392e-05, "loss": 0.3724454641342163, "memory(GiB)": 72.72, "step": 5500, "token_acc": 0.8665798727743621, "train_speed(iter/s)": 0.089078 }, { "epoch": 2.369177935179223, "grad_norm": 0.10195600241422653, "learning_rate": 4.127580447574131e-05, "loss": 0.37389321327209474, "memory(GiB)": 72.72, "step": 5520, "token_acc": 0.8659414758069467, "train_speed(iter/s)": 0.089082 }, { "epoch": 2.377763468555484, "grad_norm": 0.10220296680927277, "learning_rate": 4.120975980203778e-05, "loss": 0.37123832702636717, "memory(GiB)": 72.72, "step": 5540, "token_acc": 0.8599722163416741, "train_speed(iter/s)": 0.089087 }, { "epoch": 2.386349001931745, "grad_norm": 0.09796139597892761, "learning_rate": 4.114351936154105e-05, "loss": 0.37191407680511473, "memory(GiB)": 72.72, "step": 5560, "token_acc": 0.8607424388032349, "train_speed(iter/s)": 0.089089 }, { "epoch": 2.394934535308006, "grad_norm": 0.1000937819480896, "learning_rate": 4.1077083954243134e-05, "loss": 0.3728537082672119, "memory(GiB)": 72.72, "step": 5580, "token_acc": 0.8599175456576579, "train_speed(iter/s)": 0.089092 }, { "epoch": 2.403520068684267, "grad_norm": 0.10470744967460632, "learning_rate": 4.101045438249072e-05, "loss": 0.3749739170074463, "memory(GiB)": 72.72, "step": 5600, "token_acc": 0.8662434580620245, "train_speed(iter/s)": 0.089094 }, { "epoch": 2.412105602060528, "grad_norm": 0.10006117075681686, "learning_rate": 4.0943631450975395e-05, "loss": 0.3695227146148682, "memory(GiB)": 72.72, "step": 5620, "token_acc": 0.8567862235957147, "train_speed(iter/s)": 0.089097 }, { "epoch": 2.420691135436789, "grad_norm": 0.11233365535736084, "learning_rate": 4.0876615966723983e-05, "loss": 0.37129299640655516, "memory(GiB)": 72.72, "step": 5640, "token_acc": 0.8725153838730988, "train_speed(iter/s)": 0.089101 }, { "epoch": 2.42927666881305, "grad_norm": 0.09630627185106277, "learning_rate": 4.080940873908881e-05, "loss": 0.3767483472824097, "memory(GiB)": 72.72, "step": 5660, "token_acc": 0.8623814759151552, "train_speed(iter/s)": 0.089105 }, { "epoch": 2.437862202189311, "grad_norm": 0.11699684709310532, "learning_rate": 4.0742010579737855e-05, "loss": 0.37203705310821533, "memory(GiB)": 72.72, "step": 5680, "token_acc": 0.8617447464487988, "train_speed(iter/s)": 0.089104 }, { "epoch": 2.446447735565572, "grad_norm": 0.10771006345748901, "learning_rate": 4.067442230264503e-05, "loss": 0.3795736312866211, "memory(GiB)": 72.72, "step": 5700, "token_acc": 0.8621945679332835, "train_speed(iter/s)": 0.089107 }, { "epoch": 2.455033268941833, "grad_norm": 0.10978804528713226, "learning_rate": 4.0606644724080334e-05, "loss": 0.37045629024505616, "memory(GiB)": 72.72, "step": 5720, "token_acc": 0.8683952247812166, "train_speed(iter/s)": 0.089111 }, { "epoch": 2.463618802318094, "grad_norm": 0.11052682995796204, "learning_rate": 4.053867866259994e-05, "loss": 0.37306039333343505, "memory(GiB)": 72.72, "step": 5740, "token_acc": 0.8691631145068139, "train_speed(iter/s)": 0.089115 }, { "epoch": 2.472204335694355, "grad_norm": 0.09953057020902634, "learning_rate": 4.0470524939036355e-05, "loss": 0.37361931800842285, "memory(GiB)": 72.72, "step": 5760, "token_acc": 0.8691507758784558, "train_speed(iter/s)": 0.089118 }, { "epoch": 2.480789869070616, "grad_norm": 0.11204252392053604, "learning_rate": 4.0402184376488514e-05, "loss": 0.37611095905303954, "memory(GiB)": 72.72, "step": 5780, "token_acc": 0.8522185815081345, "train_speed(iter/s)": 0.089118 }, { "epoch": 2.489375402446877, "grad_norm": 0.09915061295032501, "learning_rate": 4.033365780031183e-05, "loss": 0.37398972511291506, "memory(GiB)": 72.72, "step": 5800, "token_acc": 0.8830448305013368, "train_speed(iter/s)": 0.089119 }, { "epoch": 2.497960935823138, "grad_norm": 0.10546642541885376, "learning_rate": 4.026494603810819e-05, "loss": 0.3730853796005249, "memory(GiB)": 72.72, "step": 5820, "token_acc": 0.8712887390375224, "train_speed(iter/s)": 0.089122 }, { "epoch": 2.506546469199399, "grad_norm": 0.10121016949415207, "learning_rate": 4.0196049919716004e-05, "loss": 0.3762380361557007, "memory(GiB)": 72.72, "step": 5840, "token_acc": 0.8579485282281408, "train_speed(iter/s)": 0.089126 }, { "epoch": 2.51513200257566, "grad_norm": 0.103721484541893, "learning_rate": 4.012697027720018e-05, "loss": 0.36703407764434814, "memory(GiB)": 72.72, "step": 5860, "token_acc": 0.8760437267344359, "train_speed(iter/s)": 0.089129 }, { "epoch": 2.523717535951921, "grad_norm": 0.10886813700199127, "learning_rate": 4.005770794484206e-05, "loss": 0.3760274648666382, "memory(GiB)": 72.72, "step": 5880, "token_acc": 0.86771377124094, "train_speed(iter/s)": 0.089132 }, { "epoch": 2.532303069328182, "grad_norm": 0.10048224776983261, "learning_rate": 3.998826375912934e-05, "loss": 0.3727203369140625, "memory(GiB)": 72.72, "step": 5900, "token_acc": 0.8678732978111068, "train_speed(iter/s)": 0.089136 }, { "epoch": 2.540888602704443, "grad_norm": 0.11523660272359848, "learning_rate": 3.9918638558745966e-05, "loss": 0.3741061449050903, "memory(GiB)": 72.72, "step": 5920, "token_acc": 0.8660318303612676, "train_speed(iter/s)": 0.089136 }, { "epoch": 2.549474136080704, "grad_norm": 0.11144141107797623, "learning_rate": 3.9848833184562056e-05, "loss": 0.3695514440536499, "memory(GiB)": 72.72, "step": 5940, "token_acc": 0.8587312382845311, "train_speed(iter/s)": 0.089141 }, { "epoch": 2.558059669456965, "grad_norm": 0.10469717532396317, "learning_rate": 3.9778848479623656e-05, "loss": 0.3754448413848877, "memory(GiB)": 72.72, "step": 5960, "token_acc": 0.856428029145263, "train_speed(iter/s)": 0.089145 }, { "epoch": 2.566645202833226, "grad_norm": 0.09304027259349823, "learning_rate": 3.970868528914264e-05, "loss": 0.3713753938674927, "memory(GiB)": 72.72, "step": 5980, "token_acc": 0.8559786330639638, "train_speed(iter/s)": 0.089145 }, { "epoch": 2.575230736209487, "grad_norm": 0.10925323516130447, "learning_rate": 3.963834446048644e-05, "loss": 0.3693029165267944, "memory(GiB)": 72.72, "step": 6000, "token_acc": 0.8629965592743197, "train_speed(iter/s)": 0.089148 }, { "epoch": 2.575230736209487, "eval_loss": 0.4436081647872925, "eval_runtime": 70.1538, "eval_samples_per_second": 53.654, "eval_steps_per_second": 0.684, "eval_token_acc": 0.84051665623243, "step": 6000 }, { "epoch": 2.583816269585748, "grad_norm": 0.1098393052816391, "learning_rate": 3.956782684316788e-05, "loss": 0.37103126049041746, "memory(GiB)": 72.72, "step": 6020, "token_acc": 0.8471303763965553, "train_speed(iter/s)": 0.089007 }, { "epoch": 2.592401802962009, "grad_norm": 0.10387935489416122, "learning_rate": 3.949713328883483e-05, "loss": 0.36882970333099363, "memory(GiB)": 72.72, "step": 6040, "token_acc": 0.8459705942755437, "train_speed(iter/s)": 0.088998 }, { "epoch": 2.60098733633827, "grad_norm": 0.10209009051322937, "learning_rate": 3.942626465126001e-05, "loss": 0.36882977485656737, "memory(GiB)": 72.72, "step": 6060, "token_acc": 0.8655902503061222, "train_speed(iter/s)": 0.088991 }, { "epoch": 2.609572869714531, "grad_norm": 0.10415869951248169, "learning_rate": 3.935522178633062e-05, "loss": 0.3759881258010864, "memory(GiB)": 72.72, "step": 6080, "token_acc": 0.8581398082906834, "train_speed(iter/s)": 0.08899 }, { "epoch": 2.618158403090792, "grad_norm": 0.11114171892404556, "learning_rate": 3.928400555203801e-05, "loss": 0.37210090160369874, "memory(GiB)": 72.72, "step": 6100, "token_acc": 0.8736639992402806, "train_speed(iter/s)": 0.088988 }, { "epoch": 2.626743936467053, "grad_norm": 0.10994569212198257, "learning_rate": 3.921261680846734e-05, "loss": 0.3746177673339844, "memory(GiB)": 72.72, "step": 6120, "token_acc": 0.8693309992064365, "train_speed(iter/s)": 0.088985 }, { "epoch": 2.635329469843314, "grad_norm": 0.096384197473526, "learning_rate": 3.914105641778718e-05, "loss": 0.3694021701812744, "memory(GiB)": 72.72, "step": 6140, "token_acc": 0.8684860314899538, "train_speed(iter/s)": 0.088985 }, { "epoch": 2.643915003219575, "grad_norm": 0.10146961361169815, "learning_rate": 3.9069325244239095e-05, "loss": 0.36874828338623045, "memory(GiB)": 72.72, "step": 6160, "token_acc": 0.8668048776320361, "train_speed(iter/s)": 0.08898 }, { "epoch": 2.652500536595836, "grad_norm": 0.0965966135263443, "learning_rate": 3.899742415412722e-05, "loss": 0.36864802837371824, "memory(GiB)": 72.72, "step": 6180, "token_acc": 0.8755646100841697, "train_speed(iter/s)": 0.088979 }, { "epoch": 2.661086069972097, "grad_norm": 0.09827576577663422, "learning_rate": 3.892535401580776e-05, "loss": 0.36760308742523196, "memory(GiB)": 72.72, "step": 6200, "token_acc": 0.8648272017837235, "train_speed(iter/s)": 0.088982 }, { "epoch": 2.669671603348358, "grad_norm": 0.09901771694421768, "learning_rate": 3.885311569967858e-05, "loss": 0.37281830310821534, "memory(GiB)": 72.72, "step": 6220, "token_acc": 0.8820075603884335, "train_speed(iter/s)": 0.088983 }, { "epoch": 2.678257136724619, "grad_norm": 0.1107199490070343, "learning_rate": 3.878071007816859e-05, "loss": 0.37139651775360105, "memory(GiB)": 72.72, "step": 6240, "token_acc": 0.8544989601044564, "train_speed(iter/s)": 0.088985 }, { "epoch": 2.68684267010088, "grad_norm": 0.10703787952661514, "learning_rate": 3.87081380257273e-05, "loss": 0.3727452039718628, "memory(GiB)": 72.72, "step": 6260, "token_acc": 0.8577885712183096, "train_speed(iter/s)": 0.088987 }, { "epoch": 2.695428203477141, "grad_norm": 0.10672149062156677, "learning_rate": 3.8635400418814214e-05, "loss": 0.36861019134521483, "memory(GiB)": 72.72, "step": 6280, "token_acc": 0.8560803640097792, "train_speed(iter/s)": 0.088987 }, { "epoch": 2.704013736853402, "grad_norm": 0.09396067261695862, "learning_rate": 3.856249813588824e-05, "loss": 0.36811778545379636, "memory(GiB)": 72.72, "step": 6300, "token_acc": 0.868624502647731, "train_speed(iter/s)": 0.088988 }, { "epoch": 2.712599270229663, "grad_norm": 0.1063656210899353, "learning_rate": 3.848943205739711e-05, "loss": 0.369048547744751, "memory(GiB)": 72.72, "step": 6320, "token_acc": 0.8519738843659109, "train_speed(iter/s)": 0.088991 }, { "epoch": 2.721184803605924, "grad_norm": 0.10474120825529099, "learning_rate": 3.841620306576673e-05, "loss": 0.3731086730957031, "memory(GiB)": 72.72, "step": 6340, "token_acc": 0.8653390159502418, "train_speed(iter/s)": 0.088991 }, { "epoch": 2.729770336982185, "grad_norm": 0.10354544967412949, "learning_rate": 3.834281204539051e-05, "loss": 0.37295677661895754, "memory(GiB)": 72.72, "step": 6360, "token_acc": 0.8547169188263978, "train_speed(iter/s)": 0.088993 }, { "epoch": 2.738355870358446, "grad_norm": 0.10440333932638168, "learning_rate": 3.82692598826187e-05, "loss": 0.3712725877761841, "memory(GiB)": 72.72, "step": 6380, "token_acc": 0.8820346020559459, "train_speed(iter/s)": 0.088996 }, { "epoch": 2.746941403734707, "grad_norm": 0.09520816057920456, "learning_rate": 3.8195547465747685e-05, "loss": 0.3697003602981567, "memory(GiB)": 72.72, "step": 6400, "token_acc": 0.8595229803723984, "train_speed(iter/s)": 0.088997 }, { "epoch": 2.755526937110968, "grad_norm": 0.09628502279520035, "learning_rate": 3.812167568500927e-05, "loss": 0.3673550128936768, "memory(GiB)": 72.72, "step": 6420, "token_acc": 0.8689877572158957, "train_speed(iter/s)": 0.089 }, { "epoch": 2.764112470487229, "grad_norm": 0.09505701065063477, "learning_rate": 3.804764543255987e-05, "loss": 0.36903977394104004, "memory(GiB)": 72.72, "step": 6440, "token_acc": 0.8750163024104382, "train_speed(iter/s)": 0.089003 }, { "epoch": 2.77269800386349, "grad_norm": 0.092622309923172, "learning_rate": 3.797345760246982e-05, "loss": 0.3679107666015625, "memory(GiB)": 72.72, "step": 6460, "token_acc": 0.8469855548827956, "train_speed(iter/s)": 0.089006 }, { "epoch": 2.781283537239751, "grad_norm": 0.10573814809322357, "learning_rate": 3.7899113090712526e-05, "loss": 0.3690340042114258, "memory(GiB)": 72.72, "step": 6480, "token_acc": 0.8704448664825046, "train_speed(iter/s)": 0.089008 }, { "epoch": 2.789869070616012, "grad_norm": 0.1018662378191948, "learning_rate": 3.782461279515363e-05, "loss": 0.3682270050048828, "memory(GiB)": 72.72, "step": 6500, "token_acc": 0.8687270373931054, "train_speed(iter/s)": 0.089011 }, { "epoch": 2.798454603992273, "grad_norm": 0.09893783926963806, "learning_rate": 3.7749957615540224e-05, "loss": 0.371025824546814, "memory(GiB)": 72.72, "step": 6520, "token_acc": 0.8599820821280371, "train_speed(iter/s)": 0.089014 }, { "epoch": 2.807040137368534, "grad_norm": 0.1044677123427391, "learning_rate": 3.767514845348992e-05, "loss": 0.37092270851135256, "memory(GiB)": 72.72, "step": 6540, "token_acc": 0.860977069485444, "train_speed(iter/s)": 0.089016 }, { "epoch": 2.815625670744795, "grad_norm": 0.10815873742103577, "learning_rate": 3.760018621248e-05, "loss": 0.36874244213104246, "memory(GiB)": 72.72, "step": 6560, "token_acc": 0.8586234130381737, "train_speed(iter/s)": 0.089019 }, { "epoch": 2.824211204121056, "grad_norm": 0.08873378485441208, "learning_rate": 3.75250717978365e-05, "loss": 0.36833083629608154, "memory(GiB)": 72.72, "step": 6580, "token_acc": 0.8633419814445173, "train_speed(iter/s)": 0.089022 }, { "epoch": 2.832796737497317, "grad_norm": 0.09121917188167572, "learning_rate": 3.7449806116723266e-05, "loss": 0.3694983720779419, "memory(GiB)": 72.72, "step": 6600, "token_acc": 0.8697197272952701, "train_speed(iter/s)": 0.089026 }, { "epoch": 2.841382270873578, "grad_norm": 0.09253229945898056, "learning_rate": 3.7374390078131015e-05, "loss": 0.37108821868896485, "memory(GiB)": 72.72, "step": 6620, "token_acc": 0.8706145844516814, "train_speed(iter/s)": 0.089031 }, { "epoch": 2.849967804249839, "grad_norm": 0.09768302738666534, "learning_rate": 3.729882459286632e-05, "loss": 0.3706928253173828, "memory(GiB)": 72.72, "step": 6640, "token_acc": 0.8605562350922205, "train_speed(iter/s)": 0.089033 }, { "epoch": 2.8585533376261, "grad_norm": 0.09809901565313339, "learning_rate": 3.722311057354067e-05, "loss": 0.3715434312820435, "memory(GiB)": 72.72, "step": 6660, "token_acc": 0.8687115200037456, "train_speed(iter/s)": 0.089037 }, { "epoch": 2.867138871002361, "grad_norm": 0.10311082750558853, "learning_rate": 3.714724893455938e-05, "loss": 0.3686758756637573, "memory(GiB)": 72.72, "step": 6680, "token_acc": 0.8536554098061117, "train_speed(iter/s)": 0.089035 }, { "epoch": 2.875724404378622, "grad_norm": 0.0951702892780304, "learning_rate": 3.7071240592110604e-05, "loss": 0.37487409114837644, "memory(GiB)": 72.72, "step": 6700, "token_acc": 0.8619494831493575, "train_speed(iter/s)": 0.089039 }, { "epoch": 2.884309937754883, "grad_norm": 0.10398156195878983, "learning_rate": 3.699508646415424e-05, "loss": 0.3755856275558472, "memory(GiB)": 72.72, "step": 6720, "token_acc": 0.8719096505699291, "train_speed(iter/s)": 0.089043 }, { "epoch": 2.892895471131144, "grad_norm": 0.09801426529884338, "learning_rate": 3.691878747041084e-05, "loss": 0.36969609260559083, "memory(GiB)": 72.72, "step": 6740, "token_acc": 0.8539101926900138, "train_speed(iter/s)": 0.089046 }, { "epoch": 2.901481004507405, "grad_norm": 0.10008656978607178, "learning_rate": 3.684234453235054e-05, "loss": 0.3719330310821533, "memory(GiB)": 72.72, "step": 6760, "token_acc": 0.8648975749697432, "train_speed(iter/s)": 0.08905 }, { "epoch": 2.910066537883666, "grad_norm": 0.12179595977067947, "learning_rate": 3.676575857318189e-05, "loss": 0.37140851020812987, "memory(GiB)": 72.72, "step": 6780, "token_acc": 0.8577137651213464, "train_speed(iter/s)": 0.089052 }, { "epoch": 2.918652071259927, "grad_norm": 0.09753546863794327, "learning_rate": 3.66890305178407e-05, "loss": 0.3708536624908447, "memory(GiB)": 72.72, "step": 6800, "token_acc": 0.8671116019269858, "train_speed(iter/s)": 0.089056 }, { "epoch": 2.927237604636188, "grad_norm": 0.09348613768815994, "learning_rate": 3.661216129297894e-05, "loss": 0.3709095001220703, "memory(GiB)": 72.72, "step": 6820, "token_acc": 0.8573097173193669, "train_speed(iter/s)": 0.089061 }, { "epoch": 2.935823138012449, "grad_norm": 0.0905463695526123, "learning_rate": 3.653515182695344e-05, "loss": 0.3767134189605713, "memory(GiB)": 72.72, "step": 6840, "token_acc": 0.8626287415238181, "train_speed(iter/s)": 0.089065 }, { "epoch": 2.94440867138871, "grad_norm": 0.10822242498397827, "learning_rate": 3.645800304981477e-05, "loss": 0.3709308385848999, "memory(GiB)": 72.72, "step": 6860, "token_acc": 0.8577446782413705, "train_speed(iter/s)": 0.089069 }, { "epoch": 2.952994204764971, "grad_norm": 0.1089344173669815, "learning_rate": 3.638071589329597e-05, "loss": 0.3755086660385132, "memory(GiB)": 72.72, "step": 6880, "token_acc": 0.8570769973171867, "train_speed(iter/s)": 0.089072 }, { "epoch": 2.961579738141232, "grad_norm": 0.10646732896566391, "learning_rate": 3.630329129080129e-05, "loss": 0.36852853298187255, "memory(GiB)": 72.72, "step": 6900, "token_acc": 0.8610651132070156, "train_speed(iter/s)": 0.089077 }, { "epoch": 2.970165271517493, "grad_norm": 0.10016820579767227, "learning_rate": 3.622573017739495e-05, "loss": 0.37330124378204343, "memory(GiB)": 72.72, "step": 6920, "token_acc": 0.8775841748626209, "train_speed(iter/s)": 0.08908 }, { "epoch": 2.978750804893754, "grad_norm": 0.1020449697971344, "learning_rate": 3.6148033489789765e-05, "loss": 0.3684419631958008, "memory(GiB)": 72.72, "step": 6940, "token_acc": 0.8642162515149019, "train_speed(iter/s)": 0.089084 }, { "epoch": 2.987336338270015, "grad_norm": 0.0974557027220726, "learning_rate": 3.607020216633599e-05, "loss": 0.37378945350646975, "memory(GiB)": 72.72, "step": 6960, "token_acc": 0.858156359329171, "train_speed(iter/s)": 0.089087 }, { "epoch": 2.995921871646276, "grad_norm": 0.09330358356237411, "learning_rate": 3.59922371470098e-05, "loss": 0.36865170001983644, "memory(GiB)": 72.72, "step": 6980, "token_acc": 0.8638886721914512, "train_speed(iter/s)": 0.089091 }, { "epoch": 3.0042927666881303, "grad_norm": 0.1193256601691246, "learning_rate": 3.591413937340208e-05, "loss": 0.3534395694732666, "memory(GiB)": 72.72, "step": 7000, "token_acc": 0.8802663670407237, "train_speed(iter/s)": 0.089092 }, { "epoch": 3.0042927666881303, "eval_loss": 0.4485101103782654, "eval_runtime": 74.3969, "eval_samples_per_second": 50.593, "eval_steps_per_second": 0.645, "eval_token_acc": 0.8402603254517357, "step": 7000 }, { "epoch": 3.0128783000643913, "grad_norm": 0.1156892329454422, "learning_rate": 3.583590978870699e-05, "loss": 0.3319342851638794, "memory(GiB)": 72.72, "step": 7020, "token_acc": 0.8532470204427854, "train_speed(iter/s)": 0.088961 }, { "epoch": 3.0214638334406523, "grad_norm": 0.10194379091262817, "learning_rate": 3.5757549337710564e-05, "loss": 0.33723247051239014, "memory(GiB)": 72.72, "step": 7040, "token_acc": 0.8831583445244781, "train_speed(iter/s)": 0.088954 }, { "epoch": 3.0300493668169133, "grad_norm": 0.10132017731666565, "learning_rate": 3.5679058966779344e-05, "loss": 0.336438250541687, "memory(GiB)": 72.72, "step": 7060, "token_acc": 0.8769756994854098, "train_speed(iter/s)": 0.08895 }, { "epoch": 3.0386349001931743, "grad_norm": 0.1068112775683403, "learning_rate": 3.560043962384891e-05, "loss": 0.3355576753616333, "memory(GiB)": 72.72, "step": 7080, "token_acc": 0.8759380793584041, "train_speed(iter/s)": 0.088949 }, { "epoch": 3.0472204335694353, "grad_norm": 0.10327329486608505, "learning_rate": 3.552169225841248e-05, "loss": 0.3344245195388794, "memory(GiB)": 72.72, "step": 7100, "token_acc": 0.8749370992739901, "train_speed(iter/s)": 0.088945 }, { "epoch": 3.0558059669456963, "grad_norm": 0.10621868073940277, "learning_rate": 3.544281782150936e-05, "loss": 0.33667793273925783, "memory(GiB)": 72.72, "step": 7120, "token_acc": 0.8698413495330857, "train_speed(iter/s)": 0.088946 }, { "epoch": 3.0643915003219577, "grad_norm": 0.09647602587938309, "learning_rate": 3.536381726571358e-05, "loss": 0.33697144985198973, "memory(GiB)": 72.72, "step": 7140, "token_acc": 0.879177233267265, "train_speed(iter/s)": 0.088946 }, { "epoch": 3.0729770336982183, "grad_norm": 0.1008361279964447, "learning_rate": 3.528469154512224e-05, "loss": 0.3379324674606323, "memory(GiB)": 72.72, "step": 7160, "token_acc": 0.881303225060136, "train_speed(iter/s)": 0.088946 }, { "epoch": 3.0815625670744797, "grad_norm": 0.09905105084180832, "learning_rate": 3.520544161534413e-05, "loss": 0.33641412258148196, "memory(GiB)": 72.72, "step": 7180, "token_acc": 0.8765279938577173, "train_speed(iter/s)": 0.088947 }, { "epoch": 3.0901481004507403, "grad_norm": 0.09547468274831772, "learning_rate": 3.51260684334881e-05, "loss": 0.33444535732269287, "memory(GiB)": 72.72, "step": 7200, "token_acc": 0.8740168402536957, "train_speed(iter/s)": 0.088949 }, { "epoch": 3.0987336338270013, "grad_norm": 0.091608926653862, "learning_rate": 3.504657295815153e-05, "loss": 0.33458809852600097, "memory(GiB)": 72.72, "step": 7220, "token_acc": 0.8822041996574748, "train_speed(iter/s)": 0.088951 }, { "epoch": 3.1073191672032623, "grad_norm": 0.095795176923275, "learning_rate": 3.496695614940875e-05, "loss": 0.3341191053390503, "memory(GiB)": 72.72, "step": 7240, "token_acc": 0.8863122055178043, "train_speed(iter/s)": 0.088952 }, { "epoch": 3.1159047005795233, "grad_norm": 0.11027920246124268, "learning_rate": 3.488721896879943e-05, "loss": 0.3351098299026489, "memory(GiB)": 72.72, "step": 7260, "token_acc": 0.8802774242498409, "train_speed(iter/s)": 0.088955 }, { "epoch": 3.1244902339557843, "grad_norm": 0.09548976272344589, "learning_rate": 3.4807362379317025e-05, "loss": 0.3381031513214111, "memory(GiB)": 72.72, "step": 7280, "token_acc": 0.8777537505068252, "train_speed(iter/s)": 0.088954 }, { "epoch": 3.1330757673320457, "grad_norm": 0.1054491475224495, "learning_rate": 3.472738734539706e-05, "loss": 0.33547115325927734, "memory(GiB)": 72.72, "step": 7300, "token_acc": 0.8795795912347567, "train_speed(iter/s)": 0.088956 }, { "epoch": 3.1416613007083063, "grad_norm": 0.09988971799612045, "learning_rate": 3.464729483290553e-05, "loss": 0.3418281555175781, "memory(GiB)": 72.72, "step": 7320, "token_acc": 0.8629133179032032, "train_speed(iter/s)": 0.088958 }, { "epoch": 3.1502468340845677, "grad_norm": 0.09766259044408798, "learning_rate": 3.456708580912725e-05, "loss": 0.3392175674438477, "memory(GiB)": 72.72, "step": 7340, "token_acc": 0.8706737594562531, "train_speed(iter/s)": 0.088951 }, { "epoch": 3.1588323674608283, "grad_norm": 0.09341710805892944, "learning_rate": 3.448676124275414e-05, "loss": 0.3362084150314331, "memory(GiB)": 72.72, "step": 7360, "token_acc": 0.8706982003587074, "train_speed(iter/s)": 0.088954 }, { "epoch": 3.1674179008370897, "grad_norm": 0.0969720259308815, "learning_rate": 3.440632210387354e-05, "loss": 0.3380004644393921, "memory(GiB)": 72.72, "step": 7380, "token_acc": 0.8738021476597112, "train_speed(iter/s)": 0.088957 }, { "epoch": 3.1760034342133503, "grad_norm": 0.09787522256374359, "learning_rate": 3.432576936395648e-05, "loss": 0.3357203245162964, "memory(GiB)": 72.72, "step": 7400, "token_acc": 0.8912336656741101, "train_speed(iter/s)": 0.088961 }, { "epoch": 3.1845889675896113, "grad_norm": 0.10224709659814835, "learning_rate": 3.424510399584601e-05, "loss": 0.33477025032043456, "memory(GiB)": 72.72, "step": 7420, "token_acc": 0.8561189105937783, "train_speed(iter/s)": 0.088965 }, { "epoch": 3.1931745009658723, "grad_norm": 0.10669636726379395, "learning_rate": 3.416432697374533e-05, "loss": 0.33573341369628906, "memory(GiB)": 72.72, "step": 7440, "token_acc": 0.874112458982316, "train_speed(iter/s)": 0.088968 }, { "epoch": 3.2017600343421333, "grad_norm": 0.1014070212841034, "learning_rate": 3.408343927320613e-05, "loss": 0.3380695343017578, "memory(GiB)": 72.72, "step": 7460, "token_acc": 0.8848022091860703, "train_speed(iter/s)": 0.088972 }, { "epoch": 3.2103455677183943, "grad_norm": 0.09528549015522003, "learning_rate": 3.40024418711168e-05, "loss": 0.33952438831329346, "memory(GiB)": 72.72, "step": 7480, "token_acc": 0.8705726760778868, "train_speed(iter/s)": 0.088975 }, { "epoch": 3.2189311010946553, "grad_norm": 0.10318120568990707, "learning_rate": 3.392133574569057e-05, "loss": 0.3406086444854736, "memory(GiB)": 72.72, "step": 7500, "token_acc": 0.8733639567077774, "train_speed(iter/s)": 0.088978 }, { "epoch": 3.2275166344709163, "grad_norm": 0.11275230348110199, "learning_rate": 3.3840121876453734e-05, "loss": 0.33986356258392336, "memory(GiB)": 72.72, "step": 7520, "token_acc": 0.8619126202517206, "train_speed(iter/s)": 0.088978 }, { "epoch": 3.2361021678471773, "grad_norm": 0.10118957608938217, "learning_rate": 3.375880124423383e-05, "loss": 0.3386232852935791, "memory(GiB)": 72.72, "step": 7540, "token_acc": 0.8710604646623604, "train_speed(iter/s)": 0.088981 }, { "epoch": 3.2446877012234383, "grad_norm": 0.10550114512443542, "learning_rate": 3.367737483114779e-05, "loss": 0.3421770572662354, "memory(GiB)": 72.72, "step": 7560, "token_acc": 0.8851797047121107, "train_speed(iter/s)": 0.088985 }, { "epoch": 3.2532732345996997, "grad_norm": 0.1023048609495163, "learning_rate": 3.359584362059004e-05, "loss": 0.33796124458312987, "memory(GiB)": 72.72, "step": 7580, "token_acc": 0.8739776940178287, "train_speed(iter/s)": 0.088985 }, { "epoch": 3.2618587679759603, "grad_norm": 0.09559116512537003, "learning_rate": 3.3514208597220705e-05, "loss": 0.3409781217575073, "memory(GiB)": 72.72, "step": 7600, "token_acc": 0.874609344576846, "train_speed(iter/s)": 0.088989 }, { "epoch": 3.2704443013522217, "grad_norm": 0.09580449014902115, "learning_rate": 3.3432470746953606e-05, "loss": 0.33773849010467527, "memory(GiB)": 72.72, "step": 7620, "token_acc": 0.8727284510693454, "train_speed(iter/s)": 0.088993 }, { "epoch": 3.2790298347284823, "grad_norm": 0.10818155109882355, "learning_rate": 3.335063105694447e-05, "loss": 0.3401022434234619, "memory(GiB)": 72.72, "step": 7640, "token_acc": 0.8764802837026362, "train_speed(iter/s)": 0.088996 }, { "epoch": 3.2876153681047438, "grad_norm": 0.10184460878372192, "learning_rate": 3.326869051557891e-05, "loss": 0.3434968709945679, "memory(GiB)": 72.72, "step": 7660, "token_acc": 0.8761705077978165, "train_speed(iter/s)": 0.088919 }, { "epoch": 3.2962009014810043, "grad_norm": 0.09505783021450043, "learning_rate": 3.318665011246056e-05, "loss": 0.3408296346664429, "memory(GiB)": 72.72, "step": 7680, "token_acc": 0.8661087384073535, "train_speed(iter/s)": 0.088905 }, { "epoch": 3.3047864348572658, "grad_norm": 0.10040104389190674, "learning_rate": 3.310451083839908e-05, "loss": 0.3423358678817749, "memory(GiB)": 72.72, "step": 7700, "token_acc": 0.861539109557306, "train_speed(iter/s)": 0.088907 }, { "epoch": 3.3133719682335263, "grad_norm": 0.10616692155599594, "learning_rate": 3.30222736853982e-05, "loss": 0.34503300189971925, "memory(GiB)": 72.72, "step": 7720, "token_acc": 0.8724093642360908, "train_speed(iter/s)": 0.08891 }, { "epoch": 3.3219575016097878, "grad_norm": 0.10949140787124634, "learning_rate": 3.293993964664376e-05, "loss": 0.3432727098464966, "memory(GiB)": 72.72, "step": 7740, "token_acc": 0.8669582519497799, "train_speed(iter/s)": 0.088914 }, { "epoch": 3.3305430349860483, "grad_norm": 0.09881085902452469, "learning_rate": 3.285750971649167e-05, "loss": 0.3427408695220947, "memory(GiB)": 72.72, "step": 7760, "token_acc": 0.8689034982030741, "train_speed(iter/s)": 0.088917 }, { "epoch": 3.3391285683623098, "grad_norm": 0.09140335768461227, "learning_rate": 3.2774984890455976e-05, "loss": 0.3475862979888916, "memory(GiB)": 72.72, "step": 7780, "token_acc": 0.8685826593182928, "train_speed(iter/s)": 0.088921 }, { "epoch": 3.3477141017385703, "grad_norm": 0.1024077907204628, "learning_rate": 3.2692366165196727e-05, "loss": 0.3404365539550781, "memory(GiB)": 72.72, "step": 7800, "token_acc": 0.8840015739822477, "train_speed(iter/s)": 0.088925 }, { "epoch": 3.3562996351148318, "grad_norm": 0.09467454254627228, "learning_rate": 3.260965453850806e-05, "loss": 0.34421525001525877, "memory(GiB)": 72.72, "step": 7820, "token_acc": 0.8758503166590742, "train_speed(iter/s)": 0.088929 }, { "epoch": 3.3648851684910923, "grad_norm": 0.10136840492486954, "learning_rate": 3.252685100930605e-05, "loss": 0.3386892795562744, "memory(GiB)": 72.72, "step": 7840, "token_acc": 0.85672288931185, "train_speed(iter/s)": 0.088932 }, { "epoch": 3.3734707018673533, "grad_norm": 0.09780098497867584, "learning_rate": 3.244395657761671e-05, "loss": 0.3428237199783325, "memory(GiB)": 72.72, "step": 7860, "token_acc": 0.868161995980711, "train_speed(iter/s)": 0.088935 }, { "epoch": 3.3820562352436143, "grad_norm": 0.1032358855009079, "learning_rate": 3.23609722445639e-05, "loss": 0.3407264709472656, "memory(GiB)": 72.72, "step": 7880, "token_acc": 0.8630356105896284, "train_speed(iter/s)": 0.088936 }, { "epoch": 3.3906417686198753, "grad_norm": 0.09920444339513779, "learning_rate": 3.2277899012357196e-05, "loss": 0.34147114753723146, "memory(GiB)": 72.72, "step": 7900, "token_acc": 0.8645183518911774, "train_speed(iter/s)": 0.088941 }, { "epoch": 3.3992273019961363, "grad_norm": 0.1050969585776329, "learning_rate": 3.219473788427984e-05, "loss": 0.3448856115341187, "memory(GiB)": 72.72, "step": 7920, "token_acc": 0.8714814655549509, "train_speed(iter/s)": 0.088944 }, { "epoch": 3.4078128353723973, "grad_norm": 0.10028455406427383, "learning_rate": 3.211148986467659e-05, "loss": 0.3422698974609375, "memory(GiB)": 72.72, "step": 7940, "token_acc": 0.8711220342714154, "train_speed(iter/s)": 0.088948 }, { "epoch": 3.4163983687486583, "grad_norm": 0.09475808590650558, "learning_rate": 3.2028155958941615e-05, "loss": 0.3451426029205322, "memory(GiB)": 72.72, "step": 7960, "token_acc": 0.8738779982122461, "train_speed(iter/s)": 0.088952 }, { "epoch": 3.4249839021249193, "grad_norm": 0.09882804751396179, "learning_rate": 3.1944737173506324e-05, "loss": 0.3444493532180786, "memory(GiB)": 72.72, "step": 7980, "token_acc": 0.8827410911702268, "train_speed(iter/s)": 0.088955 }, { "epoch": 3.4335694355011803, "grad_norm": 0.10227163881063461, "learning_rate": 3.186123451582723e-05, "loss": 0.339670729637146, "memory(GiB)": 72.72, "step": 8000, "token_acc": 0.8807350762593477, "train_speed(iter/s)": 0.08896 }, { "epoch": 3.4335694355011803, "eval_loss": 0.44718989729881287, "eval_runtime": 69.1311, "eval_samples_per_second": 54.447, "eval_steps_per_second": 0.694, "eval_token_acc": 0.8404557155339724, "step": 8000 }, { "epoch": 3.4421549688774413, "grad_norm": 0.0968368649482727, "learning_rate": 3.177764899437378e-05, "loss": 0.34265289306640623, "memory(GiB)": 72.72, "step": 8020, "token_acc": 0.854713276154318, "train_speed(iter/s)": 0.088852 }, { "epoch": 3.4507405022537023, "grad_norm": 0.09359851479530334, "learning_rate": 3.169398161861618e-05, "loss": 0.33971107006073, "memory(GiB)": 72.72, "step": 8040, "token_acc": 0.8740548416277094, "train_speed(iter/s)": 0.08884 }, { "epoch": 3.4593260356299633, "grad_norm": 0.09218861162662506, "learning_rate": 3.1610233399013194e-05, "loss": 0.34025261402130125, "memory(GiB)": 72.72, "step": 8060, "token_acc": 0.8837196272437907, "train_speed(iter/s)": 0.088833 }, { "epoch": 3.4679115690062243, "grad_norm": 0.09785692393779755, "learning_rate": 3.1526405346999946e-05, "loss": 0.34408791065216066, "memory(GiB)": 72.72, "step": 8080, "token_acc": 0.8632519203232839, "train_speed(iter/s)": 0.088829 }, { "epoch": 3.4764971023824853, "grad_norm": 0.0918072834610939, "learning_rate": 3.1442498474975694e-05, "loss": 0.3405976057052612, "memory(GiB)": 72.72, "step": 8100, "token_acc": 0.8723113057185948, "train_speed(iter/s)": 0.088832 }, { "epoch": 3.4850826357587463, "grad_norm": 0.10397264361381531, "learning_rate": 3.1358513796291625e-05, "loss": 0.3404028654098511, "memory(GiB)": 72.72, "step": 8120, "token_acc": 0.8617087474123225, "train_speed(iter/s)": 0.088834 }, { "epoch": 3.4936681691350073, "grad_norm": 0.10147637873888016, "learning_rate": 3.1274452325238604e-05, "loss": 0.3449804067611694, "memory(GiB)": 72.72, "step": 8140, "token_acc": 0.881801972466236, "train_speed(iter/s)": 0.088832 }, { "epoch": 3.5022537025112683, "grad_norm": 0.10313740372657776, "learning_rate": 3.119031507703491e-05, "loss": 0.34123189449310304, "memory(GiB)": 72.72, "step": 8160, "token_acc": 0.8796580674904387, "train_speed(iter/s)": 0.088831 }, { "epoch": 3.5108392358875298, "grad_norm": 0.10292479395866394, "learning_rate": 3.1106103067814005e-05, "loss": 0.342661452293396, "memory(GiB)": 72.72, "step": 8180, "token_acc": 0.8706844817024822, "train_speed(iter/s)": 0.088834 }, { "epoch": 3.5194247692637903, "grad_norm": 0.10231835395097733, "learning_rate": 3.102181731461225e-05, "loss": 0.3427009344100952, "memory(GiB)": 72.72, "step": 8200, "token_acc": 0.8746011467506197, "train_speed(iter/s)": 0.088833 }, { "epoch": 3.5280103026400518, "grad_norm": 0.09958157688379288, "learning_rate": 3.09374588353566e-05, "loss": 0.34229106903076173, "memory(GiB)": 72.72, "step": 8220, "token_acc": 0.8835202199767901, "train_speed(iter/s)": 0.088833 }, { "epoch": 3.5365958360163123, "grad_norm": 0.10157457739114761, "learning_rate": 3.085302864885235e-05, "loss": 0.3417761564254761, "memory(GiB)": 72.72, "step": 8240, "token_acc": 0.8649101475499108, "train_speed(iter/s)": 0.088834 }, { "epoch": 3.545181369392574, "grad_norm": 0.0995817556977272, "learning_rate": 3.076852777477079e-05, "loss": 0.34410881996154785, "memory(GiB)": 72.72, "step": 8260, "token_acc": 0.8783496646486048, "train_speed(iter/s)": 0.088836 }, { "epoch": 3.5537669027688343, "grad_norm": 0.09822899103164673, "learning_rate": 3.068395723363694e-05, "loss": 0.34146294593811033, "memory(GiB)": 72.72, "step": 8280, "token_acc": 0.8781757426389024, "train_speed(iter/s)": 0.088836 }, { "epoch": 3.562352436145096, "grad_norm": 0.10480652749538422, "learning_rate": 3.0599318046817144e-05, "loss": 0.34048995971679685, "memory(GiB)": 72.72, "step": 8300, "token_acc": 0.8748031260669741, "train_speed(iter/s)": 0.088838 }, { "epoch": 3.5709379695213563, "grad_norm": 0.09434372186660767, "learning_rate": 3.051461123650685e-05, "loss": 0.33703758716583254, "memory(GiB)": 72.72, "step": 8320, "token_acc": 0.8765604747936422, "train_speed(iter/s)": 0.088842 }, { "epoch": 3.579523502897618, "grad_norm": 0.09659520536661148, "learning_rate": 3.0429837825718162e-05, "loss": 0.3348528385162354, "memory(GiB)": 72.72, "step": 8340, "token_acc": 0.8765401382308406, "train_speed(iter/s)": 0.088834 }, { "epoch": 3.5881090362738783, "grad_norm": 0.09309985488653183, "learning_rate": 3.0344998838267525e-05, "loss": 0.3402057647705078, "memory(GiB)": 72.72, "step": 8360, "token_acc": 0.8651010368553427, "train_speed(iter/s)": 0.088836 }, { "epoch": 3.59669456965014, "grad_norm": 0.0929030105471611, "learning_rate": 3.0260095298763376e-05, "loss": 0.34411866664886476, "memory(GiB)": 72.72, "step": 8380, "token_acc": 0.8811669848458061, "train_speed(iter/s)": 0.088838 }, { "epoch": 3.6052801030264003, "grad_norm": 0.0983252078294754, "learning_rate": 3.017512823259373e-05, "loss": 0.34260566234588624, "memory(GiB)": 72.72, "step": 8400, "token_acc": 0.8748058346767034, "train_speed(iter/s)": 0.088837 }, { "epoch": 3.613865636402662, "grad_norm": 0.10412958264350891, "learning_rate": 3.0090098665913857e-05, "loss": 0.3410640716552734, "memory(GiB)": 72.72, "step": 8420, "token_acc": 0.8833422403080311, "train_speed(iter/s)": 0.088839 }, { "epoch": 3.6224511697789223, "grad_norm": 0.1032663881778717, "learning_rate": 3.0005007625633806e-05, "loss": 0.3369549512863159, "memory(GiB)": 72.72, "step": 8440, "token_acc": 0.8744007729088683, "train_speed(iter/s)": 0.088841 }, { "epoch": 3.631036703155184, "grad_norm": 0.09928712248802185, "learning_rate": 2.9919856139406093e-05, "loss": 0.3410694122314453, "memory(GiB)": 72.72, "step": 8460, "token_acc": 0.868446777131458, "train_speed(iter/s)": 0.088844 }, { "epoch": 3.6396222365314443, "grad_norm": 0.10240930318832397, "learning_rate": 2.9834645235613202e-05, "loss": 0.34042160511016845, "memory(GiB)": 72.72, "step": 8480, "token_acc": 0.8746132434983096, "train_speed(iter/s)": 0.088845 }, { "epoch": 3.648207769907706, "grad_norm": 0.09757622331380844, "learning_rate": 2.9749375943355245e-05, "loss": 0.3391597032546997, "memory(GiB)": 72.72, "step": 8500, "token_acc": 0.8870871533336139, "train_speed(iter/s)": 0.088847 }, { "epoch": 3.6567933032839663, "grad_norm": 0.10708373039960861, "learning_rate": 2.966404929243746e-05, "loss": 0.3418737888336182, "memory(GiB)": 72.72, "step": 8520, "token_acc": 0.8803486188795007, "train_speed(iter/s)": 0.08885 }, { "epoch": 3.665378836660228, "grad_norm": 0.09238722175359726, "learning_rate": 2.9578666313357866e-05, "loss": 0.3395582675933838, "memory(GiB)": 72.72, "step": 8540, "token_acc": 0.8617492297025544, "train_speed(iter/s)": 0.088851 }, { "epoch": 3.6739643700364883, "grad_norm": 0.0982414111495018, "learning_rate": 2.9493228037294702e-05, "loss": 0.339850926399231, "memory(GiB)": 72.72, "step": 8560, "token_acc": 0.872913510605142, "train_speed(iter/s)": 0.088854 }, { "epoch": 3.68254990341275, "grad_norm": 0.09378170222043991, "learning_rate": 2.9407735496094074e-05, "loss": 0.3445668935775757, "memory(GiB)": 72.72, "step": 8580, "token_acc": 0.8608412452277943, "train_speed(iter/s)": 0.088857 }, { "epoch": 3.6911354367890103, "grad_norm": 0.10139860957860947, "learning_rate": 2.9322189722257437e-05, "loss": 0.33951511383056643, "memory(GiB)": 72.72, "step": 8600, "token_acc": 0.8813381599903551, "train_speed(iter/s)": 0.088858 }, { "epoch": 3.699720970165272, "grad_norm": 0.10095764696598053, "learning_rate": 2.9236591748929143e-05, "loss": 0.3414825201034546, "memory(GiB)": 72.72, "step": 8620, "token_acc": 0.8747491060455407, "train_speed(iter/s)": 0.088861 }, { "epoch": 3.7083065035415324, "grad_norm": 0.09368202835321426, "learning_rate": 2.915094260988397e-05, "loss": 0.3400054216384888, "memory(GiB)": 72.72, "step": 8640, "token_acc": 0.8603559177014007, "train_speed(iter/s)": 0.088863 }, { "epoch": 3.7168920369177934, "grad_norm": 0.09599091857671738, "learning_rate": 2.906524333951461e-05, "loss": 0.33973557949066163, "memory(GiB)": 72.72, "step": 8660, "token_acc": 0.8864862275305668, "train_speed(iter/s)": 0.088865 }, { "epoch": 3.7254775702940544, "grad_norm": 0.0969940647482872, "learning_rate": 2.8979494972819227e-05, "loss": 0.3434182405471802, "memory(GiB)": 72.72, "step": 8680, "token_acc": 0.8716537070538549, "train_speed(iter/s)": 0.088858 }, { "epoch": 3.7340631036703154, "grad_norm": 0.10267031192779541, "learning_rate": 2.8893698545388887e-05, "loss": 0.3440374851226807, "memory(GiB)": 72.72, "step": 8700, "token_acc": 0.8709150326797386, "train_speed(iter/s)": 0.088861 }, { "epoch": 3.7426486370465764, "grad_norm": 0.09835559874773026, "learning_rate": 2.8807855093395126e-05, "loss": 0.34554252624511717, "memory(GiB)": 72.72, "step": 8720, "token_acc": 0.8670599046959998, "train_speed(iter/s)": 0.088863 }, { "epoch": 3.7512341704228374, "grad_norm": 0.0885239914059639, "learning_rate": 2.8721965653577386e-05, "loss": 0.3446002244949341, "memory(GiB)": 72.72, "step": 8740, "token_acc": 0.8721490695849959, "train_speed(iter/s)": 0.088867 }, { "epoch": 3.7598197037990984, "grad_norm": 0.09081339836120605, "learning_rate": 2.86360312632305e-05, "loss": 0.33843419551849363, "memory(GiB)": 72.72, "step": 8760, "token_acc": 0.8680939478458125, "train_speed(iter/s)": 0.088869 }, { "epoch": 3.7684052371753594, "grad_norm": 0.09640111774206161, "learning_rate": 2.855005296019218e-05, "loss": 0.340420126914978, "memory(GiB)": 72.72, "step": 8780, "token_acc": 0.8749122556452666, "train_speed(iter/s)": 0.088872 }, { "epoch": 3.7769907705516204, "grad_norm": 0.0949261263012886, "learning_rate": 2.8464031782830474e-05, "loss": 0.3449671983718872, "memory(GiB)": 72.72, "step": 8800, "token_acc": 0.8710775436891774, "train_speed(iter/s)": 0.088876 }, { "epoch": 3.7855763039278814, "grad_norm": 0.09448053687810898, "learning_rate": 2.837796877003124e-05, "loss": 0.3435060977935791, "memory(GiB)": 72.72, "step": 8820, "token_acc": 0.884149136577708, "train_speed(iter/s)": 0.08888 }, { "epoch": 3.7941618373041424, "grad_norm": 0.09816328436136246, "learning_rate": 2.8291864961185566e-05, "loss": 0.34175992012023926, "memory(GiB)": 72.72, "step": 8840, "token_acc": 0.8704215639701488, "train_speed(iter/s)": 0.088882 }, { "epoch": 3.8027473706804034, "grad_norm": 0.09840340167284012, "learning_rate": 2.820572139617725e-05, "loss": 0.3442914247512817, "memory(GiB)": 72.72, "step": 8860, "token_acc": 0.8852563932460973, "train_speed(iter/s)": 0.088885 }, { "epoch": 3.8113329040566644, "grad_norm": 0.09052480757236481, "learning_rate": 2.8119539115370218e-05, "loss": 0.3354163408279419, "memory(GiB)": 72.72, "step": 8880, "token_acc": 0.8710054027589692, "train_speed(iter/s)": 0.088887 }, { "epoch": 3.8199184374329254, "grad_norm": 0.09055832773447037, "learning_rate": 2.803331915959599e-05, "loss": 0.341020393371582, "memory(GiB)": 72.72, "step": 8900, "token_acc": 0.8775211583840608, "train_speed(iter/s)": 0.088889 }, { "epoch": 3.8285039708091864, "grad_norm": 0.09606460481882095, "learning_rate": 2.7947062570141073e-05, "loss": 0.34467277526855467, "memory(GiB)": 72.72, "step": 8920, "token_acc": 0.8684845089446742, "train_speed(iter/s)": 0.088892 }, { "epoch": 3.8370895041854474, "grad_norm": 0.0941082313656807, "learning_rate": 2.7860770388734408e-05, "loss": 0.34154183864593507, "memory(GiB)": 72.72, "step": 8940, "token_acc": 0.8651064878551884, "train_speed(iter/s)": 0.088895 }, { "epoch": 3.8456750375617084, "grad_norm": 0.08800920099020004, "learning_rate": 2.7774443657534788e-05, "loss": 0.34454681873321535, "memory(GiB)": 72.72, "step": 8960, "token_acc": 0.884229596704054, "train_speed(iter/s)": 0.088899 }, { "epoch": 3.8542605709379694, "grad_norm": 0.0993284210562706, "learning_rate": 2.7688083419118255e-05, "loss": 0.3417619466781616, "memory(GiB)": 72.72, "step": 8980, "token_acc": 0.8696293253324922, "train_speed(iter/s)": 0.088902 }, { "epoch": 3.8628461043142304, "grad_norm": 0.10383660346269608, "learning_rate": 2.760169071646553e-05, "loss": 0.34536774158477784, "memory(GiB)": 72.72, "step": 9000, "token_acc": 0.8775136024730062, "train_speed(iter/s)": 0.088905 }, { "epoch": 3.8628461043142304, "eval_loss": 0.4432525634765625, "eval_runtime": 69.6489, "eval_samples_per_second": 54.042, "eval_steps_per_second": 0.689, "eval_token_acc": 0.8414275958111643, "step": 9000 }, { "epoch": 3.8714316376904914, "grad_norm": 0.0947885811328888, "learning_rate": 2.7515266592949407e-05, "loss": 0.3397974491119385, "memory(GiB)": 72.72, "step": 9020, "token_acc": 0.8571858554733831, "train_speed(iter/s)": 0.08881 }, { "epoch": 3.8800171710667524, "grad_norm": 0.09524156153202057, "learning_rate": 2.742881209232215e-05, "loss": 0.3427132129669189, "memory(GiB)": 72.72, "step": 9040, "token_acc": 0.868957431040566, "train_speed(iter/s)": 0.088802 }, { "epoch": 3.8886027044430134, "grad_norm": 0.08956858515739441, "learning_rate": 2.7342328258702894e-05, "loss": 0.34703960418701174, "memory(GiB)": 72.72, "step": 9060, "token_acc": 0.8717364607638463, "train_speed(iter/s)": 0.088797 }, { "epoch": 3.8971882378192744, "grad_norm": 0.09309873729944229, "learning_rate": 2.7255816136565026e-05, "loss": 0.34093830585479734, "memory(GiB)": 72.72, "step": 9080, "token_acc": 0.8860340449246085, "train_speed(iter/s)": 0.088797 }, { "epoch": 3.9057737711955354, "grad_norm": 0.09236317873001099, "learning_rate": 2.7169276770723585e-05, "loss": 0.3432276248931885, "memory(GiB)": 72.72, "step": 9100, "token_acc": 0.8692972431017865, "train_speed(iter/s)": 0.088797 }, { "epoch": 3.9143593045717964, "grad_norm": 0.09957270324230194, "learning_rate": 2.708271120632262e-05, "loss": 0.34100799560546874, "memory(GiB)": 72.72, "step": 9120, "token_acc": 0.8780453295762229, "train_speed(iter/s)": 0.088796 }, { "epoch": 3.9229448379480574, "grad_norm": 0.09253112971782684, "learning_rate": 2.69961204888226e-05, "loss": 0.344201922416687, "memory(GiB)": 72.72, "step": 9140, "token_acc": 0.893788044699683, "train_speed(iter/s)": 0.088799 }, { "epoch": 3.9315303713243184, "grad_norm": 0.09970075637102127, "learning_rate": 2.6909505663987756e-05, "loss": 0.34385430812835693, "memory(GiB)": 72.72, "step": 9160, "token_acc": 0.884597342165496, "train_speed(iter/s)": 0.088801 }, { "epoch": 3.9401159047005794, "grad_norm": 0.0891101136803627, "learning_rate": 2.682286777787348e-05, "loss": 0.3451590299606323, "memory(GiB)": 72.72, "step": 9180, "token_acc": 0.8716154630632927, "train_speed(iter/s)": 0.0888 }, { "epoch": 3.9487014380768404, "grad_norm": 0.09408137947320938, "learning_rate": 2.6736207876813646e-05, "loss": 0.34462172985076905, "memory(GiB)": 72.72, "step": 9200, "token_acc": 0.8778122218028758, "train_speed(iter/s)": 0.088802 }, { "epoch": 3.9572869714531014, "grad_norm": 0.09145346283912659, "learning_rate": 2.664952700740806e-05, "loss": 0.34248254299163816, "memory(GiB)": 72.72, "step": 9220, "token_acc": 0.872891004579533, "train_speed(iter/s)": 0.088803 }, { "epoch": 3.9658725048293624, "grad_norm": 0.09725998342037201, "learning_rate": 2.6562826216509696e-05, "loss": 0.34380669593811036, "memory(GiB)": 72.72, "step": 9240, "token_acc": 0.8909276331759067, "train_speed(iter/s)": 0.088804 }, { "epoch": 3.9744580382056234, "grad_norm": 0.10166844725608826, "learning_rate": 2.6476106551212188e-05, "loss": 0.34403514862060547, "memory(GiB)": 72.72, "step": 9260, "token_acc": 0.8776962289782687, "train_speed(iter/s)": 0.088806 }, { "epoch": 3.9830435715818844, "grad_norm": 0.09070953726768494, "learning_rate": 2.6389369058837077e-05, "loss": 0.341811990737915, "memory(GiB)": 72.72, "step": 9280, "token_acc": 0.8719749437415167, "train_speed(iter/s)": 0.088808 }, { "epoch": 3.9916291049581454, "grad_norm": 0.09677760303020477, "learning_rate": 2.6302614786921204e-05, "loss": 0.3442156553268433, "memory(GiB)": 72.72, "step": 9300, "token_acc": 0.882238909204825, "train_speed(iter/s)": 0.088808 }, { "epoch": 4.0, "grad_norm": 0.15741688013076782, "learning_rate": 2.621584478320408e-05, "loss": 0.3397855758666992, "memory(GiB)": 72.72, "step": 9320, "token_acc": 0.8889204303051386, "train_speed(iter/s)": 0.088814 }, { "epoch": 4.008585533376261, "grad_norm": 0.10205920785665512, "learning_rate": 2.6129060095615187e-05, "loss": 0.29747543334960935, "memory(GiB)": 72.72, "step": 9340, "token_acc": 0.8900451968067217, "train_speed(iter/s)": 0.0888 }, { "epoch": 4.017171066752522, "grad_norm": 0.10247659683227539, "learning_rate": 2.604226177226137e-05, "loss": 0.30353684425354005, "memory(GiB)": 72.72, "step": 9360, "token_acc": 0.886528226098631, "train_speed(iter/s)": 0.088801 }, { "epoch": 4.025756600128783, "grad_norm": 0.10435572266578674, "learning_rate": 2.5955450861414126e-05, "loss": 0.30368824005126954, "memory(GiB)": 72.72, "step": 9380, "token_acc": 0.8827944824311919, "train_speed(iter/s)": 0.088803 }, { "epoch": 4.034342133505044, "grad_norm": 0.1014116182923317, "learning_rate": 2.586862841149701e-05, "loss": 0.3020852327346802, "memory(GiB)": 72.72, "step": 9400, "token_acc": 0.8891262896776423, "train_speed(iter/s)": 0.088805 }, { "epoch": 4.042927666881305, "grad_norm": 0.10401485115289688, "learning_rate": 2.5781795471072885e-05, "loss": 0.3056429386138916, "memory(GiB)": 72.72, "step": 9420, "token_acc": 0.8829484753143999, "train_speed(iter/s)": 0.088807 }, { "epoch": 4.051513200257566, "grad_norm": 0.10134406387805939, "learning_rate": 2.5694953088831352e-05, "loss": 0.30531723499298097, "memory(GiB)": 72.72, "step": 9440, "token_acc": 0.8840279216629264, "train_speed(iter/s)": 0.088808 }, { "epoch": 4.060098733633827, "grad_norm": 0.10662077367305756, "learning_rate": 2.5608102313576027e-05, "loss": 0.3047459363937378, "memory(GiB)": 72.72, "step": 9460, "token_acc": 0.9002080243657248, "train_speed(iter/s)": 0.088811 }, { "epoch": 4.068684267010088, "grad_norm": 0.10326355695724487, "learning_rate": 2.5521244194211884e-05, "loss": 0.30735197067260744, "memory(GiB)": 72.72, "step": 9480, "token_acc": 0.8819828054997908, "train_speed(iter/s)": 0.088814 }, { "epoch": 4.077269800386349, "grad_norm": 0.10981076210737228, "learning_rate": 2.5434379779732603e-05, "loss": 0.30461032390594484, "memory(GiB)": 72.72, "step": 9500, "token_acc": 0.882671980207554, "train_speed(iter/s)": 0.088816 }, { "epoch": 4.08585533376261, "grad_norm": 0.09967193752527237, "learning_rate": 2.5347510119207878e-05, "loss": 0.3016824722290039, "memory(GiB)": 72.72, "step": 9520, "token_acc": 0.8960580499977037, "train_speed(iter/s)": 0.088818 }, { "epoch": 4.094440867138871, "grad_norm": 0.10693041980266571, "learning_rate": 2.5260636261770777e-05, "loss": 0.3073539972305298, "memory(GiB)": 72.72, "step": 9540, "token_acc": 0.890892156523979, "train_speed(iter/s)": 0.08882 }, { "epoch": 4.103026400515132, "grad_norm": 0.10553585737943649, "learning_rate": 2.5173759256605027e-05, "loss": 0.30216293334960936, "memory(GiB)": 72.72, "step": 9560, "token_acc": 0.8891749049597542, "train_speed(iter/s)": 0.088822 }, { "epoch": 4.111611933891393, "grad_norm": 0.10220309346914291, "learning_rate": 2.5086880152932402e-05, "loss": 0.3027711153030396, "memory(GiB)": 72.72, "step": 9580, "token_acc": 0.8892161871654268, "train_speed(iter/s)": 0.088824 }, { "epoch": 4.120197467267654, "grad_norm": 0.10086795687675476, "learning_rate": 2.5e-05, "loss": 0.30600886344909667, "memory(GiB)": 72.72, "step": 9600, "token_acc": 0.8824066390041494, "train_speed(iter/s)": 0.088825 }, { "epoch": 4.1287830006439155, "grad_norm": 0.10636570304632187, "learning_rate": 2.4913119847067603e-05, "loss": 0.30425918102264404, "memory(GiB)": 72.72, "step": 9620, "token_acc": 0.8838457920573797, "train_speed(iter/s)": 0.088829 }, { "epoch": 4.137368534020176, "grad_norm": 0.10464228689670563, "learning_rate": 2.4826240743394982e-05, "loss": 0.3025052070617676, "memory(GiB)": 72.72, "step": 9640, "token_acc": 0.8769574601853707, "train_speed(iter/s)": 0.088832 }, { "epoch": 4.145954067396437, "grad_norm": 0.1083202064037323, "learning_rate": 2.4739363738229232e-05, "loss": 0.30380189418792725, "memory(GiB)": 72.72, "step": 9660, "token_acc": 0.8893545408707838, "train_speed(iter/s)": 0.088834 }, { "epoch": 4.154539600772698, "grad_norm": 0.10492519289255142, "learning_rate": 2.4652489880792128e-05, "loss": 0.30443031787872316, "memory(GiB)": 72.72, "step": 9680, "token_acc": 0.8797012712026356, "train_speed(iter/s)": 0.088831 }, { "epoch": 4.1631251341489595, "grad_norm": 0.09974920004606247, "learning_rate": 2.4565620220267396e-05, "loss": 0.3066636800765991, "memory(GiB)": 72.72, "step": 9700, "token_acc": 0.8844553871840214, "train_speed(iter/s)": 0.088833 }, { "epoch": 4.17171066752522, "grad_norm": 0.0984271839261055, "learning_rate": 2.447875580578812e-05, "loss": 0.3007610082626343, "memory(GiB)": 72.72, "step": 9720, "token_acc": 0.8761438976087101, "train_speed(iter/s)": 0.088836 }, { "epoch": 4.180296200901481, "grad_norm": 0.10344758629798889, "learning_rate": 2.439189768642398e-05, "loss": 0.3055333375930786, "memory(GiB)": 72.72, "step": 9740, "token_acc": 0.8798825324153172, "train_speed(iter/s)": 0.088839 }, { "epoch": 4.188881734277742, "grad_norm": 0.10062626749277115, "learning_rate": 2.4305046911168653e-05, "loss": 0.30226128101348876, "memory(GiB)": 72.72, "step": 9760, "token_acc": 0.877849069049261, "train_speed(iter/s)": 0.088842 }, { "epoch": 4.197467267654003, "grad_norm": 0.1044364646077156, "learning_rate": 2.4218204528927117e-05, "loss": 0.3027973175048828, "memory(GiB)": 72.72, "step": 9780, "token_acc": 0.8901094903786694, "train_speed(iter/s)": 0.088844 }, { "epoch": 4.206052801030264, "grad_norm": 0.09792552888393402, "learning_rate": 2.4131371588503003e-05, "loss": 0.30410778522491455, "memory(GiB)": 72.72, "step": 9800, "token_acc": 0.8904304675100755, "train_speed(iter/s)": 0.088846 }, { "epoch": 4.214638334406525, "grad_norm": 0.11304216086864471, "learning_rate": 2.4044549138585877e-05, "loss": 0.3036644697189331, "memory(GiB)": 72.72, "step": 9820, "token_acc": 0.8800798395927938, "train_speed(iter/s)": 0.088849 }, { "epoch": 4.223223867782786, "grad_norm": 0.10015735030174255, "learning_rate": 2.395773822773863e-05, "loss": 0.30791220664978025, "memory(GiB)": 72.72, "step": 9840, "token_acc": 0.8848758135171705, "train_speed(iter/s)": 0.088852 }, { "epoch": 4.231809401159047, "grad_norm": 0.09757008403539658, "learning_rate": 2.3870939904384815e-05, "loss": 0.30361478328704833, "memory(GiB)": 72.72, "step": 9860, "token_acc": 0.8940831985400854, "train_speed(iter/s)": 0.088855 }, { "epoch": 4.240394934535308, "grad_norm": 0.09704037755727768, "learning_rate": 2.378415521679593e-05, "loss": 0.3088146924972534, "memory(GiB)": 72.72, "step": 9880, "token_acc": 0.887872541700794, "train_speed(iter/s)": 0.088857 }, { "epoch": 4.248980467911569, "grad_norm": 0.10431049019098282, "learning_rate": 2.3697385213078805e-05, "loss": 0.30578904151916503, "memory(GiB)": 72.72, "step": 9900, "token_acc": 0.8816228300017872, "train_speed(iter/s)": 0.088859 }, { "epoch": 4.25756600128783, "grad_norm": 0.09818245470523834, "learning_rate": 2.361063094116293e-05, "loss": 0.3096456527709961, "memory(GiB)": 72.72, "step": 9920, "token_acc": 0.8843864415701027, "train_speed(iter/s)": 0.088863 }, { "epoch": 4.2661515346640915, "grad_norm": 0.09961646795272827, "learning_rate": 2.3523893448787818e-05, "loss": 0.30978071689605713, "memory(GiB)": 72.72, "step": 9940, "token_acc": 0.8849045058887656, "train_speed(iter/s)": 0.088865 }, { "epoch": 4.274737068040352, "grad_norm": 0.10661664605140686, "learning_rate": 2.3437173783490307e-05, "loss": 0.30757110118865966, "memory(GiB)": 72.72, "step": 9960, "token_acc": 0.8822289688850337, "train_speed(iter/s)": 0.088868 }, { "epoch": 4.283322601416613, "grad_norm": 0.1005556732416153, "learning_rate": 2.3350472992591947e-05, "loss": 0.30759055614471437, "memory(GiB)": 72.72, "step": 9980, "token_acc": 0.8835728408590111, "train_speed(iter/s)": 0.088871 }, { "epoch": 4.291908134792874, "grad_norm": 0.09660108387470245, "learning_rate": 2.3263792123186353e-05, "loss": 0.30487823486328125, "memory(GiB)": 72.72, "step": 10000, "token_acc": 0.8812157065140277, "train_speed(iter/s)": 0.088874 }, { "epoch": 4.291908134792874, "eval_loss": 0.45848873257637024, "eval_runtime": 74.5961, "eval_samples_per_second": 50.458, "eval_steps_per_second": 0.643, "eval_token_acc": 0.8390728406167212, "step": 10000 }, { "epoch": 4.3004936681691355, "grad_norm": 0.10211784392595291, "learning_rate": 2.3177132222126536e-05, "loss": 0.3054050922393799, "memory(GiB)": 72.72, "step": 10020, "token_acc": 0.861507260950951, "train_speed(iter/s)": 0.088783 }, { "epoch": 4.309079201545396, "grad_norm": 0.1039443388581276, "learning_rate": 2.3090494336012253e-05, "loss": 0.3065175533294678, "memory(GiB)": 72.72, "step": 10040, "token_acc": 0.8864880616836895, "train_speed(iter/s)": 0.088777 }, { "epoch": 4.317664734921657, "grad_norm": 0.1060820147395134, "learning_rate": 2.3003879511177405e-05, "loss": 0.31085891723632814, "memory(GiB)": 72.72, "step": 10060, "token_acc": 0.8897265286253574, "train_speed(iter/s)": 0.088773 }, { "epoch": 4.326250268297918, "grad_norm": 0.10298410803079605, "learning_rate": 2.2917288793677382e-05, "loss": 0.31043663024902346, "memory(GiB)": 72.72, "step": 10080, "token_acc": 0.8748683362897243, "train_speed(iter/s)": 0.088769 }, { "epoch": 4.3348358016741795, "grad_norm": 0.1114133968949318, "learning_rate": 2.2830723229276424e-05, "loss": 0.31448495388031006, "memory(GiB)": 72.72, "step": 10100, "token_acc": 0.8866603970434808, "train_speed(iter/s)": 0.088766 }, { "epoch": 4.34342133505044, "grad_norm": 0.10426465421915054, "learning_rate": 2.2744183863434976e-05, "loss": 0.31032671928405764, "memory(GiB)": 72.72, "step": 10120, "token_acc": 0.8818581792950851, "train_speed(iter/s)": 0.088765 }, { "epoch": 4.352006868426701, "grad_norm": 0.10287055373191833, "learning_rate": 2.265767174129711e-05, "loss": 0.3112910747528076, "memory(GiB)": 72.72, "step": 10140, "token_acc": 0.8739920728492123, "train_speed(iter/s)": 0.088763 }, { "epoch": 4.360592401802962, "grad_norm": 0.10366437584161758, "learning_rate": 2.2571187907677853e-05, "loss": 0.31062612533569334, "memory(GiB)": 72.72, "step": 10160, "token_acc": 0.8771409538302638, "train_speed(iter/s)": 0.088761 }, { "epoch": 4.369177935179223, "grad_norm": 0.10374686121940613, "learning_rate": 2.2484733407050602e-05, "loss": 0.31010420322418214, "memory(GiB)": 72.72, "step": 10180, "token_acc": 0.8837488220680202, "train_speed(iter/s)": 0.088762 }, { "epoch": 4.377763468555484, "grad_norm": 0.10094033926725388, "learning_rate": 2.2398309283534477e-05, "loss": 0.3080222845077515, "memory(GiB)": 72.72, "step": 10200, "token_acc": 0.8891878281040166, "train_speed(iter/s)": 0.088764 }, { "epoch": 4.386349001931745, "grad_norm": 0.10435180366039276, "learning_rate": 2.2311916580881754e-05, "loss": 0.30961949825286866, "memory(GiB)": 72.72, "step": 10220, "token_acc": 0.8952461985350648, "train_speed(iter/s)": 0.088764 }, { "epoch": 4.394934535308006, "grad_norm": 0.0953126922249794, "learning_rate": 2.222555634246521e-05, "loss": 0.3070392608642578, "memory(GiB)": 72.72, "step": 10240, "token_acc": 0.8863533099042126, "train_speed(iter/s)": 0.088766 }, { "epoch": 4.403520068684267, "grad_norm": 0.10288111865520477, "learning_rate": 2.2139229611265594e-05, "loss": 0.30999772548675536, "memory(GiB)": 72.72, "step": 10260, "token_acc": 0.8752241865231873, "train_speed(iter/s)": 0.088766 }, { "epoch": 4.412105602060528, "grad_norm": 0.10298358649015427, "learning_rate": 2.205293742985893e-05, "loss": 0.310498046875, "memory(GiB)": 72.72, "step": 10280, "token_acc": 0.895062097103973, "train_speed(iter/s)": 0.088768 }, { "epoch": 4.420691135436789, "grad_norm": 0.10269106179475784, "learning_rate": 2.1966680840404013e-05, "loss": 0.31382122039794924, "memory(GiB)": 72.72, "step": 10300, "token_acc": 0.8826629491356146, "train_speed(iter/s)": 0.088769 }, { "epoch": 4.42927666881305, "grad_norm": 0.09890419244766235, "learning_rate": 2.188046088462979e-05, "loss": 0.31236202716827394, "memory(GiB)": 72.72, "step": 10320, "token_acc": 0.877004450607206, "train_speed(iter/s)": 0.08877 }, { "epoch": 4.437862202189311, "grad_norm": 0.1035868227481842, "learning_rate": 2.179427860382276e-05, "loss": 0.31030888557434083, "memory(GiB)": 72.72, "step": 10340, "token_acc": 0.88265658710238, "train_speed(iter/s)": 0.08877 }, { "epoch": 4.446447735565572, "grad_norm": 0.10644908994436264, "learning_rate": 2.170813503881444e-05, "loss": 0.31080482006072996, "memory(GiB)": 72.72, "step": 10360, "token_acc": 0.8680257223302367, "train_speed(iter/s)": 0.088772 }, { "epoch": 4.455033268941833, "grad_norm": 0.10393664985895157, "learning_rate": 2.162203122996876e-05, "loss": 0.3072603702545166, "memory(GiB)": 72.72, "step": 10380, "token_acc": 0.8879988357215192, "train_speed(iter/s)": 0.088774 }, { "epoch": 4.463618802318094, "grad_norm": 0.09875033795833588, "learning_rate": 2.1535968217169535e-05, "loss": 0.308307147026062, "memory(GiB)": 72.72, "step": 10400, "token_acc": 0.8760545062481376, "train_speed(iter/s)": 0.088777 }, { "epoch": 4.472204335694355, "grad_norm": 0.10074667632579803, "learning_rate": 2.1449947039807826e-05, "loss": 0.3109966039657593, "memory(GiB)": 72.72, "step": 10420, "token_acc": 0.8947892374351213, "train_speed(iter/s)": 0.088778 }, { "epoch": 4.480789869070616, "grad_norm": 0.09881151467561722, "learning_rate": 2.1363968736769508e-05, "loss": 0.3046985626220703, "memory(GiB)": 72.72, "step": 10440, "token_acc": 0.8952631152568657, "train_speed(iter/s)": 0.08878 }, { "epoch": 4.489375402446877, "grad_norm": 0.09804583340883255, "learning_rate": 2.1278034346422616e-05, "loss": 0.31377933025360105, "memory(GiB)": 72.72, "step": 10460, "token_acc": 0.8788372867424049, "train_speed(iter/s)": 0.088782 }, { "epoch": 4.497960935823138, "grad_norm": 0.10384197533130646, "learning_rate": 2.1192144906604876e-05, "loss": 0.3103285312652588, "memory(GiB)": 72.72, "step": 10480, "token_acc": 0.891363222526985, "train_speed(iter/s)": 0.088783 }, { "epoch": 4.5065464691993995, "grad_norm": 0.10672769695520401, "learning_rate": 2.110630145461112e-05, "loss": 0.3111438512802124, "memory(GiB)": 72.72, "step": 10500, "token_acc": 0.88412093531313, "train_speed(iter/s)": 0.088785 }, { "epoch": 4.51513200257566, "grad_norm": 0.10372064262628555, "learning_rate": 2.102050502718078e-05, "loss": 0.3104998111724854, "memory(GiB)": 72.72, "step": 10520, "token_acc": 0.8813348577961984, "train_speed(iter/s)": 0.088787 }, { "epoch": 4.523717535951921, "grad_norm": 0.1009448915719986, "learning_rate": 2.093475666048539e-05, "loss": 0.30964412689208987, "memory(GiB)": 72.72, "step": 10540, "token_acc": 0.8954398710496272, "train_speed(iter/s)": 0.088788 }, { "epoch": 4.532303069328182, "grad_norm": 0.10434540361166, "learning_rate": 2.0849057390116042e-05, "loss": 0.30902681350708006, "memory(GiB)": 72.72, "step": 10560, "token_acc": 0.8803101400044141, "train_speed(iter/s)": 0.088789 }, { "epoch": 4.5408886027044435, "grad_norm": 0.10229279845952988, "learning_rate": 2.0763408251070866e-05, "loss": 0.3061969757080078, "memory(GiB)": 72.72, "step": 10580, "token_acc": 0.8930533404217614, "train_speed(iter/s)": 0.088791 }, { "epoch": 4.549474136080704, "grad_norm": 0.09319902211427689, "learning_rate": 2.0677810277742565e-05, "loss": 0.3094120740890503, "memory(GiB)": 72.72, "step": 10600, "token_acc": 0.8876524522036789, "train_speed(iter/s)": 0.088793 }, { "epoch": 4.558059669456965, "grad_norm": 0.09506496042013168, "learning_rate": 2.0592264503905932e-05, "loss": 0.3105063199996948, "memory(GiB)": 72.72, "step": 10620, "token_acc": 0.8743828338452405, "train_speed(iter/s)": 0.088795 }, { "epoch": 4.566645202833226, "grad_norm": 0.09979739040136337, "learning_rate": 2.0506771962705304e-05, "loss": 0.30733799934387207, "memory(GiB)": 72.72, "step": 10640, "token_acc": 0.889678967341867, "train_speed(iter/s)": 0.088798 }, { "epoch": 4.5752307362094875, "grad_norm": 0.0996963307261467, "learning_rate": 2.0421333686642137e-05, "loss": 0.30787818431854247, "memory(GiB)": 72.72, "step": 10660, "token_acc": 0.8724791602710936, "train_speed(iter/s)": 0.0888 }, { "epoch": 4.583816269585748, "grad_norm": 0.10467605292797089, "learning_rate": 2.0335950707562535e-05, "loss": 0.30961976051330564, "memory(GiB)": 72.72, "step": 10680, "token_acc": 0.8865601551069852, "train_speed(iter/s)": 0.088797 }, { "epoch": 4.592401802962009, "grad_norm": 0.10287564992904663, "learning_rate": 2.0250624056644767e-05, "loss": 0.30673904418945314, "memory(GiB)": 72.72, "step": 10700, "token_acc": 0.8888166591838771, "train_speed(iter/s)": 0.088799 }, { "epoch": 4.60098733633827, "grad_norm": 0.10342861711978912, "learning_rate": 2.0165354764386807e-05, "loss": 0.3080348253250122, "memory(GiB)": 72.72, "step": 10720, "token_acc": 0.8935362282980741, "train_speed(iter/s)": 0.088801 }, { "epoch": 4.6095728697145315, "grad_norm": 0.09834201633930206, "learning_rate": 2.0080143860593913e-05, "loss": 0.30832786560058595, "memory(GiB)": 72.72, "step": 10740, "token_acc": 0.8824297207331616, "train_speed(iter/s)": 0.088803 }, { "epoch": 4.618158403090792, "grad_norm": 0.10289661586284637, "learning_rate": 1.9994992374366193e-05, "loss": 0.3109771251678467, "memory(GiB)": 72.72, "step": 10760, "token_acc": 0.8895210650649608, "train_speed(iter/s)": 0.088805 }, { "epoch": 4.626743936467053, "grad_norm": 0.09662512689828873, "learning_rate": 1.9909901334086152e-05, "loss": 0.31307733058929443, "memory(GiB)": 72.72, "step": 10780, "token_acc": 0.8865438146287556, "train_speed(iter/s)": 0.088807 }, { "epoch": 4.635329469843314, "grad_norm": 0.10243885219097137, "learning_rate": 1.982487176740627e-05, "loss": 0.31298274993896485, "memory(GiB)": 72.72, "step": 10800, "token_acc": 0.8782184863693918, "train_speed(iter/s)": 0.088808 }, { "epoch": 4.6439150032195755, "grad_norm": 0.10350590944290161, "learning_rate": 1.973990470123663e-05, "loss": 0.309729266166687, "memory(GiB)": 72.72, "step": 10820, "token_acc": 0.8839905751216937, "train_speed(iter/s)": 0.088809 }, { "epoch": 4.652500536595836, "grad_norm": 0.10676155984401703, "learning_rate": 1.9655001161732478e-05, "loss": 0.3093304395675659, "memory(GiB)": 72.72, "step": 10840, "token_acc": 0.8909490610287415, "train_speed(iter/s)": 0.088812 }, { "epoch": 4.661086069972097, "grad_norm": 0.09464031457901001, "learning_rate": 1.9570162174281847e-05, "loss": 0.3070455312728882, "memory(GiB)": 72.72, "step": 10860, "token_acc": 0.8747045411759784, "train_speed(iter/s)": 0.088813 }, { "epoch": 4.669671603348358, "grad_norm": 0.09355127811431885, "learning_rate": 1.9485388763493153e-05, "loss": 0.30823278427124023, "memory(GiB)": 72.72, "step": 10880, "token_acc": 0.9008269805356058, "train_speed(iter/s)": 0.088815 }, { "epoch": 4.6782571367246195, "grad_norm": 0.0956326350569725, "learning_rate": 1.9400681953182855e-05, "loss": 0.30865190029144285, "memory(GiB)": 72.72, "step": 10900, "token_acc": 0.8869463759204074, "train_speed(iter/s)": 0.088817 }, { "epoch": 4.68684267010088, "grad_norm": 0.10339660942554474, "learning_rate": 1.9316042766363075e-05, "loss": 0.3091820955276489, "memory(GiB)": 72.72, "step": 10920, "token_acc": 0.8778233411535858, "train_speed(iter/s)": 0.088819 }, { "epoch": 4.695428203477141, "grad_norm": 0.0986744612455368, "learning_rate": 1.9231472225229216e-05, "loss": 0.31184089183807373, "memory(GiB)": 72.72, "step": 10940, "token_acc": 0.8970116747089019, "train_speed(iter/s)": 0.088821 }, { "epoch": 4.704013736853402, "grad_norm": 0.10175996273756027, "learning_rate": 1.9146971351147655e-05, "loss": 0.3101097583770752, "memory(GiB)": 72.72, "step": 10960, "token_acc": 0.8852196976340255, "train_speed(iter/s)": 0.088823 }, { "epoch": 4.7125992702296635, "grad_norm": 0.7812356948852539, "learning_rate": 1.9062541164643403e-05, "loss": 0.3123283863067627, "memory(GiB)": 72.72, "step": 10980, "token_acc": 0.8854708801840979, "train_speed(iter/s)": 0.088825 }, { "epoch": 4.721184803605924, "grad_norm": 0.10162019729614258, "learning_rate": 1.897818268538776e-05, "loss": 0.31052777767181394, "memory(GiB)": 72.72, "step": 11000, "token_acc": 0.8801016226848057, "train_speed(iter/s)": 0.088825 }, { "epoch": 4.721184803605924, "eval_loss": 0.45501717925071716, "eval_runtime": 70.0969, "eval_samples_per_second": 53.697, "eval_steps_per_second": 0.685, "eval_token_acc": 0.8396253696160709, "step": 11000 }, { "epoch": 4.729770336982185, "grad_norm": 0.10113983601331711, "learning_rate": 1.8893896932185994e-05, "loss": 0.30813672542572024, "memory(GiB)": 72.72, "step": 11020, "token_acc": 0.8546351539786743, "train_speed(iter/s)": 0.088747 }, { "epoch": 4.738355870358446, "grad_norm": 0.103078193962574, "learning_rate": 1.8809684922965097e-05, "loss": 0.30388219356536866, "memory(GiB)": 72.72, "step": 11040, "token_acc": 0.8948337756570212, "train_speed(iter/s)": 0.088741 }, { "epoch": 4.746941403734707, "grad_norm": 0.09970963001251221, "learning_rate": 1.87255476747614e-05, "loss": 0.3133774042129517, "memory(GiB)": 72.72, "step": 11060, "token_acc": 0.8968223367439061, "train_speed(iter/s)": 0.088738 }, { "epoch": 4.755526937110968, "grad_norm": 0.10380697250366211, "learning_rate": 1.8641486203708387e-05, "loss": 0.30957233905792236, "memory(GiB)": 72.72, "step": 11080, "token_acc": 0.8824741415108899, "train_speed(iter/s)": 0.088734 }, { "epoch": 4.764112470487229, "grad_norm": 0.1037619411945343, "learning_rate": 1.855750152502431e-05, "loss": 0.3057359457015991, "memory(GiB)": 72.72, "step": 11100, "token_acc": 0.8958725033279122, "train_speed(iter/s)": 0.088733 }, { "epoch": 4.77269800386349, "grad_norm": 0.10157765448093414, "learning_rate": 1.847359465300006e-05, "loss": 0.30702900886535645, "memory(GiB)": 72.72, "step": 11120, "token_acc": 0.876122716238661, "train_speed(iter/s)": 0.088732 }, { "epoch": 4.781283537239751, "grad_norm": 0.09982700645923615, "learning_rate": 1.83897666009868e-05, "loss": 0.3116676092147827, "memory(GiB)": 72.72, "step": 11140, "token_acc": 0.8806798775281173, "train_speed(iter/s)": 0.088731 }, { "epoch": 4.789869070616012, "grad_norm": 0.10041019320487976, "learning_rate": 1.830601838138382e-05, "loss": 0.30963037014007566, "memory(GiB)": 72.72, "step": 11160, "token_acc": 0.8754533556507809, "train_speed(iter/s)": 0.088732 }, { "epoch": 4.798454603992273, "grad_norm": 0.09908230602741241, "learning_rate": 1.8222351005626226e-05, "loss": 0.31059741973876953, "memory(GiB)": 72.72, "step": 11180, "token_acc": 0.8883735287189193, "train_speed(iter/s)": 0.088733 }, { "epoch": 4.807040137368534, "grad_norm": 0.10159313678741455, "learning_rate": 1.8138765484172775e-05, "loss": 0.3082897186279297, "memory(GiB)": 72.72, "step": 11200, "token_acc": 0.8837576612751032, "train_speed(iter/s)": 0.088733 }, { "epoch": 4.815625670744795, "grad_norm": 0.09573191404342651, "learning_rate": 1.805526282649369e-05, "loss": 0.31205048561096194, "memory(GiB)": 72.72, "step": 11220, "token_acc": 0.8822307222234796, "train_speed(iter/s)": 0.088734 }, { "epoch": 4.824211204121056, "grad_norm": 0.10527610033750534, "learning_rate": 1.797184404105839e-05, "loss": 0.3125370264053345, "memory(GiB)": 72.72, "step": 11240, "token_acc": 0.877393258829162, "train_speed(iter/s)": 0.088733 }, { "epoch": 4.832796737497317, "grad_norm": 0.09318065643310547, "learning_rate": 1.7888510135323414e-05, "loss": 0.30796611309051514, "memory(GiB)": 72.72, "step": 11260, "token_acc": 0.8781843195222971, "train_speed(iter/s)": 0.088735 }, { "epoch": 4.841382270873578, "grad_norm": 0.09891670942306519, "learning_rate": 1.780526211572016e-05, "loss": 0.31104702949523927, "memory(GiB)": 72.72, "step": 11280, "token_acc": 0.8761593749911606, "train_speed(iter/s)": 0.088737 }, { "epoch": 4.849967804249839, "grad_norm": 0.09686607122421265, "learning_rate": 1.772210098764281e-05, "loss": 0.3131218433380127, "memory(GiB)": 72.72, "step": 11300, "token_acc": 0.8879271267617395, "train_speed(iter/s)": 0.088736 }, { "epoch": 4.8585533376261, "grad_norm": 0.09879806637763977, "learning_rate": 1.7639027755436104e-05, "loss": 0.30540714263916013, "memory(GiB)": 72.72, "step": 11320, "token_acc": 0.8883117608455857, "train_speed(iter/s)": 0.088738 }, { "epoch": 4.867138871002361, "grad_norm": 0.09865374863147736, "learning_rate": 1.7556043422383293e-05, "loss": 0.3053091287612915, "memory(GiB)": 72.72, "step": 11340, "token_acc": 0.8942929802909607, "train_speed(iter/s)": 0.088737 }, { "epoch": 4.875724404378622, "grad_norm": 0.10021404922008514, "learning_rate": 1.7473148990693955e-05, "loss": 0.31073627471923826, "memory(GiB)": 72.72, "step": 11360, "token_acc": 0.8850116031551548, "train_speed(iter/s)": 0.088738 }, { "epoch": 4.884309937754883, "grad_norm": 0.10069513320922852, "learning_rate": 1.7390345461491954e-05, "loss": 0.3094152927398682, "memory(GiB)": 72.72, "step": 11380, "token_acc": 0.8841858526281734, "train_speed(iter/s)": 0.088739 }, { "epoch": 4.892895471131144, "grad_norm": 0.1061035767197609, "learning_rate": 1.730763383480328e-05, "loss": 0.30918545722961427, "memory(GiB)": 72.72, "step": 11400, "token_acc": 0.8904189361026621, "train_speed(iter/s)": 0.08874 }, { "epoch": 4.901481004507405, "grad_norm": 0.0995524600148201, "learning_rate": 1.722501510954403e-05, "loss": 0.3127927541732788, "memory(GiB)": 72.72, "step": 11420, "token_acc": 0.8850685685523632, "train_speed(iter/s)": 0.088743 }, { "epoch": 4.910066537883666, "grad_norm": 0.09735783189535141, "learning_rate": 1.7142490283508324e-05, "loss": 0.30820300579071047, "memory(GiB)": 72.72, "step": 11440, "token_acc": 0.8748503235514588, "train_speed(iter/s)": 0.088745 }, { "epoch": 4.918652071259927, "grad_norm": 0.10155107080936432, "learning_rate": 1.706006035335625e-05, "loss": 0.3070305109024048, "memory(GiB)": 72.72, "step": 11460, "token_acc": 0.886489278720699, "train_speed(iter/s)": 0.088748 }, { "epoch": 4.927237604636188, "grad_norm": 0.11103896051645279, "learning_rate": 1.6977726314601806e-05, "loss": 0.31273181438446046, "memory(GiB)": 72.72, "step": 11480, "token_acc": 0.8837625376784819, "train_speed(iter/s)": 0.08875 }, { "epoch": 4.935823138012449, "grad_norm": 0.09665607661008835, "learning_rate": 1.6895489161600924e-05, "loss": 0.30753934383392334, "memory(GiB)": 72.72, "step": 11500, "token_acc": 0.8802015271291028, "train_speed(iter/s)": 0.088752 }, { "epoch": 4.94440867138871, "grad_norm": 0.0969487726688385, "learning_rate": 1.6813349887539443e-05, "loss": 0.3144726514816284, "memory(GiB)": 72.72, "step": 11520, "token_acc": 0.8802431565821507, "train_speed(iter/s)": 0.088753 }, { "epoch": 4.952994204764971, "grad_norm": 0.09839560836553574, "learning_rate": 1.67313094844211e-05, "loss": 0.30981805324554446, "memory(GiB)": 72.72, "step": 11540, "token_acc": 0.8963515858448547, "train_speed(iter/s)": 0.088755 }, { "epoch": 4.961579738141232, "grad_norm": 0.10357420891523361, "learning_rate": 1.664936894305554e-05, "loss": 0.3088369846343994, "memory(GiB)": 72.72, "step": 11560, "token_acc": 0.8838528141659493, "train_speed(iter/s)": 0.088757 }, { "epoch": 4.970165271517493, "grad_norm": 0.09701311588287354, "learning_rate": 1.65675292530464e-05, "loss": 0.31214241981506347, "memory(GiB)": 72.72, "step": 11580, "token_acc": 0.8797079209755736, "train_speed(iter/s)": 0.088759 }, { "epoch": 4.978750804893754, "grad_norm": 0.09698698669672012, "learning_rate": 1.648579140277931e-05, "loss": 0.3103867292404175, "memory(GiB)": 72.72, "step": 11600, "token_acc": 0.874222062607426, "train_speed(iter/s)": 0.088759 }, { "epoch": 4.987336338270015, "grad_norm": 0.09471474587917328, "learning_rate": 1.640415637940996e-05, "loss": 0.31050570011138917, "memory(GiB)": 72.72, "step": 11620, "token_acc": 0.891306756689066, "train_speed(iter/s)": 0.088762 }, { "epoch": 4.995921871646276, "grad_norm": 0.09426256269216537, "learning_rate": 1.6322625168852217e-05, "loss": 0.31265413761138916, "memory(GiB)": 72.72, "step": 11640, "token_acc": 0.8955581978003312, "train_speed(iter/s)": 0.088764 }, { "epoch": 5.00429276668813, "grad_norm": 0.11454425007104874, "learning_rate": 1.6241198755766175e-05, "loss": 0.2891073703765869, "memory(GiB)": 72.72, "step": 11660, "token_acc": 0.8897775721320687, "train_speed(iter/s)": 0.088768 }, { "epoch": 5.012878300064392, "grad_norm": 0.10882110148668289, "learning_rate": 1.6159878123546275e-05, "loss": 0.2693314790725708, "memory(GiB)": 72.72, "step": 11680, "token_acc": 0.8982194210665359, "train_speed(iter/s)": 0.088762 }, { "epoch": 5.021463833440652, "grad_norm": 0.10319822281599045, "learning_rate": 1.6078664254309436e-05, "loss": 0.27081449031829835, "memory(GiB)": 72.72, "step": 11700, "token_acc": 0.9001353267268086, "train_speed(iter/s)": 0.088765 }, { "epoch": 5.030049366816914, "grad_norm": 0.11284149438142776, "learning_rate": 1.59975581288832e-05, "loss": 0.27441935539245604, "memory(GiB)": 72.72, "step": 11720, "token_acc": 0.8984662917082801, "train_speed(iter/s)": 0.088767 }, { "epoch": 5.038634900193174, "grad_norm": 0.10425851494073868, "learning_rate": 1.591656072679387e-05, "loss": 0.2715555906295776, "memory(GiB)": 72.72, "step": 11740, "token_acc": 0.890478422247908, "train_speed(iter/s)": 0.088768 }, { "epoch": 5.047220433569436, "grad_norm": 0.11284064501523972, "learning_rate": 1.583567302625469e-05, "loss": 0.2725609540939331, "memory(GiB)": 72.72, "step": 11760, "token_acc": 0.897063681945232, "train_speed(iter/s)": 0.088769 }, { "epoch": 5.055805966945696, "grad_norm": 0.11659186333417892, "learning_rate": 1.5754896004154e-05, "loss": 0.2763663291931152, "memory(GiB)": 72.72, "step": 11780, "token_acc": 0.8892781727292928, "train_speed(iter/s)": 0.088771 }, { "epoch": 5.064391500321958, "grad_norm": 0.10959072411060333, "learning_rate": 1.567423063604352e-05, "loss": 0.27177045345306394, "memory(GiB)": 72.72, "step": 11800, "token_acc": 0.8928949946338838, "train_speed(iter/s)": 0.088772 }, { "epoch": 5.072977033698218, "grad_norm": 0.11139950156211853, "learning_rate": 1.5593677896126462e-05, "loss": 0.2721517086029053, "memory(GiB)": 72.72, "step": 11820, "token_acc": 0.895870023109786, "train_speed(iter/s)": 0.088773 }, { "epoch": 5.08156256707448, "grad_norm": 0.10203303396701813, "learning_rate": 1.551323875724587e-05, "loss": 0.27356884479522703, "memory(GiB)": 72.72, "step": 11840, "token_acc": 0.9059859374397118, "train_speed(iter/s)": 0.088775 }, { "epoch": 5.09014810045074, "grad_norm": 0.10516630858182907, "learning_rate": 1.5432914190872757e-05, "loss": 0.2754658222198486, "memory(GiB)": 72.72, "step": 11860, "token_acc": 0.8908944849786643, "train_speed(iter/s)": 0.088776 }, { "epoch": 5.098733633827002, "grad_norm": 0.10858064144849777, "learning_rate": 1.5352705167094477e-05, "loss": 0.2734870672225952, "memory(GiB)": 72.72, "step": 11880, "token_acc": 0.8974409839317595, "train_speed(iter/s)": 0.088777 }, { "epoch": 5.107319167203262, "grad_norm": 0.11306975036859512, "learning_rate": 1.527261265460296e-05, "loss": 0.27300803661346434, "memory(GiB)": 72.72, "step": 11900, "token_acc": 0.9109106165341432, "train_speed(iter/s)": 0.08878 }, { "epoch": 5.115904700579524, "grad_norm": 0.10477675497531891, "learning_rate": 1.5192637620682981e-05, "loss": 0.2717351198196411, "memory(GiB)": 72.72, "step": 11920, "token_acc": 0.8888284413313953, "train_speed(iter/s)": 0.088782 }, { "epoch": 5.124490233955784, "grad_norm": 0.11272590607404709, "learning_rate": 1.5112781031200569e-05, "loss": 0.2693598508834839, "memory(GiB)": 72.72, "step": 11940, "token_acc": 0.8937451291948688, "train_speed(iter/s)": 0.088784 }, { "epoch": 5.133075767332046, "grad_norm": 0.1111418828368187, "learning_rate": 1.5033043850591256e-05, "loss": 0.2743582487106323, "memory(GiB)": 72.72, "step": 11960, "token_acc": 0.8922390332455552, "train_speed(iter/s)": 0.088786 }, { "epoch": 5.141661300708306, "grad_norm": 0.10565278679132462, "learning_rate": 1.4953427041848473e-05, "loss": 0.2750978946685791, "memory(GiB)": 72.72, "step": 11980, "token_acc": 0.8900847655801997, "train_speed(iter/s)": 0.088787 }, { "epoch": 5.150246834084568, "grad_norm": 0.10609643161296844, "learning_rate": 1.4873931566511901e-05, "loss": 0.27565574645996094, "memory(GiB)": 72.72, "step": 12000, "token_acc": 0.9002406831246359, "train_speed(iter/s)": 0.088788 }, { "epoch": 5.150246834084568, "eval_loss": 0.47488316893577576, "eval_runtime": 70.9516, "eval_samples_per_second": 53.05, "eval_steps_per_second": 0.677, "eval_token_acc": 0.8365712662327689, "step": 12000 } ], "logging_steps": 20, "max_steps": 18640, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.567507230772429e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }