{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 704, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007106057914372002, "grad_norm": 48.15382880127834, "learning_rate": 3.6363636363636366e-06, "loss": 4.3877, "mean_token_accuracy": 0.4402003187686205, "num_tokens": 5475610.0, "step": 5 }, { "epoch": 0.014212115828744005, "grad_norm": 14.120135940330632, "learning_rate": 8.181818181818183e-06, "loss": 3.149, "mean_token_accuracy": 0.49750813096761703, "num_tokens": 10990469.0, "step": 10 }, { "epoch": 0.021318173743116006, "grad_norm": 4.9582246488048884, "learning_rate": 1.2727272727272728e-05, "loss": 1.6042, "mean_token_accuracy": 0.6544640183448791, "num_tokens": 16509954.0, "step": 15 }, { "epoch": 0.02842423165748801, "grad_norm": 2.3061348987678967, "learning_rate": 1.7272727272727274e-05, "loss": 1.0542, "mean_token_accuracy": 0.7447574809193611, "num_tokens": 22025438.0, "step": 20 }, { "epoch": 0.03553028957186001, "grad_norm": 3.4964571027817337, "learning_rate": 1.999961805535155e-05, "loss": 0.8969, "mean_token_accuracy": 0.7685870260000229, "num_tokens": 27537030.0, "step": 25 }, { "epoch": 0.04263634748623201, "grad_norm": 3.3071652925849335, "learning_rate": 1.9995321550350065e-05, "loss": 0.8074, "mean_token_accuracy": 0.7807081520557404, "num_tokens": 33069854.0, "step": 30 }, { "epoch": 0.04974240540060401, "grad_norm": 3.924372429122354, "learning_rate": 1.998625339625423e-05, "loss": 0.7719, "mean_token_accuracy": 0.783011856675148, "num_tokens": 38592423.0, "step": 35 }, { "epoch": 0.05684846331497602, "grad_norm": 0.9496737705645568, "learning_rate": 1.9972418403347817e-05, "loss": 0.7245, "mean_token_accuracy": 0.7904581762850285, "num_tokens": 44102692.0, "step": 40 }, { "epoch": 0.06395452122934801, "grad_norm": 0.4437647446141135, "learning_rate": 1.9953823910527057e-05, "loss": 0.6875, "mean_token_accuracy": 0.8015773832798004, "num_tokens": 49606158.0, "step": 45 }, { "epoch": 0.07106057914372002, "grad_norm": 0.36104953638366394, "learning_rate": 1.993047978140764e-05, "loss": 0.6681, "mean_token_accuracy": 0.8048019059002399, "num_tokens": 55124799.0, "step": 50 }, { "epoch": 0.07816663705809203, "grad_norm": 0.3451124617137935, "learning_rate": 1.9902398399092494e-05, "loss": 0.6356, "mean_token_accuracy": 0.8128186449408531, "num_tokens": 60636117.0, "step": 55 }, { "epoch": 0.08527269497246402, "grad_norm": 0.389660261004574, "learning_rate": 1.9869594659603032e-05, "loss": 0.6398, "mean_token_accuracy": 0.8115979641675949, "num_tokens": 66154262.0, "step": 60 }, { "epoch": 0.09237875288683603, "grad_norm": 0.31317999931398266, "learning_rate": 1.9832085963977445e-05, "loss": 0.6337, "mean_token_accuracy": 0.8128221824765205, "num_tokens": 71679661.0, "step": 65 }, { "epoch": 0.09948481080120802, "grad_norm": 0.3128254105868865, "learning_rate": 1.978989220904016e-05, "loss": 0.6227, "mean_token_accuracy": 0.8144343480467796, "num_tokens": 77204132.0, "step": 70 }, { "epoch": 0.10659086871558003, "grad_norm": 0.3000404040639256, "learning_rate": 1.9743035776847377e-05, "loss": 0.6166, "mean_token_accuracy": 0.8157493658363819, "num_tokens": 82747984.0, "step": 75 }, { "epoch": 0.11369692662995204, "grad_norm": 0.3139627449464346, "learning_rate": 1.9691541522814327e-05, "loss": 0.5988, "mean_token_accuracy": 0.8197014890611172, "num_tokens": 88267430.0, "step": 80 }, { "epoch": 0.12080298454432403, "grad_norm": 0.3039876079367156, "learning_rate": 1.963543676253048e-05, "loss": 0.6051, "mean_token_accuracy": 0.8184983253479003, "num_tokens": 93790180.0, "step": 85 }, { "epoch": 0.12790904245869603, "grad_norm": 0.3409273981011121, "learning_rate": 1.9574751257269748e-05, "loss": 0.5978, "mean_token_accuracy": 0.8195111580193043, "num_tokens": 99314107.0, "step": 90 }, { "epoch": 0.13501510037306805, "grad_norm": 0.2958717785828003, "learning_rate": 1.950951719820335e-05, "loss": 0.5902, "mean_token_accuracy": 0.8219340682029724, "num_tokens": 104809146.0, "step": 95 }, { "epoch": 0.14212115828744004, "grad_norm": 0.2994112787306848, "learning_rate": 1.9439769189323727e-05, "loss": 0.5989, "mean_token_accuracy": 0.820228873193264, "num_tokens": 110323602.0, "step": 100 }, { "epoch": 0.14212115828744004, "eval_loss": 0.5710137486457825, "eval_mean_token_accuracy": 0.8228120437839574, "eval_num_tokens": 110323602.0, "eval_runtime": 160.5102, "eval_samples_per_second": 22.671, "eval_steps_per_second": 0.71, "step": 100 }, { "epoch": 0.14922721620181204, "grad_norm": 0.30064641734602476, "learning_rate": 1.9365544229088517e-05, "loss": 0.5944, "mean_token_accuracy": 0.8212312825024128, "num_tokens": 115858100.0, "step": 105 }, { "epoch": 0.15633327411618406, "grad_norm": 0.2894891012090971, "learning_rate": 1.9286881690794425e-05, "loss": 0.5832, "mean_token_accuracy": 0.8244290247559547, "num_tokens": 121393955.0, "step": 110 }, { "epoch": 0.16343933203055605, "grad_norm": 0.3177791926774768, "learning_rate": 1.9203823301691272e-05, "loss": 0.5946, "mean_token_accuracy": 0.8195516988635063, "num_tokens": 126941522.0, "step": 115 }, { "epoch": 0.17054538994492804, "grad_norm": 0.3271845261422823, "learning_rate": 1.9116413120847425e-05, "loss": 0.5853, "mean_token_accuracy": 0.823321682959795, "num_tokens": 132467661.0, "step": 120 }, { "epoch": 0.17765144785930007, "grad_norm": 0.30578663346709106, "learning_rate": 1.902469751577826e-05, "loss": 0.5787, "mean_token_accuracy": 0.8247792065143585, "num_tokens": 137980426.0, "step": 125 }, { "epoch": 0.18475750577367206, "grad_norm": 0.28728878535567604, "learning_rate": 1.892872513785008e-05, "loss": 0.568, "mean_token_accuracy": 0.8274045430123806, "num_tokens": 143490685.0, "step": 130 }, { "epoch": 0.19186356368804405, "grad_norm": 0.3081153281219037, "learning_rate": 1.88285468964726e-05, "loss": 0.5728, "mean_token_accuracy": 0.8262367367744445, "num_tokens": 149018034.0, "step": 135 }, { "epoch": 0.19896962160241605, "grad_norm": 0.27248779720052707, "learning_rate": 1.872421593209355e-05, "loss": 0.5679, "mean_token_accuracy": 0.8270156674087048, "num_tokens": 154549711.0, "step": 140 }, { "epoch": 0.20607567951678807, "grad_norm": 0.2876029562240766, "learning_rate": 1.861578758800989e-05, "loss": 0.5739, "mean_token_accuracy": 0.825926473736763, "num_tokens": 160056919.0, "step": 145 }, { "epoch": 0.21318173743116006, "grad_norm": 0.2592081911074665, "learning_rate": 1.8503319381010414e-05, "loss": 0.5688, "mean_token_accuracy": 0.826562087237835, "num_tokens": 165584428.0, "step": 150 }, { "epoch": 0.22028779534553206, "grad_norm": 0.2921770757113811, "learning_rate": 1.8386870970865488e-05, "loss": 0.5615, "mean_token_accuracy": 0.8287016794085502, "num_tokens": 171107691.0, "step": 155 }, { "epoch": 0.22739385325990408, "grad_norm": 0.25717865089543174, "learning_rate": 1.8266504128679988e-05, "loss": 0.5619, "mean_token_accuracy": 0.8289386563003063, "num_tokens": 176626158.0, "step": 160 }, { "epoch": 0.23449991117427607, "grad_norm": 0.27820799246988237, "learning_rate": 1.814228270412624e-05, "loss": 0.5772, "mean_token_accuracy": 0.8250645868480205, "num_tokens": 182151744.0, "step": 165 }, { "epoch": 0.24160596908864806, "grad_norm": 0.2821779366904109, "learning_rate": 1.8014272591574405e-05, "loss": 0.5707, "mean_token_accuracy": 0.8270999036729336, "num_tokens": 187670198.0, "step": 170 }, { "epoch": 0.2487120270030201, "grad_norm": 0.3275959246759839, "learning_rate": 1.7882541695138224e-05, "loss": 0.5579, "mean_token_accuracy": 0.8302620410919189, "num_tokens": 193189696.0, "step": 175 }, { "epoch": 0.25581808491739205, "grad_norm": 0.29306437619143894, "learning_rate": 1.7747159892654646e-05, "loss": 0.5564, "mean_token_accuracy": 0.8296913146972656, "num_tokens": 198706595.0, "step": 180 }, { "epoch": 0.2629241428317641, "grad_norm": 0.2977278839411146, "learning_rate": 1.7608198998616533e-05, "loss": 0.5635, "mean_token_accuracy": 0.8275744579732418, "num_tokens": 204263565.0, "step": 185 }, { "epoch": 0.2700302007461361, "grad_norm": 0.3317727367369204, "learning_rate": 1.7465732726077993e-05, "loss": 0.559, "mean_token_accuracy": 0.8284266702830791, "num_tokens": 209753822.0, "step": 190 }, { "epoch": 0.27713625866050806, "grad_norm": 0.27234470597159066, "learning_rate": 1.731983664755264e-05, "loss": 0.5613, "mean_token_accuracy": 0.8291354134678841, "num_tokens": 215275472.0, "step": 195 }, { "epoch": 0.2842423165748801, "grad_norm": 0.26544343779539864, "learning_rate": 1.717058815492548e-05, "loss": 0.5613, "mean_token_accuracy": 0.8280082412064076, "num_tokens": 220789450.0, "step": 200 }, { "epoch": 0.2842423165748801, "eval_loss": 0.5400242805480957, "eval_mean_token_accuracy": 0.8298156413069943, "eval_num_tokens": 220789450.0, "eval_runtime": 161.1614, "eval_samples_per_second": 22.58, "eval_steps_per_second": 0.707, "step": 200 }, { "epoch": 0.2913483744892521, "grad_norm": 0.2813762656625426, "learning_rate": 1.701806641839967e-05, "loss": 0.5615, "mean_token_accuracy": 0.8296020865440369, "num_tokens": 226318335.0, "step": 205 }, { "epoch": 0.29845443240362407, "grad_norm": 0.2738481182825542, "learning_rate": 1.6862352344500004e-05, "loss": 0.5604, "mean_token_accuracy": 0.8292979046702385, "num_tokens": 231830330.0, "step": 210 }, { "epoch": 0.3055604903179961, "grad_norm": 0.2724828799767463, "learning_rate": 1.6703528533155283e-05, "loss": 0.5555, "mean_token_accuracy": 0.8297731988132, "num_tokens": 237344871.0, "step": 215 }, { "epoch": 0.3126665482323681, "grad_norm": 0.26306657580185255, "learning_rate": 1.6541679233882477e-05, "loss": 0.5516, "mean_token_accuracy": 0.8313047230243683, "num_tokens": 242868508.0, "step": 220 }, { "epoch": 0.3197726061467401, "grad_norm": 0.2918451339288788, "learning_rate": 1.63768903010958e-05, "loss": 0.5549, "mean_token_accuracy": 0.8298390731215477, "num_tokens": 248407721.0, "step": 225 }, { "epoch": 0.3268786640611121, "grad_norm": 0.2778992598258234, "learning_rate": 1.6209249148564437e-05, "loss": 0.5503, "mean_token_accuracy": 0.831806804984808, "num_tokens": 253911671.0, "step": 230 }, { "epoch": 0.3339847219754841, "grad_norm": 0.29816424123268476, "learning_rate": 1.603884470304318e-05, "loss": 0.5625, "mean_token_accuracy": 0.8286240585148335, "num_tokens": 259442838.0, "step": 235 }, { "epoch": 0.3410907798898561, "grad_norm": 0.3434968878817897, "learning_rate": 1.5865767357100383e-05, "loss": 0.5454, "mean_token_accuracy": 0.8329082369804383, "num_tokens": 264975290.0, "step": 240 }, { "epoch": 0.3481968378042281, "grad_norm": 0.28342056849119923, "learning_rate": 1.5690108921168428e-05, "loss": 0.5509, "mean_token_accuracy": 0.831651521474123, "num_tokens": 270498217.0, "step": 245 }, { "epoch": 0.35530289571860013, "grad_norm": 0.2801391925223262, "learning_rate": 1.5511962574842073e-05, "loss": 0.5506, "mean_token_accuracy": 0.8316259779036045, "num_tokens": 276014342.0, "step": 250 }, { "epoch": 0.3624089536329721, "grad_norm": 0.275450732541475, "learning_rate": 1.5331422817450485e-05, "loss": 0.5533, "mean_token_accuracy": 0.830129113048315, "num_tokens": 281549490.0, "step": 255 }, { "epoch": 0.3695150115473441, "grad_norm": 0.30780371017473934, "learning_rate": 1.5148585417929212e-05, "loss": 0.5497, "mean_token_accuracy": 0.8321508087217808, "num_tokens": 287067865.0, "step": 260 }, { "epoch": 0.37662106946171614, "grad_norm": 0.32141139168463606, "learning_rate": 1.4963547364018711e-05, "loss": 0.546, "mean_token_accuracy": 0.8327535085380078, "num_tokens": 292575698.0, "step": 265 }, { "epoch": 0.3837271273760881, "grad_norm": 0.2941090649445171, "learning_rate": 1.477640681081632e-05, "loss": 0.5495, "mean_token_accuracy": 0.8316998913884163, "num_tokens": 298104461.0, "step": 270 }, { "epoch": 0.39083318529046013, "grad_norm": 0.3156113844180264, "learning_rate": 1.4587263028709013e-05, "loss": 0.546, "mean_token_accuracy": 0.8325068384408951, "num_tokens": 303615295.0, "step": 275 }, { "epoch": 0.3979392432048321, "grad_norm": 0.27848176976199285, "learning_rate": 1.4396216350714512e-05, "loss": 0.5473, "mean_token_accuracy": 0.8326819702982903, "num_tokens": 309131939.0, "step": 280 }, { "epoch": 0.4050453011192041, "grad_norm": 0.28930577025792253, "learning_rate": 1.4203368119258759e-05, "loss": 0.5433, "mean_token_accuracy": 0.8328328810632228, "num_tokens": 314661318.0, "step": 285 }, { "epoch": 0.41215135903357614, "grad_norm": 0.3660686775923689, "learning_rate": 1.4008820632417906e-05, "loss": 0.5401, "mean_token_accuracy": 0.8341327331960201, "num_tokens": 320185466.0, "step": 290 }, { "epoch": 0.4192574169479481, "grad_norm": 0.30653539266099966, "learning_rate": 1.381267708965339e-05, "loss": 0.5437, "mean_token_accuracy": 0.8333369344472885, "num_tokens": 325709519.0, "step": 295 }, { "epoch": 0.4263634748623201, "grad_norm": 0.2716446207789635, "learning_rate": 1.3615041537068831e-05, "loss": 0.5515, "mean_token_accuracy": 0.8306186564266682, "num_tokens": 331248121.0, "step": 300 }, { "epoch": 0.4263634748623201, "eval_loss": 0.5254271030426025, "eval_mean_token_accuracy": 0.8335580465040708, "eval_num_tokens": 331248121.0, "eval_runtime": 160.8696, "eval_samples_per_second": 22.621, "eval_steps_per_second": 0.709, "step": 300 }, { "epoch": 0.43346953277669215, "grad_norm": 0.27817622720791757, "learning_rate": 1.3416018812217866e-05, "loss": 0.5495, "mean_token_accuracy": 0.8312446601688862, "num_tokens": 336772706.0, "step": 305 }, { "epoch": 0.4405755906910641, "grad_norm": 0.2770013355984933, "learning_rate": 1.3215714488492121e-05, "loss": 0.535, "mean_token_accuracy": 0.8355906143784523, "num_tokens": 342302933.0, "step": 310 }, { "epoch": 0.44768164860543613, "grad_norm": 0.2774944414170579, "learning_rate": 1.3014234819118846e-05, "loss": 0.533, "mean_token_accuracy": 0.8360061995685101, "num_tokens": 347832122.0, "step": 315 }, { "epoch": 0.45478770651980815, "grad_norm": 0.2822504430152847, "learning_rate": 1.2811686680797942e-05, "loss": 0.5464, "mean_token_accuracy": 0.8318669557571411, "num_tokens": 353357495.0, "step": 320 }, { "epoch": 0.4618937644341801, "grad_norm": 0.25912651097009326, "learning_rate": 1.2608177517008268e-05, "loss": 0.5374, "mean_token_accuracy": 0.8345673441886902, "num_tokens": 358876427.0, "step": 325 }, { "epoch": 0.46899982234855214, "grad_norm": 0.2899391004734366, "learning_rate": 1.240381528101327e-05, "loss": 0.5315, "mean_token_accuracy": 0.8358384124934674, "num_tokens": 364395480.0, "step": 330 }, { "epoch": 0.47610588026292416, "grad_norm": 0.2885199318810423, "learning_rate": 1.2198708378596198e-05, "loss": 0.5276, "mean_token_accuracy": 0.8363832160830498, "num_tokens": 369904006.0, "step": 335 }, { "epoch": 0.48321193817729613, "grad_norm": 0.2803340014077296, "learning_rate": 1.19929656105553e-05, "loss": 0.5313, "mean_token_accuracy": 0.8354136534035206, "num_tokens": 375415817.0, "step": 340 }, { "epoch": 0.49031799609166815, "grad_norm": 0.2841127075165474, "learning_rate": 1.1786696114989455e-05, "loss": 0.5324, "mean_token_accuracy": 0.8360840916633606, "num_tokens": 380931196.0, "step": 345 }, { "epoch": 0.4974240540060402, "grad_norm": 0.27070169414461726, "learning_rate": 1.1580009309404887e-05, "loss": 0.5339, "mean_token_accuracy": 0.8355580635368824, "num_tokens": 386462024.0, "step": 350 }, { "epoch": 0.5045301119204122, "grad_norm": 0.27595636054491235, "learning_rate": 1.1373014832673661e-05, "loss": 0.5363, "mean_token_accuracy": 0.8354052670300007, "num_tokens": 391971484.0, "step": 355 }, { "epoch": 0.5116361698347841, "grad_norm": 0.2526139742391355, "learning_rate": 1.1165822486874773e-05, "loss": 0.5288, "mean_token_accuracy": 0.8361794017255306, "num_tokens": 397503191.0, "step": 360 }, { "epoch": 0.5187422277491561, "grad_norm": 0.28153871306210954, "learning_rate": 1.0958542179048637e-05, "loss": 0.5307, "mean_token_accuracy": 0.8368273265659809, "num_tokens": 403001146.0, "step": 365 }, { "epoch": 0.5258482856635281, "grad_norm": 0.3145297412780038, "learning_rate": 1.0751283862895914e-05, "loss": 0.5422, "mean_token_accuracy": 0.8336515955626964, "num_tokens": 408526858.0, "step": 370 }, { "epoch": 0.5329543435779002, "grad_norm": 0.28928430258415916, "learning_rate": 1.0544157480451586e-05, "loss": 0.5402, "mean_token_accuracy": 0.833558625727892, "num_tokens": 414051639.0, "step": 375 }, { "epoch": 0.5400604014922722, "grad_norm": 0.26142431075193084, "learning_rate": 1.033727290376522e-05, "loss": 0.542, "mean_token_accuracy": 0.8336416870355606, "num_tokens": 419571564.0, "step": 380 }, { "epoch": 0.5471664594066442, "grad_norm": 0.31125146975357, "learning_rate": 1.013073987661834e-05, "loss": 0.54, "mean_token_accuracy": 0.8338716469705105, "num_tokens": 425096738.0, "step": 385 }, { "epoch": 0.5542725173210161, "grad_norm": 0.2668746904615073, "learning_rate": 9.924667956309862e-06, "loss": 0.5312, "mean_token_accuracy": 0.836452516913414, "num_tokens": 430620174.0, "step": 390 }, { "epoch": 0.5613785752353881, "grad_norm": 0.40741674797587885, "learning_rate": 9.719166455540437e-06, "loss": 0.5374, "mean_token_accuracy": 0.8344319149851799, "num_tokens": 436140181.0, "step": 395 }, { "epoch": 0.5684846331497602, "grad_norm": 0.27348406881195864, "learning_rate": 9.51434438442655e-06, "loss": 0.5348, "mean_token_accuracy": 0.8360598988831043, "num_tokens": 441662675.0, "step": 400 }, { "epoch": 0.5684846331497602, "eval_loss": 0.5157341361045837, "eval_mean_token_accuracy": 0.8361675456950539, "eval_num_tokens": 441662675.0, "eval_runtime": 161.4058, "eval_samples_per_second": 22.546, "eval_steps_per_second": 0.706, "step": 400 }, { "epoch": 0.5755906910641322, "grad_norm": 0.27295040196213805, "learning_rate": 9.310310392675132e-06, "loss": 0.5232, "mean_token_accuracy": 0.8382402293384075, "num_tokens": 447154493.0, "step": 405 }, { "epoch": 0.5826967489785042, "grad_norm": 0.26519161043459566, "learning_rate": 9.107172711949324e-06, "loss": 0.5382, "mean_token_accuracy": 0.8342290692031383, "num_tokens": 452684121.0, "step": 410 }, { "epoch": 0.5898028068928762, "grad_norm": 0.2629255148134165, "learning_rate": 8.905039098456049e-06, "loss": 0.53, "mean_token_accuracy": 0.835949394851923, "num_tokens": 458209573.0, "step": 415 }, { "epoch": 0.5969088648072481, "grad_norm": 0.2762404944964492, "learning_rate": 8.704016775785742e-06, "loss": 0.5345, "mean_token_accuracy": 0.8355151884257793, "num_tokens": 463743007.0, "step": 420 }, { "epoch": 0.6040149227216202, "grad_norm": 0.3862082589618725, "learning_rate": 8.50421237803464e-06, "loss": 0.529, "mean_token_accuracy": 0.8362281493842602, "num_tokens": 469259672.0, "step": 425 }, { "epoch": 0.6111209806359922, "grad_norm": 0.29912793555607287, "learning_rate": 8.30573189323978e-06, "loss": 0.523, "mean_token_accuracy": 0.8389238156378269, "num_tokens": 474761309.0, "step": 430 }, { "epoch": 0.6182270385503642, "grad_norm": 0.28697574980998686, "learning_rate": 8.108680607156669e-06, "loss": 0.5373, "mean_token_accuracy": 0.8346886426210404, "num_tokens": 480283239.0, "step": 435 }, { "epoch": 0.6253330964647362, "grad_norm": 0.2981386592415112, "learning_rate": 7.913163047409533e-06, "loss": 0.5299, "mean_token_accuracy": 0.8359711997210979, "num_tokens": 485803053.0, "step": 440 }, { "epoch": 0.6324391543791081, "grad_norm": 0.2811128507955913, "learning_rate": 7.719282928043688e-06, "loss": 0.5312, "mean_token_accuracy": 0.8358238264918327, "num_tokens": 491312112.0, "step": 445 }, { "epoch": 0.6395452122934802, "grad_norm": 0.2538506418412949, "learning_rate": 7.527143094509492e-06, "loss": 0.5292, "mean_token_accuracy": 0.8370420016348362, "num_tokens": 496828932.0, "step": 450 }, { "epoch": 0.6466512702078522, "grad_norm": 0.26664943478343317, "learning_rate": 7.336845469107061e-06, "loss": 0.5295, "mean_token_accuracy": 0.8362418174743652, "num_tokens": 502329483.0, "step": 455 }, { "epoch": 0.6537573281222242, "grad_norm": 0.2822285091829132, "learning_rate": 7.148490996920661e-06, "loss": 0.5315, "mean_token_accuracy": 0.8359686724841595, "num_tokens": 507853922.0, "step": 460 }, { "epoch": 0.6608633860365962, "grad_norm": 0.26891344059858413, "learning_rate": 6.9621795922714805e-06, "loss": 0.5283, "mean_token_accuracy": 0.837210227549076, "num_tokens": 513366033.0, "step": 465 }, { "epoch": 0.6679694439509682, "grad_norm": 0.27887938299077314, "learning_rate": 6.778010085717202e-06, "loss": 0.5275, "mean_token_accuracy": 0.8373380437493324, "num_tokens": 518888798.0, "step": 470 }, { "epoch": 0.6750755018653402, "grad_norm": 0.25953435317326623, "learning_rate": 6.596080171626409e-06, "loss": 0.5307, "mean_token_accuracy": 0.8357150256633759, "num_tokens": 524392285.0, "step": 475 }, { "epoch": 0.6821815597797122, "grad_norm": 0.2665111343594617, "learning_rate": 6.416486356355769e-06, "loss": 0.5365, "mean_token_accuracy": 0.8344089902937413, "num_tokens": 529904641.0, "step": 480 }, { "epoch": 0.6892876176940842, "grad_norm": 0.26605568148767894, "learning_rate": 6.239323907057342e-06, "loss": 0.5334, "mean_token_accuracy": 0.8357222154736519, "num_tokens": 535417362.0, "step": 485 }, { "epoch": 0.6963936756084562, "grad_norm": 0.2636227512736411, "learning_rate": 6.064686801143271e-06, "loss": 0.5166, "mean_token_accuracy": 0.8401257589459419, "num_tokens": 540908836.0, "step": 490 }, { "epoch": 0.7034997335228282, "grad_norm": 0.26826679232229517, "learning_rate": 5.892667676434633e-06, "loss": 0.5238, "mean_token_accuracy": 0.8382793001830577, "num_tokens": 546444796.0, "step": 495 }, { "epoch": 0.7106057914372003, "grad_norm": 0.3599829471194239, "learning_rate": 5.723357782020867e-06, "loss": 0.5225, "mean_token_accuracy": 0.838120236247778, "num_tokens": 551952947.0, "step": 500 }, { "epoch": 0.7106057914372003, "eval_loss": 0.5095566511154175, "eval_mean_token_accuracy": 0.8375082057819032, "eval_num_tokens": 551952947.0, "eval_runtime": 161.2359, "eval_samples_per_second": 22.569, "eval_steps_per_second": 0.707, "step": 500 }, { "epoch": 0.7177118493515722, "grad_norm": 0.26704698034936375, "learning_rate": 5.556846929855857e-06, "loss": 0.5203, "mean_token_accuracy": 0.8389924250543117, "num_tokens": 557467246.0, "step": 505 }, { "epoch": 0.7248179072659442, "grad_norm": 0.26539658794683185, "learning_rate": 5.393223447116409e-06, "loss": 0.5344, "mean_token_accuracy": 0.8358133606612682, "num_tokens": 562988077.0, "step": 510 }, { "epoch": 0.7319239651803162, "grad_norm": 0.24829603150024088, "learning_rate": 5.232574129348278e-06, "loss": 0.5235, "mean_token_accuracy": 0.8386510953307151, "num_tokens": 568506583.0, "step": 515 }, { "epoch": 0.7390300230946882, "grad_norm": 0.2605719227696348, "learning_rate": 5.0749841944247e-06, "loss": 0.5338, "mean_token_accuracy": 0.83480354398489, "num_tokens": 574039672.0, "step": 520 }, { "epoch": 0.7461360810090603, "grad_norm": 0.26778891488350126, "learning_rate": 4.92053723734182e-06, "loss": 0.5314, "mean_token_accuracy": 0.8356986902654171, "num_tokens": 579544417.0, "step": 525 }, { "epoch": 0.7532421389234323, "grad_norm": 0.42601785989143315, "learning_rate": 4.769315185874951e-06, "loss": 0.5269, "mean_token_accuracy": 0.8374913208186626, "num_tokens": 585066703.0, "step": 530 }, { "epoch": 0.7603481968378042, "grad_norm": 0.2541554106960908, "learning_rate": 4.621398257119266e-06, "loss": 0.5262, "mean_token_accuracy": 0.837667242437601, "num_tokens": 590596017.0, "step": 535 }, { "epoch": 0.7674542547521762, "grad_norm": 0.2695567911282263, "learning_rate": 4.476864914937923e-06, "loss": 0.5195, "mean_token_accuracy": 0.8392265714704991, "num_tokens": 596113106.0, "step": 540 }, { "epoch": 0.7745603126665482, "grad_norm": 0.25052432871031344, "learning_rate": 4.335791828340183e-06, "loss": 0.5296, "mean_token_accuracy": 0.837313498556614, "num_tokens": 601659151.0, "step": 545 }, { "epoch": 0.7816663705809203, "grad_norm": 0.26020461836934194, "learning_rate": 4.1982538308116775e-06, "loss": 0.5251, "mean_token_accuracy": 0.8361369468271732, "num_tokens": 607175134.0, "step": 550 }, { "epoch": 0.7887724284952923, "grad_norm": 0.24684211829794916, "learning_rate": 4.064323880618279e-06, "loss": 0.5274, "mean_token_accuracy": 0.8380102440714836, "num_tokens": 612699036.0, "step": 555 }, { "epoch": 0.7958784864096642, "grad_norm": 0.2745271550137196, "learning_rate": 3.934073022104759e-06, "loss": 0.5233, "mean_token_accuracy": 0.8380540162324905, "num_tokens": 618232472.0, "step": 560 }, { "epoch": 0.8029845443240362, "grad_norm": 0.2695298031436579, "learning_rate": 3.807570348008672e-06, "loss": 0.5243, "mean_token_accuracy": 0.8377192810177803, "num_tokens": 623766598.0, "step": 565 }, { "epoch": 0.8100906022384082, "grad_norm": 0.2646862071287686, "learning_rate": 3.684882962809484e-06, "loss": 0.5304, "mean_token_accuracy": 0.8363631062209607, "num_tokens": 629297697.0, "step": 570 }, { "epoch": 0.8171966601527803, "grad_norm": 0.23795317969691135, "learning_rate": 3.5660759471324037e-06, "loss": 0.5287, "mean_token_accuracy": 0.836943382024765, "num_tokens": 634808630.0, "step": 575 }, { "epoch": 0.8243027180671523, "grad_norm": 0.25754712205145064, "learning_rate": 3.451212323225786e-06, "loss": 0.52, "mean_token_accuracy": 0.8387916676700116, "num_tokens": 640315396.0, "step": 580 }, { "epoch": 0.8314087759815243, "grad_norm": 0.24240514105833283, "learning_rate": 3.340353021530409e-06, "loss": 0.5214, "mean_token_accuracy": 0.8374654315412045, "num_tokens": 645841425.0, "step": 585 }, { "epoch": 0.8385148338958962, "grad_norm": 0.26461850588888575, "learning_rate": 3.2335568483583708e-06, "loss": 0.516, "mean_token_accuracy": 0.8414874106645585, "num_tokens": 651349089.0, "step": 590 }, { "epoch": 0.8456208918102682, "grad_norm": 0.25434414665474275, "learning_rate": 3.1308804546987615e-06, "loss": 0.5305, "mean_token_accuracy": 0.8366403698921203, "num_tokens": 656881938.0, "step": 595 }, { "epoch": 0.8527269497246402, "grad_norm": 0.25710193414257565, "learning_rate": 3.0323783061666307e-06, "loss": 0.5218, "mean_token_accuracy": 0.8387804657220841, "num_tokens": 662399294.0, "step": 600 }, { "epoch": 0.8527269497246402, "eval_loss": 0.5056362748146057, "eval_mean_token_accuracy": 0.8385671649062842, "eval_num_tokens": 662399294.0, "eval_runtime": 161.5872, "eval_samples_per_second": 22.52, "eval_steps_per_second": 0.706, "step": 600 }, { "epoch": 0.8598330076390123, "grad_norm": 0.24729771911584983, "learning_rate": 2.9381026541112145e-06, "loss": 0.5237, "mean_token_accuracy": 0.8380155004560947, "num_tokens": 667931430.0, "step": 605 }, { "epoch": 0.8669390655533843, "grad_norm": 0.25660228503672733, "learning_rate": 2.848103507898745e-06, "loss": 0.5269, "mean_token_accuracy": 0.8368470750749111, "num_tokens": 673461286.0, "step": 610 }, { "epoch": 0.8740451234677563, "grad_norm": 0.2856740804187853, "learning_rate": 2.7624286083845187e-06, "loss": 0.5218, "mean_token_accuracy": 0.8375120624899864, "num_tokens": 678984176.0, "step": 615 }, { "epoch": 0.8811511813821282, "grad_norm": 0.2678064141268326, "learning_rate": 2.6811234025883457e-06, "loss": 0.5172, "mean_token_accuracy": 0.839438085258007, "num_tokens": 684519341.0, "step": 620 }, { "epoch": 0.8882572392965002, "grad_norm": 0.26981981962318713, "learning_rate": 2.604231019586761e-06, "loss": 0.5207, "mean_token_accuracy": 0.839027612656355, "num_tokens": 690037943.0, "step": 625 }, { "epoch": 0.8953632972108723, "grad_norm": 0.2653800927885472, "learning_rate": 2.5317922476348194e-06, "loss": 0.523, "mean_token_accuracy": 0.8382897555828095, "num_tokens": 695563755.0, "step": 630 }, { "epoch": 0.9024693551252443, "grad_norm": 0.2909130920485432, "learning_rate": 2.4638455125296043e-06, "loss": 0.5246, "mean_token_accuracy": 0.837975486367941, "num_tokens": 701088593.0, "step": 635 }, { "epoch": 0.9095754130396163, "grad_norm": 0.2538310488730525, "learning_rate": 2.400426857226914e-06, "loss": 0.5181, "mean_token_accuracy": 0.8393405571579933, "num_tokens": 706621161.0, "step": 640 }, { "epoch": 0.9166814709539882, "grad_norm": 0.2569434114852769, "learning_rate": 2.3415699227219517e-06, "loss": 0.5295, "mean_token_accuracy": 0.8363286212086678, "num_tokens": 712135507.0, "step": 645 }, { "epoch": 0.9237875288683602, "grad_norm": 0.26172097069920536, "learning_rate": 2.2873059302041627e-06, "loss": 0.52, "mean_token_accuracy": 0.8396071724593639, "num_tokens": 717654276.0, "step": 650 }, { "epoch": 0.9308935867827323, "grad_norm": 0.26549742257601705, "learning_rate": 2.2376636644956656e-06, "loss": 0.5175, "mean_token_accuracy": 0.8391963638365268, "num_tokens": 723171964.0, "step": 655 }, { "epoch": 0.9379996446971043, "grad_norm": 0.2556259295868791, "learning_rate": 2.192669458782096e-06, "loss": 0.5255, "mean_token_accuracy": 0.8376853354275227, "num_tokens": 728678520.0, "step": 660 }, { "epoch": 0.9451057026114763, "grad_norm": 0.2511199627055941, "learning_rate": 2.1523471806439205e-06, "loss": 0.5344, "mean_token_accuracy": 0.8351106189191342, "num_tokens": 734210332.0, "step": 665 }, { "epoch": 0.9522117605258483, "grad_norm": 0.2744720323508251, "learning_rate": 2.1167182193956738e-06, "loss": 0.5156, "mean_token_accuracy": 0.8407034426927567, "num_tokens": 739702586.0, "step": 670 }, { "epoch": 0.9593178184402202, "grad_norm": 0.2996101632719976, "learning_rate": 2.0858014747397952e-06, "loss": 0.5245, "mean_token_accuracy": 0.8386490792036057, "num_tokens": 745219812.0, "step": 675 }, { "epoch": 0.9664238763545923, "grad_norm": 0.2615861429363619, "learning_rate": 2.0596133467411213e-06, "loss": 0.5175, "mean_token_accuracy": 0.8394487954676151, "num_tokens": 750717984.0, "step": 680 }, { "epoch": 0.9735299342689643, "grad_norm": 0.2670223147061545, "learning_rate": 2.0381677271273177e-06, "loss": 0.5209, "mean_token_accuracy": 0.8379806369543076, "num_tokens": 756252357.0, "step": 685 }, { "epoch": 0.9806359921833363, "grad_norm": 0.260083835300812, "learning_rate": 2.0214759919198904e-06, "loss": 0.5149, "mean_token_accuracy": 0.839554088562727, "num_tokens": 761750059.0, "step": 690 }, { "epoch": 0.9877420500977083, "grad_norm": 0.3230971032122484, "learning_rate": 2.0095469953996724e-06, "loss": 0.5244, "mean_token_accuracy": 0.8375246554613114, "num_tokens": 767275567.0, "step": 695 }, { "epoch": 0.9948481080120803, "grad_norm": 0.26793724105887723, "learning_rate": 2.002387065409989e-06, "loss": 0.527, "mean_token_accuracy": 0.8372870542109012, "num_tokens": 772815373.0, "step": 700 }, { "epoch": 0.9948481080120803, "eval_loss": 0.5037879943847656, "eval_mean_token_accuracy": 0.8392795601434875, "eval_num_tokens": 772815373.0, "eval_runtime": 162.0902, "eval_samples_per_second": 22.45, "eval_steps_per_second": 0.703, "step": 700 }, { "epoch": 1.0, "mean_token_accuracy": 0.8380202034424091, "num_tokens": 776817701.0, "step": 704, "total_flos": 6035679775555584.0, "train_loss": 0.6108774590221319, "train_runtime": 28363.9285, "train_samples_per_second": 12.701, "train_steps_per_second": 0.025 } ], "logging_steps": 5, "max_steps": 704, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6035679775555584.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }