{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 1408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007109215320359016, "grad_norm": 22.657577985866105, "learning_rate": 9.302325581395349e-06, "loss": 2.574, "mean_token_accuracy": 0.5464246176183224, "num_tokens": 4589382.0, "step": 5 }, { "epoch": 0.014218430640718031, "grad_norm": 2.3543289370923013, "learning_rate": 2.0930232558139536e-05, "loss": 1.4882, "mean_token_accuracy": 0.6589333653450012, "num_tokens": 9171524.0, "step": 10 }, { "epoch": 0.021327645961077047, "grad_norm": 0.8063547574982903, "learning_rate": 3.2558139534883724e-05, "loss": 1.0174, "mean_token_accuracy": 0.7330243036150932, "num_tokens": 13765157.0, "step": 15 }, { "epoch": 0.028436861281436063, "grad_norm": 0.572573905518242, "learning_rate": 4.418604651162791e-05, "loss": 0.8773, "mean_token_accuracy": 0.7569610200822353, "num_tokens": 18369874.0, "step": 20 }, { "epoch": 0.035546076601795075, "grad_norm": 0.5738482260117446, "learning_rate": 5.5813953488372095e-05, "loss": 0.7975, "mean_token_accuracy": 0.7729738861322403, "num_tokens": 22960290.0, "step": 25 }, { "epoch": 0.042655291922154094, "grad_norm": 0.5016568944917689, "learning_rate": 6.744186046511628e-05, "loss": 0.7632, "mean_token_accuracy": 0.778630904853344, "num_tokens": 27556623.0, "step": 30 }, { "epoch": 0.049764507242513106, "grad_norm": 0.4845613474361907, "learning_rate": 7.906976744186047e-05, "loss": 0.7326, "mean_token_accuracy": 0.7872321248054505, "num_tokens": 32158408.0, "step": 35 }, { "epoch": 0.056873722562872125, "grad_norm": 0.4270154516127363, "learning_rate": 9.069767441860465e-05, "loss": 0.7095, "mean_token_accuracy": 0.7919960044324398, "num_tokens": 36742233.0, "step": 40 }, { "epoch": 0.06398293788323114, "grad_norm": 0.499498695141066, "learning_rate": 9.9999880816326e-05, "loss": 0.6973, "mean_token_accuracy": 0.7952379912137986, "num_tokens": 41335670.0, "step": 45 }, { "epoch": 0.07109215320359015, "grad_norm": 0.4645180201543763, "learning_rate": 9.999570945402425e-05, "loss": 0.6853, "mean_token_accuracy": 0.7981184311211109, "num_tokens": 45940079.0, "step": 50 }, { "epoch": 0.07820136852394917, "grad_norm": 0.434255531179794, "learning_rate": 9.998557953932929e-05, "loss": 0.6688, "mean_token_accuracy": 0.8012012615799904, "num_tokens": 50533771.0, "step": 55 }, { "epoch": 0.08531058384430819, "grad_norm": 0.393754634337621, "learning_rate": 9.99694924136941e-05, "loss": 0.6725, "mean_token_accuracy": 0.800255061686039, "num_tokens": 55133444.0, "step": 60 }, { "epoch": 0.0924197991646672, "grad_norm": 0.49718727212066355, "learning_rate": 9.99474502074547e-05, "loss": 0.6664, "mean_token_accuracy": 0.801218880712986, "num_tokens": 59726447.0, "step": 65 }, { "epoch": 0.09952901448502621, "grad_norm": 0.4005142024066312, "learning_rate": 9.991945583954808e-05, "loss": 0.6549, "mean_token_accuracy": 0.8056452445685863, "num_tokens": 64319917.0, "step": 70 }, { "epoch": 0.10663822980538523, "grad_norm": 0.3774090383980249, "learning_rate": 9.988551301712567e-05, "loss": 0.6454, "mean_token_accuracy": 0.806719920784235, "num_tokens": 68898868.0, "step": 75 }, { "epoch": 0.11374744512574425, "grad_norm": 0.3995895890256704, "learning_rate": 9.984562623506235e-05, "loss": 0.6464, "mean_token_accuracy": 0.8064703330397606, "num_tokens": 73481972.0, "step": 80 }, { "epoch": 0.12085666044610326, "grad_norm": 0.3801619159341505, "learning_rate": 9.979980077536136e-05, "loss": 0.6462, "mean_token_accuracy": 0.8080633491277694, "num_tokens": 78079419.0, "step": 85 }, { "epoch": 0.1279658757664623, "grad_norm": 0.37074794226689833, "learning_rate": 9.974804270645462e-05, "loss": 0.6362, "mean_token_accuracy": 0.8091117829084397, "num_tokens": 82670195.0, "step": 90 }, { "epoch": 0.13507509108682128, "grad_norm": 0.37193721608812236, "learning_rate": 9.969035888239937e-05, "loss": 0.635, "mean_token_accuracy": 0.8079991653561592, "num_tokens": 87257953.0, "step": 95 }, { "epoch": 0.1421843064071803, "grad_norm": 0.36251703620037773, "learning_rate": 9.96267569419703e-05, "loss": 0.6315, "mean_token_accuracy": 0.8096475720405578, "num_tokens": 91838382.0, "step": 100 }, { "epoch": 0.1421843064071803, "eval_loss": 0.5971412062644958, "eval_mean_token_accuracy": 0.8093206621052926, "eval_num_tokens": 91838382.0, "eval_runtime": 141.8153, "eval_samples_per_second": 25.653, "eval_steps_per_second": 0.804, "step": 100 }, { "epoch": 0.14929352172753932, "grad_norm": 0.41583625971563776, "learning_rate": 9.955724530764809e-05, "loss": 0.6381, "mean_token_accuracy": 0.8077230393886566, "num_tokens": 96431755.0, "step": 105 }, { "epoch": 0.15640273704789834, "grad_norm": 0.3705693803444073, "learning_rate": 9.948183318450413e-05, "loss": 0.6197, "mean_token_accuracy": 0.8116156131029129, "num_tokens": 101027690.0, "step": 110 }, { "epoch": 0.16351195236825736, "grad_norm": 0.3214510651452395, "learning_rate": 9.940053055898133e-05, "loss": 0.6313, "mean_token_accuracy": 0.8089181430637836, "num_tokens": 105628547.0, "step": 115 }, { "epoch": 0.17062116768861638, "grad_norm": 0.34220731720085373, "learning_rate": 9.93133481975719e-05, "loss": 0.6077, "mean_token_accuracy": 0.814984206855297, "num_tokens": 110243592.0, "step": 120 }, { "epoch": 0.1777303830089754, "grad_norm": 0.35675802487560043, "learning_rate": 9.922029764539148e-05, "loss": 0.6263, "mean_token_accuracy": 0.8096928559243679, "num_tokens": 114832845.0, "step": 125 }, { "epoch": 0.1848395983293344, "grad_norm": 0.3422296936678833, "learning_rate": 9.912139122465027e-05, "loss": 0.6116, "mean_token_accuracy": 0.8140982151031494, "num_tokens": 119435421.0, "step": 130 }, { "epoch": 0.1919488136496934, "grad_norm": 0.3599918244922273, "learning_rate": 9.901664203302126e-05, "loss": 0.6052, "mean_token_accuracy": 0.8154805108904839, "num_tokens": 124028647.0, "step": 135 }, { "epoch": 0.19905802897005243, "grad_norm": 0.3595154303423279, "learning_rate": 9.890606394190588e-05, "loss": 0.6126, "mean_token_accuracy": 0.8132404424250126, "num_tokens": 128628413.0, "step": 140 }, { "epoch": 0.20616724429041144, "grad_norm": 0.3711012466200944, "learning_rate": 9.878967159459693e-05, "loss": 0.6068, "mean_token_accuracy": 0.8164977565407753, "num_tokens": 133219422.0, "step": 145 }, { "epoch": 0.21327645961077046, "grad_norm": 0.35910926284617867, "learning_rate": 9.866748040433956e-05, "loss": 0.6099, "mean_token_accuracy": 0.8152773261070252, "num_tokens": 137825952.0, "step": 150 }, { "epoch": 0.22038567493112948, "grad_norm": 0.4205439208166243, "learning_rate": 9.853950655229009e-05, "loss": 0.6064, "mean_token_accuracy": 0.815191026777029, "num_tokens": 142422368.0, "step": 155 }, { "epoch": 0.2274948902514885, "grad_norm": 0.32091150374802263, "learning_rate": 9.840576698537329e-05, "loss": 0.6093, "mean_token_accuracy": 0.8135301224887371, "num_tokens": 147015990.0, "step": 160 }, { "epoch": 0.23460410557184752, "grad_norm": 0.32627028158119226, "learning_rate": 9.826627941403811e-05, "loss": 0.5969, "mean_token_accuracy": 0.8182829037308693, "num_tokens": 151627096.0, "step": 165 }, { "epoch": 0.2417133208922065, "grad_norm": 0.32405674248273, "learning_rate": 9.812106230991248e-05, "loss": 0.6068, "mean_token_accuracy": 0.8159149341285229, "num_tokens": 156218968.0, "step": 170 }, { "epoch": 0.24882253621256553, "grad_norm": 0.3206982540127891, "learning_rate": 9.79701349033571e-05, "loss": 0.6039, "mean_token_accuracy": 0.8161494679749012, "num_tokens": 160797401.0, "step": 175 }, { "epoch": 0.2559317515329246, "grad_norm": 0.3360732448004463, "learning_rate": 9.78135171809189e-05, "loss": 0.6068, "mean_token_accuracy": 0.8159954428672791, "num_tokens": 165402684.0, "step": 180 }, { "epoch": 0.26304096685328354, "grad_norm": 0.33789233259366286, "learning_rate": 9.76512298826844e-05, "loss": 0.6026, "mean_token_accuracy": 0.8167447924613953, "num_tokens": 169997282.0, "step": 185 }, { "epoch": 0.27015018217364256, "grad_norm": 0.3089560668153988, "learning_rate": 9.748329449953302e-05, "loss": 0.5904, "mean_token_accuracy": 0.8193566597998142, "num_tokens": 174589836.0, "step": 190 }, { "epoch": 0.2772593974940016, "grad_norm": 0.32060053414915524, "learning_rate": 9.73097332702914e-05, "loss": 0.6044, "mean_token_accuracy": 0.8175870932638645, "num_tokens": 179181747.0, "step": 195 }, { "epoch": 0.2843686128143606, "grad_norm": 0.32004664048912745, "learning_rate": 9.713056917878818e-05, "loss": 0.5888, "mean_token_accuracy": 0.8192018747329712, "num_tokens": 183760367.0, "step": 200 }, { "epoch": 0.2843686128143606, "eval_loss": 0.5599971413612366, "eval_mean_token_accuracy": 0.8188303473748659, "eval_num_tokens": 183760367.0, "eval_runtime": 145.8536, "eval_samples_per_second": 24.943, "eval_steps_per_second": 0.782, "step": 200 }, { "epoch": 0.2914778281347196, "grad_norm": 0.3094551492752116, "learning_rate": 9.694582595081057e-05, "loss": 0.5872, "mean_token_accuracy": 0.819921114295721, "num_tokens": 188360903.0, "step": 205 }, { "epoch": 0.29858704345507864, "grad_norm": 0.36254147904822126, "learning_rate": 9.67555280509623e-05, "loss": 0.5942, "mean_token_accuracy": 0.817745155096054, "num_tokens": 192932381.0, "step": 210 }, { "epoch": 0.30569625877543766, "grad_norm": 0.3377909564779145, "learning_rate": 9.655970067942405e-05, "loss": 0.5994, "mean_token_accuracy": 0.8163805276155471, "num_tokens": 197505985.0, "step": 215 }, { "epoch": 0.3128054740957967, "grad_norm": 0.30751780494672465, "learning_rate": 9.63583697686162e-05, "loss": 0.5902, "mean_token_accuracy": 0.8196643941104412, "num_tokens": 202105424.0, "step": 220 }, { "epoch": 0.3199146894161557, "grad_norm": 0.34345028355301316, "learning_rate": 9.615156197976477e-05, "loss": 0.582, "mean_token_accuracy": 0.8217154465615749, "num_tokens": 206686951.0, "step": 225 }, { "epoch": 0.3270239047365147, "grad_norm": 0.3216135018716631, "learning_rate": 9.593930469937087e-05, "loss": 0.5708, "mean_token_accuracy": 0.8250658005475998, "num_tokens": 211278788.0, "step": 230 }, { "epoch": 0.33413312005687373, "grad_norm": 0.32564659909940696, "learning_rate": 9.572162603558393e-05, "loss": 0.5928, "mean_token_accuracy": 0.819525595754385, "num_tokens": 215877205.0, "step": 235 }, { "epoch": 0.34124233537723275, "grad_norm": 0.4839583335140069, "learning_rate": 9.549855481447954e-05, "loss": 0.5882, "mean_token_accuracy": 0.8204580388963223, "num_tokens": 220486454.0, "step": 240 }, { "epoch": 0.34835155069759177, "grad_norm": 0.3268671171699921, "learning_rate": 9.527012057624224e-05, "loss": 0.5836, "mean_token_accuracy": 0.8208626843988895, "num_tokens": 225080225.0, "step": 245 }, { "epoch": 0.3554607660179508, "grad_norm": 0.3244498327733708, "learning_rate": 9.50363535712535e-05, "loss": 0.586, "mean_token_accuracy": 0.8207595020532608, "num_tokens": 229657012.0, "step": 250 }, { "epoch": 0.3625699813383098, "grad_norm": 0.29889265357291406, "learning_rate": 9.479728475608593e-05, "loss": 0.5919, "mean_token_accuracy": 0.8190862230956555, "num_tokens": 234248976.0, "step": 255 }, { "epoch": 0.3696791966586688, "grad_norm": 0.34636883393423384, "learning_rate": 9.455294578940384e-05, "loss": 0.5765, "mean_token_accuracy": 0.8226364821195602, "num_tokens": 238829734.0, "step": 260 }, { "epoch": 0.3767884119790278, "grad_norm": 0.3092592234408446, "learning_rate": 9.430336902777083e-05, "loss": 0.576, "mean_token_accuracy": 0.821333235502243, "num_tokens": 243418989.0, "step": 265 }, { "epoch": 0.3838976272993868, "grad_norm": 0.30454136223380207, "learning_rate": 9.404858752136499e-05, "loss": 0.5771, "mean_token_accuracy": 0.8237294301390647, "num_tokens": 248015701.0, "step": 270 }, { "epoch": 0.39100684261974583, "grad_norm": 0.30289215095264577, "learning_rate": 9.378863500960222e-05, "loss": 0.5709, "mean_token_accuracy": 0.8236084163188935, "num_tokens": 252613191.0, "step": 275 }, { "epoch": 0.39811605794010485, "grad_norm": 0.3010273864601919, "learning_rate": 9.352354591666827e-05, "loss": 0.5861, "mean_token_accuracy": 0.820894256979227, "num_tokens": 257210808.0, "step": 280 }, { "epoch": 0.40522527326046387, "grad_norm": 0.30175911100812025, "learning_rate": 9.325335534696017e-05, "loss": 0.5753, "mean_token_accuracy": 0.8225005254149437, "num_tokens": 261790131.0, "step": 285 }, { "epoch": 0.4123344885808229, "grad_norm": 0.28871941798325856, "learning_rate": 9.29780990804375e-05, "loss": 0.5799, "mean_token_accuracy": 0.821347926557064, "num_tokens": 266377324.0, "step": 290 }, { "epoch": 0.4194437039011819, "grad_norm": 0.28095014086273895, "learning_rate": 9.269781356788424e-05, "loss": 0.581, "mean_token_accuracy": 0.8209108576178551, "num_tokens": 270967910.0, "step": 295 }, { "epoch": 0.4265529192215409, "grad_norm": 0.2893211807696515, "learning_rate": 9.241253592608183e-05, "loss": 0.5755, "mean_token_accuracy": 0.8242007777094841, "num_tokens": 275570273.0, "step": 300 }, { "epoch": 0.4265529192215409, "eval_loss": 0.5416839122772217, "eval_mean_token_accuracy": 0.8231211885025627, "eval_num_tokens": 275570273.0, "eval_runtime": 145.5254, "eval_samples_per_second": 24.999, "eval_steps_per_second": 0.783, "step": 300 }, { "epoch": 0.43366213454189995, "grad_norm": 0.30733885282429685, "learning_rate": 9.212230393289385e-05, "loss": 0.5781, "mean_token_accuracy": 0.8230207331478596, "num_tokens": 280172533.0, "step": 305 }, { "epoch": 0.44077134986225897, "grad_norm": 0.2682470819307261, "learning_rate": 9.182715602226341e-05, "loss": 0.5625, "mean_token_accuracy": 0.8270745746791363, "num_tokens": 284763929.0, "step": 310 }, { "epoch": 0.447880565182618, "grad_norm": 0.2962012849994535, "learning_rate": 9.152713127912355e-05, "loss": 0.5848, "mean_token_accuracy": 0.8201167277991772, "num_tokens": 289376903.0, "step": 315 }, { "epoch": 0.454989780502977, "grad_norm": 0.28564514411407316, "learning_rate": 9.12222694342213e-05, "loss": 0.5732, "mean_token_accuracy": 0.8246621482074261, "num_tokens": 293966796.0, "step": 320 }, { "epoch": 0.462098995823336, "grad_norm": 0.30020425973519915, "learning_rate": 9.091261085885646e-05, "loss": 0.5606, "mean_token_accuracy": 0.826822079718113, "num_tokens": 298540346.0, "step": 325 }, { "epoch": 0.46920821114369504, "grad_norm": 0.2887047887642146, "learning_rate": 9.059819655953536e-05, "loss": 0.5738, "mean_token_accuracy": 0.823461939394474, "num_tokens": 303112604.0, "step": 330 }, { "epoch": 0.476317426464054, "grad_norm": 0.3180269352697689, "learning_rate": 9.027906817254063e-05, "loss": 0.5654, "mean_token_accuracy": 0.8256018176674843, "num_tokens": 307694241.0, "step": 335 }, { "epoch": 0.483426641784413, "grad_norm": 0.29567931374872014, "learning_rate": 8.995526795841753e-05, "loss": 0.558, "mean_token_accuracy": 0.8256605207920075, "num_tokens": 312289299.0, "step": 340 }, { "epoch": 0.49053585710477204, "grad_norm": 0.3336504103662035, "learning_rate": 8.962683879637747e-05, "loss": 0.5617, "mean_token_accuracy": 0.8257805988192558, "num_tokens": 316884766.0, "step": 345 }, { "epoch": 0.49764507242513106, "grad_norm": 0.3705167375534613, "learning_rate": 8.929382417861991e-05, "loss": 0.561, "mean_token_accuracy": 0.8267210200428963, "num_tokens": 321461198.0, "step": 350 }, { "epoch": 0.5047542877454901, "grad_norm": 0.2946584460529412, "learning_rate": 8.895626820457283e-05, "loss": 0.557, "mean_token_accuracy": 0.828194110840559, "num_tokens": 326064722.0, "step": 355 }, { "epoch": 0.5118635030658492, "grad_norm": 0.31227448766803945, "learning_rate": 8.861421557505282e-05, "loss": 0.5522, "mean_token_accuracy": 0.8295037761330605, "num_tokens": 330652094.0, "step": 360 }, { "epoch": 0.5189727183862082, "grad_norm": 1.0759474066945163, "learning_rate": 8.826771158634567e-05, "loss": 0.5629, "mean_token_accuracy": 0.8260238766670227, "num_tokens": 335255835.0, "step": 365 }, { "epoch": 0.5260819337065671, "grad_norm": 0.2758992633553522, "learning_rate": 8.791680212420797e-05, "loss": 0.5502, "mean_token_accuracy": 0.828965923935175, "num_tokens": 339843476.0, "step": 370 }, { "epoch": 0.5331911490269261, "grad_norm": 0.29696149610793166, "learning_rate": 8.756153365779066e-05, "loss": 0.5542, "mean_token_accuracy": 0.8278730027377605, "num_tokens": 344420533.0, "step": 375 }, { "epoch": 0.5403003643472851, "grad_norm": 0.284706804181623, "learning_rate": 8.720195323348545e-05, "loss": 0.559, "mean_token_accuracy": 0.8278782211244107, "num_tokens": 349010370.0, "step": 380 }, { "epoch": 0.5474095796676441, "grad_norm": 0.3046957362601185, "learning_rate": 8.68381084686946e-05, "loss": 0.5576, "mean_token_accuracy": 0.8258513130247593, "num_tokens": 353598451.0, "step": 385 }, { "epoch": 0.5545187949880032, "grad_norm": 0.3134773718519533, "learning_rate": 8.647004754552526e-05, "loss": 0.5612, "mean_token_accuracy": 0.8255665130913258, "num_tokens": 358195615.0, "step": 390 }, { "epoch": 0.5616280103083622, "grad_norm": 0.33349640254961, "learning_rate": 8.609781920440891e-05, "loss": 0.552, "mean_token_accuracy": 0.8278413727879524, "num_tokens": 362764034.0, "step": 395 }, { "epoch": 0.5687372256287212, "grad_norm": 0.32034152048464726, "learning_rate": 8.5721472737647e-05, "loss": 0.5534, "mean_token_accuracy": 0.8273369200527668, "num_tokens": 367350265.0, "step": 400 }, { "epoch": 0.5687372256287212, "eval_loss": 0.5274047255516052, "eval_mean_token_accuracy": 0.8264030280866121, "eval_num_tokens": 367350265.0, "eval_runtime": 146.0134, "eval_samples_per_second": 24.916, "eval_steps_per_second": 0.781, "step": 400 }, { "epoch": 0.5758464409490802, "grad_norm": 0.29085093151843905, "learning_rate": 8.534105798288331e-05, "loss": 0.5506, "mean_token_accuracy": 0.830031219124794, "num_tokens": 371939618.0, "step": 405 }, { "epoch": 0.5829556562694392, "grad_norm": 0.27710417408529203, "learning_rate": 8.49566253165043e-05, "loss": 0.5439, "mean_token_accuracy": 0.8304261237382888, "num_tokens": 376519800.0, "step": 410 }, { "epoch": 0.5900648715897983, "grad_norm": 0.2611394917691902, "learning_rate": 8.456822564696789e-05, "loss": 0.5409, "mean_token_accuracy": 0.832954341173172, "num_tokens": 381102299.0, "step": 415 }, { "epoch": 0.5971740869101573, "grad_norm": 0.42771473321829473, "learning_rate": 8.417591040806213e-05, "loss": 0.5504, "mean_token_accuracy": 0.8300940133631229, "num_tokens": 385700779.0, "step": 420 }, { "epoch": 0.6042833022305163, "grad_norm": 0.28194050483515865, "learning_rate": 8.377973155209387e-05, "loss": 0.5553, "mean_token_accuracy": 0.8270630918443203, "num_tokens": 390294365.0, "step": 425 }, { "epoch": 0.6113925175508753, "grad_norm": 0.27563889901609234, "learning_rate": 8.337974154300913e-05, "loss": 0.5427, "mean_token_accuracy": 0.8309814311563969, "num_tokens": 394889149.0, "step": 430 }, { "epoch": 0.6185017328712343, "grad_norm": 0.27875362292884753, "learning_rate": 8.297599334944542e-05, "loss": 0.5561, "mean_token_accuracy": 0.8275676898658275, "num_tokens": 399459807.0, "step": 435 }, { "epoch": 0.6256109481915934, "grad_norm": 0.7336148967265075, "learning_rate": 8.256854043771754e-05, "loss": 0.5507, "mean_token_accuracy": 0.8285100273787975, "num_tokens": 404034333.0, "step": 440 }, { "epoch": 0.6327201635119524, "grad_norm": 0.3259646654441019, "learning_rate": 8.215743676473719e-05, "loss": 0.5503, "mean_token_accuracy": 0.8290993146598339, "num_tokens": 408627270.0, "step": 445 }, { "epoch": 0.6398293788323114, "grad_norm": 0.3012299941832976, "learning_rate": 8.174273677086779e-05, "loss": 0.552, "mean_token_accuracy": 0.8279682919383049, "num_tokens": 413222911.0, "step": 450 }, { "epoch": 0.6469385941526704, "grad_norm": 0.30771992691522176, "learning_rate": 8.132449537271519e-05, "loss": 0.552, "mean_token_accuracy": 0.8296807646751404, "num_tokens": 417806274.0, "step": 455 }, { "epoch": 0.6540478094730294, "grad_norm": 0.2810763807856677, "learning_rate": 8.090276795585531e-05, "loss": 0.5414, "mean_token_accuracy": 0.8314659893512726, "num_tokens": 422401434.0, "step": 460 }, { "epoch": 0.6611570247933884, "grad_norm": 0.2672336811508722, "learning_rate": 8.047761036749985e-05, "loss": 0.5564, "mean_token_accuracy": 0.8265900291502476, "num_tokens": 426986385.0, "step": 465 }, { "epoch": 0.6682662401137475, "grad_norm": 0.25924906311163326, "learning_rate": 8.004907890910055e-05, "loss": 0.5452, "mean_token_accuracy": 0.8297064855694771, "num_tokens": 431585703.0, "step": 470 }, { "epoch": 0.6753754554341065, "grad_norm": 0.2772688573388134, "learning_rate": 7.961723032889358e-05, "loss": 0.5292, "mean_token_accuracy": 0.8346129797399044, "num_tokens": 436150194.0, "step": 475 }, { "epoch": 0.6824846707544655, "grad_norm": 0.25573353155086187, "learning_rate": 7.918212181438467e-05, "loss": 0.5397, "mean_token_accuracy": 0.8314497999846935, "num_tokens": 440736901.0, "step": 480 }, { "epoch": 0.6895938860748245, "grad_norm": 0.2640386419783165, "learning_rate": 7.874381098477599e-05, "loss": 0.5359, "mean_token_accuracy": 0.8328767582774163, "num_tokens": 445334774.0, "step": 485 }, { "epoch": 0.6967031013951835, "grad_norm": 0.2662269663206075, "learning_rate": 7.830235588333597e-05, "loss": 0.5578, "mean_token_accuracy": 0.8268053226172924, "num_tokens": 449908855.0, "step": 490 }, { "epoch": 0.7038123167155426, "grad_norm": 0.2756351015892551, "learning_rate": 7.785781496971297e-05, "loss": 0.5503, "mean_token_accuracy": 0.8284729138016701, "num_tokens": 454513487.0, "step": 495 }, { "epoch": 0.7109215320359016, "grad_norm": 0.4547105928976161, "learning_rate": 7.741024711219366e-05, "loss": 0.5431, "mean_token_accuracy": 0.8298681430518627, "num_tokens": 459106365.0, "step": 500 }, { "epoch": 0.7109215320359016, "eval_loss": 0.5168540477752686, "eval_mean_token_accuracy": 0.8290872861418808, "eval_num_tokens": 459106365.0, "eval_runtime": 146.2066, "eval_samples_per_second": 24.883, "eval_steps_per_second": 0.78, "step": 500 }, { "epoch": 0.7180307473562606, "grad_norm": 1.6021704699780053, "learning_rate": 7.695971157990754e-05, "loss": 0.5646, "mean_token_accuracy": 0.8263038910925389, "num_tokens": 463703240.0, "step": 505 }, { "epoch": 0.7251399626766196, "grad_norm": 4.625968090811763, "learning_rate": 7.650626803497806e-05, "loss": 0.5581, "mean_token_accuracy": 0.8270722553133965, "num_tokens": 468295660.0, "step": 510 }, { "epoch": 0.7322491779969785, "grad_norm": 0.27503115183353516, "learning_rate": 7.604997652462205e-05, "loss": 0.5492, "mean_token_accuracy": 0.8294327199459076, "num_tokens": 472896751.0, "step": 515 }, { "epoch": 0.7393583933173375, "grad_norm": 0.267416722991217, "learning_rate": 7.55908974731978e-05, "loss": 0.5418, "mean_token_accuracy": 0.8326966613531113, "num_tokens": 477480918.0, "step": 520 }, { "epoch": 0.7464676086376966, "grad_norm": 0.25628361203172423, "learning_rate": 7.512909167420347e-05, "loss": 0.5404, "mean_token_accuracy": 0.8324044570326805, "num_tokens": 482064392.0, "step": 525 }, { "epoch": 0.7535768239580556, "grad_norm": 0.24597696845219366, "learning_rate": 7.466462028222654e-05, "loss": 0.5353, "mean_token_accuracy": 0.8331540204584599, "num_tokens": 486649806.0, "step": 530 }, { "epoch": 0.7606860392784146, "grad_norm": 0.2497969231256322, "learning_rate": 7.419754480484536e-05, "loss": 0.5378, "mean_token_accuracy": 0.8323175966739654, "num_tokens": 491217398.0, "step": 535 }, { "epoch": 0.7677952545987736, "grad_norm": 0.27136426093422567, "learning_rate": 7.3727927094484e-05, "loss": 0.5303, "mean_token_accuracy": 0.8346898458898068, "num_tokens": 495798334.0, "step": 540 }, { "epoch": 0.7749044699191326, "grad_norm": 0.263928683082665, "learning_rate": 7.32558293402215e-05, "loss": 0.5193, "mean_token_accuracy": 0.8367893837392331, "num_tokens": 500382331.0, "step": 545 }, { "epoch": 0.7820136852394917, "grad_norm": 0.2697485453052082, "learning_rate": 7.27813140595565e-05, "loss": 0.5249, "mean_token_accuracy": 0.836308328807354, "num_tokens": 504972961.0, "step": 550 }, { "epoch": 0.7891229005598507, "grad_norm": 0.47577994241811294, "learning_rate": 7.23044440901283e-05, "loss": 0.5386, "mean_token_accuracy": 0.832004614919424, "num_tokens": 509556175.0, "step": 555 }, { "epoch": 0.7962321158802097, "grad_norm": 0.26812210950339255, "learning_rate": 7.182528258139563e-05, "loss": 0.5327, "mean_token_accuracy": 0.8331871695816517, "num_tokens": 514159170.0, "step": 560 }, { "epoch": 0.8033413312005687, "grad_norm": 0.2590503131411491, "learning_rate": 7.13438929862741e-05, "loss": 0.5447, "mean_token_accuracy": 0.8303000062704087, "num_tokens": 518758083.0, "step": 565 }, { "epoch": 0.8104505465209277, "grad_norm": 0.2700164600845211, "learning_rate": 7.086033905273344e-05, "loss": 0.5367, "mean_token_accuracy": 0.8323484763503075, "num_tokens": 523345629.0, "step": 570 }, { "epoch": 0.8175597618412868, "grad_norm": 0.26967028018820877, "learning_rate": 7.037468481535567e-05, "loss": 0.5212, "mean_token_accuracy": 0.8371426187455654, "num_tokens": 527940592.0, "step": 575 }, { "epoch": 0.8246689771616458, "grad_norm": 0.3154368910279167, "learning_rate": 6.988699458685537e-05, "loss": 0.5275, "mean_token_accuracy": 0.8351783238351345, "num_tokens": 532516910.0, "step": 580 }, { "epoch": 0.8317781924820048, "grad_norm": 0.26226153440650185, "learning_rate": 6.9397332949563e-05, "loss": 0.5335, "mean_token_accuracy": 0.8329351760447026, "num_tokens": 537121758.0, "step": 585 }, { "epoch": 0.8388874078023638, "grad_norm": 0.31223173870328286, "learning_rate": 6.890576474687263e-05, "loss": 0.5458, "mean_token_accuracy": 0.829648780822754, "num_tokens": 541734519.0, "step": 590 }, { "epoch": 0.8459966231227228, "grad_norm": 0.2565970150956528, "learning_rate": 6.841235507465515e-05, "loss": 0.5415, "mean_token_accuracy": 0.8324811846017838, "num_tokens": 546326546.0, "step": 595 }, { "epoch": 0.8531058384430819, "grad_norm": 0.29462309278409743, "learning_rate": 6.791716927263778e-05, "loss": 0.5354, "mean_token_accuracy": 0.8325764186680317, "num_tokens": 550923667.0, "step": 600 }, { "epoch": 0.8531058384430819, "eval_loss": 0.5030205249786377, "eval_mean_token_accuracy": 0.8328834866222582, "eval_num_tokens": 550923667.0, "eval_runtime": 145.5099, "eval_samples_per_second": 25.002, "eval_steps_per_second": 0.783, "step": 600 }, { "epoch": 0.8602150537634409, "grad_norm": 0.2995740161508053, "learning_rate": 6.742027291575156e-05, "loss": 0.5351, "mean_token_accuracy": 0.8337548352777958, "num_tokens": 555521300.0, "step": 605 }, { "epoch": 0.8673242690837999, "grad_norm": 0.256895454866442, "learning_rate": 6.692173180544768e-05, "loss": 0.527, "mean_token_accuracy": 0.8346491247415543, "num_tokens": 560114622.0, "step": 610 }, { "epoch": 0.8744334844041589, "grad_norm": 0.26124663621839667, "learning_rate": 6.642161196098351e-05, "loss": 0.5299, "mean_token_accuracy": 0.835064522176981, "num_tokens": 564707120.0, "step": 615 }, { "epoch": 0.8815426997245179, "grad_norm": 0.30629789668279445, "learning_rate": 6.591997961068024e-05, "loss": 0.5391, "mean_token_accuracy": 0.8325687229633332, "num_tokens": 569285949.0, "step": 620 }, { "epoch": 0.888651915044877, "grad_norm": 0.2517010032545197, "learning_rate": 6.541690118315245e-05, "loss": 0.528, "mean_token_accuracy": 0.834906804561615, "num_tokens": 573871769.0, "step": 625 }, { "epoch": 0.895761130365236, "grad_norm": 0.3714356282666368, "learning_rate": 6.491244329851133e-05, "loss": 0.521, "mean_token_accuracy": 0.8374850310385227, "num_tokens": 578461250.0, "step": 630 }, { "epoch": 0.902870345685595, "grad_norm": 0.2513550517622928, "learning_rate": 6.440667275954262e-05, "loss": 0.5151, "mean_token_accuracy": 0.8384780243039132, "num_tokens": 583046607.0, "step": 635 }, { "epoch": 0.909979561005954, "grad_norm": 0.2790784344252937, "learning_rate": 6.389965654286011e-05, "loss": 0.5287, "mean_token_accuracy": 0.8349935576319695, "num_tokens": 587648232.0, "step": 640 }, { "epoch": 0.917088776326313, "grad_norm": 0.27767689120972117, "learning_rate": 6.339146179003636e-05, "loss": 0.5207, "mean_token_accuracy": 0.837136809527874, "num_tokens": 592239729.0, "step": 645 }, { "epoch": 0.924197991646672, "grad_norm": 0.2805149976836277, "learning_rate": 6.288215579871148e-05, "loss": 0.5229, "mean_token_accuracy": 0.8374404884874821, "num_tokens": 596831306.0, "step": 650 }, { "epoch": 0.9313072069670311, "grad_norm": 0.24703194529226574, "learning_rate": 6.23718060136812e-05, "loss": 0.5152, "mean_token_accuracy": 0.8385937295854091, "num_tokens": 601427733.0, "step": 655 }, { "epoch": 0.9384164222873901, "grad_norm": 0.33949011504626453, "learning_rate": 6.186048001796556e-05, "loss": 0.5204, "mean_token_accuracy": 0.8384438544511795, "num_tokens": 606006466.0, "step": 660 }, { "epoch": 0.945525637607749, "grad_norm": 0.24749318396547174, "learning_rate": 6.134824552385915e-05, "loss": 0.5256, "mean_token_accuracy": 0.8357278972864151, "num_tokens": 610597552.0, "step": 665 }, { "epoch": 0.952634852928108, "grad_norm": 0.26267746218214755, "learning_rate": 6.0835170363964434e-05, "loss": 0.528, "mean_token_accuracy": 0.8351906433701515, "num_tokens": 615193994.0, "step": 670 }, { "epoch": 0.959744068248467, "grad_norm": 0.25519090759528035, "learning_rate": 6.032132248220893e-05, "loss": 0.518, "mean_token_accuracy": 0.8378535941243171, "num_tokens": 619786315.0, "step": 675 }, { "epoch": 0.966853283568826, "grad_norm": 0.25149430173186577, "learning_rate": 5.9806769924847784e-05, "loss": 0.5175, "mean_token_accuracy": 0.8372136250138282, "num_tokens": 624383919.0, "step": 680 }, { "epoch": 0.9739624988891851, "grad_norm": 0.2669872598294479, "learning_rate": 5.929158083145271e-05, "loss": 0.5166, "mean_token_accuracy": 0.8380297608673573, "num_tokens": 628976906.0, "step": 685 }, { "epoch": 0.9810717142095441, "grad_norm": 0.3079990980800955, "learning_rate": 5.8775823425888664e-05, "loss": 0.5171, "mean_token_accuracy": 0.8365243822336197, "num_tokens": 633557562.0, "step": 690 }, { "epoch": 0.9881809295299031, "grad_norm": 0.26934237379344833, "learning_rate": 5.825956600727932e-05, "loss": 0.5176, "mean_token_accuracy": 0.8371751248836518, "num_tokens": 638143938.0, "step": 695 }, { "epoch": 0.9952901448502621, "grad_norm": 0.24892879578477203, "learning_rate": 5.774287694096246e-05, "loss": 0.5203, "mean_token_accuracy": 0.8368992209434509, "num_tokens": 642760408.0, "step": 700 }, { "epoch": 0.9952901448502621, "eval_loss": 0.49169814586639404, "eval_mean_token_accuracy": 0.8366760449451313, "eval_num_tokens": 642760408.0, "eval_runtime": 148.141, "eval_samples_per_second": 24.558, "eval_steps_per_second": 0.77, "step": 700 }, { "epoch": 1.0014218430640718, "grad_norm": 0.5358904769553885, "learning_rate": 5.72258246494368e-05, "loss": 0.4893, "mean_token_accuracy": 0.8436046752376832, "num_tokens": 646718128.0, "step": 705 }, { "epoch": 1.008531058384431, "grad_norm": 0.25743890956382126, "learning_rate": 5.6708477603301146e-05, "loss": 0.461, "mean_token_accuracy": 0.8506338618695736, "num_tokens": 651304404.0, "step": 710 }, { "epoch": 1.0156402737047898, "grad_norm": 0.2648866270558085, "learning_rate": 5.6190904312187154e-05, "loss": 0.4544, "mean_token_accuracy": 0.8519260853528976, "num_tokens": 655879909.0, "step": 715 }, { "epoch": 1.022749489025149, "grad_norm": 0.27694330822934976, "learning_rate": 5.567317331568687e-05, "loss": 0.4474, "mean_token_accuracy": 0.8545098066329956, "num_tokens": 660449626.0, "step": 720 }, { "epoch": 1.0298587043455079, "grad_norm": 0.24825528169946715, "learning_rate": 5.515535317427657e-05, "loss": 0.4517, "mean_token_accuracy": 0.8533940657973289, "num_tokens": 665058163.0, "step": 725 }, { "epoch": 1.0369679196658668, "grad_norm": 0.24464581183689546, "learning_rate": 5.463751246023746e-05, "loss": 0.4559, "mean_token_accuracy": 0.8523735709488391, "num_tokens": 669654595.0, "step": 730 }, { "epoch": 1.044077134986226, "grad_norm": 0.24930171479148333, "learning_rate": 5.4119719748575106e-05, "loss": 0.4487, "mean_token_accuracy": 0.8542089037597179, "num_tokens": 674232882.0, "step": 735 }, { "epoch": 1.0511863503065848, "grad_norm": 0.23303088594874635, "learning_rate": 5.360204360793836e-05, "loss": 0.4436, "mean_token_accuracy": 0.8547257304191589, "num_tokens": 678813498.0, "step": 740 }, { "epoch": 1.058295565626944, "grad_norm": 0.317097982341769, "learning_rate": 5.308455259153915e-05, "loss": 0.458, "mean_token_accuracy": 0.8515614397823811, "num_tokens": 683401148.0, "step": 745 }, { "epoch": 1.0654047809473028, "grad_norm": 0.24160258781744343, "learning_rate": 5.256731522807436e-05, "loss": 0.4506, "mean_token_accuracy": 0.8526393964886665, "num_tokens": 687982154.0, "step": 750 }, { "epoch": 1.072513996267662, "grad_norm": 0.23602108922437653, "learning_rate": 5.205040001265094e-05, "loss": 0.4515, "mean_token_accuracy": 0.8521531477570534, "num_tokens": 692583016.0, "step": 755 }, { "epoch": 1.0796232115880209, "grad_norm": 0.2431546567595459, "learning_rate": 5.1533875397715345e-05, "loss": 0.455, "mean_token_accuracy": 0.8529531605541706, "num_tokens": 697183950.0, "step": 760 }, { "epoch": 1.08673242690838, "grad_norm": 0.27597324346348756, "learning_rate": 5.101780978398888e-05, "loss": 0.4518, "mean_token_accuracy": 0.8528432317078114, "num_tokens": 701785548.0, "step": 765 }, { "epoch": 1.093841642228739, "grad_norm": 0.26932926236063864, "learning_rate": 5.050227151140958e-05, "loss": 0.4536, "mean_token_accuracy": 0.852679468691349, "num_tokens": 706364188.0, "step": 770 }, { "epoch": 1.100950857549098, "grad_norm": 0.2587220894683173, "learning_rate": 4.998732885008244e-05, "loss": 0.4503, "mean_token_accuracy": 0.8526183031499386, "num_tokens": 710949271.0, "step": 775 }, { "epoch": 1.108060072869457, "grad_norm": 0.24430696998738718, "learning_rate": 4.947304999123867e-05, "loss": 0.4357, "mean_token_accuracy": 0.8572968378663063, "num_tokens": 715539336.0, "step": 780 }, { "epoch": 1.115169288189816, "grad_norm": 0.24614402366250857, "learning_rate": 4.895950303820552e-05, "loss": 0.4525, "mean_token_accuracy": 0.8526603005826473, "num_tokens": 720147357.0, "step": 785 }, { "epoch": 1.122278503510175, "grad_norm": 0.23262198319374294, "learning_rate": 4.844675599738765e-05, "loss": 0.4523, "mean_token_accuracy": 0.852922348678112, "num_tokens": 724741149.0, "step": 790 }, { "epoch": 1.1293877188305341, "grad_norm": 0.2551816873924689, "learning_rate": 4.793487676926142e-05, "loss": 0.4562, "mean_token_accuracy": 0.8518377915024757, "num_tokens": 729327424.0, "step": 795 }, { "epoch": 1.136496934150893, "grad_norm": 0.23754167080648592, "learning_rate": 4.742393313938327e-05, "loss": 0.445, "mean_token_accuracy": 0.8547273397445678, "num_tokens": 733921218.0, "step": 800 }, { "epoch": 1.136496934150893, "eval_loss": 0.4879998564720154, "eval_mean_token_accuracy": 0.8380277005203983, "eval_num_tokens": 733921218.0, "eval_runtime": 146.7948, "eval_samples_per_second": 24.783, "eval_steps_per_second": 0.777, "step": 800 }, { "epoch": 1.1436061494712522, "grad_norm": 0.25050469601877845, "learning_rate": 4.6913992769413026e-05, "loss": 0.4552, "mean_token_accuracy": 0.8521495588123799, "num_tokens": 738503816.0, "step": 805 }, { "epoch": 1.150715364791611, "grad_norm": 0.24476661787598053, "learning_rate": 4.6405123188153966e-05, "loss": 0.4506, "mean_token_accuracy": 0.8532384999096394, "num_tokens": 743095770.0, "step": 810 }, { "epoch": 1.1578245801119702, "grad_norm": 0.24115136773182058, "learning_rate": 4.589739178261028e-05, "loss": 0.4471, "mean_token_accuracy": 0.8549422182142734, "num_tokens": 747676184.0, "step": 815 }, { "epoch": 1.1649337954323291, "grad_norm": 0.24283949811905522, "learning_rate": 4.5390865789063344e-05, "loss": 0.448, "mean_token_accuracy": 0.8543575026094914, "num_tokens": 752274534.0, "step": 820 }, { "epoch": 1.1720430107526882, "grad_norm": 0.2701107129425895, "learning_rate": 4.4885612284167955e-05, "loss": 0.4411, "mean_token_accuracy": 0.8565104402601719, "num_tokens": 756863683.0, "step": 825 }, { "epoch": 1.1791522260730471, "grad_norm": 0.2886054721404824, "learning_rate": 4.4381698176069754e-05, "loss": 0.4379, "mean_token_accuracy": 0.8567862503230572, "num_tokens": 761453110.0, "step": 830 }, { "epoch": 1.1862614413934063, "grad_norm": 0.2561982737144238, "learning_rate": 4.387919019554487e-05, "loss": 0.4532, "mean_token_accuracy": 0.8531202852725983, "num_tokens": 766041248.0, "step": 835 }, { "epoch": 1.1933706567137652, "grad_norm": 0.26412588441218454, "learning_rate": 4.3378154887163144e-05, "loss": 0.4453, "mean_token_accuracy": 0.853339533507824, "num_tokens": 770624920.0, "step": 840 }, { "epoch": 1.2004798720341243, "grad_norm": 0.25032821222177587, "learning_rate": 4.287865860047596e-05, "loss": 0.4558, "mean_token_accuracy": 0.8522251404821872, "num_tokens": 775225729.0, "step": 845 }, { "epoch": 1.2075890873544832, "grad_norm": 0.23998083533004458, "learning_rate": 4.2380767481229886e-05, "loss": 0.4418, "mean_token_accuracy": 0.8569207176566124, "num_tokens": 779811918.0, "step": 850 }, { "epoch": 1.2146983026748424, "grad_norm": 0.2456015755421057, "learning_rate": 4.1884547462607326e-05, "loss": 0.4454, "mean_token_accuracy": 0.8553664483129978, "num_tokens": 784391305.0, "step": 855 }, { "epoch": 1.2218075179952013, "grad_norm": 0.25612737416807746, "learning_rate": 4.139006425649541e-05, "loss": 0.4504, "mean_token_accuracy": 0.8527485050261021, "num_tokens": 788981682.0, "step": 860 }, { "epoch": 1.2289167333155602, "grad_norm": 0.24215144672428524, "learning_rate": 4.089738334478399e-05, "loss": 0.4466, "mean_token_accuracy": 0.8540120802819728, "num_tokens": 793548878.0, "step": 865 }, { "epoch": 1.2360259486359193, "grad_norm": 0.251956160570565, "learning_rate": 4.0406569970694285e-05, "loss": 0.4514, "mean_token_accuracy": 0.8536942526698112, "num_tokens": 798145090.0, "step": 870 }, { "epoch": 1.2431351639562784, "grad_norm": 0.24137828427946414, "learning_rate": 3.991768913013904e-05, "loss": 0.4408, "mean_token_accuracy": 0.8566184468567372, "num_tokens": 802721141.0, "step": 875 }, { "epoch": 1.2502443792766373, "grad_norm": 0.3769699788745637, "learning_rate": 3.943080556311536e-05, "loss": 0.438, "mean_token_accuracy": 0.8581221453845501, "num_tokens": 807303824.0, "step": 880 }, { "epoch": 1.2573535945969962, "grad_norm": 0.251278759950789, "learning_rate": 3.894598374513174e-05, "loss": 0.4485, "mean_token_accuracy": 0.8541063219308853, "num_tokens": 811911762.0, "step": 885 }, { "epoch": 1.2644628099173554, "grad_norm": 0.24068163801342848, "learning_rate": 3.846328787866964e-05, "loss": 0.4339, "mean_token_accuracy": 0.859130322188139, "num_tokens": 816508640.0, "step": 890 }, { "epoch": 1.2715720252377145, "grad_norm": 0.23232711368022352, "learning_rate": 3.798278188468164e-05, "loss": 0.4445, "mean_token_accuracy": 0.8543654963374138, "num_tokens": 821100737.0, "step": 895 }, { "epoch": 1.2786812405580734, "grad_norm": 0.2368572559014999, "learning_rate": 3.750452939412667e-05, "loss": 0.4434, "mean_token_accuracy": 0.8547687388956546, "num_tokens": 825694727.0, "step": 900 }, { "epoch": 1.2786812405580734, "eval_loss": 0.4800785183906555, "eval_mean_token_accuracy": 0.8407511988229919, "eval_num_tokens": 825694727.0, "eval_runtime": 146.4602, "eval_samples_per_second": 24.84, "eval_steps_per_second": 0.778, "step": 900 }, { "epoch": 1.2857904558784323, "grad_norm": 0.26166517034573067, "learning_rate": 3.7028593739543715e-05, "loss": 0.4475, "mean_token_accuracy": 0.854764747619629, "num_tokens": 830291180.0, "step": 905 }, { "epoch": 1.2928996711987915, "grad_norm": 0.24015937616460478, "learning_rate": 3.6555037946664926e-05, "loss": 0.4455, "mean_token_accuracy": 0.8552566647529602, "num_tokens": 834892125.0, "step": 910 }, { "epoch": 1.3000088865191506, "grad_norm": 0.252313420976958, "learning_rate": 3.608392472606956e-05, "loss": 0.4441, "mean_token_accuracy": 0.8559129044413567, "num_tokens": 839486375.0, "step": 915 }, { "epoch": 1.3071181018395095, "grad_norm": 0.256487918121681, "learning_rate": 3.5615316464879445e-05, "loss": 0.4401, "mean_token_accuracy": 0.8565216913819313, "num_tokens": 844107444.0, "step": 920 }, { "epoch": 1.3142273171598684, "grad_norm": 0.23448215102314007, "learning_rate": 3.5149275218497445e-05, "loss": 0.4383, "mean_token_accuracy": 0.8571599997580052, "num_tokens": 848704492.0, "step": 925 }, { "epoch": 1.3213365324802275, "grad_norm": 0.24419792529251788, "learning_rate": 3.4685862702389714e-05, "loss": 0.4429, "mean_token_accuracy": 0.855844734609127, "num_tokens": 853292585.0, "step": 930 }, { "epoch": 1.3284457478005864, "grad_norm": 0.23566825561303636, "learning_rate": 3.422514028391304e-05, "loss": 0.4354, "mean_token_accuracy": 0.8570930063724518, "num_tokens": 857867604.0, "step": 935 }, { "epoch": 1.3355549631209456, "grad_norm": 0.2454162982602229, "learning_rate": 3.376716897418831e-05, "loss": 0.4447, "mean_token_accuracy": 0.8552064374089241, "num_tokens": 862460961.0, "step": 940 }, { "epoch": 1.3426641784413045, "grad_norm": 0.2524163496767361, "learning_rate": 3.331200942002113e-05, "loss": 0.4525, "mean_token_accuracy": 0.8537895001471043, "num_tokens": 867058298.0, "step": 945 }, { "epoch": 1.3497733937616636, "grad_norm": 0.23190520165291026, "learning_rate": 3.2859721895870635e-05, "loss": 0.44, "mean_token_accuracy": 0.8565752863883972, "num_tokens": 871661806.0, "step": 950 }, { "epoch": 1.3568826090820225, "grad_norm": 0.24782970977401894, "learning_rate": 3.2410366295867664e-05, "loss": 0.4352, "mean_token_accuracy": 0.8579383887350559, "num_tokens": 876250262.0, "step": 955 }, { "epoch": 1.3639918244023816, "grad_norm": 0.22786025696468146, "learning_rate": 3.19640021258833e-05, "loss": 0.444, "mean_token_accuracy": 0.8550498209893703, "num_tokens": 880839029.0, "step": 960 }, { "epoch": 1.3711010397227406, "grad_norm": 0.2265711418699179, "learning_rate": 3.152068849564879e-05, "loss": 0.4435, "mean_token_accuracy": 0.8563594095408916, "num_tokens": 885417939.0, "step": 965 }, { "epoch": 1.3782102550430997, "grad_norm": 0.23977507514952898, "learning_rate": 3.1080484110927954e-05, "loss": 0.4325, "mean_token_accuracy": 0.8590381443500519, "num_tokens": 890005207.0, "step": 970 }, { "epoch": 1.3853194703634586, "grad_norm": 0.24689756755824815, "learning_rate": 3.0643447265743096e-05, "loss": 0.44, "mean_token_accuracy": 0.85642144754529, "num_tokens": 894591297.0, "step": 975 }, { "epoch": 1.3924286856838177, "grad_norm": 0.24051873631020942, "learning_rate": 3.0209635834655392e-05, "loss": 0.435, "mean_token_accuracy": 0.8576522074639797, "num_tokens": 899178832.0, "step": 980 }, { "epoch": 1.3995379010041766, "grad_norm": 0.2413492029135495, "learning_rate": 2.9779107265100892e-05, "loss": 0.4369, "mean_token_accuracy": 0.857710150629282, "num_tokens": 903773147.0, "step": 985 }, { "epoch": 1.4066471163245358, "grad_norm": 0.23506138046697497, "learning_rate": 2.9351918569783006e-05, "loss": 0.4364, "mean_token_accuracy": 0.8576699584722519, "num_tokens": 908371284.0, "step": 990 }, { "epoch": 1.4137563316448947, "grad_norm": 0.25438867805085685, "learning_rate": 2.892812631912265e-05, "loss": 0.4349, "mean_token_accuracy": 0.8586409255862236, "num_tokens": 912978481.0, "step": 995 }, { "epoch": 1.4208655469652536, "grad_norm": 0.24429497699288996, "learning_rate": 2.8507786633766877e-05, "loss": 0.4354, "mean_token_accuracy": 0.8573046490550041, "num_tokens": 917574029.0, "step": 1000 }, { "epoch": 1.4208655469652536, "eval_loss": 0.47304314374923706, "eval_mean_token_accuracy": 0.842672534156264, "eval_num_tokens": 917574029.0, "eval_runtime": 145.3562, "eval_samples_per_second": 25.028, "eval_steps_per_second": 0.784, "step": 1000 }, { "epoch": 1.4279747622856127, "grad_norm": 0.24463063083449332, "learning_rate": 2.809095517715713e-05, "loss": 0.4303, "mean_token_accuracy": 0.858917984366417, "num_tokens": 922160147.0, "step": 1005 }, { "epoch": 1.4350839776059718, "grad_norm": 0.24348846567727375, "learning_rate": 2.7677687148157998e-05, "loss": 0.4367, "mean_token_accuracy": 0.8577364660799504, "num_tokens": 926746028.0, "step": 1010 }, { "epoch": 1.4421931929263307, "grad_norm": 0.24745049020205356, "learning_rate": 2.7268037273747525e-05, "loss": 0.4368, "mean_token_accuracy": 0.857840034365654, "num_tokens": 931337261.0, "step": 1015 }, { "epoch": 1.4493024082466897, "grad_norm": 0.2439587698234042, "learning_rate": 2.686205980176998e-05, "loss": 0.4447, "mean_token_accuracy": 0.8548872321844101, "num_tokens": 935941769.0, "step": 1020 }, { "epoch": 1.4564116235670488, "grad_norm": 0.25142114078442956, "learning_rate": 2.6459808493752102e-05, "loss": 0.4284, "mean_token_accuracy": 0.8603815868496895, "num_tokens": 940535643.0, "step": 1025 }, { "epoch": 1.463520838887408, "grad_norm": 0.2444154895688051, "learning_rate": 2.606133661778377e-05, "loss": 0.4368, "mean_token_accuracy": 0.8575351513922215, "num_tokens": 945124519.0, "step": 1030 }, { "epoch": 1.4706300542077668, "grad_norm": 0.2397327728518288, "learning_rate": 2.5666696941463885e-05, "loss": 0.4307, "mean_token_accuracy": 0.8594269149005413, "num_tokens": 949709974.0, "step": 1035 }, { "epoch": 1.4777392695281257, "grad_norm": 0.3077470484547689, "learning_rate": 2.5275941724912743e-05, "loss": 0.4288, "mean_token_accuracy": 0.8588724002242089, "num_tokens": 954294899.0, "step": 1040 }, { "epoch": 1.4848484848484849, "grad_norm": 0.24584716924955974, "learning_rate": 2.4889122713851394e-05, "loss": 0.4304, "mean_token_accuracy": 0.8590269833803177, "num_tokens": 958889833.0, "step": 1045 }, { "epoch": 1.491957700168844, "grad_norm": 0.24260820183680837, "learning_rate": 2.4506291132749272e-05, "loss": 0.4322, "mean_token_accuracy": 0.8588926158845425, "num_tokens": 963479630.0, "step": 1050 }, { "epoch": 1.499066915489203, "grad_norm": 0.2512439219193439, "learning_rate": 2.4127497678040846e-05, "loss": 0.4338, "mean_token_accuracy": 0.8590321697294712, "num_tokens": 968086693.0, "step": 1055 }, { "epoch": 1.5061761308095618, "grad_norm": 0.25788120133019554, "learning_rate": 2.375279251141201e-05, "loss": 0.4302, "mean_token_accuracy": 0.8599278099834919, "num_tokens": 972668807.0, "step": 1060 }, { "epoch": 1.513285346129921, "grad_norm": 0.24857387974370135, "learning_rate": 2.338222525315758e-05, "loss": 0.4371, "mean_token_accuracy": 0.8579599760472775, "num_tokens": 977267842.0, "step": 1065 }, { "epoch": 1.52039456145028, "grad_norm": 0.24022880991860499, "learning_rate": 2.301584497561024e-05, "loss": 0.4234, "mean_token_accuracy": 0.862085721641779, "num_tokens": 981857003.0, "step": 1070 }, { "epoch": 1.527503776770639, "grad_norm": 0.27120541109477303, "learning_rate": 2.2653700196642134e-05, "loss": 0.4396, "mean_token_accuracy": 0.857264555990696, "num_tokens": 986456929.0, "step": 1075 }, { "epoch": 1.5346129920909979, "grad_norm": 0.24114703590240177, "learning_rate": 2.2295838873239965e-05, "loss": 0.4296, "mean_token_accuracy": 0.8604548752307892, "num_tokens": 991061372.0, "step": 1080 }, { "epoch": 1.541722207411357, "grad_norm": 0.23963844839444817, "learning_rate": 2.194230839515425e-05, "loss": 0.4336, "mean_token_accuracy": 0.8584208697080612, "num_tokens": 995660319.0, "step": 1085 }, { "epoch": 1.5488314227317161, "grad_norm": 0.24314988814533856, "learning_rate": 2.1593155578623702e-05, "loss": 0.4306, "mean_token_accuracy": 0.8601135425269604, "num_tokens": 1000236933.0, "step": 1090 }, { "epoch": 1.555940638052075, "grad_norm": 0.2566886574453899, "learning_rate": 2.1248426660175713e-05, "loss": 0.4384, "mean_token_accuracy": 0.8573588460683823, "num_tokens": 1004820862.0, "step": 1095 }, { "epoch": 1.563049853372434, "grad_norm": 0.2621075128506793, "learning_rate": 2.0908167290503326e-05, "loss": 0.4298, "mean_token_accuracy": 0.8607131637632847, "num_tokens": 1009411521.0, "step": 1100 }, { "epoch": 1.563049853372434, "eval_loss": 0.4672245681285858, "eval_mean_token_accuracy": 0.844007690747579, "eval_num_tokens": 1009411521.0, "eval_runtime": 146.3617, "eval_samples_per_second": 24.856, "eval_steps_per_second": 0.779, "step": 1100 }, { "epoch": 1.570159068692793, "grad_norm": 0.23570827346042514, "learning_rate": 2.0572422528420095e-05, "loss": 0.4206, "mean_token_accuracy": 0.8622309692203999, "num_tokens": 1013995376.0, "step": 1105 }, { "epoch": 1.577268284013152, "grad_norm": 0.28786088105829327, "learning_rate": 2.024123683489303e-05, "loss": 0.4195, "mean_token_accuracy": 0.8634026922285557, "num_tokens": 1018562407.0, "step": 1110 }, { "epoch": 1.584377499333511, "grad_norm": 0.22477409346403396, "learning_rate": 1.9914654067154996e-05, "loss": 0.4345, "mean_token_accuracy": 0.8584335811436177, "num_tokens": 1023168118.0, "step": 1115 }, { "epoch": 1.59148671465387, "grad_norm": 0.24599345473106599, "learning_rate": 1.959271747289686e-05, "loss": 0.4278, "mean_token_accuracy": 0.8616135574877262, "num_tokens": 1027754848.0, "step": 1120 }, { "epoch": 1.5985959299742292, "grad_norm": 0.24491593894054278, "learning_rate": 1.9275469684540404e-05, "loss": 0.4294, "mean_token_accuracy": 0.8590353332459927, "num_tokens": 1032347251.0, "step": 1125 }, { "epoch": 1.605705145294588, "grad_norm": 0.2540751338276317, "learning_rate": 1.8962952713592752e-05, "loss": 0.4242, "mean_token_accuracy": 0.8608104437589645, "num_tokens": 1036931829.0, "step": 1130 }, { "epoch": 1.612814360614947, "grad_norm": 0.2510287685288083, "learning_rate": 1.8655207945083e-05, "loss": 0.4239, "mean_token_accuracy": 0.8617179103195667, "num_tokens": 1041532224.0, "step": 1135 }, { "epoch": 1.6199235759353061, "grad_norm": 0.2693350827409704, "learning_rate": 1.8352276132081847e-05, "loss": 0.4357, "mean_token_accuracy": 0.8589904353022575, "num_tokens": 1046120676.0, "step": 1140 }, { "epoch": 1.6270327912556652, "grad_norm": 0.24443054034299724, "learning_rate": 1.8054197390304755e-05, "loss": 0.4275, "mean_token_accuracy": 0.8615889854729175, "num_tokens": 1050708153.0, "step": 1145 }, { "epoch": 1.6341420065760242, "grad_norm": 0.24588007040764026, "learning_rate": 1.7761011192799764e-05, "loss": 0.4238, "mean_token_accuracy": 0.8622479006648064, "num_tokens": 1055294826.0, "step": 1150 }, { "epoch": 1.641251221896383, "grad_norm": 0.24561473837992528, "learning_rate": 1.7472756364720206e-05, "loss": 0.4243, "mean_token_accuracy": 0.8616314500570297, "num_tokens": 1059896792.0, "step": 1155 }, { "epoch": 1.6483604372167422, "grad_norm": 0.23202476301237993, "learning_rate": 1.7189471078183302e-05, "loss": 0.4313, "mean_token_accuracy": 0.860023857653141, "num_tokens": 1064504870.0, "step": 1160 }, { "epoch": 1.6554696525371013, "grad_norm": 0.2403111932989795, "learning_rate": 1.6911192847215225e-05, "loss": 0.4315, "mean_token_accuracy": 0.85991101115942, "num_tokens": 1069092813.0, "step": 1165 }, { "epoch": 1.6625788678574602, "grad_norm": 0.23285052418281263, "learning_rate": 1.6637958522783298e-05, "loss": 0.4286, "mean_token_accuracy": 0.8603983536362648, "num_tokens": 1073673087.0, "step": 1170 }, { "epoch": 1.6696880831778191, "grad_norm": 0.23644436345090544, "learning_rate": 1.6369804287916028e-05, "loss": 0.4237, "mean_token_accuracy": 0.8625174552202225, "num_tokens": 1078263989.0, "step": 1175 }, { "epoch": 1.6767972984981783, "grad_norm": 0.2283809036559784, "learning_rate": 1.6106765652911563e-05, "loss": 0.4196, "mean_token_accuracy": 0.8629219397902489, "num_tokens": 1082858600.0, "step": 1180 }, { "epoch": 1.6839065138185374, "grad_norm": 0.2437421457507895, "learning_rate": 1.5848877450635237e-05, "loss": 0.431, "mean_token_accuracy": 0.8596989519894123, "num_tokens": 1087463215.0, "step": 1185 }, { "epoch": 1.6910157291388963, "grad_norm": 0.24997191755310427, "learning_rate": 1.559617383190684e-05, "loss": 0.4258, "mean_token_accuracy": 0.8600839108228684, "num_tokens": 1092046691.0, "step": 1190 }, { "epoch": 1.6981249444592552, "grad_norm": 0.24275510902589129, "learning_rate": 1.5348688260978188e-05, "loss": 0.4198, "mean_token_accuracy": 0.8634254619479179, "num_tokens": 1096635412.0, "step": 1195 }, { "epoch": 1.7052341597796143, "grad_norm": 0.25771028141912433, "learning_rate": 1.5106453511101657e-05, "loss": 0.4198, "mean_token_accuracy": 0.8630197443068027, "num_tokens": 1101239957.0, "step": 1200 }, { "epoch": 1.7052341597796143, "eval_loss": 0.4617161452770233, "eval_mean_token_accuracy": 0.8460459296117749, "eval_num_tokens": 1101239957.0, "eval_runtime": 143.0225, "eval_samples_per_second": 25.437, "eval_steps_per_second": 0.797, "step": 1200 }, { "epoch": 1.7123433750999735, "grad_norm": 0.2465846462175401, "learning_rate": 1.4869501660190118e-05, "loss": 0.4269, "mean_token_accuracy": 0.8613091327250004, "num_tokens": 1105835727.0, "step": 1205 }, { "epoch": 1.7194525904203324, "grad_norm": 0.24343231445496366, "learning_rate": 1.4637864086569114e-05, "loss": 0.4189, "mean_token_accuracy": 0.8625466778874398, "num_tokens": 1110431832.0, "step": 1210 }, { "epoch": 1.7265618057406913, "grad_norm": 0.24500024608031826, "learning_rate": 1.4411571464821522e-05, "loss": 0.4178, "mean_token_accuracy": 0.8632443450391293, "num_tokens": 1115003545.0, "step": 1215 }, { "epoch": 1.7336710210610504, "grad_norm": 0.24384954499049283, "learning_rate": 1.4190653761725458e-05, "loss": 0.4331, "mean_token_accuracy": 0.8595723591744899, "num_tokens": 1119594038.0, "step": 1220 }, { "epoch": 1.7407802363814096, "grad_norm": 0.24988962843301607, "learning_rate": 1.3975140232286033e-05, "loss": 0.4292, "mean_token_accuracy": 0.8610283821821213, "num_tokens": 1124191272.0, "step": 1225 }, { "epoch": 1.7478894517017685, "grad_norm": 0.23666630913921613, "learning_rate": 1.3765059415861142e-05, "loss": 0.4256, "mean_token_accuracy": 0.8612963631749153, "num_tokens": 1128787024.0, "step": 1230 }, { "epoch": 1.7549986670221274, "grad_norm": 0.24377997978707636, "learning_rate": 1.3560439132382218e-05, "loss": 0.4249, "mean_token_accuracy": 0.8616208277642727, "num_tokens": 1133369468.0, "step": 1235 }, { "epoch": 1.7621078823424865, "grad_norm": 0.24473326280197544, "learning_rate": 1.336130647867015e-05, "loss": 0.4233, "mean_token_accuracy": 0.8611096739768982, "num_tokens": 1137960753.0, "step": 1240 }, { "epoch": 1.7692170976628456, "grad_norm": 0.2814923829698822, "learning_rate": 1.3167687824846988e-05, "loss": 0.4345, "mean_token_accuracy": 0.8590093135833741, "num_tokens": 1142557989.0, "step": 1245 }, { "epoch": 1.7763263129832043, "grad_norm": 0.24671237642090413, "learning_rate": 1.297960881084391e-05, "loss": 0.4136, "mean_token_accuracy": 0.8641826197504997, "num_tokens": 1147139033.0, "step": 1250 }, { "epoch": 1.7834355283035634, "grad_norm": 0.23802525665842986, "learning_rate": 1.2797094343005807e-05, "loss": 0.4212, "mean_token_accuracy": 0.8627298250794411, "num_tokens": 1151728912.0, "step": 1255 }, { "epoch": 1.7905447436239226, "grad_norm": 0.24514167574215462, "learning_rate": 1.2620168590793105e-05, "loss": 0.4243, "mean_token_accuracy": 0.8623115479946136, "num_tokens": 1156315343.0, "step": 1260 }, { "epoch": 1.7976539589442815, "grad_norm": 0.24177052216503225, "learning_rate": 1.2448854983581134e-05, "loss": 0.4205, "mean_token_accuracy": 0.8636125177145004, "num_tokens": 1160905222.0, "step": 1265 }, { "epoch": 1.8047631742646404, "grad_norm": 0.25623340057701793, "learning_rate": 1.2283176207557455e-05, "loss": 0.4204, "mean_token_accuracy": 0.863289151340723, "num_tokens": 1165469584.0, "step": 1270 }, { "epoch": 1.8118723895849995, "grad_norm": 0.2366529819101992, "learning_rate": 1.2123154202717656e-05, "loss": 0.4205, "mean_token_accuracy": 0.8623673833906651, "num_tokens": 1170087058.0, "step": 1275 }, { "epoch": 1.8189816049053587, "grad_norm": 0.23815408906221286, "learning_rate": 1.1968810159959982e-05, "loss": 0.4167, "mean_token_accuracy": 0.8636409521102906, "num_tokens": 1174675450.0, "step": 1280 }, { "epoch": 1.8260908202257176, "grad_norm": 0.25161717096488057, "learning_rate": 1.1820164518279083e-05, "loss": 0.4308, "mean_token_accuracy": 0.8603747352957726, "num_tokens": 1179252086.0, "step": 1285 }, { "epoch": 1.8332000355460765, "grad_norm": 0.23828924023109987, "learning_rate": 1.1677236962059421e-05, "loss": 0.4161, "mean_token_accuracy": 0.8636845953762531, "num_tokens": 1183846581.0, "step": 1290 }, { "epoch": 1.8403092508664356, "grad_norm": 0.2389439298878492, "learning_rate": 1.1540046418468561e-05, "loss": 0.4093, "mean_token_accuracy": 0.8666847251355648, "num_tokens": 1188439447.0, "step": 1295 }, { "epoch": 1.8474184661867947, "grad_norm": 0.26036762406039, "learning_rate": 1.1408611054950722e-05, "loss": 0.4187, "mean_token_accuracy": 0.8630855195224285, "num_tokens": 1193031482.0, "step": 1300 }, { "epoch": 1.8474184661867947, "eval_loss": 0.45738622546195984, "eval_mean_token_accuracy": 0.847679163803134, "eval_num_tokens": 1193031482.0, "eval_runtime": 143.6355, "eval_samples_per_second": 25.328, "eval_steps_per_second": 0.794, "step": 1300 }, { "epoch": 1.8545276815071536, "grad_norm": 0.2419491832206913, "learning_rate": 1.1282948276820963e-05, "loss": 0.4223, "mean_token_accuracy": 0.8626484178006649, "num_tokens": 1197621510.0, "step": 1305 }, { "epoch": 1.8616368968275125, "grad_norm": 0.2366717377397619, "learning_rate": 1.1163074724960326e-05, "loss": 0.4202, "mean_token_accuracy": 0.8629304811358451, "num_tokens": 1202214988.0, "step": 1310 }, { "epoch": 1.8687461121478717, "grad_norm": 0.24750576690261594, "learning_rate": 1.10490062736121e-05, "loss": 0.4159, "mean_token_accuracy": 0.8640658937394619, "num_tokens": 1206801749.0, "step": 1315 }, { "epoch": 1.8758553274682308, "grad_norm": 0.2754980560042937, "learning_rate": 1.094075802827971e-05, "loss": 0.4224, "mean_token_accuracy": 0.8619605071842671, "num_tokens": 1211394066.0, "step": 1320 }, { "epoch": 1.8829645427885897, "grad_norm": 0.2441756409539309, "learning_rate": 1.0838344323726395e-05, "loss": 0.4159, "mean_token_accuracy": 0.8641899891197682, "num_tokens": 1215982389.0, "step": 1325 }, { "epoch": 1.8900737581089486, "grad_norm": 0.25017331261640485, "learning_rate": 1.0741778722076896e-05, "loss": 0.4141, "mean_token_accuracy": 0.864534319192171, "num_tokens": 1220561480.0, "step": 1330 }, { "epoch": 1.8971829734293078, "grad_norm": 0.24928323459761015, "learning_rate": 1.0651074011021495e-05, "loss": 0.4148, "mean_token_accuracy": 0.8647311642765999, "num_tokens": 1225151015.0, "step": 1335 }, { "epoch": 1.9042921887496669, "grad_norm": 0.26117744577378244, "learning_rate": 1.056624220212263e-05, "loss": 0.4227, "mean_token_accuracy": 0.8627439729869366, "num_tokens": 1229753553.0, "step": 1340 }, { "epoch": 1.9114014040700258, "grad_norm": 0.250926981430339, "learning_rate": 1.048729452922423e-05, "loss": 0.4118, "mean_token_accuracy": 0.8654024370014668, "num_tokens": 1234324722.0, "step": 1345 }, { "epoch": 1.9185106193903847, "grad_norm": 0.26445464932369295, "learning_rate": 1.0414241446964102e-05, "loss": 0.4176, "mean_token_accuracy": 0.8638374984264374, "num_tokens": 1238945254.0, "step": 1350 }, { "epoch": 1.9256198347107438, "grad_norm": 0.24942959940503223, "learning_rate": 1.0347092629389484e-05, "loss": 0.4098, "mean_token_accuracy": 0.8681537143886089, "num_tokens": 1243530120.0, "step": 1355 }, { "epoch": 1.932729050031103, "grad_norm": 0.25517475920539473, "learning_rate": 1.0285856968675917e-05, "loss": 0.4104, "mean_token_accuracy": 0.8657238759100437, "num_tokens": 1248126495.0, "step": 1360 }, { "epoch": 1.9398382653514619, "grad_norm": 0.24624704699692396, "learning_rate": 1.0230542573949747e-05, "loss": 0.4053, "mean_token_accuracy": 0.8677756235003471, "num_tokens": 1252728208.0, "step": 1365 }, { "epoch": 1.9469474806718208, "grad_norm": 0.24811417447193737, "learning_rate": 1.0181156770214243e-05, "loss": 0.4193, "mean_token_accuracy": 0.8637429274618625, "num_tokens": 1257314007.0, "step": 1370 }, { "epoch": 1.95405669599218, "grad_norm": 0.2553291480205661, "learning_rate": 1.013770609737961e-05, "loss": 0.4153, "mean_token_accuracy": 0.8649327427148819, "num_tokens": 1261908378.0, "step": 1375 }, { "epoch": 1.961165911312539, "grad_norm": 0.24846642652489853, "learning_rate": 1.010019630939691e-05, "loss": 0.4204, "mean_token_accuracy": 0.8626691080629826, "num_tokens": 1266492690.0, "step": 1380 }, { "epoch": 1.968275126632898, "grad_norm": 0.24853442428779762, "learning_rate": 1.0068632373496125e-05, "loss": 0.4213, "mean_token_accuracy": 0.862095658481121, "num_tokens": 1271089050.0, "step": 1385 }, { "epoch": 1.9753843419532569, "grad_norm": 0.25447008393745496, "learning_rate": 1.0043018469528365e-05, "loss": 0.4186, "mean_token_accuracy": 0.8638553529977798, "num_tokens": 1275693685.0, "step": 1390 }, { "epoch": 1.982493557273616, "grad_norm": 0.25146974784680387, "learning_rate": 1.0023357989412332e-05, "loss": 0.4132, "mean_token_accuracy": 0.8654829584062099, "num_tokens": 1280282291.0, "step": 1395 }, { "epoch": 1.9896027725939749, "grad_norm": 0.25186861166219776, "learning_rate": 1.000965353668517e-05, "loss": 0.4097, "mean_token_accuracy": 0.8660168826580048, "num_tokens": 1284878893.0, "step": 1400 }, { "epoch": 1.9896027725939749, "eval_loss": 0.45450538396835327, "eval_mean_token_accuracy": 0.8486974662856052, "eval_num_tokens": 1284878893.0, "eval_runtime": 143.4865, "eval_samples_per_second": 25.354, "eval_steps_per_second": 0.794, "step": 1400 }, { "epoch": 1.9967119879143338, "grad_norm": 0.2548741967506241, "learning_rate": 1.0001906926157681e-05, "loss": 0.4088, "mean_token_accuracy": 0.8670746453106404, "num_tokens": 1289465244.0, "step": 1405 }, { "epoch": 2.0, "mean_token_accuracy": 0.8681698522052249, "num_tokens": 1291584473.0, "step": 1408, "total_flos": 9795365997903872.0, "train_loss": 0.5166227378120477, "train_runtime": 48333.5779, "train_samples_per_second": 14.899, "train_steps_per_second": 0.029 } ], "logging_steps": 5, "max_steps": 1408, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9795365997903872.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }