{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.928, "eval_steps": 500, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 13.215581893920898, "learning_rate": 6.25e-07, "loss": 1.6218, "mean_token_accuracy": 0.6110803186893463, "step": 1 }, { "epoch": 0.032, "grad_norm": 13.990567207336426, "learning_rate": 1.25e-06, "loss": 1.7041, "mean_token_accuracy": 0.6301040947437286, "step": 2 }, { "epoch": 0.048, "grad_norm": 13.211037635803223, "learning_rate": 1.8750000000000003e-06, "loss": 1.7223, "mean_token_accuracy": 0.5843513906002045, "step": 3 }, { "epoch": 0.064, "grad_norm": 14.67287826538086, "learning_rate": 2.5e-06, "loss": 1.7409, "mean_token_accuracy": 0.5931073725223541, "step": 4 }, { "epoch": 0.08, "grad_norm": 11.63578987121582, "learning_rate": 3.125e-06, "loss": 1.6771, "mean_token_accuracy": 0.59913170337677, "step": 5 }, { "epoch": 0.096, "grad_norm": 10.64076042175293, "learning_rate": 3.7500000000000005e-06, "loss": 1.594, "mean_token_accuracy": 0.604899674654007, "step": 6 }, { "epoch": 0.112, "grad_norm": 6.09752893447876, "learning_rate": 4.3750000000000005e-06, "loss": 1.458, "mean_token_accuracy": 0.6177553832530975, "step": 7 }, { "epoch": 0.128, "grad_norm": 6.3267412185668945, "learning_rate": 5e-06, "loss": 1.4365, "mean_token_accuracy": 0.625266432762146, "step": 8 }, { "epoch": 0.144, "grad_norm": 7.116683006286621, "learning_rate": 5.625e-06, "loss": 1.4384, "mean_token_accuracy": 0.6151327192783356, "step": 9 }, { "epoch": 0.16, "grad_norm": 5.78098726272583, "learning_rate": 6.25e-06, "loss": 1.5083, "mean_token_accuracy": 0.5956140756607056, "step": 10 }, { "epoch": 0.176, "grad_norm": 6.24675989151001, "learning_rate": 6.875e-06, "loss": 1.4437, "mean_token_accuracy": 0.6152209639549255, "step": 11 }, { "epoch": 0.192, "grad_norm": 5.379809856414795, "learning_rate": 7.500000000000001e-06, "loss": 1.3736, "mean_token_accuracy": 0.6294114291667938, "step": 12 }, { "epoch": 0.208, "grad_norm": 6.314828872680664, "learning_rate": 8.125000000000001e-06, "loss": 1.2994, "mean_token_accuracy": 0.6413589417934418, "step": 13 }, { "epoch": 0.224, "grad_norm": 5.699032783508301, "learning_rate": 8.750000000000001e-06, "loss": 1.4117, "mean_token_accuracy": 0.6173716485500336, "step": 14 }, { "epoch": 0.24, "grad_norm": 5.353813171386719, "learning_rate": 9.375000000000001e-06, "loss": 1.3486, "mean_token_accuracy": 0.6315614581108093, "step": 15 }, { "epoch": 0.256, "grad_norm": 4.132171630859375, "learning_rate": 1e-05, "loss": 1.2788, "mean_token_accuracy": 0.6412823796272278, "step": 16 }, { "epoch": 0.272, "grad_norm": 3.82682204246521, "learning_rate": 9.999714542826806e-06, "loss": 1.299, "mean_token_accuracy": 0.6437126398086548, "step": 17 }, { "epoch": 0.288, "grad_norm": 3.6106579303741455, "learning_rate": 9.99885820390154e-06, "loss": 1.2665, "mean_token_accuracy": 0.6511238217353821, "step": 18 }, { "epoch": 0.304, "grad_norm": 3.483157157897949, "learning_rate": 9.99743108100344e-06, "loss": 1.182, "mean_token_accuracy": 0.6675288081169128, "step": 19 }, { "epoch": 0.32, "grad_norm": 3.896629571914673, "learning_rate": 9.995433337085492e-06, "loss": 1.2832, "mean_token_accuracy": 0.6452755331993103, "step": 20 }, { "epoch": 0.336, "grad_norm": 3.7331128120422363, "learning_rate": 9.992865200255829e-06, "loss": 1.2423, "mean_token_accuracy": 0.6492677927017212, "step": 21 }, { "epoch": 0.352, "grad_norm": 2.90647292137146, "learning_rate": 9.989726963751683e-06, "loss": 1.1555, "mean_token_accuracy": 0.6576137840747833, "step": 22 }, { "epoch": 0.368, "grad_norm": 3.1129422187805176, "learning_rate": 9.986018985905901e-06, "loss": 1.1522, "mean_token_accuracy": 0.6654780805110931, "step": 23 }, { "epoch": 0.384, "grad_norm": 2.943026304244995, "learning_rate": 9.981741690106035e-06, "loss": 1.2178, "mean_token_accuracy": 0.6560894250869751, "step": 24 }, { "epoch": 0.4, "grad_norm": 2.891770362854004, "learning_rate": 9.976895564745993e-06, "loss": 1.1376, "mean_token_accuracy": 0.6699535846710205, "step": 25 }, { "epoch": 0.416, "grad_norm": 3.3815524578094482, "learning_rate": 9.97148116317027e-06, "loss": 1.0781, "mean_token_accuracy": 0.6821956038475037, "step": 26 }, { "epoch": 0.432, "grad_norm": 2.6959567070007324, "learning_rate": 9.965499103610775e-06, "loss": 1.186, "mean_token_accuracy": 0.6514892578125, "step": 27 }, { "epoch": 0.448, "grad_norm": 2.619265079498291, "learning_rate": 9.95895006911623e-06, "loss": 1.1389, "mean_token_accuracy": 0.6814388632774353, "step": 28 }, { "epoch": 0.464, "grad_norm": 2.3042593002319336, "learning_rate": 9.951834807474191e-06, "loss": 1.2518, "mean_token_accuracy": 0.6459318101406097, "step": 29 }, { "epoch": 0.48, "grad_norm": 2.0451290607452393, "learning_rate": 9.944154131125643e-06, "loss": 1.2551, "mean_token_accuracy": 0.6489538252353668, "step": 30 }, { "epoch": 0.496, "grad_norm": 2.3282535076141357, "learning_rate": 9.935908917072253e-06, "loss": 1.1706, "mean_token_accuracy": 0.6664490401744843, "step": 31 }, { "epoch": 0.512, "grad_norm": 2.491532802581787, "learning_rate": 9.927100106776213e-06, "loss": 1.1587, "mean_token_accuracy": 0.6678481698036194, "step": 32 }, { "epoch": 0.528, "grad_norm": 2.270688533782959, "learning_rate": 9.917728706052765e-06, "loss": 1.2636, "mean_token_accuracy": 0.6454881429672241, "step": 33 }, { "epoch": 0.544, "grad_norm": 2.3011410236358643, "learning_rate": 9.907795784955327e-06, "loss": 1.1625, "mean_token_accuracy": 0.6705195307731628, "step": 34 }, { "epoch": 0.56, "grad_norm": 2.0805764198303223, "learning_rate": 9.897302477653334e-06, "loss": 1.1351, "mean_token_accuracy": 0.6750719845294952, "step": 35 }, { "epoch": 0.576, "grad_norm": 2.144771099090576, "learning_rate": 9.88624998230272e-06, "loss": 1.1632, "mean_token_accuracy": 0.6594493687152863, "step": 36 }, { "epoch": 0.592, "grad_norm": 2.3651621341705322, "learning_rate": 9.874639560909118e-06, "loss": 1.1155, "mean_token_accuracy": 0.6659645736217499, "step": 37 }, { "epoch": 0.608, "grad_norm": 2.438082695007324, "learning_rate": 9.862472539183757e-06, "loss": 1.049, "mean_token_accuracy": 0.6966923773288727, "step": 38 }, { "epoch": 0.624, "grad_norm": 2.3827691078186035, "learning_rate": 9.849750306392085e-06, "loss": 1.1359, "mean_token_accuracy": 0.6755817234516144, "step": 39 }, { "epoch": 0.64, "grad_norm": 2.284531831741333, "learning_rate": 9.836474315195148e-06, "loss": 1.2714, "mean_token_accuracy": 0.6394982039928436, "step": 40 }, { "epoch": 0.656, "grad_norm": 2.040752410888672, "learning_rate": 9.822646081483713e-06, "loss": 1.1401, "mean_token_accuracy": 0.6642443835735321, "step": 41 }, { "epoch": 0.672, "grad_norm": 2.416327476501465, "learning_rate": 9.808267184205182e-06, "loss": 1.1629, "mean_token_accuracy": 0.6591512560844421, "step": 42 }, { "epoch": 0.688, "grad_norm": 2.5612552165985107, "learning_rate": 9.793339265183303e-06, "loss": 1.0806, "mean_token_accuracy": 0.6875375807285309, "step": 43 }, { "epoch": 0.704, "grad_norm": 2.396414041519165, "learning_rate": 9.777864028930705e-06, "loss": 1.143, "mean_token_accuracy": 0.6633529961109161, "step": 44 }, { "epoch": 0.72, "grad_norm": 2.2041707038879395, "learning_rate": 9.761843242454261e-06, "loss": 1.1267, "mean_token_accuracy": 0.6633928120136261, "step": 45 }, { "epoch": 0.736, "grad_norm": 2.2253987789154053, "learning_rate": 9.745278735053345e-06, "loss": 1.052, "mean_token_accuracy": 0.6914463341236115, "step": 46 }, { "epoch": 0.752, "grad_norm": 2.0851356983184814, "learning_rate": 9.728172398110935e-06, "loss": 1.2451, "mean_token_accuracy": 0.6509403884410858, "step": 47 }, { "epoch": 0.768, "grad_norm": 2.110045909881592, "learning_rate": 9.710526184877667e-06, "loss": 1.1558, "mean_token_accuracy": 0.6611768901348114, "step": 48 }, { "epoch": 0.784, "grad_norm": 2.5329489707946777, "learning_rate": 9.692342110248802e-06, "loss": 1.0322, "mean_token_accuracy": 0.6901764273643494, "step": 49 }, { "epoch": 0.8, "grad_norm": 2.5721609592437744, "learning_rate": 9.673622250534155e-06, "loss": 1.1132, "mean_token_accuracy": 0.6760377287864685, "step": 50 }, { "epoch": 0.816, "grad_norm": 2.4259121417999268, "learning_rate": 9.654368743221022e-06, "loss": 1.1227, "mean_token_accuracy": 0.6690613925457001, "step": 51 }, { "epoch": 0.832, "grad_norm": 2.1539769172668457, "learning_rate": 9.63458378673011e-06, "loss": 1.1727, "mean_token_accuracy": 0.6630356311798096, "step": 52 }, { "epoch": 0.848, "grad_norm": 2.281749963760376, "learning_rate": 9.61426964016452e-06, "loss": 1.1745, "mean_token_accuracy": 0.6740401685237885, "step": 53 }, { "epoch": 0.864, "grad_norm": 2.3648617267608643, "learning_rate": 9.593428623051793e-06, "loss": 1.1393, "mean_token_accuracy": 0.6692823767662048, "step": 54 }, { "epoch": 0.88, "grad_norm": 2.0223169326782227, "learning_rate": 9.572063115079063e-06, "loss": 1.1872, "mean_token_accuracy": 0.6564280390739441, "step": 55 }, { "epoch": 0.896, "grad_norm": 2.4743919372558594, "learning_rate": 9.550175555821333e-06, "loss": 1.0744, "mean_token_accuracy": 0.6807240545749664, "step": 56 }, { "epoch": 0.912, "grad_norm": 2.407332181930542, "learning_rate": 9.527768444462922e-06, "loss": 1.0236, "mean_token_accuracy": 0.6924389898777008, "step": 57 }, { "epoch": 0.928, "grad_norm": 2.3066141605377197, "learning_rate": 9.504844339512096e-06, "loss": 1.08, "mean_token_accuracy": 0.6853623390197754, "step": 58 }, { "epoch": 0.944, "grad_norm": 2.3197851181030273, "learning_rate": 9.481405858508935e-06, "loss": 1.1733, "mean_token_accuracy": 0.6676386296749115, "step": 59 }, { "epoch": 0.96, "grad_norm": 2.277989625930786, "learning_rate": 9.457455677726447e-06, "loss": 1.112, "mean_token_accuracy": 0.6790435910224915, "step": 60 }, { "epoch": 0.976, "grad_norm": 2.2761218547821045, "learning_rate": 9.432996531865001e-06, "loss": 1.1124, "mean_token_accuracy": 0.6608242690563202, "step": 61 }, { "epoch": 0.992, "grad_norm": 2.4564669132232666, "learning_rate": 9.408031213740045e-06, "loss": 1.0301, "mean_token_accuracy": 0.6902466714382172, "step": 62 }, { "epoch": 1.0, "grad_norm": 2.897603988647461, "learning_rate": 9.382562573963238e-06, "loss": 1.1236, "mean_token_accuracy": 0.6699228882789612, "step": 63 }, { "epoch": 1.016, "grad_norm": 2.5035905838012695, "learning_rate": 9.356593520616948e-06, "loss": 0.9954, "mean_token_accuracy": 0.6969379484653473, "step": 64 }, { "epoch": 1.032, "grad_norm": 2.1372523307800293, "learning_rate": 9.330127018922195e-06, "loss": 0.8832, "mean_token_accuracy": 0.7084084153175354, "step": 65 }, { "epoch": 1.048, "grad_norm": 2.1107141971588135, "learning_rate": 9.303166090900082e-06, "loss": 0.997, "mean_token_accuracy": 0.6974039673805237, "step": 66 }, { "epoch": 1.064, "grad_norm": 2.203361988067627, "learning_rate": 9.275713815026732e-06, "loss": 0.8926, "mean_token_accuracy": 0.7100537419319153, "step": 67 }, { "epoch": 1.08, "grad_norm": 2.280200481414795, "learning_rate": 9.24777332588177e-06, "loss": 0.7885, "mean_token_accuracy": 0.7500935792922974, "step": 68 }, { "epoch": 1.096, "grad_norm": 1.9387831687927246, "learning_rate": 9.219347813790416e-06, "loss": 0.7684, "mean_token_accuracy": 0.7537571489810944, "step": 69 }, { "epoch": 1.112, "grad_norm": 1.8483513593673706, "learning_rate": 9.190440524459203e-06, "loss": 0.9308, "mean_token_accuracy": 0.7234339416027069, "step": 70 }, { "epoch": 1.1280000000000001, "grad_norm": 2.085538148880005, "learning_rate": 9.16105475860537e-06, "loss": 0.892, "mean_token_accuracy": 0.7284388244152069, "step": 71 }, { "epoch": 1.144, "grad_norm": 2.351925849914551, "learning_rate": 9.131193871579975e-06, "loss": 0.9014, "mean_token_accuracy": 0.709541916847229, "step": 72 }, { "epoch": 1.16, "grad_norm": 2.246936798095703, "learning_rate": 9.10086127298478e-06, "loss": 0.797, "mean_token_accuracy": 0.7530255913734436, "step": 73 }, { "epoch": 1.176, "grad_norm": 2.2428550720214844, "learning_rate": 9.070060426282924e-06, "loss": 0.891, "mean_token_accuracy": 0.7251231074333191, "step": 74 }, { "epoch": 1.192, "grad_norm": 2.2173142433166504, "learning_rate": 9.038794848403463e-06, "loss": 0.8948, "mean_token_accuracy": 0.7141964137554169, "step": 75 }, { "epoch": 1.208, "grad_norm": 2.357853889465332, "learning_rate": 9.007068109339783e-06, "loss": 0.8253, "mean_token_accuracy": 0.737085372209549, "step": 76 }, { "epoch": 1.224, "grad_norm": 2.5355238914489746, "learning_rate": 8.97488383174199e-06, "loss": 0.9135, "mean_token_accuracy": 0.7259043753147125, "step": 77 }, { "epoch": 1.24, "grad_norm": 2.1808676719665527, "learning_rate": 8.94224569050324e-06, "loss": 0.9373, "mean_token_accuracy": 0.718153566122055, "step": 78 }, { "epoch": 1.256, "grad_norm": 2.065774917602539, "learning_rate": 8.90915741234015e-06, "loss": 0.9066, "mean_token_accuracy": 0.7267594933509827, "step": 79 }, { "epoch": 1.272, "grad_norm": 2.277780055999756, "learning_rate": 8.87562277536726e-06, "loss": 0.84, "mean_token_accuracy": 0.7230830788612366, "step": 80 }, { "epoch": 1.288, "grad_norm": 2.232370615005493, "learning_rate": 8.84164560866564e-06, "loss": 0.8702, "mean_token_accuracy": 0.7305114567279816, "step": 81 }, { "epoch": 1.304, "grad_norm": 2.038029193878174, "learning_rate": 8.807229791845673e-06, "loss": 0.9693, "mean_token_accuracy": 0.71137934923172, "step": 82 }, { "epoch": 1.32, "grad_norm": 2.3167836666107178, "learning_rate": 8.772379254604074e-06, "loss": 0.9381, "mean_token_accuracy": 0.7007491886615753, "step": 83 }, { "epoch": 1.336, "grad_norm": 2.048931121826172, "learning_rate": 8.737097976275177e-06, "loss": 0.9307, "mean_token_accuracy": 0.7027971148490906, "step": 84 }, { "epoch": 1.3519999999999999, "grad_norm": 1.9068111181259155, "learning_rate": 8.701389985376578e-06, "loss": 0.8459, "mean_token_accuracy": 0.7333859205245972, "step": 85 }, { "epoch": 1.3679999999999999, "grad_norm": 2.2652273178100586, "learning_rate": 8.665259359149132e-06, "loss": 0.8011, "mean_token_accuracy": 0.7324046790599823, "step": 86 }, { "epoch": 1.384, "grad_norm": 2.2550220489501953, "learning_rate": 8.62871022309141e-06, "loss": 0.894, "mean_token_accuracy": 0.7118227481842041, "step": 87 }, { "epoch": 1.4, "grad_norm": 2.1859939098358154, "learning_rate": 8.591746750488639e-06, "loss": 0.9182, "mean_token_accuracy": 0.7196886539459229, "step": 88 }, { "epoch": 1.416, "grad_norm": 2.1244003772735596, "learning_rate": 8.554373161936176e-06, "loss": 0.8451, "mean_token_accuracy": 0.7242709100246429, "step": 89 }, { "epoch": 1.432, "grad_norm": 2.1248416900634766, "learning_rate": 8.516593724857598e-06, "loss": 0.7697, "mean_token_accuracy": 0.7540797293186188, "step": 90 }, { "epoch": 1.448, "grad_norm": 2.0870182514190674, "learning_rate": 8.478412753017433e-06, "loss": 0.8992, "mean_token_accuracy": 0.7169869244098663, "step": 91 }, { "epoch": 1.464, "grad_norm": 2.2304399013519287, "learning_rate": 8.439834606028594e-06, "loss": 0.9942, "mean_token_accuracy": 0.7090178430080414, "step": 92 }, { "epoch": 1.48, "grad_norm": 2.376088857650757, "learning_rate": 8.400863688854598e-06, "loss": 0.82, "mean_token_accuracy": 0.7365303039550781, "step": 93 }, { "epoch": 1.496, "grad_norm": 2.288710594177246, "learning_rate": 8.361504451306585e-06, "loss": 0.8563, "mean_token_accuracy": 0.730265200138092, "step": 94 }, { "epoch": 1.512, "grad_norm": 2.1162161827087402, "learning_rate": 8.321761387535231e-06, "loss": 0.8894, "mean_token_accuracy": 0.7278172373771667, "step": 95 }, { "epoch": 1.528, "grad_norm": 2.040531635284424, "learning_rate": 8.281639035517591e-06, "loss": 0.8451, "mean_token_accuracy": 0.7346065938472748, "step": 96 }, { "epoch": 1.544, "grad_norm": 2.139214277267456, "learning_rate": 8.241141976538944e-06, "loss": 0.9186, "mean_token_accuracy": 0.7127924859523773, "step": 97 }, { "epoch": 1.56, "grad_norm": 2.0378589630126953, "learning_rate": 8.200274834669675e-06, "loss": 0.8343, "mean_token_accuracy": 0.7423592507839203, "step": 98 }, { "epoch": 1.576, "grad_norm": 2.0469090938568115, "learning_rate": 8.159042276237308e-06, "loss": 0.9495, "mean_token_accuracy": 0.7150463759899139, "step": 99 }, { "epoch": 1.592, "grad_norm": 1.8930435180664062, "learning_rate": 8.117449009293668e-06, "loss": 0.7888, "mean_token_accuracy": 0.7434406578540802, "step": 100 }, { "epoch": 1.608, "grad_norm": 2.1599249839782715, "learning_rate": 8.075499783077321e-06, "loss": 0.8726, "mean_token_accuracy": 0.7206392288208008, "step": 101 }, { "epoch": 1.624, "grad_norm": 2.1455466747283936, "learning_rate": 8.033199387471278e-06, "loss": 0.9613, "mean_token_accuracy": 0.7104770839214325, "step": 102 }, { "epoch": 1.6400000000000001, "grad_norm": 2.0764999389648438, "learning_rate": 7.99055265245608e-06, "loss": 0.9353, "mean_token_accuracy": 0.7238904535770416, "step": 103 }, { "epoch": 1.6560000000000001, "grad_norm": 1.9134161472320557, "learning_rate": 7.9475644475583e-06, "loss": 0.8277, "mean_token_accuracy": 0.7398611009120941, "step": 104 }, { "epoch": 1.6720000000000002, "grad_norm": 2.059666872024536, "learning_rate": 7.904239681294515e-06, "loss": 0.809, "mean_token_accuracy": 0.7348803877830505, "step": 105 }, { "epoch": 1.688, "grad_norm": 1.903882622718811, "learning_rate": 7.860583300610849e-06, "loss": 0.779, "mean_token_accuracy": 0.7487615942955017, "step": 106 }, { "epoch": 1.704, "grad_norm": 1.921822428703308, "learning_rate": 7.81660029031811e-06, "loss": 0.8543, "mean_token_accuracy": 0.7314419448375702, "step": 107 }, { "epoch": 1.72, "grad_norm": 1.9682213068008423, "learning_rate": 7.772295672522615e-06, "loss": 0.8275, "mean_token_accuracy": 0.7274343073368073, "step": 108 }, { "epoch": 1.736, "grad_norm": 2.098949909210205, "learning_rate": 7.727674506052744e-06, "loss": 0.862, "mean_token_accuracy": 0.7319101393222809, "step": 109 }, { "epoch": 1.752, "grad_norm": 1.8602185249328613, "learning_rate": 7.682741885881314e-06, "loss": 0.8919, "mean_token_accuracy": 0.7283230125904083, "step": 110 }, { "epoch": 1.768, "grad_norm": 1.8391119241714478, "learning_rate": 7.637502942543825e-06, "loss": 0.9563, "mean_token_accuracy": 0.716068685054779, "step": 111 }, { "epoch": 1.784, "grad_norm": 1.818740725517273, "learning_rate": 7.591962841552627e-06, "loss": 0.8362, "mean_token_accuracy": 0.7313212752342224, "step": 112 }, { "epoch": 1.8, "grad_norm": 1.9765996932983398, "learning_rate": 7.546126782807117e-06, "loss": 0.9427, "mean_token_accuracy": 0.7000788450241089, "step": 113 }, { "epoch": 1.8159999999999998, "grad_norm": 1.8508985042572021, "learning_rate": 7.500000000000001e-06, "loss": 0.8859, "mean_token_accuracy": 0.7213517427444458, "step": 114 }, { "epoch": 1.8319999999999999, "grad_norm": 1.936930537223816, "learning_rate": 7.453587760019691e-06, "loss": 0.8006, "mean_token_accuracy": 0.7494409084320068, "step": 115 }, { "epoch": 1.8479999999999999, "grad_norm": 1.8357962369918823, "learning_rate": 7.406895362348916e-06, "loss": 0.9697, "mean_token_accuracy": 0.7061353325843811, "step": 116 }, { "epoch": 1.8639999999999999, "grad_norm": 1.9790865182876587, "learning_rate": 7.359928138459615e-06, "loss": 0.9303, "mean_token_accuracy": 0.7191019952297211, "step": 117 }, { "epoch": 1.88, "grad_norm": 1.857360601425171, "learning_rate": 7.312691451204178e-06, "loss": 0.8748, "mean_token_accuracy": 0.7341600954532623, "step": 118 }, { "epoch": 1.896, "grad_norm": 1.8874846696853638, "learning_rate": 7.265190694203086e-06, "loss": 0.9276, "mean_token_accuracy": 0.7085016965866089, "step": 119 }, { "epoch": 1.912, "grad_norm": 1.8215382099151611, "learning_rate": 7.217431291229068e-06, "loss": 0.8696, "mean_token_accuracy": 0.712285041809082, "step": 120 }, { "epoch": 1.928, "grad_norm": 2.019747257232666, "learning_rate": 7.169418695587791e-06, "loss": 0.8699, "mean_token_accuracy": 0.7117200493812561, "step": 121 }, { "epoch": 1.944, "grad_norm": 2.0499608516693115, "learning_rate": 7.121158389495187e-06, "loss": 0.8163, "mean_token_accuracy": 0.7351702451705933, "step": 122 }, { "epoch": 1.96, "grad_norm": 1.9624381065368652, "learning_rate": 7.072655883451478e-06, "loss": 0.8318, "mean_token_accuracy": 0.7394059598445892, "step": 123 }, { "epoch": 1.976, "grad_norm": 2.178957223892212, "learning_rate": 7.023916715611969e-06, "loss": 0.8883, "mean_token_accuracy": 0.7379129827022552, "step": 124 }, { "epoch": 1.992, "grad_norm": 2.058216094970703, "learning_rate": 6.974946451154694e-06, "loss": 0.8657, "mean_token_accuracy": 0.7464525401592255, "step": 125 }, { "epoch": 2.0, "grad_norm": 2.7633073329925537, "learning_rate": 6.925750681644954e-06, "loss": 0.8745, "mean_token_accuracy": 0.6934740543365479, "step": 126 }, { "epoch": 2.016, "grad_norm": 2.234250783920288, "learning_rate": 6.876335024396872e-06, "loss": 0.7128, "mean_token_accuracy": 0.7859170734882355, "step": 127 }, { "epoch": 2.032, "grad_norm": 1.9746248722076416, "learning_rate": 6.8267051218319766e-06, "loss": 0.7826, "mean_token_accuracy": 0.7330600023269653, "step": 128 }, { "epoch": 2.048, "grad_norm": 1.6777163743972778, "learning_rate": 6.7768666408349445e-06, "loss": 0.7145, "mean_token_accuracy": 0.7724449634552002, "step": 129 }, { "epoch": 2.064, "grad_norm": 1.8287721872329712, "learning_rate": 6.726825272106539e-06, "loss": 0.7962, "mean_token_accuracy": 0.7566362023353577, "step": 130 }, { "epoch": 2.08, "grad_norm": 1.8211966753005981, "learning_rate": 6.676586729513823e-06, "loss": 0.603, "mean_token_accuracy": 0.7968634068965912, "step": 131 }, { "epoch": 2.096, "grad_norm": 2.150343894958496, "learning_rate": 6.626156749437736e-06, "loss": 0.5516, "mean_token_accuracy": 0.8278916776180267, "step": 132 }, { "epoch": 2.112, "grad_norm": 2.1101906299591064, "learning_rate": 6.575541090118105e-06, "loss": 0.7889, "mean_token_accuracy": 0.7510809600353241, "step": 133 }, { "epoch": 2.128, "grad_norm": 1.85099196434021, "learning_rate": 6.524745530996137e-06, "loss": 0.6569, "mean_token_accuracy": 0.7761998474597931, "step": 134 }, { "epoch": 2.144, "grad_norm": 1.9975664615631104, "learning_rate": 6.473775872054522e-06, "loss": 0.6174, "mean_token_accuracy": 0.7907389402389526, "step": 135 }, { "epoch": 2.16, "grad_norm": 1.923912763595581, "learning_rate": 6.4226379331551625e-06, "loss": 0.6157, "mean_token_accuracy": 0.7957707643508911, "step": 136 }, { "epoch": 2.176, "grad_norm": 2.096576452255249, "learning_rate": 6.3713375533746525e-06, "loss": 0.5704, "mean_token_accuracy": 0.8159550130367279, "step": 137 }, { "epoch": 2.192, "grad_norm": 2.174253463745117, "learning_rate": 6.319880590337549e-06, "loss": 0.5904, "mean_token_accuracy": 0.7935393750667572, "step": 138 }, { "epoch": 2.208, "grad_norm": 1.8152769804000854, "learning_rate": 6.268272919547537e-06, "loss": 0.6239, "mean_token_accuracy": 0.7914057075977325, "step": 139 }, { "epoch": 2.224, "grad_norm": 1.8336701393127441, "learning_rate": 6.216520433716544e-06, "loss": 0.7358, "mean_token_accuracy": 0.7601068317890167, "step": 140 }, { "epoch": 2.24, "grad_norm": 1.7724854946136475, "learning_rate": 6.164629042091894e-06, "loss": 0.5178, "mean_token_accuracy": 0.8328815996646881, "step": 141 }, { "epoch": 2.2560000000000002, "grad_norm": 2.001527786254883, "learning_rate": 6.112604669781572e-06, "loss": 0.7341, "mean_token_accuracy": 0.7661413848400116, "step": 142 }, { "epoch": 2.2720000000000002, "grad_norm": 1.973062515258789, "learning_rate": 6.060453257077686e-06, "loss": 0.6315, "mean_token_accuracy": 0.7816351056098938, "step": 143 }, { "epoch": 2.288, "grad_norm": 1.9068118333816528, "learning_rate": 6.008180758778167e-06, "loss": 0.6673, "mean_token_accuracy": 0.7819712162017822, "step": 144 }, { "epoch": 2.304, "grad_norm": 1.9064576625823975, "learning_rate": 5.955793143506863e-06, "loss": 0.5872, "mean_token_accuracy": 0.8068342804908752, "step": 145 }, { "epoch": 2.32, "grad_norm": 1.783179759979248, "learning_rate": 5.903296393031996e-06, "loss": 0.6182, "mean_token_accuracy": 0.7834264039993286, "step": 146 }, { "epoch": 2.336, "grad_norm": 1.8228886127471924, "learning_rate": 5.850696501583164e-06, "loss": 0.6316, "mean_token_accuracy": 0.7983745634555817, "step": 147 }, { "epoch": 2.352, "grad_norm": 1.7698777914047241, "learning_rate": 5.797999475166897e-06, "loss": 0.7002, "mean_token_accuracy": 0.7744604349136353, "step": 148 }, { "epoch": 2.368, "grad_norm": 2.0236709117889404, "learning_rate": 5.745211330880872e-06, "loss": 0.6289, "mean_token_accuracy": 0.7845793068408966, "step": 149 }, { "epoch": 2.384, "grad_norm": 2.0701353549957275, "learning_rate": 5.69233809622687e-06, "loss": 0.4853, "mean_token_accuracy": 0.8342052102088928, "step": 150 }, { "epoch": 2.4, "grad_norm": 1.841230034828186, "learning_rate": 5.6393858084225305e-06, "loss": 0.6612, "mean_token_accuracy": 0.7836865186691284, "step": 151 }, { "epoch": 2.416, "grad_norm": 1.8939555883407593, "learning_rate": 5.586360513712011e-06, "loss": 0.645, "mean_token_accuracy": 0.779130607843399, "step": 152 }, { "epoch": 2.432, "grad_norm": 1.9395414590835571, "learning_rate": 5.533268266675601e-06, "loss": 0.6672, "mean_token_accuracy": 0.767756462097168, "step": 153 }, { "epoch": 2.448, "grad_norm": 1.8063210248947144, "learning_rate": 5.480115129538409e-06, "loss": 0.6267, "mean_token_accuracy": 0.7938161492347717, "step": 154 }, { "epoch": 2.464, "grad_norm": 1.9447354078292847, "learning_rate": 5.426907171478143e-06, "loss": 0.458, "mean_token_accuracy": 0.8398927450180054, "step": 155 }, { "epoch": 2.48, "grad_norm": 1.9445098638534546, "learning_rate": 5.373650467932122e-06, "loss": 0.5893, "mean_token_accuracy": 0.8068392276763916, "step": 156 }, { "epoch": 2.496, "grad_norm": 1.9469696283340454, "learning_rate": 5.320351099903565e-06, "loss": 0.7112, "mean_token_accuracy": 0.7803526222705841, "step": 157 }, { "epoch": 2.512, "grad_norm": 1.8126373291015625, "learning_rate": 5.267015153267246e-06, "loss": 0.5232, "mean_token_accuracy": 0.8252148330211639, "step": 158 }, { "epoch": 2.528, "grad_norm": 1.9050959348678589, "learning_rate": 5.213648718074584e-06, "loss": 0.5775, "mean_token_accuracy": 0.7865356504917145, "step": 159 }, { "epoch": 2.544, "grad_norm": 1.998955488204956, "learning_rate": 5.160257887858278e-06, "loss": 0.703, "mean_token_accuracy": 0.7390948235988617, "step": 160 }, { "epoch": 2.56, "grad_norm": 1.973913311958313, "learning_rate": 5.106848758936508e-06, "loss": 0.6627, "mean_token_accuracy": 0.7834351360797882, "step": 161 }, { "epoch": 2.576, "grad_norm": 1.9610992670059204, "learning_rate": 5.053427429716867e-06, "loss": 0.676, "mean_token_accuracy": 0.7891885936260223, "step": 162 }, { "epoch": 2.592, "grad_norm": 1.9601593017578125, "learning_rate": 5e-06, "loss": 0.6207, "mean_token_accuracy": 0.7994289994239807, "step": 163 }, { "epoch": 2.608, "grad_norm": 1.98080575466156, "learning_rate": 4.946572570283135e-06, "loss": 0.5899, "mean_token_accuracy": 0.7986661791801453, "step": 164 }, { "epoch": 2.624, "grad_norm": 1.8518565893173218, "learning_rate": 4.893151241063493e-06, "loss": 0.6278, "mean_token_accuracy": 0.7949875593185425, "step": 165 }, { "epoch": 2.64, "grad_norm": 1.865482211112976, "learning_rate": 4.839742112141725e-06, "loss": 0.6787, "mean_token_accuracy": 0.7608937621116638, "step": 166 }, { "epoch": 2.656, "grad_norm": 2.0454063415527344, "learning_rate": 4.786351281925417e-06, "loss": 0.7028, "mean_token_accuracy": 0.7533181011676788, "step": 167 }, { "epoch": 2.672, "grad_norm": 1.9324195384979248, "learning_rate": 4.732984846732755e-06, "loss": 0.5646, "mean_token_accuracy": 0.8008407652378082, "step": 168 }, { "epoch": 2.6879999999999997, "grad_norm": 1.8085490465164185, "learning_rate": 4.679648900096436e-06, "loss": 0.6662, "mean_token_accuracy": 0.7686671316623688, "step": 169 }, { "epoch": 2.7039999999999997, "grad_norm": 1.8703786134719849, "learning_rate": 4.626349532067879e-06, "loss": 0.7152, "mean_token_accuracy": 0.7892328798770905, "step": 170 }, { "epoch": 2.7199999999999998, "grad_norm": 2.0019490718841553, "learning_rate": 4.573092828521857e-06, "loss": 0.569, "mean_token_accuracy": 0.8060936331748962, "step": 171 }, { "epoch": 2.7359999999999998, "grad_norm": 1.706632137298584, "learning_rate": 4.5198848704615915e-06, "loss": 0.6882, "mean_token_accuracy": 0.7672425508499146, "step": 172 }, { "epoch": 2.752, "grad_norm": 1.881160855293274, "learning_rate": 4.466731733324399e-06, "loss": 0.6283, "mean_token_accuracy": 0.8031312525272369, "step": 173 }, { "epoch": 2.768, "grad_norm": 1.8949018716812134, "learning_rate": 4.413639486287992e-06, "loss": 0.5711, "mean_token_accuracy": 0.8097128570079803, "step": 174 }, { "epoch": 2.784, "grad_norm": 1.7667330503463745, "learning_rate": 4.3606141915774695e-06, "loss": 0.8263, "mean_token_accuracy": 0.7537855207920074, "step": 175 }, { "epoch": 2.8, "grad_norm": 1.8455729484558105, "learning_rate": 4.307661903773129e-06, "loss": 0.5854, "mean_token_accuracy": 0.8064002692699432, "step": 176 }, { "epoch": 2.816, "grad_norm": 1.8596807718276978, "learning_rate": 4.254788669119127e-06, "loss": 0.6915, "mean_token_accuracy": 0.7887419760227203, "step": 177 }, { "epoch": 2.832, "grad_norm": 1.7176703214645386, "learning_rate": 4.2020005248331056e-06, "loss": 0.6683, "mean_token_accuracy": 0.7633313834667206, "step": 178 }, { "epoch": 2.848, "grad_norm": 1.7099390029907227, "learning_rate": 4.149303498416838e-06, "loss": 0.5278, "mean_token_accuracy": 0.7912513315677643, "step": 179 }, { "epoch": 2.864, "grad_norm": 1.9386448860168457, "learning_rate": 4.096703606968007e-06, "loss": 0.6205, "mean_token_accuracy": 0.787754625082016, "step": 180 }, { "epoch": 2.88, "grad_norm": 1.731827974319458, "learning_rate": 4.04420685649314e-06, "loss": 0.6155, "mean_token_accuracy": 0.8019371926784515, "step": 181 }, { "epoch": 2.896, "grad_norm": 1.9536575078964233, "learning_rate": 3.991819241221836e-06, "loss": 0.6656, "mean_token_accuracy": 0.7672460973262787, "step": 182 }, { "epoch": 2.912, "grad_norm": 1.7277284860610962, "learning_rate": 3.939546742922318e-06, "loss": 0.6124, "mean_token_accuracy": 0.7833640873432159, "step": 183 }, { "epoch": 2.928, "grad_norm": 1.7660479545593262, "learning_rate": 3.887395330218429e-06, "loss": 0.5523, "mean_token_accuracy": 0.8151686191558838, "step": 184 }, { "epoch": 2.944, "grad_norm": 1.7463651895523071, "learning_rate": 3.835370957908108e-06, "loss": 0.5803, "mean_token_accuracy": 0.8114446699619293, "step": 185 }, { "epoch": 2.96, "grad_norm": 1.9581048488616943, "learning_rate": 3.783479566283457e-06, "loss": 0.7512, "mean_token_accuracy": 0.7575298547744751, "step": 186 }, { "epoch": 2.976, "grad_norm": 2.0337820053100586, "learning_rate": 3.731727080452464e-06, "loss": 0.4586, "mean_token_accuracy": 0.8477471768856049, "step": 187 }, { "epoch": 2.992, "grad_norm": 2.0532712936401367, "learning_rate": 3.6801194096624515e-06, "loss": 0.7017, "mean_token_accuracy": 0.7881054580211639, "step": 188 }, { "epoch": 3.0, "grad_norm": 2.616175651550293, "learning_rate": 3.6286624466253496e-06, "loss": 0.4702, "mean_token_accuracy": 0.8436923027038574, "step": 189 }, { "epoch": 3.016, "grad_norm": 1.8989686965942383, "learning_rate": 3.5773620668448384e-06, "loss": 0.4499, "mean_token_accuracy": 0.8539375066757202, "step": 190 }, { "epoch": 3.032, "grad_norm": 1.7433695793151855, "learning_rate": 3.526224127945479e-06, "loss": 0.44, "mean_token_accuracy": 0.834608644247055, "step": 191 }, { "epoch": 3.048, "grad_norm": 1.5851784944534302, "learning_rate": 3.475254469003865e-06, "loss": 0.526, "mean_token_accuracy": 0.8261672258377075, "step": 192 }, { "epoch": 3.064, "grad_norm": 1.618119239807129, "learning_rate": 3.424458909881897e-06, "loss": 0.4833, "mean_token_accuracy": 0.8181418478488922, "step": 193 }, { "epoch": 3.08, "grad_norm": 1.6358258724212646, "learning_rate": 3.3738432505622653e-06, "loss": 0.3926, "mean_token_accuracy": 0.8705029785633087, "step": 194 }, { "epoch": 3.096, "grad_norm": 1.7759870290756226, "learning_rate": 3.3234132704861786e-06, "loss": 0.6084, "mean_token_accuracy": 0.8052189946174622, "step": 195 }, { "epoch": 3.112, "grad_norm": 1.9421766996383667, "learning_rate": 3.273174727893463e-06, "loss": 0.4258, "mean_token_accuracy": 0.8420732915401459, "step": 196 }, { "epoch": 3.128, "grad_norm": 2.129655361175537, "learning_rate": 3.2231333591650567e-06, "loss": 0.4995, "mean_token_accuracy": 0.8285854160785675, "step": 197 }, { "epoch": 3.144, "grad_norm": 1.9939950704574585, "learning_rate": 3.173294878168025e-06, "loss": 0.4836, "mean_token_accuracy": 0.8304620087146759, "step": 198 }, { "epoch": 3.16, "grad_norm": 2.111128330230713, "learning_rate": 3.12366497560313e-06, "loss": 0.4879, "mean_token_accuracy": 0.8432117104530334, "step": 199 }, { "epoch": 3.176, "grad_norm": 1.8537310361862183, "learning_rate": 3.074249318355046e-06, "loss": 0.4127, "mean_token_accuracy": 0.850278377532959, "step": 200 }, { "epoch": 3.192, "grad_norm": 1.8146088123321533, "learning_rate": 3.0250535488453077e-06, "loss": 0.4438, "mean_token_accuracy": 0.830359548330307, "step": 201 }, { "epoch": 3.208, "grad_norm": 1.8513424396514893, "learning_rate": 2.976083284388031e-06, "loss": 0.4974, "mean_token_accuracy": 0.8160843849182129, "step": 202 }, { "epoch": 3.224, "grad_norm": 1.7857319116592407, "learning_rate": 2.9273441165485227e-06, "loss": 0.4141, "mean_token_accuracy": 0.8569373190402985, "step": 203 }, { "epoch": 3.24, "grad_norm": 1.8783490657806396, "learning_rate": 2.8788416105048124e-06, "loss": 0.3886, "mean_token_accuracy": 0.850079745054245, "step": 204 }, { "epoch": 3.2560000000000002, "grad_norm": 1.798640489578247, "learning_rate": 2.83058130441221e-06, "loss": 0.434, "mean_token_accuracy": 0.8334758579730988, "step": 205 }, { "epoch": 3.2720000000000002, "grad_norm": 1.7705386877059937, "learning_rate": 2.782568708770933e-06, "loss": 0.4141, "mean_token_accuracy": 0.8566360175609589, "step": 206 }, { "epoch": 3.288, "grad_norm": 1.741195797920227, "learning_rate": 2.734809305796915e-06, "loss": 0.4204, "mean_token_accuracy": 0.8755166530609131, "step": 207 }, { "epoch": 3.304, "grad_norm": 1.6184072494506836, "learning_rate": 2.687308548795825e-06, "loss": 0.3762, "mean_token_accuracy": 0.8658173084259033, "step": 208 }, { "epoch": 3.32, "grad_norm": 1.733109712600708, "learning_rate": 2.6400718615403852e-06, "loss": 0.5744, "mean_token_accuracy": 0.8061816394329071, "step": 209 }, { "epoch": 3.336, "grad_norm": 1.9283976554870605, "learning_rate": 2.5931046376510875e-06, "loss": 0.4062, "mean_token_accuracy": 0.8531254231929779, "step": 210 }, { "epoch": 3.352, "grad_norm": 1.7725756168365479, "learning_rate": 2.5464122399803126e-06, "loss": 0.5576, "mean_token_accuracy": 0.800176590681076, "step": 211 }, { "epoch": 3.368, "grad_norm": 1.6289314031600952, "learning_rate": 2.5000000000000015e-06, "loss": 0.4405, "mean_token_accuracy": 0.8461686670780182, "step": 212 }, { "epoch": 3.384, "grad_norm": 1.7887394428253174, "learning_rate": 2.4538732171928847e-06, "loss": 0.4359, "mean_token_accuracy": 0.8430683016777039, "step": 213 }, { "epoch": 3.4, "grad_norm": 1.781262993812561, "learning_rate": 2.408037158447375e-06, "loss": 0.6237, "mean_token_accuracy": 0.8007213473320007, "step": 214 }, { "epoch": 3.416, "grad_norm": 1.6724776029586792, "learning_rate": 2.3624970574561773e-06, "loss": 0.4734, "mean_token_accuracy": 0.8019916415214539, "step": 215 }, { "epoch": 3.432, "grad_norm": 1.624058485031128, "learning_rate": 2.317258114118686e-06, "loss": 0.5263, "mean_token_accuracy": 0.8112485110759735, "step": 216 }, { "epoch": 3.448, "grad_norm": 1.5671225786209106, "learning_rate": 2.272325493947257e-06, "loss": 0.5377, "mean_token_accuracy": 0.8191773295402527, "step": 217 }, { "epoch": 3.464, "grad_norm": 1.6635698080062866, "learning_rate": 2.2277043274773856e-06, "loss": 0.4164, "mean_token_accuracy": 0.8735319077968597, "step": 218 }, { "epoch": 3.48, "grad_norm": 1.5347254276275635, "learning_rate": 2.1833997096818897e-06, "loss": 0.5414, "mean_token_accuracy": 0.8047713041305542, "step": 219 }, { "epoch": 3.496, "grad_norm": 1.5895464420318604, "learning_rate": 2.139416699389153e-06, "loss": 0.431, "mean_token_accuracy": 0.8373080492019653, "step": 220 }, { "epoch": 3.512, "grad_norm": 1.728340983390808, "learning_rate": 2.095760318705487e-06, "loss": 0.4073, "mean_token_accuracy": 0.8578878045082092, "step": 221 }, { "epoch": 3.528, "grad_norm": 1.8697818517684937, "learning_rate": 2.0524355524417017e-06, "loss": 0.4038, "mean_token_accuracy": 0.8781362175941467, "step": 222 }, { "epoch": 3.544, "grad_norm": 1.7442781925201416, "learning_rate": 2.00944734754392e-06, "loss": 0.3679, "mean_token_accuracy": 0.8618011176586151, "step": 223 }, { "epoch": 3.56, "grad_norm": 1.6816269159317017, "learning_rate": 1.966800612528723e-06, "loss": 0.5316, "mean_token_accuracy": 0.7882444560527802, "step": 224 }, { "epoch": 3.576, "grad_norm": 1.6920692920684814, "learning_rate": 1.9245002169226814e-06, "loss": 0.4844, "mean_token_accuracy": 0.8364316821098328, "step": 225 }, { "epoch": 3.592, "grad_norm": 1.9367053508758545, "learning_rate": 1.8825509907063328e-06, "loss": 0.3735, "mean_token_accuracy": 0.86876380443573, "step": 226 }, { "epoch": 3.608, "grad_norm": 1.8703553676605225, "learning_rate": 1.8409577237626935e-06, "loss": 0.4888, "mean_token_accuracy": 0.8111458122730255, "step": 227 }, { "epoch": 3.624, "grad_norm": 1.7826961278915405, "learning_rate": 1.7997251653303249e-06, "loss": 0.4856, "mean_token_accuracy": 0.801908940076828, "step": 228 }, { "epoch": 3.64, "grad_norm": 1.846787929534912, "learning_rate": 1.7588580234610592e-06, "loss": 0.5526, "mean_token_accuracy": 0.8281767964363098, "step": 229 }, { "epoch": 3.656, "grad_norm": 1.5192652940750122, "learning_rate": 1.7183609644824096e-06, "loss": 0.4609, "mean_token_accuracy": 0.8234535455703735, "step": 230 }, { "epoch": 3.672, "grad_norm": 1.6722280979156494, "learning_rate": 1.67823861246477e-06, "loss": 0.5246, "mean_token_accuracy": 0.7935002446174622, "step": 231 }, { "epoch": 3.6879999999999997, "grad_norm": 1.5522016286849976, "learning_rate": 1.6384955486934157e-06, "loss": 0.4526, "mean_token_accuracy": 0.8310949802398682, "step": 232 }, { "epoch": 3.7039999999999997, "grad_norm": 1.7835192680358887, "learning_rate": 1.5991363111454023e-06, "loss": 0.4427, "mean_token_accuracy": 0.8425520956516266, "step": 233 }, { "epoch": 3.7199999999999998, "grad_norm": 1.6655128002166748, "learning_rate": 1.5601653939714073e-06, "loss": 0.5791, "mean_token_accuracy": 0.8191401064395905, "step": 234 }, { "epoch": 3.7359999999999998, "grad_norm": 1.6413390636444092, "learning_rate": 1.5215872469825682e-06, "loss": 0.4886, "mean_token_accuracy": 0.8357130289077759, "step": 235 }, { "epoch": 3.752, "grad_norm": 1.6971957683563232, "learning_rate": 1.4834062751424018e-06, "loss": 0.5311, "mean_token_accuracy": 0.8187433481216431, "step": 236 }, { "epoch": 3.768, "grad_norm": 1.489127516746521, "learning_rate": 1.4456268380638262e-06, "loss": 0.4431, "mean_token_accuracy": 0.8484348654747009, "step": 237 }, { "epoch": 3.784, "grad_norm": 1.6830918788909912, "learning_rate": 1.4082532495113627e-06, "loss": 0.4008, "mean_token_accuracy": 0.8682940900325775, "step": 238 }, { "epoch": 3.8, "grad_norm": 1.8044171333312988, "learning_rate": 1.3712897769085903e-06, "loss": 0.3821, "mean_token_accuracy": 0.8439038991928101, "step": 239 }, { "epoch": 3.816, "grad_norm": 1.6815401315689087, "learning_rate": 1.3347406408508695e-06, "loss": 0.3555, "mean_token_accuracy": 0.8827997744083405, "step": 240 }, { "epoch": 3.832, "grad_norm": 1.5818198919296265, "learning_rate": 1.298610014623423e-06, "loss": 0.3805, "mean_token_accuracy": 0.8539446592330933, "step": 241 }, { "epoch": 3.848, "grad_norm": 1.6825566291809082, "learning_rate": 1.2629020237248241e-06, "loss": 0.4691, "mean_token_accuracy": 0.8389621675014496, "step": 242 }, { "epoch": 3.864, "grad_norm": 1.8937445878982544, "learning_rate": 1.2276207453959283e-06, "loss": 0.4858, "mean_token_accuracy": 0.8229635059833527, "step": 243 }, { "epoch": 3.88, "grad_norm": 1.7236624956130981, "learning_rate": 1.1927702081543279e-06, "loss": 0.5053, "mean_token_accuracy": 0.8316462635993958, "step": 244 }, { "epoch": 3.896, "grad_norm": 1.700505256652832, "learning_rate": 1.158354391334362e-06, "loss": 0.3465, "mean_token_accuracy": 0.8770395815372467, "step": 245 }, { "epoch": 3.912, "grad_norm": 1.7477822303771973, "learning_rate": 1.1243772246327416e-06, "loss": 0.4043, "mean_token_accuracy": 0.8705323040485382, "step": 246 }, { "epoch": 3.928, "grad_norm": 1.6631226539611816, "learning_rate": 1.0908425876598512e-06, "loss": 0.446, "mean_token_accuracy": 0.8587776124477386, "step": 247 }, { "epoch": 3.944, "grad_norm": 1.7363426685333252, "learning_rate": 1.0577543094967613e-06, "loss": 0.4081, "mean_token_accuracy": 0.871842622756958, "step": 248 }, { "epoch": 3.96, "grad_norm": 1.7819398641586304, "learning_rate": 1.0251161682580125e-06, "loss": 0.3599, "mean_token_accuracy": 0.8662829697132111, "step": 249 }, { "epoch": 3.976, "grad_norm": 1.8177067041397095, "learning_rate": 9.929318906602176e-07, "loss": 0.4463, "mean_token_accuracy": 0.8615152835845947, "step": 250 }, { "epoch": 3.992, "grad_norm": 1.6980502605438232, "learning_rate": 9.612051515965388e-07, "loss": 0.5262, "mean_token_accuracy": 0.7997405827045441, "step": 251 }, { "epoch": 4.0, "grad_norm": 2.324087619781494, "learning_rate": 9.299395737170758e-07, "loss": 0.3421, "mean_token_accuracy": 0.8514267802238464, "step": 252 }, { "epoch": 4.016, "grad_norm": 1.6846673488616943, "learning_rate": 8.991387270152202e-07, "loss": 0.4377, "mean_token_accuracy": 0.8406525552272797, "step": 253 }, { "epoch": 4.032, "grad_norm": 1.7401528358459473, "learning_rate": 8.688061284200266e-07, "loss": 0.3745, "mean_token_accuracy": 0.862852543592453, "step": 254 }, { "epoch": 4.048, "grad_norm": 1.5129587650299072, "learning_rate": 8.389452413946314e-07, "loss": 0.4453, "mean_token_accuracy": 0.8313189148902893, "step": 255 }, { "epoch": 4.064, "grad_norm": 1.4720897674560547, "learning_rate": 8.095594755407971e-07, "loss": 0.5333, "mean_token_accuracy": 0.8177091777324677, "step": 256 }, { "epoch": 4.08, "grad_norm": 1.5798670053482056, "learning_rate": 7.806521862095834e-07, "loss": 0.3594, "mean_token_accuracy": 0.878955602645874, "step": 257 }, { "epoch": 4.096, "grad_norm": 1.5988901853561401, "learning_rate": 7.522266741182305e-07, "loss": 0.3372, "mean_token_accuracy": 0.8858175873756409, "step": 258 }, { "epoch": 4.112, "grad_norm": 1.3429925441741943, "learning_rate": 7.242861849732696e-07, "loss": 0.3053, "mean_token_accuracy": 0.8911112546920776, "step": 259 }, { "epoch": 4.128, "grad_norm": 1.392916202545166, "learning_rate": 6.968339090999188e-07, "loss": 0.3566, "mean_token_accuracy": 0.8763693869113922, "step": 260 }, { "epoch": 4.144, "grad_norm": 1.3282548189163208, "learning_rate": 6.698729810778065e-07, "loss": 0.3934, "mean_token_accuracy": 0.8518996834754944, "step": 261 }, { "epoch": 4.16, "grad_norm": 1.3634921312332153, "learning_rate": 6.43406479383053e-07, "loss": 0.2876, "mean_token_accuracy": 0.8888527750968933, "step": 262 }, { "epoch": 4.176, "grad_norm": 1.5951544046401978, "learning_rate": 6.174374260367611e-07, "loss": 0.3306, "mean_token_accuracy": 0.8580814898014069, "step": 263 }, { "epoch": 4.192, "grad_norm": 1.8386245965957642, "learning_rate": 5.919687862599549e-07, "loss": 0.5825, "mean_token_accuracy": 0.8090504109859467, "step": 264 }, { "epoch": 4.208, "grad_norm": 1.6140440702438354, "learning_rate": 5.670034681349995e-07, "loss": 0.3189, "mean_token_accuracy": 0.8884587585926056, "step": 265 }, { "epoch": 4.224, "grad_norm": 1.5564498901367188, "learning_rate": 5.425443222735527e-07, "loss": 0.3539, "mean_token_accuracy": 0.8893531560897827, "step": 266 }, { "epoch": 4.24, "grad_norm": 1.6134988069534302, "learning_rate": 5.185941414910673e-07, "loss": 0.3285, "mean_token_accuracy": 0.8698451220989227, "step": 267 }, { "epoch": 4.256, "grad_norm": 1.657450556755066, "learning_rate": 4.951556604879049e-07, "loss": 0.3152, "mean_token_accuracy": 0.8656443655490875, "step": 268 }, { "epoch": 4.272, "grad_norm": 1.6017976999282837, "learning_rate": 4.722315555370793e-07, "loss": 0.3683, "mean_token_accuracy": 0.8707852065563202, "step": 269 }, { "epoch": 4.288, "grad_norm": 1.5347394943237305, "learning_rate": 4.4982444417866753e-07, "loss": 0.3373, "mean_token_accuracy": 0.8773435652256012, "step": 270 }, { "epoch": 4.304, "grad_norm": 1.658043384552002, "learning_rate": 4.279368849209381e-07, "loss": 0.4898, "mean_token_accuracy": 0.8278676569461823, "step": 271 }, { "epoch": 4.32, "grad_norm": 1.503208875656128, "learning_rate": 4.0657137694820826e-07, "loss": 0.2716, "mean_token_accuracy": 0.9029871225357056, "step": 272 }, { "epoch": 4.336, "grad_norm": 1.4751423597335815, "learning_rate": 3.8573035983548167e-07, "loss": 0.4335, "mean_token_accuracy": 0.8453833758831024, "step": 273 }, { "epoch": 4.352, "grad_norm": 1.5802851915359497, "learning_rate": 3.6541621326989183e-07, "loss": 0.372, "mean_token_accuracy": 0.8641645312309265, "step": 274 }, { "epoch": 4.368, "grad_norm": 1.4285887479782104, "learning_rate": 3.4563125677897936e-07, "loss": 0.2716, "mean_token_accuracy": 0.8914371728897095, "step": 275 }, { "epoch": 4.384, "grad_norm": 1.543737769126892, "learning_rate": 3.263777494658449e-07, "loss": 0.4701, "mean_token_accuracy": 0.8330877125263214, "step": 276 }, { "epoch": 4.4, "grad_norm": 1.5941988229751587, "learning_rate": 3.076578897511978e-07, "loss": 0.4733, "mean_token_accuracy": 0.8405623137950897, "step": 277 }, { "epoch": 4.416, "grad_norm": 1.4298094511032104, "learning_rate": 2.894738151223331e-07, "loss": 0.3411, "mean_token_accuracy": 0.8751503527164459, "step": 278 }, { "epoch": 4.432, "grad_norm": 1.4553662538528442, "learning_rate": 2.71827601889067e-07, "loss": 0.4069, "mean_token_accuracy": 0.8369295597076416, "step": 279 }, { "epoch": 4.448, "grad_norm": 1.3951661586761475, "learning_rate": 2.547212649466568e-07, "loss": 0.3843, "mean_token_accuracy": 0.846329540014267, "step": 280 }, { "epoch": 4.464, "grad_norm": 1.5225402116775513, "learning_rate": 2.3815675754573885e-07, "loss": 0.3576, "mean_token_accuracy": 0.8736283481121063, "step": 281 }, { "epoch": 4.48, "grad_norm": 1.6082041263580322, "learning_rate": 2.2213597106929608e-07, "loss": 0.2575, "mean_token_accuracy": 0.9190754592418671, "step": 282 }, { "epoch": 4.496, "grad_norm": 1.5779746770858765, "learning_rate": 2.0666073481669714e-07, "loss": 0.3829, "mean_token_accuracy": 0.8619149625301361, "step": 283 }, { "epoch": 4.5120000000000005, "grad_norm": 1.5220720767974854, "learning_rate": 1.9173281579481896e-07, "loss": 0.2685, "mean_token_accuracy": 0.9106524586677551, "step": 284 }, { "epoch": 4.5280000000000005, "grad_norm": 1.500473976135254, "learning_rate": 1.7735391851628814e-07, "loss": 0.2973, "mean_token_accuracy": 0.892973393201828, "step": 285 }, { "epoch": 4.5440000000000005, "grad_norm": 1.5466065406799316, "learning_rate": 1.6352568480485277e-07, "loss": 0.3419, "mean_token_accuracy": 0.8905702233314514, "step": 286 }, { "epoch": 4.5600000000000005, "grad_norm": 1.4613832235336304, "learning_rate": 1.5024969360791564e-07, "loss": 0.3889, "mean_token_accuracy": 0.8405336439609528, "step": 287 }, { "epoch": 4.576, "grad_norm": 1.484372615814209, "learning_rate": 1.375274608162447e-07, "loss": 0.2586, "mean_token_accuracy": 0.9031639397144318, "step": 288 }, { "epoch": 4.592, "grad_norm": 1.6078850030899048, "learning_rate": 1.253604390908819e-07, "loss": 0.3391, "mean_token_accuracy": 0.8819546401500702, "step": 289 }, { "epoch": 4.608, "grad_norm": 1.6001267433166504, "learning_rate": 1.1375001769728e-07, "loss": 0.311, "mean_token_accuracy": 0.8772971332073212, "step": 290 }, { "epoch": 4.624, "grad_norm": 1.572523593902588, "learning_rate": 1.0269752234666642e-07, "loss": 0.5671, "mean_token_accuracy": 0.8123734891414642, "step": 291 }, { "epoch": 4.64, "grad_norm": 1.4202545881271362, "learning_rate": 9.22042150446728e-08, "loss": 0.3798, "mean_token_accuracy": 0.8508090078830719, "step": 292 }, { "epoch": 4.656, "grad_norm": 1.5495851039886475, "learning_rate": 8.227129394723643e-08, "loss": 0.3317, "mean_token_accuracy": 0.8867059946060181, "step": 293 }, { "epoch": 4.672, "grad_norm": 1.4975632429122925, "learning_rate": 7.289989322378732e-08, "loss": 0.3868, "mean_token_accuracy": 0.8555810451507568, "step": 294 }, { "epoch": 4.688, "grad_norm": 1.413699984550476, "learning_rate": 6.409108292774912e-08, "loss": 0.3873, "mean_token_accuracy": 0.8520547449588776, "step": 295 }, { "epoch": 4.704, "grad_norm": 1.5418516397476196, "learning_rate": 5.584586887435739e-08, "loss": 0.3496, "mean_token_accuracy": 0.863271564245224, "step": 296 }, { "epoch": 4.72, "grad_norm": 1.517801284790039, "learning_rate": 4.8165192525809754e-08, "loss": 0.402, "mean_token_accuracy": 0.867528647184372, "step": 297 }, { "epoch": 4.736, "grad_norm": 1.3901314735412598, "learning_rate": 4.104993088376974e-08, "loss": 0.4927, "mean_token_accuracy": 0.8101126849651337, "step": 298 }, { "epoch": 4.752, "grad_norm": 1.6048532724380493, "learning_rate": 3.450089638922738e-08, "loss": 0.4841, "mean_token_accuracy": 0.836152046918869, "step": 299 }, { "epoch": 4.768, "grad_norm": 1.39896821975708, "learning_rate": 2.8518836829732332e-08, "loss": 0.4559, "mean_token_accuracy": 0.8460139036178589, "step": 300 }, { "epoch": 4.784, "grad_norm": 1.40359365940094, "learning_rate": 2.3104435254008852e-08, "loss": 0.2905, "mean_token_accuracy": 0.8796796500682831, "step": 301 }, { "epoch": 4.8, "grad_norm": 1.392991065979004, "learning_rate": 1.8258309893965375e-08, "loss": 0.3089, "mean_token_accuracy": 0.8802558779716492, "step": 302 }, { "epoch": 4.816, "grad_norm": 1.6868401765823364, "learning_rate": 1.3981014094099354e-08, "loss": 0.2923, "mean_token_accuracy": 0.8579813539981842, "step": 303 }, { "epoch": 4.832, "grad_norm": 1.4592156410217285, "learning_rate": 1.0273036248318325e-08, "loss": 0.3833, "mean_token_accuracy": 0.8635309934616089, "step": 304 }, { "epoch": 4.848, "grad_norm": 1.4995806217193604, "learning_rate": 7.13479974417175e-09, "loss": 0.4708, "mean_token_accuracy": 0.8479401469230652, "step": 305 }, { "epoch": 4.864, "grad_norm": 1.4440363645553589, "learning_rate": 4.56666291450858e-09, "loss": 0.2393, "mean_token_accuracy": 0.9273587167263031, "step": 306 }, { "epoch": 4.88, "grad_norm": 1.4790477752685547, "learning_rate": 2.568918996560532e-09, "loss": 0.3455, "mean_token_accuracy": 0.8849050998687744, "step": 307 }, { "epoch": 4.896, "grad_norm": 1.658073902130127, "learning_rate": 1.1417960984605459e-09, "loss": 0.2981, "mean_token_accuracy": 0.8946259915828705, "step": 308 }, { "epoch": 4.912, "grad_norm": 1.4868714809417725, "learning_rate": 2.854571731947253e-10, "loss": 0.3557, "mean_token_accuracy": 0.8737443387508392, "step": 309 }, { "epoch": 4.928, "grad_norm": 1.462461233139038, "learning_rate": 0.0, "loss": 0.3918, "mean_token_accuracy": 0.8558708727359772, "step": 310 } ], "logging_steps": 1.0, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.75424678173737e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }