diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,36266 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9994207375941304, + "eval_steps": 500, + "global_step": 5176, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007723498744931454, + "grad_norm": 2.1837761402130127, + "learning_rate": 0.0, + "loss": 0.1449, + "step": 1 + }, + { + "epoch": 0.0015446997489862908, + "grad_norm": 1.8389215469360352, + "learning_rate": 1.5455950540958268e-07, + "loss": 0.1331, + "step": 2 + }, + { + "epoch": 0.0023170496234794363, + "grad_norm": 1.9792014360427856, + "learning_rate": 3.0911901081916536e-07, + "loss": 0.1353, + "step": 3 + }, + { + "epoch": 0.0030893994979725816, + "grad_norm": 1.6553817987442017, + "learning_rate": 4.636785162287481e-07, + "loss": 0.1451, + "step": 4 + }, + { + "epoch": 0.003861749372465727, + "grad_norm": 2.098407745361328, + "learning_rate": 6.182380216383307e-07, + "loss": 0.1415, + "step": 5 + }, + { + "epoch": 0.004634099246958873, + "grad_norm": 1.5416218042373657, + "learning_rate": 7.727975270479134e-07, + "loss": 0.1229, + "step": 6 + }, + { + "epoch": 0.0054064491214520175, + "grad_norm": 1.732174038887024, + "learning_rate": 9.273570324574961e-07, + "loss": 0.1284, + "step": 7 + }, + { + "epoch": 0.006178798995945163, + "grad_norm": 1.7713077068328857, + "learning_rate": 1.0819165378670788e-06, + "loss": 0.1171, + "step": 8 + }, + { + "epoch": 0.006951148870438309, + "grad_norm": 1.6853700876235962, + "learning_rate": 1.2364760432766615e-06, + "loss": 0.123, + "step": 9 + }, + { + "epoch": 0.007723498744931454, + "grad_norm": 1.5785168409347534, + "learning_rate": 1.3910355486862442e-06, + "loss": 0.1232, + "step": 10 + }, + { + "epoch": 0.008495848619424599, + "grad_norm": 1.4248701333999634, + "learning_rate": 1.5455950540958269e-06, + "loss": 0.118, + "step": 11 + }, + { + "epoch": 0.009268198493917745, + "grad_norm": 1.0691180229187012, + "learning_rate": 1.7001545595054098e-06, + "loss": 0.0743, + "step": 12 + }, + { + "epoch": 0.01004054836841089, + "grad_norm": 0.9088413119316101, + "learning_rate": 1.8547140649149923e-06, + "loss": 0.077, + "step": 13 + }, + { + "epoch": 0.010812898242904035, + "grad_norm": 0.9975239634513855, + "learning_rate": 2.009273570324575e-06, + "loss": 0.0719, + "step": 14 + }, + { + "epoch": 0.011585248117397182, + "grad_norm": 0.7566964626312256, + "learning_rate": 2.1638330757341575e-06, + "loss": 0.0665, + "step": 15 + }, + { + "epoch": 0.012357597991890326, + "grad_norm": 0.4156074523925781, + "learning_rate": 2.3183925811437404e-06, + "loss": 0.0436, + "step": 16 + }, + { + "epoch": 0.013129947866383471, + "grad_norm": 0.5193804502487183, + "learning_rate": 2.472952086553323e-06, + "loss": 0.0428, + "step": 17 + }, + { + "epoch": 0.013902297740876618, + "grad_norm": 0.4834356904029846, + "learning_rate": 2.627511591962906e-06, + "loss": 0.0419, + "step": 18 + }, + { + "epoch": 0.014674647615369763, + "grad_norm": 0.44509536027908325, + "learning_rate": 2.7820710973724883e-06, + "loss": 0.0392, + "step": 19 + }, + { + "epoch": 0.015446997489862908, + "grad_norm": 0.4127598702907562, + "learning_rate": 2.9366306027820713e-06, + "loss": 0.0372, + "step": 20 + }, + { + "epoch": 0.016219347364356054, + "grad_norm": 0.3719303607940674, + "learning_rate": 3.0911901081916538e-06, + "loss": 0.0352, + "step": 21 + }, + { + "epoch": 0.016991697238849197, + "grad_norm": 0.2237921953201294, + "learning_rate": 3.2457496136012367e-06, + "loss": 0.03, + "step": 22 + }, + { + "epoch": 0.017764047113342344, + "grad_norm": 0.2025945633649826, + "learning_rate": 3.4003091190108196e-06, + "loss": 0.0254, + "step": 23 + }, + { + "epoch": 0.01853639698783549, + "grad_norm": 0.25628677010536194, + "learning_rate": 3.554868624420402e-06, + "loss": 0.0223, + "step": 24 + }, + { + "epoch": 0.019308746862328634, + "grad_norm": 0.21528829634189606, + "learning_rate": 3.7094281298299846e-06, + "loss": 0.0245, + "step": 25 + }, + { + "epoch": 0.02008109673682178, + "grad_norm": 0.17562495172023773, + "learning_rate": 3.863987635239567e-06, + "loss": 0.0215, + "step": 26 + }, + { + "epoch": 0.020853446611314927, + "grad_norm": 0.15615582466125488, + "learning_rate": 4.01854714064915e-06, + "loss": 0.0226, + "step": 27 + }, + { + "epoch": 0.02162579648580807, + "grad_norm": 0.17141973972320557, + "learning_rate": 4.173106646058733e-06, + "loss": 0.0211, + "step": 28 + }, + { + "epoch": 0.022398146360301217, + "grad_norm": 0.14127251505851746, + "learning_rate": 4.327666151468315e-06, + "loss": 0.0202, + "step": 29 + }, + { + "epoch": 0.023170496234794363, + "grad_norm": 0.12122655659914017, + "learning_rate": 4.482225656877898e-06, + "loss": 0.0181, + "step": 30 + }, + { + "epoch": 0.023942846109287506, + "grad_norm": 0.1447962075471878, + "learning_rate": 4.636785162287481e-06, + "loss": 0.0175, + "step": 31 + }, + { + "epoch": 0.024715195983780653, + "grad_norm": 0.11168921738862991, + "learning_rate": 4.791344667697063e-06, + "loss": 0.0175, + "step": 32 + }, + { + "epoch": 0.0254875458582738, + "grad_norm": 0.17067140340805054, + "learning_rate": 4.945904173106646e-06, + "loss": 0.0172, + "step": 33 + }, + { + "epoch": 0.026259895732766943, + "grad_norm": 0.21642448008060455, + "learning_rate": 5.100463678516229e-06, + "loss": 0.016, + "step": 34 + }, + { + "epoch": 0.02703224560726009, + "grad_norm": 0.1376991868019104, + "learning_rate": 5.255023183925812e-06, + "loss": 0.0172, + "step": 35 + }, + { + "epoch": 0.027804595481753236, + "grad_norm": 0.1457221657037735, + "learning_rate": 5.409582689335394e-06, + "loss": 0.0147, + "step": 36 + }, + { + "epoch": 0.02857694535624638, + "grad_norm": 0.12801788747310638, + "learning_rate": 5.564142194744977e-06, + "loss": 0.0137, + "step": 37 + }, + { + "epoch": 0.029349295230739526, + "grad_norm": 0.12587693333625793, + "learning_rate": 5.71870170015456e-06, + "loss": 0.0135, + "step": 38 + }, + { + "epoch": 0.03012164510523267, + "grad_norm": 0.16374649107456207, + "learning_rate": 5.8732612055641425e-06, + "loss": 0.0143, + "step": 39 + }, + { + "epoch": 0.030893994979725815, + "grad_norm": 0.09200599044561386, + "learning_rate": 6.0278207109737254e-06, + "loss": 0.0137, + "step": 40 + }, + { + "epoch": 0.03166634485421896, + "grad_norm": 0.15025416016578674, + "learning_rate": 6.1823802163833075e-06, + "loss": 0.0124, + "step": 41 + }, + { + "epoch": 0.03243869472871211, + "grad_norm": 0.09628334641456604, + "learning_rate": 6.3369397217928904e-06, + "loss": 0.0122, + "step": 42 + }, + { + "epoch": 0.03321104460320525, + "grad_norm": 0.1101807951927185, + "learning_rate": 6.491499227202473e-06, + "loss": 0.0134, + "step": 43 + }, + { + "epoch": 0.033983394477698395, + "grad_norm": 0.2027738243341446, + "learning_rate": 6.646058732612056e-06, + "loss": 0.0127, + "step": 44 + }, + { + "epoch": 0.034755744352191545, + "grad_norm": 0.06605881452560425, + "learning_rate": 6.800618238021639e-06, + "loss": 0.0119, + "step": 45 + }, + { + "epoch": 0.03552809422668469, + "grad_norm": 0.07640182971954346, + "learning_rate": 6.955177743431221e-06, + "loss": 0.0109, + "step": 46 + }, + { + "epoch": 0.03630044410117783, + "grad_norm": 0.16407518088817596, + "learning_rate": 7.109737248840804e-06, + "loss": 0.0121, + "step": 47 + }, + { + "epoch": 0.03707279397567098, + "grad_norm": 0.09163492918014526, + "learning_rate": 7.264296754250387e-06, + "loss": 0.01, + "step": 48 + }, + { + "epoch": 0.037845143850164124, + "grad_norm": 0.05860032141208649, + "learning_rate": 7.418856259659969e-06, + "loss": 0.0119, + "step": 49 + }, + { + "epoch": 0.03861749372465727, + "grad_norm": 0.10120034962892532, + "learning_rate": 7.573415765069553e-06, + "loss": 0.012, + "step": 50 + }, + { + "epoch": 0.03938984359915042, + "grad_norm": 0.08744475245475769, + "learning_rate": 7.727975270479134e-06, + "loss": 0.0105, + "step": 51 + }, + { + "epoch": 0.04016219347364356, + "grad_norm": 0.10905009508132935, + "learning_rate": 7.882534775888716e-06, + "loss": 0.0113, + "step": 52 + }, + { + "epoch": 0.040934543348136704, + "grad_norm": 0.08753761649131775, + "learning_rate": 8.0370942812983e-06, + "loss": 0.0105, + "step": 53 + }, + { + "epoch": 0.041706893222629854, + "grad_norm": 0.07518894970417023, + "learning_rate": 8.191653786707882e-06, + "loss": 0.0104, + "step": 54 + }, + { + "epoch": 0.042479243097123, + "grad_norm": 0.10365454107522964, + "learning_rate": 8.346213292117466e-06, + "loss": 0.0112, + "step": 55 + }, + { + "epoch": 0.04325159297161614, + "grad_norm": 0.0678032711148262, + "learning_rate": 8.500772797527048e-06, + "loss": 0.0108, + "step": 56 + }, + { + "epoch": 0.04402394284610929, + "grad_norm": 0.10868978500366211, + "learning_rate": 8.65533230293663e-06, + "loss": 0.0102, + "step": 57 + }, + { + "epoch": 0.04479629272060243, + "grad_norm": 0.1246650293469429, + "learning_rate": 8.809891808346214e-06, + "loss": 0.0098, + "step": 58 + }, + { + "epoch": 0.045568642595095576, + "grad_norm": 0.06522880494594574, + "learning_rate": 8.964451313755796e-06, + "loss": 0.0101, + "step": 59 + }, + { + "epoch": 0.04634099246958873, + "grad_norm": 0.053914472460746765, + "learning_rate": 9.119010819165378e-06, + "loss": 0.0093, + "step": 60 + }, + { + "epoch": 0.04711334234408187, + "grad_norm": 0.062161996960639954, + "learning_rate": 9.273570324574962e-06, + "loss": 0.0089, + "step": 61 + }, + { + "epoch": 0.04788569221857501, + "grad_norm": 0.055369071662425995, + "learning_rate": 9.428129829984544e-06, + "loss": 0.0085, + "step": 62 + }, + { + "epoch": 0.04865804209306816, + "grad_norm": 0.08797524124383926, + "learning_rate": 9.582689335394126e-06, + "loss": 0.0095, + "step": 63 + }, + { + "epoch": 0.049430391967561306, + "grad_norm": 0.06364715099334717, + "learning_rate": 9.73724884080371e-06, + "loss": 0.0101, + "step": 64 + }, + { + "epoch": 0.05020274184205445, + "grad_norm": 0.09843257814645767, + "learning_rate": 9.891808346213292e-06, + "loss": 0.009, + "step": 65 + }, + { + "epoch": 0.0509750917165476, + "grad_norm": 0.06403323262929916, + "learning_rate": 1.0046367851622875e-05, + "loss": 0.0087, + "step": 66 + }, + { + "epoch": 0.05174744159104074, + "grad_norm": 0.05529443547129631, + "learning_rate": 1.0200927357032458e-05, + "loss": 0.0089, + "step": 67 + }, + { + "epoch": 0.052519791465533885, + "grad_norm": 0.08181367814540863, + "learning_rate": 1.035548686244204e-05, + "loss": 0.0093, + "step": 68 + }, + { + "epoch": 0.053292141340027036, + "grad_norm": 0.060869909822940826, + "learning_rate": 1.0510046367851623e-05, + "loss": 0.0089, + "step": 69 + }, + { + "epoch": 0.05406449121452018, + "grad_norm": 0.06601981818675995, + "learning_rate": 1.0664605873261205e-05, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.05483684108901332, + "grad_norm": 0.10082995146512985, + "learning_rate": 1.0819165378670788e-05, + "loss": 0.0096, + "step": 71 + }, + { + "epoch": 0.05560919096350647, + "grad_norm": 0.046674828976392746, + "learning_rate": 1.0973724884080371e-05, + "loss": 0.0081, + "step": 72 + }, + { + "epoch": 0.056381540837999615, + "grad_norm": 0.06928626447916031, + "learning_rate": 1.1128284389489953e-05, + "loss": 0.0091, + "step": 73 + }, + { + "epoch": 0.05715389071249276, + "grad_norm": 0.1223256066441536, + "learning_rate": 1.1282843894899537e-05, + "loss": 0.0088, + "step": 74 + }, + { + "epoch": 0.0579262405869859, + "grad_norm": 0.06588178128004074, + "learning_rate": 1.143740340030912e-05, + "loss": 0.0086, + "step": 75 + }, + { + "epoch": 0.05869859046147905, + "grad_norm": 0.06649603694677353, + "learning_rate": 1.1591962905718701e-05, + "loss": 0.0082, + "step": 76 + }, + { + "epoch": 0.059470940335972194, + "grad_norm": 0.11596790701150894, + "learning_rate": 1.1746522411128285e-05, + "loss": 0.0092, + "step": 77 + }, + { + "epoch": 0.06024329021046534, + "grad_norm": 0.05574265122413635, + "learning_rate": 1.1901081916537867e-05, + "loss": 0.0088, + "step": 78 + }, + { + "epoch": 0.06101564008495849, + "grad_norm": 0.1380206048488617, + "learning_rate": 1.2055641421947451e-05, + "loss": 0.0081, + "step": 79 + }, + { + "epoch": 0.06178798995945163, + "grad_norm": 0.09319712966680527, + "learning_rate": 1.2210200927357033e-05, + "loss": 0.0092, + "step": 80 + }, + { + "epoch": 0.06256033983394478, + "grad_norm": 0.04295811802148819, + "learning_rate": 1.2364760432766615e-05, + "loss": 0.0085, + "step": 81 + }, + { + "epoch": 0.06333268970843792, + "grad_norm": 0.18484055995941162, + "learning_rate": 1.2519319938176199e-05, + "loss": 0.0089, + "step": 82 + }, + { + "epoch": 0.06410503958293107, + "grad_norm": 0.059900783002376556, + "learning_rate": 1.2673879443585781e-05, + "loss": 0.0079, + "step": 83 + }, + { + "epoch": 0.06487738945742422, + "grad_norm": 0.1664331704378128, + "learning_rate": 1.2828438948995365e-05, + "loss": 0.0086, + "step": 84 + }, + { + "epoch": 0.06564973933191735, + "grad_norm": 0.05938958376646042, + "learning_rate": 1.2982998454404947e-05, + "loss": 0.0083, + "step": 85 + }, + { + "epoch": 0.0664220892064105, + "grad_norm": 0.13517208397388458, + "learning_rate": 1.3137557959814529e-05, + "loss": 0.0086, + "step": 86 + }, + { + "epoch": 0.06719443908090365, + "grad_norm": 0.06283359974622726, + "learning_rate": 1.3292117465224113e-05, + "loss": 0.0081, + "step": 87 + }, + { + "epoch": 0.06796678895539679, + "grad_norm": 0.1026938259601593, + "learning_rate": 1.3446676970633695e-05, + "loss": 0.0077, + "step": 88 + }, + { + "epoch": 0.06873913882988994, + "grad_norm": 0.05881831422448158, + "learning_rate": 1.3601236476043278e-05, + "loss": 0.0085, + "step": 89 + }, + { + "epoch": 0.06951148870438309, + "grad_norm": 0.06477896869182587, + "learning_rate": 1.375579598145286e-05, + "loss": 0.0074, + "step": 90 + }, + { + "epoch": 0.07028383857887623, + "grad_norm": 0.10522706806659698, + "learning_rate": 1.3910355486862443e-05, + "loss": 0.0083, + "step": 91 + }, + { + "epoch": 0.07105618845336938, + "grad_norm": 0.05898185446858406, + "learning_rate": 1.4064914992272025e-05, + "loss": 0.0086, + "step": 92 + }, + { + "epoch": 0.07182853832786253, + "grad_norm": 0.09011103957891464, + "learning_rate": 1.4219474497681608e-05, + "loss": 0.0075, + "step": 93 + }, + { + "epoch": 0.07260088820235566, + "grad_norm": 0.06274860352277756, + "learning_rate": 1.4374034003091192e-05, + "loss": 0.0067, + "step": 94 + }, + { + "epoch": 0.07337323807684881, + "grad_norm": 0.05327846109867096, + "learning_rate": 1.4528593508500774e-05, + "loss": 0.008, + "step": 95 + }, + { + "epoch": 0.07414558795134196, + "grad_norm": 0.1118331104516983, + "learning_rate": 1.4683153013910356e-05, + "loss": 0.0081, + "step": 96 + }, + { + "epoch": 0.0749179378258351, + "grad_norm": 0.05044018477201462, + "learning_rate": 1.4837712519319938e-05, + "loss": 0.0072, + "step": 97 + }, + { + "epoch": 0.07569028770032825, + "grad_norm": 0.11157859861850739, + "learning_rate": 1.4992272024729522e-05, + "loss": 0.0075, + "step": 98 + }, + { + "epoch": 0.0764626375748214, + "grad_norm": 0.0844530537724495, + "learning_rate": 1.5146831530139106e-05, + "loss": 0.0073, + "step": 99 + }, + { + "epoch": 0.07723498744931453, + "grad_norm": 0.06750793009996414, + "learning_rate": 1.5301391035548686e-05, + "loss": 0.0082, + "step": 100 + }, + { + "epoch": 0.07800733732380769, + "grad_norm": 0.09606797993183136, + "learning_rate": 1.545595054095827e-05, + "loss": 0.0079, + "step": 101 + }, + { + "epoch": 0.07877968719830084, + "grad_norm": 0.06051672250032425, + "learning_rate": 1.561051004636785e-05, + "loss": 0.0075, + "step": 102 + }, + { + "epoch": 0.07955203707279397, + "grad_norm": 0.05748724564909935, + "learning_rate": 1.5765069551777432e-05, + "loss": 0.0077, + "step": 103 + }, + { + "epoch": 0.08032438694728712, + "grad_norm": 0.034181009978055954, + "learning_rate": 1.5919629057187018e-05, + "loss": 0.007, + "step": 104 + }, + { + "epoch": 0.08109673682178027, + "grad_norm": 0.0854635238647461, + "learning_rate": 1.60741885625966e-05, + "loss": 0.0075, + "step": 105 + }, + { + "epoch": 0.08186908669627341, + "grad_norm": 0.049009062349796295, + "learning_rate": 1.6228748068006182e-05, + "loss": 0.0074, + "step": 106 + }, + { + "epoch": 0.08264143657076656, + "grad_norm": 0.0906350389122963, + "learning_rate": 1.6383307573415764e-05, + "loss": 0.0079, + "step": 107 + }, + { + "epoch": 0.08341378644525971, + "grad_norm": 0.05115756392478943, + "learning_rate": 1.6537867078825346e-05, + "loss": 0.0072, + "step": 108 + }, + { + "epoch": 0.08418613631975284, + "grad_norm": 0.05932699888944626, + "learning_rate": 1.6692426584234932e-05, + "loss": 0.0071, + "step": 109 + }, + { + "epoch": 0.084958486194246, + "grad_norm": 0.0629432424902916, + "learning_rate": 1.6846986089644514e-05, + "loss": 0.0068, + "step": 110 + }, + { + "epoch": 0.08573083606873914, + "grad_norm": 0.07591287791728973, + "learning_rate": 1.7001545595054096e-05, + "loss": 0.0067, + "step": 111 + }, + { + "epoch": 0.08650318594323228, + "grad_norm": 0.03689542040228844, + "learning_rate": 1.7156105100463678e-05, + "loss": 0.0073, + "step": 112 + }, + { + "epoch": 0.08727553581772543, + "grad_norm": 0.10098189860582352, + "learning_rate": 1.731066460587326e-05, + "loss": 0.0071, + "step": 113 + }, + { + "epoch": 0.08804788569221858, + "grad_norm": 0.05658024176955223, + "learning_rate": 1.7465224111282842e-05, + "loss": 0.0063, + "step": 114 + }, + { + "epoch": 0.08882023556671172, + "grad_norm": 0.05307658389210701, + "learning_rate": 1.7619783616692428e-05, + "loss": 0.0063, + "step": 115 + }, + { + "epoch": 0.08959258544120487, + "grad_norm": 0.08351606130599976, + "learning_rate": 1.777434312210201e-05, + "loss": 0.0075, + "step": 116 + }, + { + "epoch": 0.09036493531569802, + "grad_norm": 0.10230844467878342, + "learning_rate": 1.792890262751159e-05, + "loss": 0.007, + "step": 117 + }, + { + "epoch": 0.09113728519019115, + "grad_norm": 0.03776973858475685, + "learning_rate": 1.8083462132921174e-05, + "loss": 0.0068, + "step": 118 + }, + { + "epoch": 0.0919096350646843, + "grad_norm": 0.0671847015619278, + "learning_rate": 1.8238021638330756e-05, + "loss": 0.0059, + "step": 119 + }, + { + "epoch": 0.09268198493917745, + "grad_norm": 0.03916226327419281, + "learning_rate": 1.839258114374034e-05, + "loss": 0.0066, + "step": 120 + }, + { + "epoch": 0.09345433481367059, + "grad_norm": 0.04165134206414223, + "learning_rate": 1.8547140649149923e-05, + "loss": 0.0076, + "step": 121 + }, + { + "epoch": 0.09422668468816374, + "grad_norm": 0.06339821964502335, + "learning_rate": 1.8701700154559505e-05, + "loss": 0.0071, + "step": 122 + }, + { + "epoch": 0.09499903456265689, + "grad_norm": 0.03353743627667427, + "learning_rate": 1.8856259659969088e-05, + "loss": 0.0074, + "step": 123 + }, + { + "epoch": 0.09577138443715003, + "grad_norm": 0.09131729602813721, + "learning_rate": 1.901081916537867e-05, + "loss": 0.0074, + "step": 124 + }, + { + "epoch": 0.09654373431164318, + "grad_norm": 0.04216662794351578, + "learning_rate": 1.916537867078825e-05, + "loss": 0.0066, + "step": 125 + }, + { + "epoch": 0.09731608418613633, + "grad_norm": 0.11223969608545303, + "learning_rate": 1.9319938176197837e-05, + "loss": 0.0076, + "step": 126 + }, + { + "epoch": 0.09808843406062946, + "grad_norm": 0.04586590453982353, + "learning_rate": 1.947449768160742e-05, + "loss": 0.0065, + "step": 127 + }, + { + "epoch": 0.09886078393512261, + "grad_norm": 0.13881778717041016, + "learning_rate": 1.9629057187017e-05, + "loss": 0.0069, + "step": 128 + }, + { + "epoch": 0.09963313380961576, + "grad_norm": 0.042186565697193146, + "learning_rate": 1.9783616692426583e-05, + "loss": 0.0071, + "step": 129 + }, + { + "epoch": 0.1004054836841089, + "grad_norm": 0.11093481630086899, + "learning_rate": 1.9938176197836165e-05, + "loss": 0.0074, + "step": 130 + }, + { + "epoch": 0.10117783355860205, + "grad_norm": 0.053297363221645355, + "learning_rate": 2.009273570324575e-05, + "loss": 0.0057, + "step": 131 + }, + { + "epoch": 0.1019501834330952, + "grad_norm": 0.10928916931152344, + "learning_rate": 2.0247295208655333e-05, + "loss": 0.0071, + "step": 132 + }, + { + "epoch": 0.10272253330758833, + "grad_norm": 0.06282244622707367, + "learning_rate": 2.0401854714064915e-05, + "loss": 0.0071, + "step": 133 + }, + { + "epoch": 0.10349488318208148, + "grad_norm": 0.08216606825590134, + "learning_rate": 2.0556414219474497e-05, + "loss": 0.0066, + "step": 134 + }, + { + "epoch": 0.10426723305657463, + "grad_norm": 0.03244255110621452, + "learning_rate": 2.071097372488408e-05, + "loss": 0.0064, + "step": 135 + }, + { + "epoch": 0.10503958293106777, + "grad_norm": 0.11407410353422165, + "learning_rate": 2.086553323029366e-05, + "loss": 0.0068, + "step": 136 + }, + { + "epoch": 0.10581193280556092, + "grad_norm": 0.05050407722592354, + "learning_rate": 2.1020092735703247e-05, + "loss": 0.0059, + "step": 137 + }, + { + "epoch": 0.10658428268005407, + "grad_norm": 0.07172229140996933, + "learning_rate": 2.117465224111283e-05, + "loss": 0.0064, + "step": 138 + }, + { + "epoch": 0.10735663255454721, + "grad_norm": 0.0523892417550087, + "learning_rate": 2.132921174652241e-05, + "loss": 0.0064, + "step": 139 + }, + { + "epoch": 0.10812898242904036, + "grad_norm": 0.06238657608628273, + "learning_rate": 2.1483771251931993e-05, + "loss": 0.0063, + "step": 140 + }, + { + "epoch": 0.10890133230353351, + "grad_norm": 0.12138430774211884, + "learning_rate": 2.1638330757341575e-05, + "loss": 0.0074, + "step": 141 + }, + { + "epoch": 0.10967368217802664, + "grad_norm": 0.06384667754173279, + "learning_rate": 2.179289026275116e-05, + "loss": 0.0069, + "step": 142 + }, + { + "epoch": 0.1104460320525198, + "grad_norm": 0.17014510929584503, + "learning_rate": 2.1947449768160743e-05, + "loss": 0.0077, + "step": 143 + }, + { + "epoch": 0.11121838192701294, + "grad_norm": 0.0351419635117054, + "learning_rate": 2.2102009273570325e-05, + "loss": 0.0067, + "step": 144 + }, + { + "epoch": 0.11199073180150608, + "grad_norm": 0.2364615947008133, + "learning_rate": 2.2256568778979907e-05, + "loss": 0.0083, + "step": 145 + }, + { + "epoch": 0.11276308167599923, + "grad_norm": 0.05774795636534691, + "learning_rate": 2.241112828438949e-05, + "loss": 0.0068, + "step": 146 + }, + { + "epoch": 0.11353543155049237, + "grad_norm": 0.23769724369049072, + "learning_rate": 2.2565687789799074e-05, + "loss": 0.0076, + "step": 147 + }, + { + "epoch": 0.11430778142498552, + "grad_norm": 0.048934392631053925, + "learning_rate": 2.2720247295208656e-05, + "loss": 0.0066, + "step": 148 + }, + { + "epoch": 0.11508013129947867, + "grad_norm": 0.22198757529258728, + "learning_rate": 2.287480680061824e-05, + "loss": 0.0083, + "step": 149 + }, + { + "epoch": 0.1158524811739718, + "grad_norm": 0.0632915124297142, + "learning_rate": 2.302936630602782e-05, + "loss": 0.007, + "step": 150 + }, + { + "epoch": 0.11662483104846495, + "grad_norm": 0.17810183763504028, + "learning_rate": 2.3183925811437403e-05, + "loss": 0.008, + "step": 151 + }, + { + "epoch": 0.1173971809229581, + "grad_norm": 0.11858183145523071, + "learning_rate": 2.3338485316846988e-05, + "loss": 0.0073, + "step": 152 + }, + { + "epoch": 0.11816953079745124, + "grad_norm": 0.12697774171829224, + "learning_rate": 2.349304482225657e-05, + "loss": 0.0077, + "step": 153 + }, + { + "epoch": 0.11894188067194439, + "grad_norm": 0.08780393749475479, + "learning_rate": 2.3647604327666152e-05, + "loss": 0.0071, + "step": 154 + }, + { + "epoch": 0.11971423054643754, + "grad_norm": 0.038997404277324677, + "learning_rate": 2.3802163833075734e-05, + "loss": 0.0068, + "step": 155 + }, + { + "epoch": 0.12048658042093068, + "grad_norm": 0.044348638504743576, + "learning_rate": 2.3956723338485316e-05, + "loss": 0.007, + "step": 156 + }, + { + "epoch": 0.12125893029542383, + "grad_norm": 0.09429153054952621, + "learning_rate": 2.4111282843894902e-05, + "loss": 0.008, + "step": 157 + }, + { + "epoch": 0.12203128016991698, + "grad_norm": 0.04546340927481651, + "learning_rate": 2.4265842349304484e-05, + "loss": 0.0062, + "step": 158 + }, + { + "epoch": 0.12280363004441011, + "grad_norm": 0.09436047822237015, + "learning_rate": 2.4420401854714066e-05, + "loss": 0.0065, + "step": 159 + }, + { + "epoch": 0.12357597991890326, + "grad_norm": 0.08909037709236145, + "learning_rate": 2.4574961360123648e-05, + "loss": 0.0069, + "step": 160 + }, + { + "epoch": 0.12434832979339641, + "grad_norm": 0.10167445987462997, + "learning_rate": 2.472952086553323e-05, + "loss": 0.0067, + "step": 161 + }, + { + "epoch": 0.12512067966788956, + "grad_norm": 0.10640830546617508, + "learning_rate": 2.4884080370942815e-05, + "loss": 0.0068, + "step": 162 + }, + { + "epoch": 0.1258930295423827, + "grad_norm": 0.04076463729143143, + "learning_rate": 2.5038639876352398e-05, + "loss": 0.0073, + "step": 163 + }, + { + "epoch": 0.12666537941687583, + "grad_norm": 0.08261235058307648, + "learning_rate": 2.519319938176198e-05, + "loss": 0.007, + "step": 164 + }, + { + "epoch": 0.12743772929136898, + "grad_norm": 0.13118483126163483, + "learning_rate": 2.5347758887171562e-05, + "loss": 0.0074, + "step": 165 + }, + { + "epoch": 0.12821007916586213, + "grad_norm": 0.07147473096847534, + "learning_rate": 2.5502318392581144e-05, + "loss": 0.0064, + "step": 166 + }, + { + "epoch": 0.12898242904035528, + "grad_norm": 0.11924638599157333, + "learning_rate": 2.565687789799073e-05, + "loss": 0.0065, + "step": 167 + }, + { + "epoch": 0.12975477891484843, + "grad_norm": 0.07254443317651749, + "learning_rate": 2.581143740340031e-05, + "loss": 0.0068, + "step": 168 + }, + { + "epoch": 0.13052712878934158, + "grad_norm": 0.1017264872789383, + "learning_rate": 2.5965996908809893e-05, + "loss": 0.0073, + "step": 169 + }, + { + "epoch": 0.1312994786638347, + "grad_norm": 0.10837958008050919, + "learning_rate": 2.6120556414219475e-05, + "loss": 0.0065, + "step": 170 + }, + { + "epoch": 0.13207182853832786, + "grad_norm": 0.043897368013858795, + "learning_rate": 2.6275115919629058e-05, + "loss": 0.0068, + "step": 171 + }, + { + "epoch": 0.132844178412821, + "grad_norm": 0.08516182005405426, + "learning_rate": 2.6429675425038643e-05, + "loss": 0.0071, + "step": 172 + }, + { + "epoch": 0.13361652828731416, + "grad_norm": 0.08410457521677017, + "learning_rate": 2.6584234930448225e-05, + "loss": 0.0072, + "step": 173 + }, + { + "epoch": 0.1343888781618073, + "grad_norm": 0.04611523821949959, + "learning_rate": 2.6738794435857807e-05, + "loss": 0.0058, + "step": 174 + }, + { + "epoch": 0.13516122803630046, + "grad_norm": 0.08548586070537567, + "learning_rate": 2.689335394126739e-05, + "loss": 0.0068, + "step": 175 + }, + { + "epoch": 0.13593357791079358, + "grad_norm": 0.0654478371143341, + "learning_rate": 2.704791344667697e-05, + "loss": 0.0076, + "step": 176 + }, + { + "epoch": 0.13670592778528673, + "grad_norm": 0.09992239624261856, + "learning_rate": 2.7202472952086557e-05, + "loss": 0.0068, + "step": 177 + }, + { + "epoch": 0.13747827765977988, + "grad_norm": 0.06172487512230873, + "learning_rate": 2.735703245749614e-05, + "loss": 0.0073, + "step": 178 + }, + { + "epoch": 0.13825062753427303, + "grad_norm": 0.09481542557477951, + "learning_rate": 2.751159196290572e-05, + "loss": 0.0075, + "step": 179 + }, + { + "epoch": 0.13902297740876618, + "grad_norm": 0.055736932903528214, + "learning_rate": 2.7666151468315303e-05, + "loss": 0.0055, + "step": 180 + }, + { + "epoch": 0.13979532728325933, + "grad_norm": 0.05875783413648605, + "learning_rate": 2.7820710973724885e-05, + "loss": 0.0067, + "step": 181 + }, + { + "epoch": 0.14056767715775245, + "grad_norm": 0.07550349831581116, + "learning_rate": 2.797527047913447e-05, + "loss": 0.0066, + "step": 182 + }, + { + "epoch": 0.1413400270322456, + "grad_norm": 0.03867774456739426, + "learning_rate": 2.812982998454405e-05, + "loss": 0.0063, + "step": 183 + }, + { + "epoch": 0.14211237690673875, + "grad_norm": 0.05367950350046158, + "learning_rate": 2.8284389489953635e-05, + "loss": 0.0065, + "step": 184 + }, + { + "epoch": 0.1428847267812319, + "grad_norm": 0.06944329291582108, + "learning_rate": 2.8438948995363217e-05, + "loss": 0.0065, + "step": 185 + }, + { + "epoch": 0.14365707665572505, + "grad_norm": 0.044338397681713104, + "learning_rate": 2.85935085007728e-05, + "loss": 0.0059, + "step": 186 + }, + { + "epoch": 0.1444294265302182, + "grad_norm": 0.07662731409072876, + "learning_rate": 2.8748068006182384e-05, + "loss": 0.0057, + "step": 187 + }, + { + "epoch": 0.14520177640471132, + "grad_norm": 0.03504624217748642, + "learning_rate": 2.8902627511591963e-05, + "loss": 0.0051, + "step": 188 + }, + { + "epoch": 0.14597412627920447, + "grad_norm": 0.05816268175840378, + "learning_rate": 2.905718701700155e-05, + "loss": 0.0063, + "step": 189 + }, + { + "epoch": 0.14674647615369762, + "grad_norm": 0.05528225749731064, + "learning_rate": 2.921174652241113e-05, + "loss": 0.0063, + "step": 190 + }, + { + "epoch": 0.14751882602819077, + "grad_norm": 0.03650132939219475, + "learning_rate": 2.9366306027820713e-05, + "loss": 0.0063, + "step": 191 + }, + { + "epoch": 0.14829117590268392, + "grad_norm": 0.03839438036084175, + "learning_rate": 2.9520865533230298e-05, + "loss": 0.0056, + "step": 192 + }, + { + "epoch": 0.14906352577717707, + "grad_norm": 0.02672256901860237, + "learning_rate": 2.9675425038639877e-05, + "loss": 0.0063, + "step": 193 + }, + { + "epoch": 0.1498358756516702, + "grad_norm": 0.04782088100910187, + "learning_rate": 2.9829984544049462e-05, + "loss": 0.0059, + "step": 194 + }, + { + "epoch": 0.15060822552616335, + "grad_norm": 0.03413840010762215, + "learning_rate": 2.9984544049459044e-05, + "loss": 0.0053, + "step": 195 + }, + { + "epoch": 0.1513805754006565, + "grad_norm": 0.05395448952913284, + "learning_rate": 3.0139103554868626e-05, + "loss": 0.006, + "step": 196 + }, + { + "epoch": 0.15215292527514965, + "grad_norm": 0.04196440801024437, + "learning_rate": 3.0293663060278212e-05, + "loss": 0.0064, + "step": 197 + }, + { + "epoch": 0.1529252751496428, + "grad_norm": 0.06425611674785614, + "learning_rate": 3.044822256568779e-05, + "loss": 0.0064, + "step": 198 + }, + { + "epoch": 0.15369762502413592, + "grad_norm": 0.053349222987890244, + "learning_rate": 3.060278207109737e-05, + "loss": 0.0058, + "step": 199 + }, + { + "epoch": 0.15446997489862907, + "grad_norm": 0.05441872030496597, + "learning_rate": 3.075734157650695e-05, + "loss": 0.0058, + "step": 200 + }, + { + "epoch": 0.15524232477312222, + "grad_norm": 0.03474319726228714, + "learning_rate": 3.091190108191654e-05, + "loss": 0.006, + "step": 201 + }, + { + "epoch": 0.15601467464761537, + "grad_norm": 0.03509390726685524, + "learning_rate": 3.106646058732612e-05, + "loss": 0.0058, + "step": 202 + }, + { + "epoch": 0.15678702452210852, + "grad_norm": 0.03735740855336189, + "learning_rate": 3.12210200927357e-05, + "loss": 0.0057, + "step": 203 + }, + { + "epoch": 0.15755937439660167, + "grad_norm": 0.049221672117710114, + "learning_rate": 3.1375579598145286e-05, + "loss": 0.0068, + "step": 204 + }, + { + "epoch": 0.1583317242710948, + "grad_norm": 0.056076932698488235, + "learning_rate": 3.1530139103554865e-05, + "loss": 0.0063, + "step": 205 + }, + { + "epoch": 0.15910407414558794, + "grad_norm": 0.033057134598493576, + "learning_rate": 3.168469860896445e-05, + "loss": 0.0057, + "step": 206 + }, + { + "epoch": 0.1598764240200811, + "grad_norm": 0.030060870572924614, + "learning_rate": 3.1839258114374036e-05, + "loss": 0.0062, + "step": 207 + }, + { + "epoch": 0.16064877389457424, + "grad_norm": 0.0711582824587822, + "learning_rate": 3.1993817619783615e-05, + "loss": 0.006, + "step": 208 + }, + { + "epoch": 0.1614211237690674, + "grad_norm": 0.03385183587670326, + "learning_rate": 3.21483771251932e-05, + "loss": 0.0053, + "step": 209 + }, + { + "epoch": 0.16219347364356054, + "grad_norm": 0.08179536461830139, + "learning_rate": 3.230293663060278e-05, + "loss": 0.0056, + "step": 210 + }, + { + "epoch": 0.16296582351805367, + "grad_norm": 0.03334837406873703, + "learning_rate": 3.2457496136012364e-05, + "loss": 0.005, + "step": 211 + }, + { + "epoch": 0.16373817339254682, + "grad_norm": 0.0625922828912735, + "learning_rate": 3.261205564142195e-05, + "loss": 0.006, + "step": 212 + }, + { + "epoch": 0.16451052326703997, + "grad_norm": 0.03503840044140816, + "learning_rate": 3.276661514683153e-05, + "loss": 0.0064, + "step": 213 + }, + { + "epoch": 0.16528287314153312, + "grad_norm": 0.07841236144304276, + "learning_rate": 3.2921174652241114e-05, + "loss": 0.0064, + "step": 214 + }, + { + "epoch": 0.16605522301602627, + "grad_norm": 0.03800879791378975, + "learning_rate": 3.307573415765069e-05, + "loss": 0.0061, + "step": 215 + }, + { + "epoch": 0.16682757289051942, + "grad_norm": 0.04207073524594307, + "learning_rate": 3.323029366306028e-05, + "loss": 0.0055, + "step": 216 + }, + { + "epoch": 0.16759992276501254, + "grad_norm": 0.03547835350036621, + "learning_rate": 3.3384853168469863e-05, + "loss": 0.0059, + "step": 217 + }, + { + "epoch": 0.1683722726395057, + "grad_norm": 0.04894952476024628, + "learning_rate": 3.353941267387944e-05, + "loss": 0.0053, + "step": 218 + }, + { + "epoch": 0.16914462251399884, + "grad_norm": 0.03328663110733032, + "learning_rate": 3.369397217928903e-05, + "loss": 0.0062, + "step": 219 + }, + { + "epoch": 0.169916972388492, + "grad_norm": 0.04965837672352791, + "learning_rate": 3.3848531684698606e-05, + "loss": 0.0062, + "step": 220 + }, + { + "epoch": 0.17068932226298514, + "grad_norm": 0.04981934279203415, + "learning_rate": 3.400309119010819e-05, + "loss": 0.006, + "step": 221 + }, + { + "epoch": 0.1714616721374783, + "grad_norm": 0.07044146209955215, + "learning_rate": 3.415765069551777e-05, + "loss": 0.0054, + "step": 222 + }, + { + "epoch": 0.1722340220119714, + "grad_norm": 0.03921792656183243, + "learning_rate": 3.4312210200927356e-05, + "loss": 0.0051, + "step": 223 + }, + { + "epoch": 0.17300637188646456, + "grad_norm": 0.038964755833148956, + "learning_rate": 3.446676970633694e-05, + "loss": 0.0066, + "step": 224 + }, + { + "epoch": 0.1737787217609577, + "grad_norm": 0.03377487137913704, + "learning_rate": 3.462132921174652e-05, + "loss": 0.0061, + "step": 225 + }, + { + "epoch": 0.17455107163545086, + "grad_norm": 0.04873465746641159, + "learning_rate": 3.4775888717156105e-05, + "loss": 0.0067, + "step": 226 + }, + { + "epoch": 0.175323421509944, + "grad_norm": 0.041648928076028824, + "learning_rate": 3.4930448222565684e-05, + "loss": 0.0064, + "step": 227 + }, + { + "epoch": 0.17609577138443716, + "grad_norm": 0.0782196968793869, + "learning_rate": 3.508500772797527e-05, + "loss": 0.0069, + "step": 228 + }, + { + "epoch": 0.17686812125893028, + "grad_norm": 0.05687787011265755, + "learning_rate": 3.5239567233384855e-05, + "loss": 0.0064, + "step": 229 + }, + { + "epoch": 0.17764047113342343, + "grad_norm": 0.07424864917993546, + "learning_rate": 3.5394126738794434e-05, + "loss": 0.006, + "step": 230 + }, + { + "epoch": 0.17841282100791658, + "grad_norm": 0.09180114418268204, + "learning_rate": 3.554868624420402e-05, + "loss": 0.0058, + "step": 231 + }, + { + "epoch": 0.17918517088240973, + "grad_norm": 0.045950617641210556, + "learning_rate": 3.57032457496136e-05, + "loss": 0.0055, + "step": 232 + }, + { + "epoch": 0.17995752075690288, + "grad_norm": 0.053155139088630676, + "learning_rate": 3.585780525502318e-05, + "loss": 0.0059, + "step": 233 + }, + { + "epoch": 0.18072987063139603, + "grad_norm": 0.03958377242088318, + "learning_rate": 3.601236476043277e-05, + "loss": 0.0062, + "step": 234 + }, + { + "epoch": 0.18150222050588916, + "grad_norm": 0.09630659967660904, + "learning_rate": 3.616692426584235e-05, + "loss": 0.0054, + "step": 235 + }, + { + "epoch": 0.1822745703803823, + "grad_norm": 0.06993494927883148, + "learning_rate": 3.632148377125193e-05, + "loss": 0.0062, + "step": 236 + }, + { + "epoch": 0.18304692025487546, + "grad_norm": 0.07463036477565765, + "learning_rate": 3.647604327666151e-05, + "loss": 0.0065, + "step": 237 + }, + { + "epoch": 0.1838192701293686, + "grad_norm": 0.08071650564670563, + "learning_rate": 3.66306027820711e-05, + "loss": 0.0069, + "step": 238 + }, + { + "epoch": 0.18459162000386176, + "grad_norm": 0.07690216600894928, + "learning_rate": 3.678516228748068e-05, + "loss": 0.0062, + "step": 239 + }, + { + "epoch": 0.1853639698783549, + "grad_norm": 0.02988547831773758, + "learning_rate": 3.693972179289026e-05, + "loss": 0.0058, + "step": 240 + }, + { + "epoch": 0.18613631975284803, + "grad_norm": 0.059305574744939804, + "learning_rate": 3.709428129829985e-05, + "loss": 0.0067, + "step": 241 + }, + { + "epoch": 0.18690866962734118, + "grad_norm": 0.04826758801937103, + "learning_rate": 3.7248840803709425e-05, + "loss": 0.0067, + "step": 242 + }, + { + "epoch": 0.18768101950183433, + "grad_norm": 0.0522642657160759, + "learning_rate": 3.740340030911901e-05, + "loss": 0.0065, + "step": 243 + }, + { + "epoch": 0.18845336937632748, + "grad_norm": 0.10103264451026917, + "learning_rate": 3.755795981452859e-05, + "loss": 0.006, + "step": 244 + }, + { + "epoch": 0.18922571925082063, + "grad_norm": 0.03253243863582611, + "learning_rate": 3.7712519319938175e-05, + "loss": 0.0053, + "step": 245 + }, + { + "epoch": 0.18999806912531378, + "grad_norm": 0.06003435328602791, + "learning_rate": 3.786707882534776e-05, + "loss": 0.0064, + "step": 246 + }, + { + "epoch": 0.1907704189998069, + "grad_norm": 0.08122384548187256, + "learning_rate": 3.802163833075734e-05, + "loss": 0.0062, + "step": 247 + }, + { + "epoch": 0.19154276887430005, + "grad_norm": 0.08827490359544754, + "learning_rate": 3.8176197836166925e-05, + "loss": 0.0067, + "step": 248 + }, + { + "epoch": 0.1923151187487932, + "grad_norm": 0.09226624667644501, + "learning_rate": 3.83307573415765e-05, + "loss": 0.0066, + "step": 249 + }, + { + "epoch": 0.19308746862328635, + "grad_norm": 0.05763600394129753, + "learning_rate": 3.848531684698609e-05, + "loss": 0.0061, + "step": 250 + }, + { + "epoch": 0.1938598184977795, + "grad_norm": 0.02883163094520569, + "learning_rate": 3.8639876352395674e-05, + "loss": 0.0057, + "step": 251 + }, + { + "epoch": 0.19463216837227265, + "grad_norm": 0.09011675417423248, + "learning_rate": 3.879443585780525e-05, + "loss": 0.006, + "step": 252 + }, + { + "epoch": 0.19540451824676577, + "grad_norm": 0.03947719186544418, + "learning_rate": 3.894899536321484e-05, + "loss": 0.0061, + "step": 253 + }, + { + "epoch": 0.19617686812125892, + "grad_norm": 0.10283095389604568, + "learning_rate": 3.910355486862442e-05, + "loss": 0.007, + "step": 254 + }, + { + "epoch": 0.19694921799575207, + "grad_norm": 0.05724601447582245, + "learning_rate": 3.9258114374034e-05, + "loss": 0.0062, + "step": 255 + }, + { + "epoch": 0.19772156787024522, + "grad_norm": 0.10304121673107147, + "learning_rate": 3.941267387944359e-05, + "loss": 0.0067, + "step": 256 + }, + { + "epoch": 0.19849391774473837, + "grad_norm": 0.04029000177979469, + "learning_rate": 3.956723338485317e-05, + "loss": 0.0062, + "step": 257 + }, + { + "epoch": 0.19926626761923152, + "grad_norm": 0.07087238132953644, + "learning_rate": 3.972179289026275e-05, + "loss": 0.0067, + "step": 258 + }, + { + "epoch": 0.20003861749372465, + "grad_norm": 0.02597912587225437, + "learning_rate": 3.987635239567233e-05, + "loss": 0.0054, + "step": 259 + }, + { + "epoch": 0.2008109673682178, + "grad_norm": 0.11193391680717468, + "learning_rate": 4.0030911901081916e-05, + "loss": 0.0071, + "step": 260 + }, + { + "epoch": 0.20158331724271095, + "grad_norm": 0.03230161592364311, + "learning_rate": 4.01854714064915e-05, + "loss": 0.0058, + "step": 261 + }, + { + "epoch": 0.2023556671172041, + "grad_norm": 0.056707777082920074, + "learning_rate": 4.034003091190108e-05, + "loss": 0.0063, + "step": 262 + }, + { + "epoch": 0.20312801699169725, + "grad_norm": 0.07985185086727142, + "learning_rate": 4.0494590417310666e-05, + "loss": 0.0066, + "step": 263 + }, + { + "epoch": 0.2039003668661904, + "grad_norm": 0.06168767064809799, + "learning_rate": 4.0649149922720245e-05, + "loss": 0.0059, + "step": 264 + }, + { + "epoch": 0.20467271674068352, + "grad_norm": 0.06294318288564682, + "learning_rate": 4.080370942812983e-05, + "loss": 0.006, + "step": 265 + }, + { + "epoch": 0.20544506661517667, + "grad_norm": 0.0714600458741188, + "learning_rate": 4.095826893353941e-05, + "loss": 0.0063, + "step": 266 + }, + { + "epoch": 0.20621741648966982, + "grad_norm": 0.04438761621713638, + "learning_rate": 4.1112828438948994e-05, + "loss": 0.0057, + "step": 267 + }, + { + "epoch": 0.20698976636416297, + "grad_norm": 0.06038789451122284, + "learning_rate": 4.126738794435858e-05, + "loss": 0.0067, + "step": 268 + }, + { + "epoch": 0.20776211623865612, + "grad_norm": 0.044002216309309006, + "learning_rate": 4.142194744976816e-05, + "loss": 0.0063, + "step": 269 + }, + { + "epoch": 0.20853446611314927, + "grad_norm": 0.06005591154098511, + "learning_rate": 4.1576506955177744e-05, + "loss": 0.0056, + "step": 270 + }, + { + "epoch": 0.2093068159876424, + "grad_norm": 0.026720965281128883, + "learning_rate": 4.173106646058732e-05, + "loss": 0.0057, + "step": 271 + }, + { + "epoch": 0.21007916586213554, + "grad_norm": 0.032197173684835434, + "learning_rate": 4.188562596599691e-05, + "loss": 0.0063, + "step": 272 + }, + { + "epoch": 0.2108515157366287, + "grad_norm": 0.02827630750834942, + "learning_rate": 4.2040185471406493e-05, + "loss": 0.0059, + "step": 273 + }, + { + "epoch": 0.21162386561112184, + "grad_norm": 0.03269730508327484, + "learning_rate": 4.219474497681607e-05, + "loss": 0.0063, + "step": 274 + }, + { + "epoch": 0.212396215485615, + "grad_norm": 0.03561446815729141, + "learning_rate": 4.234930448222566e-05, + "loss": 0.0062, + "step": 275 + }, + { + "epoch": 0.21316856536010814, + "grad_norm": 0.0301744993776083, + "learning_rate": 4.2503863987635236e-05, + "loss": 0.0061, + "step": 276 + }, + { + "epoch": 0.21394091523460126, + "grad_norm": 0.02276015095412731, + "learning_rate": 4.265842349304482e-05, + "loss": 0.0059, + "step": 277 + }, + { + "epoch": 0.21471326510909441, + "grad_norm": 0.025467311963438988, + "learning_rate": 4.281298299845441e-05, + "loss": 0.0053, + "step": 278 + }, + { + "epoch": 0.21548561498358756, + "grad_norm": 0.03545083850622177, + "learning_rate": 4.2967542503863986e-05, + "loss": 0.0051, + "step": 279 + }, + { + "epoch": 0.21625796485808071, + "grad_norm": 0.06959273666143417, + "learning_rate": 4.312210200927357e-05, + "loss": 0.0066, + "step": 280 + }, + { + "epoch": 0.21703031473257386, + "grad_norm": 0.03239291533827782, + "learning_rate": 4.327666151468315e-05, + "loss": 0.0053, + "step": 281 + }, + { + "epoch": 0.21780266460706701, + "grad_norm": 0.03332449123263359, + "learning_rate": 4.3431221020092735e-05, + "loss": 0.0065, + "step": 282 + }, + { + "epoch": 0.21857501448156014, + "grad_norm": 0.026551589369773865, + "learning_rate": 4.358578052550232e-05, + "loss": 0.0062, + "step": 283 + }, + { + "epoch": 0.2193473643560533, + "grad_norm": 0.0345880500972271, + "learning_rate": 4.37403400309119e-05, + "loss": 0.0057, + "step": 284 + }, + { + "epoch": 0.22011971423054644, + "grad_norm": 0.05873372405767441, + "learning_rate": 4.3894899536321485e-05, + "loss": 0.0059, + "step": 285 + }, + { + "epoch": 0.2208920641050396, + "grad_norm": 0.03387603163719177, + "learning_rate": 4.4049459041731064e-05, + "loss": 0.0056, + "step": 286 + }, + { + "epoch": 0.22166441397953274, + "grad_norm": 0.08423435688018799, + "learning_rate": 4.420401854714065e-05, + "loss": 0.0064, + "step": 287 + }, + { + "epoch": 0.2224367638540259, + "grad_norm": 0.04661883786320686, + "learning_rate": 4.4358578052550235e-05, + "loss": 0.0061, + "step": 288 + }, + { + "epoch": 0.223209113728519, + "grad_norm": 0.09756273031234741, + "learning_rate": 4.451313755795981e-05, + "loss": 0.0062, + "step": 289 + }, + { + "epoch": 0.22398146360301216, + "grad_norm": 0.0334688164293766, + "learning_rate": 4.46676970633694e-05, + "loss": 0.0056, + "step": 290 + }, + { + "epoch": 0.2247538134775053, + "grad_norm": 0.037672027945518494, + "learning_rate": 4.482225656877898e-05, + "loss": 0.0056, + "step": 291 + }, + { + "epoch": 0.22552616335199846, + "grad_norm": 0.05709156394004822, + "learning_rate": 4.497681607418856e-05, + "loss": 0.0064, + "step": 292 + }, + { + "epoch": 0.2262985132264916, + "grad_norm": 0.06096167117357254, + "learning_rate": 4.513137557959815e-05, + "loss": 0.0061, + "step": 293 + }, + { + "epoch": 0.22707086310098473, + "grad_norm": 0.04873086139559746, + "learning_rate": 4.528593508500773e-05, + "loss": 0.006, + "step": 294 + }, + { + "epoch": 0.22784321297547788, + "grad_norm": 0.08215577900409698, + "learning_rate": 4.544049459041731e-05, + "loss": 0.0065, + "step": 295 + }, + { + "epoch": 0.22861556284997103, + "grad_norm": 0.05225319415330887, + "learning_rate": 4.559505409582689e-05, + "loss": 0.0061, + "step": 296 + }, + { + "epoch": 0.22938791272446418, + "grad_norm": 0.0716412216424942, + "learning_rate": 4.574961360123648e-05, + "loss": 0.0056, + "step": 297 + }, + { + "epoch": 0.23016026259895733, + "grad_norm": 0.03296257182955742, + "learning_rate": 4.590417310664606e-05, + "loss": 0.0054, + "step": 298 + }, + { + "epoch": 0.23093261247345048, + "grad_norm": 0.06722760200500488, + "learning_rate": 4.605873261205564e-05, + "loss": 0.0055, + "step": 299 + }, + { + "epoch": 0.2317049623479436, + "grad_norm": 0.04291321709752083, + "learning_rate": 4.6213292117465226e-05, + "loss": 0.0063, + "step": 300 + }, + { + "epoch": 0.23247731222243675, + "grad_norm": 0.07270011305809021, + "learning_rate": 4.6367851622874805e-05, + "loss": 0.0063, + "step": 301 + }, + { + "epoch": 0.2332496620969299, + "grad_norm": 0.10372505336999893, + "learning_rate": 4.652241112828439e-05, + "loss": 0.0061, + "step": 302 + }, + { + "epoch": 0.23402201197142306, + "grad_norm": 0.034952372312545776, + "learning_rate": 4.6676970633693976e-05, + "loss": 0.0052, + "step": 303 + }, + { + "epoch": 0.2347943618459162, + "grad_norm": 0.12165629863739014, + "learning_rate": 4.6831530139103555e-05, + "loss": 0.0072, + "step": 304 + }, + { + "epoch": 0.23556671172040936, + "grad_norm": 0.03678225353360176, + "learning_rate": 4.698608964451314e-05, + "loss": 0.0062, + "step": 305 + }, + { + "epoch": 0.23633906159490248, + "grad_norm": 0.14635953307151794, + "learning_rate": 4.714064914992272e-05, + "loss": 0.0071, + "step": 306 + }, + { + "epoch": 0.23711141146939563, + "grad_norm": 0.05414648354053497, + "learning_rate": 4.7295208655332304e-05, + "loss": 0.0056, + "step": 307 + }, + { + "epoch": 0.23788376134388878, + "grad_norm": 0.09111734479665756, + "learning_rate": 4.744976816074189e-05, + "loss": 0.007, + "step": 308 + }, + { + "epoch": 0.23865611121838193, + "grad_norm": 0.01853596605360508, + "learning_rate": 4.760432766615147e-05, + "loss": 0.0055, + "step": 309 + }, + { + "epoch": 0.23942846109287508, + "grad_norm": 0.09702561795711517, + "learning_rate": 4.7758887171561054e-05, + "loss": 0.0063, + "step": 310 + }, + { + "epoch": 0.24020081096736823, + "grad_norm": 0.04423897713422775, + "learning_rate": 4.791344667697063e-05, + "loss": 0.0058, + "step": 311 + }, + { + "epoch": 0.24097316084186135, + "grad_norm": 0.1399751603603363, + "learning_rate": 4.806800618238022e-05, + "loss": 0.0068, + "step": 312 + }, + { + "epoch": 0.2417455107163545, + "grad_norm": 0.03153667598962784, + "learning_rate": 4.8222565687789803e-05, + "loss": 0.0056, + "step": 313 + }, + { + "epoch": 0.24251786059084765, + "grad_norm": 0.11411945521831512, + "learning_rate": 4.837712519319938e-05, + "loss": 0.0069, + "step": 314 + }, + { + "epoch": 0.2432902104653408, + "grad_norm": 0.031384337693452835, + "learning_rate": 4.853168469860897e-05, + "loss": 0.0052, + "step": 315 + }, + { + "epoch": 0.24406256033983395, + "grad_norm": 0.07609619945287704, + "learning_rate": 4.8686244204018546e-05, + "loss": 0.0064, + "step": 316 + }, + { + "epoch": 0.2448349102143271, + "grad_norm": 0.04893430694937706, + "learning_rate": 4.884080370942813e-05, + "loss": 0.0058, + "step": 317 + }, + { + "epoch": 0.24560726008882022, + "grad_norm": 0.09350036829710007, + "learning_rate": 4.899536321483772e-05, + "loss": 0.0059, + "step": 318 + }, + { + "epoch": 0.24637960996331337, + "grad_norm": 0.09150069952011108, + "learning_rate": 4.9149922720247296e-05, + "loss": 0.0059, + "step": 319 + }, + { + "epoch": 0.24715195983780652, + "grad_norm": 0.04132336005568504, + "learning_rate": 4.930448222565688e-05, + "loss": 0.0054, + "step": 320 + }, + { + "epoch": 0.24792430971229967, + "grad_norm": 0.07091164588928223, + "learning_rate": 4.945904173106646e-05, + "loss": 0.0053, + "step": 321 + }, + { + "epoch": 0.24869665958679282, + "grad_norm": 0.042904872447252274, + "learning_rate": 4.9613601236476046e-05, + "loss": 0.0059, + "step": 322 + }, + { + "epoch": 0.24946900946128597, + "grad_norm": 0.07893645018339157, + "learning_rate": 4.976816074188563e-05, + "loss": 0.0065, + "step": 323 + }, + { + "epoch": 0.2502413593357791, + "grad_norm": 0.03633784502744675, + "learning_rate": 4.992272024729521e-05, + "loss": 0.0062, + "step": 324 + }, + { + "epoch": 0.2510137092102723, + "grad_norm": 0.047179654240608215, + "learning_rate": 5.0077279752704795e-05, + "loss": 0.0057, + "step": 325 + }, + { + "epoch": 0.2517860590847654, + "grad_norm": 0.03592117503285408, + "learning_rate": 5.0231839258114374e-05, + "loss": 0.0066, + "step": 326 + }, + { + "epoch": 0.2525584089592585, + "grad_norm": 0.026356182992458344, + "learning_rate": 5.038639876352396e-05, + "loss": 0.0053, + "step": 327 + }, + { + "epoch": 0.25333075883375167, + "grad_norm": 0.05666210874915123, + "learning_rate": 5.0540958268933545e-05, + "loss": 0.006, + "step": 328 + }, + { + "epoch": 0.2541031087082448, + "grad_norm": 0.02799471653997898, + "learning_rate": 5.0695517774343123e-05, + "loss": 0.0053, + "step": 329 + }, + { + "epoch": 0.25487545858273797, + "grad_norm": 0.035501424223184586, + "learning_rate": 5.085007727975271e-05, + "loss": 0.0054, + "step": 330 + }, + { + "epoch": 0.2556478084572311, + "grad_norm": 0.03726429119706154, + "learning_rate": 5.100463678516229e-05, + "loss": 0.006, + "step": 331 + }, + { + "epoch": 0.25642015833172427, + "grad_norm": 0.04929223656654358, + "learning_rate": 5.115919629057187e-05, + "loss": 0.0062, + "step": 332 + }, + { + "epoch": 0.2571925082062174, + "grad_norm": 0.044612541794776917, + "learning_rate": 5.131375579598146e-05, + "loss": 0.0062, + "step": 333 + }, + { + "epoch": 0.25796485808071057, + "grad_norm": 0.05634415149688721, + "learning_rate": 5.146831530139104e-05, + "loss": 0.0066, + "step": 334 + }, + { + "epoch": 0.2587372079552037, + "grad_norm": 0.02783166617155075, + "learning_rate": 5.162287480680062e-05, + "loss": 0.0062, + "step": 335 + }, + { + "epoch": 0.25950955782969687, + "grad_norm": 0.07673410326242447, + "learning_rate": 5.17774343122102e-05, + "loss": 0.0068, + "step": 336 + }, + { + "epoch": 0.26028190770419, + "grad_norm": 0.03702232986688614, + "learning_rate": 5.193199381761979e-05, + "loss": 0.0061, + "step": 337 + }, + { + "epoch": 0.26105425757868317, + "grad_norm": 0.042356688529253006, + "learning_rate": 5.2086553323029365e-05, + "loss": 0.0053, + "step": 338 + }, + { + "epoch": 0.26182660745317626, + "grad_norm": 0.031067850068211555, + "learning_rate": 5.224111282843895e-05, + "loss": 0.0057, + "step": 339 + }, + { + "epoch": 0.2625989573276694, + "grad_norm": 0.03430347144603729, + "learning_rate": 5.2395672333848536e-05, + "loss": 0.0059, + "step": 340 + }, + { + "epoch": 0.26337130720216256, + "grad_norm": 0.076685830950737, + "learning_rate": 5.2550231839258115e-05, + "loss": 0.0061, + "step": 341 + }, + { + "epoch": 0.2641436570766557, + "grad_norm": 0.02771041728556156, + "learning_rate": 5.27047913446677e-05, + "loss": 0.0053, + "step": 342 + }, + { + "epoch": 0.26491600695114886, + "grad_norm": 0.08005037903785706, + "learning_rate": 5.2859350850077286e-05, + "loss": 0.0057, + "step": 343 + }, + { + "epoch": 0.265688356825642, + "grad_norm": 0.04103340208530426, + "learning_rate": 5.3013910355486865e-05, + "loss": 0.0054, + "step": 344 + }, + { + "epoch": 0.26646070670013516, + "grad_norm": 0.06979218870401382, + "learning_rate": 5.316846986089645e-05, + "loss": 0.0057, + "step": 345 + }, + { + "epoch": 0.2672330565746283, + "grad_norm": 0.060539260506629944, + "learning_rate": 5.332302936630603e-05, + "loss": 0.0058, + "step": 346 + }, + { + "epoch": 0.26800540644912146, + "grad_norm": 0.034149207174777985, + "learning_rate": 5.3477588871715614e-05, + "loss": 0.0056, + "step": 347 + }, + { + "epoch": 0.2687777563236146, + "grad_norm": 0.04316283389925957, + "learning_rate": 5.363214837712519e-05, + "loss": 0.0059, + "step": 348 + }, + { + "epoch": 0.26955010619810776, + "grad_norm": 0.07623675465583801, + "learning_rate": 5.378670788253478e-05, + "loss": 0.0067, + "step": 349 + }, + { + "epoch": 0.2703224560726009, + "grad_norm": 0.07627623528242111, + "learning_rate": 5.3941267387944364e-05, + "loss": 0.006, + "step": 350 + }, + { + "epoch": 0.271094805947094, + "grad_norm": 0.026273977011442184, + "learning_rate": 5.409582689335394e-05, + "loss": 0.0052, + "step": 351 + }, + { + "epoch": 0.27186715582158716, + "grad_norm": 0.06081831455230713, + "learning_rate": 5.425038639876353e-05, + "loss": 0.0058, + "step": 352 + }, + { + "epoch": 0.2726395056960803, + "grad_norm": 0.024972470477223396, + "learning_rate": 5.4404945904173114e-05, + "loss": 0.0052, + "step": 353 + }, + { + "epoch": 0.27341185557057346, + "grad_norm": 0.033721551299095154, + "learning_rate": 5.455950540958269e-05, + "loss": 0.006, + "step": 354 + }, + { + "epoch": 0.2741842054450666, + "grad_norm": 0.03825666382908821, + "learning_rate": 5.471406491499228e-05, + "loss": 0.0068, + "step": 355 + }, + { + "epoch": 0.27495655531955976, + "grad_norm": 0.019865261390805244, + "learning_rate": 5.4868624420401856e-05, + "loss": 0.0057, + "step": 356 + }, + { + "epoch": 0.2757289051940529, + "grad_norm": 0.05228486657142639, + "learning_rate": 5.502318392581144e-05, + "loss": 0.0056, + "step": 357 + }, + { + "epoch": 0.27650125506854606, + "grad_norm": 0.022940287366509438, + "learning_rate": 5.517774343122102e-05, + "loss": 0.0056, + "step": 358 + }, + { + "epoch": 0.2772736049430392, + "grad_norm": 0.031132381409406662, + "learning_rate": 5.5332302936630606e-05, + "loss": 0.0052, + "step": 359 + }, + { + "epoch": 0.27804595481753236, + "grad_norm": 0.029625840485095978, + "learning_rate": 5.548686244204019e-05, + "loss": 0.0062, + "step": 360 + }, + { + "epoch": 0.2788183046920255, + "grad_norm": 0.03403817117214203, + "learning_rate": 5.564142194744977e-05, + "loss": 0.0057, + "step": 361 + }, + { + "epoch": 0.27959065456651866, + "grad_norm": 0.020263448357582092, + "learning_rate": 5.5795981452859356e-05, + "loss": 0.006, + "step": 362 + }, + { + "epoch": 0.28036300444101175, + "grad_norm": 0.029121456667780876, + "learning_rate": 5.595054095826894e-05, + "loss": 0.0053, + "step": 363 + }, + { + "epoch": 0.2811353543155049, + "grad_norm": 0.053699836134910583, + "learning_rate": 5.610510046367852e-05, + "loss": 0.0048, + "step": 364 + }, + { + "epoch": 0.28190770418999805, + "grad_norm": 0.04384114220738411, + "learning_rate": 5.62596599690881e-05, + "loss": 0.006, + "step": 365 + }, + { + "epoch": 0.2826800540644912, + "grad_norm": 0.024282528087496758, + "learning_rate": 5.6414219474497684e-05, + "loss": 0.0052, + "step": 366 + }, + { + "epoch": 0.28345240393898435, + "grad_norm": 0.02922219969332218, + "learning_rate": 5.656877897990727e-05, + "loss": 0.005, + "step": 367 + }, + { + "epoch": 0.2842247538134775, + "grad_norm": 0.09684975445270538, + "learning_rate": 5.672333848531685e-05, + "loss": 0.0055, + "step": 368 + }, + { + "epoch": 0.28499710368797065, + "grad_norm": 0.03495265543460846, + "learning_rate": 5.6877897990726433e-05, + "loss": 0.0055, + "step": 369 + }, + { + "epoch": 0.2857694535624638, + "grad_norm": 0.18816547095775604, + "learning_rate": 5.703245749613602e-05, + "loss": 0.0084, + "step": 370 + }, + { + "epoch": 0.28654180343695695, + "grad_norm": 0.022471094503998756, + "learning_rate": 5.71870170015456e-05, + "loss": 0.0054, + "step": 371 + }, + { + "epoch": 0.2873141533114501, + "grad_norm": 0.8913330435752869, + "learning_rate": 5.734157650695518e-05, + "loss": 0.0188, + "step": 372 + }, + { + "epoch": 0.28808650318594325, + "grad_norm": 8.70426082611084, + "learning_rate": 5.749613601236477e-05, + "loss": 0.4367, + "step": 373 + }, + { + "epoch": 0.2888588530604364, + "grad_norm": 2.9028074741363525, + "learning_rate": 5.765069551777435e-05, + "loss": 0.2147, + "step": 374 + }, + { + "epoch": 0.2896312029349295, + "grad_norm": 2.0662808418273926, + "learning_rate": 5.7805255023183926e-05, + "loss": 0.0749, + "step": 375 + }, + { + "epoch": 0.29040355280942265, + "grad_norm": 11.876626968383789, + "learning_rate": 5.795981452859351e-05, + "loss": 0.3195, + "step": 376 + }, + { + "epoch": 0.2911759026839158, + "grad_norm": 8.665094375610352, + "learning_rate": 5.81143740340031e-05, + "loss": 0.2859, + "step": 377 + }, + { + "epoch": 0.29194825255840895, + "grad_norm": 4.957003593444824, + "learning_rate": 5.8268933539412676e-05, + "loss": 0.2408, + "step": 378 + }, + { + "epoch": 0.2927206024329021, + "grad_norm": 10.162996292114258, + "learning_rate": 5.842349304482226e-05, + "loss": 0.4644, + "step": 379 + }, + { + "epoch": 0.29349295230739525, + "grad_norm": 44.583221435546875, + "learning_rate": 5.8578052550231846e-05, + "loss": 1.6812, + "step": 380 + }, + { + "epoch": 0.2942653021818884, + "grad_norm": 5.381219863891602, + "learning_rate": 5.8732612055641425e-05, + "loss": 0.456, + "step": 381 + }, + { + "epoch": 0.29503765205638155, + "grad_norm": 5.965335369110107, + "learning_rate": 5.8887171561051004e-05, + "loss": 0.4624, + "step": 382 + }, + { + "epoch": 0.2958100019308747, + "grad_norm": 1.930649757385254, + "learning_rate": 5.9041731066460596e-05, + "loss": 0.1439, + "step": 383 + }, + { + "epoch": 0.29658235180536785, + "grad_norm": 1.6111472845077515, + "learning_rate": 5.9196290571870175e-05, + "loss": 0.256, + "step": 384 + }, + { + "epoch": 0.297354701679861, + "grad_norm": 3.4821295738220215, + "learning_rate": 5.9350850077279753e-05, + "loss": 0.3274, + "step": 385 + }, + { + "epoch": 0.29812705155435415, + "grad_norm": 1.2926265001296997, + "learning_rate": 5.950540958268934e-05, + "loss": 0.1754, + "step": 386 + }, + { + "epoch": 0.29889940142884724, + "grad_norm": 0.8014867305755615, + "learning_rate": 5.9659969088098924e-05, + "loss": 0.142, + "step": 387 + }, + { + "epoch": 0.2996717513033404, + "grad_norm": 8.201684951782227, + "learning_rate": 5.98145285935085e-05, + "loss": 0.4574, + "step": 388 + }, + { + "epoch": 0.30044410117783354, + "grad_norm": 0.9916243553161621, + "learning_rate": 5.996908809891809e-05, + "loss": 0.1185, + "step": 389 + }, + { + "epoch": 0.3012164510523267, + "grad_norm": 1.5478551387786865, + "learning_rate": 6.0123647604327674e-05, + "loss": 0.152, + "step": 390 + }, + { + "epoch": 0.30198880092681984, + "grad_norm": 0.656897783279419, + "learning_rate": 6.027820710973725e-05, + "loss": 0.1092, + "step": 391 + }, + { + "epoch": 0.302761150801313, + "grad_norm": 1.1630992889404297, + "learning_rate": 6.043276661514683e-05, + "loss": 0.1185, + "step": 392 + }, + { + "epoch": 0.30353350067580614, + "grad_norm": 0.7383008003234863, + "learning_rate": 6.0587326120556424e-05, + "loss": 0.0938, + "step": 393 + }, + { + "epoch": 0.3043058505502993, + "grad_norm": 1.26111900806427, + "learning_rate": 6.0741885625966e-05, + "loss": 0.1211, + "step": 394 + }, + { + "epoch": 0.30507820042479245, + "grad_norm": 0.7731469869613647, + "learning_rate": 6.089644513137558e-05, + "loss": 0.1022, + "step": 395 + }, + { + "epoch": 0.3058505502992856, + "grad_norm": 0.7344601154327393, + "learning_rate": 6.105100463678517e-05, + "loss": 0.1251, + "step": 396 + }, + { + "epoch": 0.30662290017377875, + "grad_norm": 0.5411269068717957, + "learning_rate": 6.120556414219475e-05, + "loss": 0.0928, + "step": 397 + }, + { + "epoch": 0.30739525004827184, + "grad_norm": 0.5494950413703918, + "learning_rate": 6.136012364760433e-05, + "loss": 0.0914, + "step": 398 + }, + { + "epoch": 0.308167599922765, + "grad_norm": 0.4900193512439728, + "learning_rate": 6.15146831530139e-05, + "loss": 0.1169, + "step": 399 + }, + { + "epoch": 0.30893994979725814, + "grad_norm": 0.43041563034057617, + "learning_rate": 6.16692426584235e-05, + "loss": 0.094, + "step": 400 + }, + { + "epoch": 0.3097122996717513, + "grad_norm": 0.5949247479438782, + "learning_rate": 6.182380216383307e-05, + "loss": 0.0922, + "step": 401 + }, + { + "epoch": 0.31048464954624444, + "grad_norm": 0.3937532305717468, + "learning_rate": 6.197836166924266e-05, + "loss": 0.0831, + "step": 402 + }, + { + "epoch": 0.3112569994207376, + "grad_norm": 0.3710464537143707, + "learning_rate": 6.213292117465224e-05, + "loss": 0.0836, + "step": 403 + }, + { + "epoch": 0.31202934929523074, + "grad_norm": 0.1914016306400299, + "learning_rate": 6.228748068006183e-05, + "loss": 0.0889, + "step": 404 + }, + { + "epoch": 0.3128016991697239, + "grad_norm": 0.3411383330821991, + "learning_rate": 6.24420401854714e-05, + "loss": 0.0853, + "step": 405 + }, + { + "epoch": 0.31357404904421704, + "grad_norm": 0.27261802554130554, + "learning_rate": 6.2596599690881e-05, + "loss": 0.0863, + "step": 406 + }, + { + "epoch": 0.3143463989187102, + "grad_norm": 0.26665183901786804, + "learning_rate": 6.275115919629057e-05, + "loss": 0.0803, + "step": 407 + }, + { + "epoch": 0.31511874879320334, + "grad_norm": 0.437089204788208, + "learning_rate": 6.290571870170016e-05, + "loss": 0.0794, + "step": 408 + }, + { + "epoch": 0.3158910986676965, + "grad_norm": 0.18276162445545197, + "learning_rate": 6.306027820710973e-05, + "loss": 0.0807, + "step": 409 + }, + { + "epoch": 0.3166634485421896, + "grad_norm": 0.35369428992271423, + "learning_rate": 6.321483771251933e-05, + "loss": 0.0796, + "step": 410 + }, + { + "epoch": 0.31743579841668274, + "grad_norm": 0.020447123795747757, + "learning_rate": 6.33693972179289e-05, + "loss": 0.0816, + "step": 411 + }, + { + "epoch": 0.3182081482911759, + "grad_norm": 0.34988123178482056, + "learning_rate": 6.352395672333849e-05, + "loss": 0.0843, + "step": 412 + }, + { + "epoch": 0.31898049816566904, + "grad_norm": 0.044847674667835236, + "learning_rate": 6.367851622874807e-05, + "loss": 0.0807, + "step": 413 + }, + { + "epoch": 0.3197528480401622, + "grad_norm": 0.2252042591571808, + "learning_rate": 6.383307573415766e-05, + "loss": 0.0786, + "step": 414 + }, + { + "epoch": 0.32052519791465534, + "grad_norm": 0.13536381721496582, + "learning_rate": 6.398763523956723e-05, + "loss": 0.0789, + "step": 415 + }, + { + "epoch": 0.3212975477891485, + "grad_norm": 0.1970210075378418, + "learning_rate": 6.414219474497683e-05, + "loss": 0.071, + "step": 416 + }, + { + "epoch": 0.32206989766364164, + "grad_norm": 0.13441815972328186, + "learning_rate": 6.42967542503864e-05, + "loss": 0.0782, + "step": 417 + }, + { + "epoch": 0.3228422475381348, + "grad_norm": 0.14775612950325012, + "learning_rate": 6.445131375579599e-05, + "loss": 0.0758, + "step": 418 + }, + { + "epoch": 0.32361459741262794, + "grad_norm": 0.0626678317785263, + "learning_rate": 6.460587326120556e-05, + "loss": 0.0745, + "step": 419 + }, + { + "epoch": 0.3243869472871211, + "grad_norm": 0.05171412229537964, + "learning_rate": 6.476043276661516e-05, + "loss": 0.0756, + "step": 420 + }, + { + "epoch": 0.32515929716161424, + "grad_norm": 0.14175881445407867, + "learning_rate": 6.491499227202473e-05, + "loss": 0.0746, + "step": 421 + }, + { + "epoch": 0.32593164703610733, + "grad_norm": 0.05274312198162079, + "learning_rate": 6.506955177743431e-05, + "loss": 0.0744, + "step": 422 + }, + { + "epoch": 0.3267039969106005, + "grad_norm": 0.1517016589641571, + "learning_rate": 6.52241112828439e-05, + "loss": 0.0766, + "step": 423 + }, + { + "epoch": 0.32747634678509363, + "grad_norm": 0.05368823930621147, + "learning_rate": 6.537867078825348e-05, + "loss": 0.0749, + "step": 424 + }, + { + "epoch": 0.3282486966595868, + "grad_norm": 0.14097322523593903, + "learning_rate": 6.553323029366306e-05, + "loss": 0.0764, + "step": 425 + }, + { + "epoch": 0.32902104653407993, + "grad_norm": 0.06039084121584892, + "learning_rate": 6.568778979907264e-05, + "loss": 0.0709, + "step": 426 + }, + { + "epoch": 0.3297933964085731, + "grad_norm": 0.07845164835453033, + "learning_rate": 6.584234930448223e-05, + "loss": 0.0743, + "step": 427 + }, + { + "epoch": 0.33056574628306623, + "grad_norm": 0.06691066920757294, + "learning_rate": 6.599690880989181e-05, + "loss": 0.0735, + "step": 428 + }, + { + "epoch": 0.3313380961575594, + "grad_norm": 0.02390989474952221, + "learning_rate": 6.615146831530138e-05, + "loss": 0.0713, + "step": 429 + }, + { + "epoch": 0.33211044603205253, + "grad_norm": 0.029864365234971046, + "learning_rate": 6.630602782071098e-05, + "loss": 0.0752, + "step": 430 + }, + { + "epoch": 0.3328827959065457, + "grad_norm": 0.09867202490568161, + "learning_rate": 6.646058732612056e-05, + "loss": 0.0742, + "step": 431 + }, + { + "epoch": 0.33365514578103883, + "grad_norm": 0.2730984687805176, + "learning_rate": 6.661514683153014e-05, + "loss": 0.0733, + "step": 432 + }, + { + "epoch": 0.334427495655532, + "grad_norm": 0.1675487458705902, + "learning_rate": 6.676970633693973e-05, + "loss": 0.0799, + "step": 433 + }, + { + "epoch": 0.3351998455300251, + "grad_norm": 0.25640159845352173, + "learning_rate": 6.692426584234931e-05, + "loss": 0.0725, + "step": 434 + }, + { + "epoch": 0.3359721954045182, + "grad_norm": 0.32263967394828796, + "learning_rate": 6.707882534775888e-05, + "loss": 0.0745, + "step": 435 + }, + { + "epoch": 0.3367445452790114, + "grad_norm": 0.10832744091749191, + "learning_rate": 6.723338485316847e-05, + "loss": 0.0689, + "step": 436 + }, + { + "epoch": 0.3375168951535045, + "grad_norm": 0.2080148458480835, + "learning_rate": 6.738794435857806e-05, + "loss": 0.0779, + "step": 437 + }, + { + "epoch": 0.3382892450279977, + "grad_norm": 0.14392602443695068, + "learning_rate": 6.754250386398764e-05, + "loss": 0.0709, + "step": 438 + }, + { + "epoch": 0.3390615949024908, + "grad_norm": 0.1571575552225113, + "learning_rate": 6.769706336939721e-05, + "loss": 0.0731, + "step": 439 + }, + { + "epoch": 0.339833944776984, + "grad_norm": 0.15750271081924438, + "learning_rate": 6.785162287480681e-05, + "loss": 0.0756, + "step": 440 + }, + { + "epoch": 0.3406062946514771, + "grad_norm": 0.1180683821439743, + "learning_rate": 6.800618238021638e-05, + "loss": 0.0737, + "step": 441 + }, + { + "epoch": 0.3413786445259703, + "grad_norm": 0.16251571476459503, + "learning_rate": 6.816074188562597e-05, + "loss": 0.0692, + "step": 442 + }, + { + "epoch": 0.3421509944004634, + "grad_norm": 0.07380519807338715, + "learning_rate": 6.831530139103554e-05, + "loss": 0.0722, + "step": 443 + }, + { + "epoch": 0.3429233442749566, + "grad_norm": 0.22383369505405426, + "learning_rate": 6.846986089644514e-05, + "loss": 0.0681, + "step": 444 + }, + { + "epoch": 0.3436956941494497, + "grad_norm": 0.1768750697374344, + "learning_rate": 6.862442040185471e-05, + "loss": 0.0693, + "step": 445 + }, + { + "epoch": 0.3444680440239428, + "grad_norm": 0.0998828187584877, + "learning_rate": 6.87789799072643e-05, + "loss": 0.076, + "step": 446 + }, + { + "epoch": 0.34524039389843597, + "grad_norm": 0.1407008320093155, + "learning_rate": 6.893353941267388e-05, + "loss": 0.0734, + "step": 447 + }, + { + "epoch": 0.3460127437729291, + "grad_norm": 0.24386470019817352, + "learning_rate": 6.908809891808347e-05, + "loss": 0.069, + "step": 448 + }, + { + "epoch": 0.34678509364742227, + "grad_norm": 0.17217887938022614, + "learning_rate": 6.924265842349304e-05, + "loss": 0.0718, + "step": 449 + }, + { + "epoch": 0.3475574435219154, + "grad_norm": 0.1998705416917801, + "learning_rate": 6.939721792890264e-05, + "loss": 0.0779, + "step": 450 + }, + { + "epoch": 0.34832979339640857, + "grad_norm": 0.15912608802318573, + "learning_rate": 6.955177743431221e-05, + "loss": 0.0738, + "step": 451 + }, + { + "epoch": 0.3491021432709017, + "grad_norm": 0.7292158603668213, + "learning_rate": 6.97063369397218e-05, + "loss": 0.0815, + "step": 452 + }, + { + "epoch": 0.34987449314539487, + "grad_norm": 0.16888517141342163, + "learning_rate": 6.986089644513137e-05, + "loss": 0.0757, + "step": 453 + }, + { + "epoch": 0.350646843019888, + "grad_norm": 0.22534602880477905, + "learning_rate": 7.001545595054097e-05, + "loss": 0.0881, + "step": 454 + }, + { + "epoch": 0.35141919289438117, + "grad_norm": 0.2260252833366394, + "learning_rate": 7.017001545595054e-05, + "loss": 0.0936, + "step": 455 + }, + { + "epoch": 0.3521915427688743, + "grad_norm": 0.16626620292663574, + "learning_rate": 7.032457496136012e-05, + "loss": 0.0894, + "step": 456 + }, + { + "epoch": 0.35296389264336747, + "grad_norm": 0.1906137317419052, + "learning_rate": 7.047913446676971e-05, + "loss": 0.0864, + "step": 457 + }, + { + "epoch": 0.35373624251786057, + "grad_norm": 0.16349031031131744, + "learning_rate": 7.06336939721793e-05, + "loss": 0.0781, + "step": 458 + }, + { + "epoch": 0.3545085923923537, + "grad_norm": 0.07731667160987854, + "learning_rate": 7.078825347758887e-05, + "loss": 0.0671, + "step": 459 + }, + { + "epoch": 0.35528094226684687, + "grad_norm": 0.818091094493866, + "learning_rate": 7.094281298299847e-05, + "loss": 0.0816, + "step": 460 + }, + { + "epoch": 0.35605329214134, + "grad_norm": 0.04961278289556503, + "learning_rate": 7.109737248840804e-05, + "loss": 0.0618, + "step": 461 + }, + { + "epoch": 0.35682564201583317, + "grad_norm": 0.14381299912929535, + "learning_rate": 7.125193199381762e-05, + "loss": 0.0698, + "step": 462 + }, + { + "epoch": 0.3575979918903263, + "grad_norm": 0.16918033361434937, + "learning_rate": 7.14064914992272e-05, + "loss": 0.0746, + "step": 463 + }, + { + "epoch": 0.35837034176481947, + "grad_norm": 0.14989601075649261, + "learning_rate": 7.15610510046368e-05, + "loss": 0.075, + "step": 464 + }, + { + "epoch": 0.3591426916393126, + "grad_norm": 0.15754370391368866, + "learning_rate": 7.171561051004637e-05, + "loss": 0.0706, + "step": 465 + }, + { + "epoch": 0.35991504151380577, + "grad_norm": 0.14635930955410004, + "learning_rate": 7.187017001545595e-05, + "loss": 0.0695, + "step": 466 + }, + { + "epoch": 0.3606873913882989, + "grad_norm": 0.12360066920518875, + "learning_rate": 7.202472952086554e-05, + "loss": 0.07, + "step": 467 + }, + { + "epoch": 0.36145974126279207, + "grad_norm": 0.0649237334728241, + "learning_rate": 7.217928902627512e-05, + "loss": 0.0675, + "step": 468 + }, + { + "epoch": 0.3622320911372852, + "grad_norm": 0.294392466545105, + "learning_rate": 7.23338485316847e-05, + "loss": 0.0634, + "step": 469 + }, + { + "epoch": 0.3630044410117783, + "grad_norm": 0.2768172323703766, + "learning_rate": 7.248840803709428e-05, + "loss": 0.0642, + "step": 470 + }, + { + "epoch": 0.36377679088627146, + "grad_norm": 0.054166581481695175, + "learning_rate": 7.264296754250387e-05, + "loss": 0.0653, + "step": 471 + }, + { + "epoch": 0.3645491407607646, + "grad_norm": 0.16986453533172607, + "learning_rate": 7.279752704791345e-05, + "loss": 0.059, + "step": 472 + }, + { + "epoch": 0.36532149063525776, + "grad_norm": 0.15569911897182465, + "learning_rate": 7.295208655332302e-05, + "loss": 0.063, + "step": 473 + }, + { + "epoch": 0.3660938405097509, + "grad_norm": 0.1545466035604477, + "learning_rate": 7.310664605873262e-05, + "loss": 0.0622, + "step": 474 + }, + { + "epoch": 0.36686619038424406, + "grad_norm": 0.1770275980234146, + "learning_rate": 7.32612055641422e-05, + "loss": 0.0692, + "step": 475 + }, + { + "epoch": 0.3676385402587372, + "grad_norm": 0.13061058521270752, + "learning_rate": 7.341576506955178e-05, + "loss": 0.0645, + "step": 476 + }, + { + "epoch": 0.36841089013323036, + "grad_norm": 0.19434262812137604, + "learning_rate": 7.357032457496137e-05, + "loss": 0.0619, + "step": 477 + }, + { + "epoch": 0.3691832400077235, + "grad_norm": 0.12423071265220642, + "learning_rate": 7.372488408037095e-05, + "loss": 0.0614, + "step": 478 + }, + { + "epoch": 0.36995558988221666, + "grad_norm": 0.1717289686203003, + "learning_rate": 7.387944358578052e-05, + "loss": 0.0652, + "step": 479 + }, + { + "epoch": 0.3707279397567098, + "grad_norm": 0.19073888659477234, + "learning_rate": 7.403400309119011e-05, + "loss": 0.0623, + "step": 480 + }, + { + "epoch": 0.37150028963120296, + "grad_norm": 0.09556687623262405, + "learning_rate": 7.41885625965997e-05, + "loss": 0.0585, + "step": 481 + }, + { + "epoch": 0.37227263950569606, + "grad_norm": 0.25466054677963257, + "learning_rate": 7.434312210200928e-05, + "loss": 0.0615, + "step": 482 + }, + { + "epoch": 0.3730449893801892, + "grad_norm": 0.11340200901031494, + "learning_rate": 7.449768160741885e-05, + "loss": 0.0597, + "step": 483 + }, + { + "epoch": 0.37381733925468236, + "grad_norm": 0.13622435927391052, + "learning_rate": 7.465224111282845e-05, + "loss": 0.0593, + "step": 484 + }, + { + "epoch": 0.3745896891291755, + "grad_norm": 0.04559488967061043, + "learning_rate": 7.480680061823802e-05, + "loss": 0.06, + "step": 485 + }, + { + "epoch": 0.37536203900366866, + "grad_norm": 0.20303906500339508, + "learning_rate": 7.496136012364761e-05, + "loss": 0.0539, + "step": 486 + }, + { + "epoch": 0.3761343888781618, + "grad_norm": 0.10952377319335938, + "learning_rate": 7.511591962905718e-05, + "loss": 0.0569, + "step": 487 + }, + { + "epoch": 0.37690673875265496, + "grad_norm": 0.09587670862674713, + "learning_rate": 7.527047913446678e-05, + "loss": 0.0555, + "step": 488 + }, + { + "epoch": 0.3776790886271481, + "grad_norm": 0.15245388448238373, + "learning_rate": 7.542503863987635e-05, + "loss": 0.0591, + "step": 489 + }, + { + "epoch": 0.37845143850164126, + "grad_norm": 0.03817014768719673, + "learning_rate": 7.557959814528594e-05, + "loss": 0.0539, + "step": 490 + }, + { + "epoch": 0.3792237883761344, + "grad_norm": 0.1699414998292923, + "learning_rate": 7.573415765069552e-05, + "loss": 0.0601, + "step": 491 + }, + { + "epoch": 0.37999613825062756, + "grad_norm": 0.14948895573616028, + "learning_rate": 7.58887171561051e-05, + "loss": 0.0614, + "step": 492 + }, + { + "epoch": 0.38076848812512065, + "grad_norm": 0.06657677888870239, + "learning_rate": 7.604327666151468e-05, + "loss": 0.0528, + "step": 493 + }, + { + "epoch": 0.3815408379996138, + "grad_norm": 0.07747476547956467, + "learning_rate": 7.619783616692428e-05, + "loss": 0.0536, + "step": 494 + }, + { + "epoch": 0.38231318787410695, + "grad_norm": 0.061604093760252, + "learning_rate": 7.635239567233385e-05, + "loss": 0.0495, + "step": 495 + }, + { + "epoch": 0.3830855377486001, + "grad_norm": 0.23023541271686554, + "learning_rate": 7.650695517774343e-05, + "loss": 0.0517, + "step": 496 + }, + { + "epoch": 0.38385788762309325, + "grad_norm": 0.3549231290817261, + "learning_rate": 7.6661514683153e-05, + "loss": 0.0531, + "step": 497 + }, + { + "epoch": 0.3846302374975864, + "grad_norm": 0.2388257384300232, + "learning_rate": 7.68160741885626e-05, + "loss": 0.0482, + "step": 498 + }, + { + "epoch": 0.38540258737207955, + "grad_norm": 0.05481262877583504, + "learning_rate": 7.697063369397218e-05, + "loss": 0.0466, + "step": 499 + }, + { + "epoch": 0.3861749372465727, + "grad_norm": 0.20280222594738007, + "learning_rate": 7.712519319938176e-05, + "loss": 0.0489, + "step": 500 + }, + { + "epoch": 0.38694728712106585, + "grad_norm": 0.10731515288352966, + "learning_rate": 7.727975270479135e-05, + "loss": 0.0402, + "step": 501 + }, + { + "epoch": 0.387719636995559, + "grad_norm": 0.14593768119812012, + "learning_rate": 7.743431221020093e-05, + "loss": 0.0402, + "step": 502 + }, + { + "epoch": 0.38849198687005215, + "grad_norm": 0.14538267254829407, + "learning_rate": 7.75888717156105e-05, + "loss": 0.0383, + "step": 503 + }, + { + "epoch": 0.3892643367445453, + "grad_norm": 0.07397224009037018, + "learning_rate": 7.774343122102009e-05, + "loss": 0.0332, + "step": 504 + }, + { + "epoch": 0.3900366866190384, + "grad_norm": 0.13446685671806335, + "learning_rate": 7.789799072642968e-05, + "loss": 0.0338, + "step": 505 + }, + { + "epoch": 0.39080903649353155, + "grad_norm": 0.1309434473514557, + "learning_rate": 7.805255023183926e-05, + "loss": 0.0336, + "step": 506 + }, + { + "epoch": 0.3915813863680247, + "grad_norm": 0.08193838596343994, + "learning_rate": 7.820710973724883e-05, + "loss": 0.0308, + "step": 507 + }, + { + "epoch": 0.39235373624251785, + "grad_norm": 0.06123901903629303, + "learning_rate": 7.836166924265843e-05, + "loss": 0.0282, + "step": 508 + }, + { + "epoch": 0.393126086117011, + "grad_norm": 0.08192218840122223, + "learning_rate": 7.8516228748068e-05, + "loss": 0.0263, + "step": 509 + }, + { + "epoch": 0.39389843599150415, + "grad_norm": 0.08452457934617996, + "learning_rate": 7.867078825347759e-05, + "loss": 0.024, + "step": 510 + }, + { + "epoch": 0.3946707858659973, + "grad_norm": 0.0647989958524704, + "learning_rate": 7.882534775888718e-05, + "loss": 0.0221, + "step": 511 + }, + { + "epoch": 0.39544313574049045, + "grad_norm": 0.07992154359817505, + "learning_rate": 7.897990726429676e-05, + "loss": 0.0218, + "step": 512 + }, + { + "epoch": 0.3962154856149836, + "grad_norm": 0.06557576358318329, + "learning_rate": 7.913446676970633e-05, + "loss": 0.0218, + "step": 513 + }, + { + "epoch": 0.39698783548947675, + "grad_norm": 0.07514405995607376, + "learning_rate": 7.928902627511592e-05, + "loss": 0.017, + "step": 514 + }, + { + "epoch": 0.3977601853639699, + "grad_norm": 0.05783172324299812, + "learning_rate": 7.94435857805255e-05, + "loss": 0.0162, + "step": 515 + }, + { + "epoch": 0.39853253523846305, + "grad_norm": 0.13857033848762512, + "learning_rate": 7.959814528593509e-05, + "loss": 0.0173, + "step": 516 + }, + { + "epoch": 0.39930488511295614, + "grad_norm": 0.26674067974090576, + "learning_rate": 7.975270479134466e-05, + "loss": 0.0206, + "step": 517 + }, + { + "epoch": 0.4000772349874493, + "grad_norm": 0.29622969031333923, + "learning_rate": 7.990726429675426e-05, + "loss": 0.0362, + "step": 518 + }, + { + "epoch": 0.40084958486194244, + "grad_norm": 0.07281139492988586, + "learning_rate": 8.006182380216383e-05, + "loss": 0.0168, + "step": 519 + }, + { + "epoch": 0.4016219347364356, + "grad_norm": 0.22175332903862, + "learning_rate": 8.021638330757342e-05, + "loss": 0.0215, + "step": 520 + }, + { + "epoch": 0.40239428461092874, + "grad_norm": 0.08515631407499313, + "learning_rate": 8.0370942812983e-05, + "loss": 0.0173, + "step": 521 + }, + { + "epoch": 0.4031666344854219, + "grad_norm": 0.056558165699243546, + "learning_rate": 8.052550231839259e-05, + "loss": 0.0143, + "step": 522 + }, + { + "epoch": 0.40393898435991504, + "grad_norm": 0.06582767516374588, + "learning_rate": 8.068006182380216e-05, + "loss": 0.0131, + "step": 523 + }, + { + "epoch": 0.4047113342344082, + "grad_norm": 0.04354550689458847, + "learning_rate": 8.083462132921175e-05, + "loss": 0.0135, + "step": 524 + }, + { + "epoch": 0.40548368410890134, + "grad_norm": 0.04626228287816048, + "learning_rate": 8.098918083462133e-05, + "loss": 0.0134, + "step": 525 + }, + { + "epoch": 0.4062560339833945, + "grad_norm": 0.056485529989004135, + "learning_rate": 8.114374034003092e-05, + "loss": 0.0114, + "step": 526 + }, + { + "epoch": 0.40702838385788764, + "grad_norm": 0.048203691840171814, + "learning_rate": 8.129829984544049e-05, + "loss": 0.0114, + "step": 527 + }, + { + "epoch": 0.4078007337323808, + "grad_norm": 0.04264072701334953, + "learning_rate": 8.145285935085009e-05, + "loss": 0.0114, + "step": 528 + }, + { + "epoch": 0.4085730836068739, + "grad_norm": 0.06230396404862404, + "learning_rate": 8.160741885625966e-05, + "loss": 0.0112, + "step": 529 + }, + { + "epoch": 0.40934543348136704, + "grad_norm": 0.048966314643621445, + "learning_rate": 8.176197836166925e-05, + "loss": 0.0098, + "step": 530 + }, + { + "epoch": 0.4101177833558602, + "grad_norm": 0.03883390873670578, + "learning_rate": 8.191653786707882e-05, + "loss": 0.0109, + "step": 531 + }, + { + "epoch": 0.41089013323035334, + "grad_norm": 0.0660616010427475, + "learning_rate": 8.207109737248842e-05, + "loss": 0.0109, + "step": 532 + }, + { + "epoch": 0.4116624831048465, + "grad_norm": 0.049630846828222275, + "learning_rate": 8.222565687789799e-05, + "loss": 0.0097, + "step": 533 + }, + { + "epoch": 0.41243483297933964, + "grad_norm": 0.051477570086717606, + "learning_rate": 8.238021638330757e-05, + "loss": 0.0101, + "step": 534 + }, + { + "epoch": 0.4132071828538328, + "grad_norm": 0.04098783805966377, + "learning_rate": 8.253477588871716e-05, + "loss": 0.0095, + "step": 535 + }, + { + "epoch": 0.41397953272832594, + "grad_norm": 0.03531495854258537, + "learning_rate": 8.268933539412674e-05, + "loss": 0.0093, + "step": 536 + }, + { + "epoch": 0.4147518826028191, + "grad_norm": 0.03453240916132927, + "learning_rate": 8.284389489953632e-05, + "loss": 0.01, + "step": 537 + }, + { + "epoch": 0.41552423247731224, + "grad_norm": 0.07704520225524902, + "learning_rate": 8.299845440494592e-05, + "loss": 0.0106, + "step": 538 + }, + { + "epoch": 0.4162965823518054, + "grad_norm": 0.05514024570584297, + "learning_rate": 8.315301391035549e-05, + "loss": 0.0088, + "step": 539 + }, + { + "epoch": 0.41706893222629854, + "grad_norm": 0.042679984122514725, + "learning_rate": 8.330757341576507e-05, + "loss": 0.0093, + "step": 540 + }, + { + "epoch": 0.41784128210079163, + "grad_norm": 0.059344884008169174, + "learning_rate": 8.346213292117464e-05, + "loss": 0.0092, + "step": 541 + }, + { + "epoch": 0.4186136319752848, + "grad_norm": 0.0364481545984745, + "learning_rate": 8.361669242658424e-05, + "loss": 0.0083, + "step": 542 + }, + { + "epoch": 0.41938598184977793, + "grad_norm": 0.03970944508910179, + "learning_rate": 8.377125193199382e-05, + "loss": 0.0091, + "step": 543 + }, + { + "epoch": 0.4201583317242711, + "grad_norm": 0.06593722850084305, + "learning_rate": 8.39258114374034e-05, + "loss": 0.0086, + "step": 544 + }, + { + "epoch": 0.42093068159876423, + "grad_norm": 0.054119762033224106, + "learning_rate": 8.408037094281299e-05, + "loss": 0.0089, + "step": 545 + }, + { + "epoch": 0.4217030314732574, + "grad_norm": 0.07380783557891846, + "learning_rate": 8.423493044822257e-05, + "loss": 0.0097, + "step": 546 + }, + { + "epoch": 0.42247538134775053, + "grad_norm": 0.04153510928153992, + "learning_rate": 8.438948995363214e-05, + "loss": 0.0083, + "step": 547 + }, + { + "epoch": 0.4232477312222437, + "grad_norm": 0.09071889519691467, + "learning_rate": 8.454404945904173e-05, + "loss": 0.0085, + "step": 548 + }, + { + "epoch": 0.42402008109673683, + "grad_norm": 0.02221975289285183, + "learning_rate": 8.469860896445132e-05, + "loss": 0.0076, + "step": 549 + }, + { + "epoch": 0.42479243097123, + "grad_norm": 0.0676327645778656, + "learning_rate": 8.48531684698609e-05, + "loss": 0.0095, + "step": 550 + }, + { + "epoch": 0.42556478084572313, + "grad_norm": 0.06018376350402832, + "learning_rate": 8.500772797527047e-05, + "loss": 0.0085, + "step": 551 + }, + { + "epoch": 0.4263371307202163, + "grad_norm": 0.03183077275753021, + "learning_rate": 8.516228748068007e-05, + "loss": 0.0077, + "step": 552 + }, + { + "epoch": 0.4271094805947094, + "grad_norm": 0.06584025919437408, + "learning_rate": 8.531684698608964e-05, + "loss": 0.009, + "step": 553 + }, + { + "epoch": 0.42788183046920253, + "grad_norm": 0.039773181080818176, + "learning_rate": 8.547140649149923e-05, + "loss": 0.0088, + "step": 554 + }, + { + "epoch": 0.4286541803436957, + "grad_norm": 0.05208640173077583, + "learning_rate": 8.562596599690881e-05, + "loss": 0.0076, + "step": 555 + }, + { + "epoch": 0.42942653021818883, + "grad_norm": 0.060254037380218506, + "learning_rate": 8.57805255023184e-05, + "loss": 0.0083, + "step": 556 + }, + { + "epoch": 0.430198880092682, + "grad_norm": 0.019793977960944176, + "learning_rate": 8.593508500772797e-05, + "loss": 0.0076, + "step": 557 + }, + { + "epoch": 0.43097122996717513, + "grad_norm": 0.04959573224186897, + "learning_rate": 8.608964451313756e-05, + "loss": 0.0087, + "step": 558 + }, + { + "epoch": 0.4317435798416683, + "grad_norm": 0.027046391740441322, + "learning_rate": 8.624420401854714e-05, + "loss": 0.0076, + "step": 559 + }, + { + "epoch": 0.43251592971616143, + "grad_norm": 0.030857374891638756, + "learning_rate": 8.639876352395673e-05, + "loss": 0.0077, + "step": 560 + }, + { + "epoch": 0.4332882795906546, + "grad_norm": 0.01885489746928215, + "learning_rate": 8.65533230293663e-05, + "loss": 0.0075, + "step": 561 + }, + { + "epoch": 0.43406062946514773, + "grad_norm": 0.020508000627160072, + "learning_rate": 8.67078825347759e-05, + "loss": 0.0082, + "step": 562 + }, + { + "epoch": 0.4348329793396409, + "grad_norm": 0.037663884460926056, + "learning_rate": 8.686244204018547e-05, + "loss": 0.0077, + "step": 563 + }, + { + "epoch": 0.43560532921413403, + "grad_norm": 0.020937500521540642, + "learning_rate": 8.701700154559506e-05, + "loss": 0.0078, + "step": 564 + }, + { + "epoch": 0.4363776790886271, + "grad_norm": 0.021033072844147682, + "learning_rate": 8.717156105100464e-05, + "loss": 0.0065, + "step": 565 + }, + { + "epoch": 0.4371500289631203, + "grad_norm": 0.029200492426753044, + "learning_rate": 8.732612055641423e-05, + "loss": 0.0079, + "step": 566 + }, + { + "epoch": 0.4379223788376134, + "grad_norm": 0.029047193005681038, + "learning_rate": 8.74806800618238e-05, + "loss": 0.0073, + "step": 567 + }, + { + "epoch": 0.4386947287121066, + "grad_norm": 0.02560579404234886, + "learning_rate": 8.763523956723338e-05, + "loss": 0.0073, + "step": 568 + }, + { + "epoch": 0.4394670785865997, + "grad_norm": 0.03898506984114647, + "learning_rate": 8.778979907264297e-05, + "loss": 0.0078, + "step": 569 + }, + { + "epoch": 0.4402394284610929, + "grad_norm": 0.055788278579711914, + "learning_rate": 8.794435857805256e-05, + "loss": 0.0083, + "step": 570 + }, + { + "epoch": 0.441011778335586, + "grad_norm": 0.026275061070919037, + "learning_rate": 8.809891808346213e-05, + "loss": 0.0076, + "step": 571 + }, + { + "epoch": 0.4417841282100792, + "grad_norm": 0.040119849145412445, + "learning_rate": 8.825347758887173e-05, + "loss": 0.007, + "step": 572 + }, + { + "epoch": 0.4425564780845723, + "grad_norm": 0.04549155756831169, + "learning_rate": 8.84080370942813e-05, + "loss": 0.0079, + "step": 573 + }, + { + "epoch": 0.4433288279590655, + "grad_norm": 0.025678303092718124, + "learning_rate": 8.856259659969088e-05, + "loss": 0.0068, + "step": 574 + }, + { + "epoch": 0.4441011778335586, + "grad_norm": 0.0724676251411438, + "learning_rate": 8.871715610510047e-05, + "loss": 0.008, + "step": 575 + }, + { + "epoch": 0.4448735277080518, + "grad_norm": 0.018808679655194283, + "learning_rate": 8.887171561051005e-05, + "loss": 0.0069, + "step": 576 + }, + { + "epoch": 0.44564587758254487, + "grad_norm": 0.044109608978033066, + "learning_rate": 8.902627511591963e-05, + "loss": 0.0066, + "step": 577 + }, + { + "epoch": 0.446418227457038, + "grad_norm": 0.04183276370167732, + "learning_rate": 8.918083462132921e-05, + "loss": 0.0067, + "step": 578 + }, + { + "epoch": 0.44719057733153117, + "grad_norm": 0.018278077244758606, + "learning_rate": 8.93353941267388e-05, + "loss": 0.0068, + "step": 579 + }, + { + "epoch": 0.4479629272060243, + "grad_norm": 0.040254537016153336, + "learning_rate": 8.948995363214838e-05, + "loss": 0.0073, + "step": 580 + }, + { + "epoch": 0.44873527708051747, + "grad_norm": 0.04067078232765198, + "learning_rate": 8.964451313755796e-05, + "loss": 0.0071, + "step": 581 + }, + { + "epoch": 0.4495076269550106, + "grad_norm": 0.021396074444055557, + "learning_rate": 8.979907264296755e-05, + "loss": 0.0064, + "step": 582 + }, + { + "epoch": 0.45027997682950377, + "grad_norm": 0.05812348425388336, + "learning_rate": 8.995363214837713e-05, + "loss": 0.0072, + "step": 583 + }, + { + "epoch": 0.4510523267039969, + "grad_norm": 0.052907440811395645, + "learning_rate": 9.010819165378671e-05, + "loss": 0.0075, + "step": 584 + }, + { + "epoch": 0.45182467657849007, + "grad_norm": 0.04013500362634659, + "learning_rate": 9.02627511591963e-05, + "loss": 0.007, + "step": 585 + }, + { + "epoch": 0.4525970264529832, + "grad_norm": 0.062313806265592575, + "learning_rate": 9.041731066460588e-05, + "loss": 0.0073, + "step": 586 + }, + { + "epoch": 0.45336937632747637, + "grad_norm": 0.020526016131043434, + "learning_rate": 9.057187017001545e-05, + "loss": 0.0073, + "step": 587 + }, + { + "epoch": 0.45414172620196946, + "grad_norm": 0.03165501728653908, + "learning_rate": 9.072642967542504e-05, + "loss": 0.0065, + "step": 588 + }, + { + "epoch": 0.4549140760764626, + "grad_norm": 0.08332403749227524, + "learning_rate": 9.088098918083463e-05, + "loss": 0.007, + "step": 589 + }, + { + "epoch": 0.45568642595095576, + "grad_norm": 0.031461749225854874, + "learning_rate": 9.103554868624421e-05, + "loss": 0.0066, + "step": 590 + }, + { + "epoch": 0.4564587758254489, + "grad_norm": 0.08399269729852676, + "learning_rate": 9.119010819165378e-05, + "loss": 0.0076, + "step": 591 + }, + { + "epoch": 0.45723112569994206, + "grad_norm": 0.07166516035795212, + "learning_rate": 9.134466769706337e-05, + "loss": 0.0081, + "step": 592 + }, + { + "epoch": 0.4580034755744352, + "grad_norm": 0.07119960337877274, + "learning_rate": 9.149922720247295e-05, + "loss": 0.0069, + "step": 593 + }, + { + "epoch": 0.45877582544892836, + "grad_norm": 0.06339021027088165, + "learning_rate": 9.165378670788254e-05, + "loss": 0.0071, + "step": 594 + }, + { + "epoch": 0.4595481753234215, + "grad_norm": 0.02564193122088909, + "learning_rate": 9.180834621329212e-05, + "loss": 0.0058, + "step": 595 + }, + { + "epoch": 0.46032052519791467, + "grad_norm": 0.022058872506022453, + "learning_rate": 9.196290571870171e-05, + "loss": 0.0062, + "step": 596 + }, + { + "epoch": 0.4610928750724078, + "grad_norm": 0.06470519304275513, + "learning_rate": 9.211746522411128e-05, + "loss": 0.0069, + "step": 597 + }, + { + "epoch": 0.46186522494690097, + "grad_norm": 0.04460231587290764, + "learning_rate": 9.227202472952087e-05, + "loss": 0.0066, + "step": 598 + }, + { + "epoch": 0.4626375748213941, + "grad_norm": 0.04173488914966583, + "learning_rate": 9.242658423493045e-05, + "loss": 0.0069, + "step": 599 + }, + { + "epoch": 0.4634099246958872, + "grad_norm": 0.06716244667768478, + "learning_rate": 9.258114374034004e-05, + "loss": 0.0074, + "step": 600 + }, + { + "epoch": 0.46418227457038036, + "grad_norm": 0.025468017905950546, + "learning_rate": 9.273570324574961e-05, + "loss": 0.0066, + "step": 601 + }, + { + "epoch": 0.4649546244448735, + "grad_norm": 0.045318953692913055, + "learning_rate": 9.28902627511592e-05, + "loss": 0.0069, + "step": 602 + }, + { + "epoch": 0.46572697431936666, + "grad_norm": 0.06828133761882782, + "learning_rate": 9.304482225656878e-05, + "loss": 0.008, + "step": 603 + }, + { + "epoch": 0.4664993241938598, + "grad_norm": 0.018730657175183296, + "learning_rate": 9.319938176197837e-05, + "loss": 0.0067, + "step": 604 + }, + { + "epoch": 0.46727167406835296, + "grad_norm": 0.052807360887527466, + "learning_rate": 9.335394126738795e-05, + "loss": 0.0064, + "step": 605 + }, + { + "epoch": 0.4680440239428461, + "grad_norm": 0.03708384931087494, + "learning_rate": 9.350850077279754e-05, + "loss": 0.0074, + "step": 606 + }, + { + "epoch": 0.46881637381733926, + "grad_norm": 0.034401170909404755, + "learning_rate": 9.366306027820711e-05, + "loss": 0.0071, + "step": 607 + }, + { + "epoch": 0.4695887236918324, + "grad_norm": 0.03672698140144348, + "learning_rate": 9.38176197836167e-05, + "loss": 0.0063, + "step": 608 + }, + { + "epoch": 0.47036107356632556, + "grad_norm": 0.04110847786068916, + "learning_rate": 9.397217928902628e-05, + "loss": 0.0066, + "step": 609 + }, + { + "epoch": 0.4711334234408187, + "grad_norm": 0.020947515964508057, + "learning_rate": 9.412673879443587e-05, + "loss": 0.0065, + "step": 610 + }, + { + "epoch": 0.47190577331531186, + "grad_norm": 0.023696739226579666, + "learning_rate": 9.428129829984544e-05, + "loss": 0.0062, + "step": 611 + }, + { + "epoch": 0.47267812318980496, + "grad_norm": 0.02793034166097641, + "learning_rate": 9.443585780525502e-05, + "loss": 0.0066, + "step": 612 + }, + { + "epoch": 0.4734504730642981, + "grad_norm": 0.036380648612976074, + "learning_rate": 9.459041731066461e-05, + "loss": 0.0065, + "step": 613 + }, + { + "epoch": 0.47422282293879126, + "grad_norm": 0.016912082210183144, + "learning_rate": 9.47449768160742e-05, + "loss": 0.0067, + "step": 614 + }, + { + "epoch": 0.4749951728132844, + "grad_norm": 0.0291866697371006, + "learning_rate": 9.489953632148378e-05, + "loss": 0.0067, + "step": 615 + }, + { + "epoch": 0.47576752268777756, + "grad_norm": 0.024140650406479836, + "learning_rate": 9.505409582689336e-05, + "loss": 0.0061, + "step": 616 + }, + { + "epoch": 0.4765398725622707, + "grad_norm": 0.018938008695840836, + "learning_rate": 9.520865533230294e-05, + "loss": 0.0069, + "step": 617 + }, + { + "epoch": 0.47731222243676386, + "grad_norm": 0.034017808735370636, + "learning_rate": 9.536321483771252e-05, + "loss": 0.0061, + "step": 618 + }, + { + "epoch": 0.478084572311257, + "grad_norm": 0.024729734286665916, + "learning_rate": 9.551777434312211e-05, + "loss": 0.0069, + "step": 619 + }, + { + "epoch": 0.47885692218575016, + "grad_norm": 0.02578096278011799, + "learning_rate": 9.567233384853169e-05, + "loss": 0.0056, + "step": 620 + }, + { + "epoch": 0.4796292720602433, + "grad_norm": 0.024975182488560677, + "learning_rate": 9.582689335394127e-05, + "loss": 0.0061, + "step": 621 + }, + { + "epoch": 0.48040162193473646, + "grad_norm": 0.016552217304706573, + "learning_rate": 9.598145285935085e-05, + "loss": 0.0066, + "step": 622 + }, + { + "epoch": 0.4811739718092296, + "grad_norm": 0.024094808846712112, + "learning_rate": 9.613601236476044e-05, + "loss": 0.0061, + "step": 623 + }, + { + "epoch": 0.4819463216837227, + "grad_norm": 0.03026052936911583, + "learning_rate": 9.629057187017002e-05, + "loss": 0.0063, + "step": 624 + }, + { + "epoch": 0.48271867155821585, + "grad_norm": 0.02954074554145336, + "learning_rate": 9.644513137557961e-05, + "loss": 0.0064, + "step": 625 + }, + { + "epoch": 0.483491021432709, + "grad_norm": 0.01834062486886978, + "learning_rate": 9.659969088098919e-05, + "loss": 0.0061, + "step": 626 + }, + { + "epoch": 0.48426337130720215, + "grad_norm": 0.026609912514686584, + "learning_rate": 9.675425038639876e-05, + "loss": 0.0067, + "step": 627 + }, + { + "epoch": 0.4850357211816953, + "grad_norm": 0.03172049671411514, + "learning_rate": 9.690880989180835e-05, + "loss": 0.0058, + "step": 628 + }, + { + "epoch": 0.48580807105618845, + "grad_norm": 0.03625147417187691, + "learning_rate": 9.706336939721794e-05, + "loss": 0.0063, + "step": 629 + }, + { + "epoch": 0.4865804209306816, + "grad_norm": 0.020125100389122963, + "learning_rate": 9.721792890262752e-05, + "loss": 0.0058, + "step": 630 + }, + { + "epoch": 0.48735277080517475, + "grad_norm": 0.07479507476091385, + "learning_rate": 9.737248840803709e-05, + "loss": 0.0074, + "step": 631 + }, + { + "epoch": 0.4881251206796679, + "grad_norm": 0.028870578855276108, + "learning_rate": 9.752704791344668e-05, + "loss": 0.0061, + "step": 632 + }, + { + "epoch": 0.48889747055416105, + "grad_norm": 0.04097558557987213, + "learning_rate": 9.768160741885626e-05, + "loss": 0.0066, + "step": 633 + }, + { + "epoch": 0.4896698204286542, + "grad_norm": 0.023863254114985466, + "learning_rate": 9.783616692426585e-05, + "loss": 0.0061, + "step": 634 + }, + { + "epoch": 0.49044217030314735, + "grad_norm": 0.037251200526952744, + "learning_rate": 9.799072642967543e-05, + "loss": 0.0064, + "step": 635 + }, + { + "epoch": 0.49121452017764045, + "grad_norm": 0.017656605690717697, + "learning_rate": 9.8145285935085e-05, + "loss": 0.0066, + "step": 636 + }, + { + "epoch": 0.4919868700521336, + "grad_norm": 0.013435768894851208, + "learning_rate": 9.829984544049459e-05, + "loss": 0.0064, + "step": 637 + }, + { + "epoch": 0.49275921992662675, + "grad_norm": 0.01997395232319832, + "learning_rate": 9.845440494590418e-05, + "loss": 0.0063, + "step": 638 + }, + { + "epoch": 0.4935315698011199, + "grad_norm": 0.049601081758737564, + "learning_rate": 9.860896445131376e-05, + "loss": 0.0065, + "step": 639 + }, + { + "epoch": 0.49430391967561305, + "grad_norm": 0.02850998379290104, + "learning_rate": 9.876352395672335e-05, + "loss": 0.0062, + "step": 640 + }, + { + "epoch": 0.4950762695501062, + "grad_norm": 0.0246591754257679, + "learning_rate": 9.891808346213292e-05, + "loss": 0.0062, + "step": 641 + }, + { + "epoch": 0.49584861942459935, + "grad_norm": 0.02419452928006649, + "learning_rate": 9.90726429675425e-05, + "loss": 0.0059, + "step": 642 + }, + { + "epoch": 0.4966209692990925, + "grad_norm": 0.024134468287229538, + "learning_rate": 9.922720247295209e-05, + "loss": 0.0066, + "step": 643 + }, + { + "epoch": 0.49739331917358565, + "grad_norm": 0.020593956112861633, + "learning_rate": 9.938176197836168e-05, + "loss": 0.0068, + "step": 644 + }, + { + "epoch": 0.4981656690480788, + "grad_norm": 0.030068911612033844, + "learning_rate": 9.953632148377126e-05, + "loss": 0.0058, + "step": 645 + }, + { + "epoch": 0.49893801892257195, + "grad_norm": 0.03695107623934746, + "learning_rate": 9.969088098918083e-05, + "loss": 0.0059, + "step": 646 + }, + { + "epoch": 0.4997103687970651, + "grad_norm": 0.01950264722108841, + "learning_rate": 9.984544049459042e-05, + "loss": 0.0058, + "step": 647 + }, + { + "epoch": 0.5004827186715582, + "grad_norm": 0.019529715180397034, + "learning_rate": 0.0001, + "loss": 0.0064, + "step": 648 + }, + { + "epoch": 0.5012550685460514, + "grad_norm": 0.01743905618786812, + "learning_rate": 9.999999272310408e-05, + "loss": 0.0069, + "step": 649 + }, + { + "epoch": 0.5020274184205445, + "grad_norm": 0.017296746373176575, + "learning_rate": 9.999997089241844e-05, + "loss": 0.0063, + "step": 650 + }, + { + "epoch": 0.5027997682950377, + "grad_norm": 0.021065376698970795, + "learning_rate": 9.999993450794945e-05, + "loss": 0.0065, + "step": 651 + }, + { + "epoch": 0.5035721181695308, + "grad_norm": 0.017609447240829468, + "learning_rate": 9.999988356970765e-05, + "loss": 0.0061, + "step": 652 + }, + { + "epoch": 0.504344468044024, + "grad_norm": 0.014331743121147156, + "learning_rate": 9.999981807770793e-05, + "loss": 0.0058, + "step": 653 + }, + { + "epoch": 0.505116817918517, + "grad_norm": 0.02973123826086521, + "learning_rate": 9.999973803196931e-05, + "loss": 0.0059, + "step": 654 + }, + { + "epoch": 0.5058891677930102, + "grad_norm": 0.06875317543745041, + "learning_rate": 9.99996434325151e-05, + "loss": 0.0066, + "step": 655 + }, + { + "epoch": 0.5066615176675033, + "grad_norm": 0.029321495443582535, + "learning_rate": 9.999953427937285e-05, + "loss": 0.0065, + "step": 656 + }, + { + "epoch": 0.5074338675419965, + "grad_norm": 0.02221604809165001, + "learning_rate": 9.999941057257431e-05, + "loss": 0.0061, + "step": 657 + }, + { + "epoch": 0.5082062174164896, + "grad_norm": 0.024562258273363113, + "learning_rate": 9.999927231215551e-05, + "loss": 0.0061, + "step": 658 + }, + { + "epoch": 0.5089785672909828, + "grad_norm": 0.05312330275774002, + "learning_rate": 9.999911949815668e-05, + "loss": 0.0057, + "step": 659 + }, + { + "epoch": 0.5097509171654759, + "grad_norm": 0.02948874980211258, + "learning_rate": 9.99989521306223e-05, + "loss": 0.007, + "step": 660 + }, + { + "epoch": 0.5105232670399691, + "grad_norm": 0.03101520985364914, + "learning_rate": 9.99987702096011e-05, + "loss": 0.0057, + "step": 661 + }, + { + "epoch": 0.5112956169144622, + "grad_norm": 0.027225030586123466, + "learning_rate": 9.999857373514601e-05, + "loss": 0.0069, + "step": 662 + }, + { + "epoch": 0.5120679667889554, + "grad_norm": 0.04047227278351784, + "learning_rate": 9.999836270731423e-05, + "loss": 0.0066, + "step": 663 + }, + { + "epoch": 0.5128403166634485, + "grad_norm": 0.0271292757242918, + "learning_rate": 9.999813712616719e-05, + "loss": 0.0066, + "step": 664 + }, + { + "epoch": 0.5136126665379417, + "grad_norm": 0.044453106820583344, + "learning_rate": 9.999789699177056e-05, + "loss": 0.0068, + "step": 665 + }, + { + "epoch": 0.5143850164124348, + "grad_norm": 0.03188185393810272, + "learning_rate": 9.999764230419422e-05, + "loss": 0.0062, + "step": 666 + }, + { + "epoch": 0.515157366286928, + "grad_norm": 0.033734098076820374, + "learning_rate": 9.999737306351232e-05, + "loss": 0.007, + "step": 667 + }, + { + "epoch": 0.5159297161614211, + "grad_norm": 0.035705890506505966, + "learning_rate": 9.99970892698032e-05, + "loss": 0.0058, + "step": 668 + }, + { + "epoch": 0.5167020660359143, + "grad_norm": 0.0311118196696043, + "learning_rate": 9.999679092314948e-05, + "loss": 0.0067, + "step": 669 + }, + { + "epoch": 0.5174744159104074, + "grad_norm": 0.04391526058316231, + "learning_rate": 9.999647802363803e-05, + "loss": 0.006, + "step": 670 + }, + { + "epoch": 0.5182467657849006, + "grad_norm": 0.01930868998169899, + "learning_rate": 9.999615057135989e-05, + "loss": 0.0063, + "step": 671 + }, + { + "epoch": 0.5190191156593937, + "grad_norm": 0.033360958099365234, + "learning_rate": 9.999580856641038e-05, + "loss": 0.0059, + "step": 672 + }, + { + "epoch": 0.5197914655338869, + "grad_norm": 0.020999480038881302, + "learning_rate": 9.999545200888907e-05, + "loss": 0.0063, + "step": 673 + }, + { + "epoch": 0.52056381540838, + "grad_norm": 0.05861787870526314, + "learning_rate": 9.999508089889971e-05, + "loss": 0.0062, + "step": 674 + }, + { + "epoch": 0.5213361652828732, + "grad_norm": 0.026002857834100723, + "learning_rate": 9.999469523655036e-05, + "loss": 0.0059, + "step": 675 + }, + { + "epoch": 0.5221085151573663, + "grad_norm": 0.017863426357507706, + "learning_rate": 9.999429502195326e-05, + "loss": 0.0058, + "step": 676 + }, + { + "epoch": 0.5228808650318595, + "grad_norm": 0.02639612928032875, + "learning_rate": 9.999388025522489e-05, + "loss": 0.0069, + "step": 677 + }, + { + "epoch": 0.5236532149063525, + "grad_norm": 0.025559836998581886, + "learning_rate": 9.9993450936486e-05, + "loss": 0.0054, + "step": 678 + }, + { + "epoch": 0.5244255647808457, + "grad_norm": 0.022854767739772797, + "learning_rate": 9.999300706586154e-05, + "loss": 0.0063, + "step": 679 + }, + { + "epoch": 0.5251979146553388, + "grad_norm": 0.016021009534597397, + "learning_rate": 9.999254864348073e-05, + "loss": 0.0055, + "step": 680 + }, + { + "epoch": 0.525970264529832, + "grad_norm": 0.016422376036643982, + "learning_rate": 9.999207566947698e-05, + "loss": 0.0061, + "step": 681 + }, + { + "epoch": 0.5267426144043251, + "grad_norm": 0.019435180351138115, + "learning_rate": 9.999158814398796e-05, + "loss": 0.0065, + "step": 682 + }, + { + "epoch": 0.5275149642788183, + "grad_norm": 0.022698726505041122, + "learning_rate": 9.999108606715561e-05, + "loss": 0.0061, + "step": 683 + }, + { + "epoch": 0.5282873141533114, + "grad_norm": 0.017409764230251312, + "learning_rate": 9.999056943912603e-05, + "loss": 0.0053, + "step": 684 + }, + { + "epoch": 0.5290596640278046, + "grad_norm": 0.019013019278645515, + "learning_rate": 9.999003826004964e-05, + "loss": 0.0061, + "step": 685 + }, + { + "epoch": 0.5298320139022977, + "grad_norm": 0.018017224967479706, + "learning_rate": 9.998949253008103e-05, + "loss": 0.0062, + "step": 686 + }, + { + "epoch": 0.5306043637767909, + "grad_norm": 0.042635902762413025, + "learning_rate": 9.998893224937904e-05, + "loss": 0.0063, + "step": 687 + }, + { + "epoch": 0.531376713651284, + "grad_norm": 0.01615188457071781, + "learning_rate": 9.998835741810677e-05, + "loss": 0.0056, + "step": 688 + }, + { + "epoch": 0.5321490635257772, + "grad_norm": 0.022535262629389763, + "learning_rate": 9.998776803643155e-05, + "loss": 0.0065, + "step": 689 + }, + { + "epoch": 0.5329214134002703, + "grad_norm": 0.034012194722890854, + "learning_rate": 9.99871641045249e-05, + "loss": 0.0061, + "step": 690 + }, + { + "epoch": 0.5336937632747635, + "grad_norm": 0.020539700984954834, + "learning_rate": 9.998654562256265e-05, + "loss": 0.005, + "step": 691 + }, + { + "epoch": 0.5344661131492566, + "grad_norm": 0.03489365801215172, + "learning_rate": 9.998591259072479e-05, + "loss": 0.0062, + "step": 692 + }, + { + "epoch": 0.5352384630237498, + "grad_norm": 0.016758328303694725, + "learning_rate": 9.998526500919558e-05, + "loss": 0.0057, + "step": 693 + }, + { + "epoch": 0.5360108128982429, + "grad_norm": 0.04345661774277687, + "learning_rate": 9.998460287816355e-05, + "loss": 0.0062, + "step": 694 + }, + { + "epoch": 0.5367831627727361, + "grad_norm": 0.028332583606243134, + "learning_rate": 9.998392619782142e-05, + "loss": 0.0061, + "step": 695 + }, + { + "epoch": 0.5375555126472292, + "grad_norm": 0.017350120469927788, + "learning_rate": 9.998323496836613e-05, + "loss": 0.0066, + "step": 696 + }, + { + "epoch": 0.5383278625217224, + "grad_norm": 0.05254344269633293, + "learning_rate": 9.99825291899989e-05, + "loss": 0.0069, + "step": 697 + }, + { + "epoch": 0.5391002123962155, + "grad_norm": 0.02194729447364807, + "learning_rate": 9.998180886292517e-05, + "loss": 0.0059, + "step": 698 + }, + { + "epoch": 0.5398725622707087, + "grad_norm": 0.02504734694957733, + "learning_rate": 9.998107398735459e-05, + "loss": 0.0061, + "step": 699 + }, + { + "epoch": 0.5406449121452018, + "grad_norm": 0.048992741852998734, + "learning_rate": 9.998032456350108e-05, + "loss": 0.0066, + "step": 700 + }, + { + "epoch": 0.5414172620196949, + "grad_norm": 0.016226578503847122, + "learning_rate": 9.997956059158278e-05, + "loss": 0.0062, + "step": 701 + }, + { + "epoch": 0.542189611894188, + "grad_norm": 0.03624337911605835, + "learning_rate": 9.997878207182205e-05, + "loss": 0.0065, + "step": 702 + }, + { + "epoch": 0.5429619617686812, + "grad_norm": 0.05948558449745178, + "learning_rate": 9.99779890044455e-05, + "loss": 0.0061, + "step": 703 + }, + { + "epoch": 0.5437343116431743, + "grad_norm": 0.01626715436577797, + "learning_rate": 9.9977181389684e-05, + "loss": 0.0058, + "step": 704 + }, + { + "epoch": 0.5445066615176675, + "grad_norm": 0.041707947850227356, + "learning_rate": 9.99763592277726e-05, + "loss": 0.0063, + "step": 705 + }, + { + "epoch": 0.5452790113921606, + "grad_norm": 0.05031125247478485, + "learning_rate": 9.997552251895061e-05, + "loss": 0.0062, + "step": 706 + }, + { + "epoch": 0.5460513612666538, + "grad_norm": 0.022725578397512436, + "learning_rate": 9.99746712634616e-05, + "loss": 0.0054, + "step": 707 + }, + { + "epoch": 0.5468237111411469, + "grad_norm": 0.042997218668460846, + "learning_rate": 9.997380546155333e-05, + "loss": 0.0067, + "step": 708 + }, + { + "epoch": 0.5475960610156401, + "grad_norm": 0.03495806083083153, + "learning_rate": 9.99729251134778e-05, + "loss": 0.006, + "step": 709 + }, + { + "epoch": 0.5483684108901332, + "grad_norm": 0.01554971281439066, + "learning_rate": 9.99720302194913e-05, + "loss": 0.0061, + "step": 710 + }, + { + "epoch": 0.5491407607646264, + "grad_norm": 0.027845371514558792, + "learning_rate": 9.997112077985428e-05, + "loss": 0.0064, + "step": 711 + }, + { + "epoch": 0.5499131106391195, + "grad_norm": 0.06307762861251831, + "learning_rate": 9.997019679483145e-05, + "loss": 0.0065, + "step": 712 + }, + { + "epoch": 0.5506854605136127, + "grad_norm": 0.024832166731357574, + "learning_rate": 9.99692582646918e-05, + "loss": 0.0057, + "step": 713 + }, + { + "epoch": 0.5514578103881058, + "grad_norm": 0.055046964436769485, + "learning_rate": 9.996830518970847e-05, + "loss": 0.0065, + "step": 714 + }, + { + "epoch": 0.552230160262599, + "grad_norm": 0.025708051398396492, + "learning_rate": 9.99673375701589e-05, + "loss": 0.006, + "step": 715 + }, + { + "epoch": 0.5530025101370921, + "grad_norm": 0.02331097424030304, + "learning_rate": 9.996635540632473e-05, + "loss": 0.006, + "step": 716 + }, + { + "epoch": 0.5537748600115853, + "grad_norm": 0.03324016556143761, + "learning_rate": 9.996535869849186e-05, + "loss": 0.0063, + "step": 717 + }, + { + "epoch": 0.5545472098860784, + "grad_norm": 0.03681979700922966, + "learning_rate": 9.996434744695038e-05, + "loss": 0.0061, + "step": 718 + }, + { + "epoch": 0.5553195597605716, + "grad_norm": 0.016809049993753433, + "learning_rate": 9.996332165199466e-05, + "loss": 0.0061, + "step": 719 + }, + { + "epoch": 0.5560919096350647, + "grad_norm": 0.0409129299223423, + "learning_rate": 9.996228131392329e-05, + "loss": 0.0058, + "step": 720 + }, + { + "epoch": 0.5568642595095579, + "grad_norm": 0.02070578932762146, + "learning_rate": 9.99612264330391e-05, + "loss": 0.0057, + "step": 721 + }, + { + "epoch": 0.557636609384051, + "grad_norm": 0.022130563855171204, + "learning_rate": 9.996015700964908e-05, + "loss": 0.0056, + "step": 722 + }, + { + "epoch": 0.5584089592585442, + "grad_norm": 0.04914633929729462, + "learning_rate": 9.995907304406457e-05, + "loss": 0.0066, + "step": 723 + }, + { + "epoch": 0.5591813091330373, + "grad_norm": 0.02824830636382103, + "learning_rate": 9.995797453660107e-05, + "loss": 0.0057, + "step": 724 + }, + { + "epoch": 0.5599536590075304, + "grad_norm": 0.01932264119386673, + "learning_rate": 9.995686148757833e-05, + "loss": 0.0058, + "step": 725 + }, + { + "epoch": 0.5607260088820235, + "grad_norm": 0.034268081188201904, + "learning_rate": 9.995573389732032e-05, + "loss": 0.0058, + "step": 726 + }, + { + "epoch": 0.5614983587565167, + "grad_norm": 0.018359249457716942, + "learning_rate": 9.995459176615527e-05, + "loss": 0.0065, + "step": 727 + }, + { + "epoch": 0.5622707086310098, + "grad_norm": 0.021127983927726746, + "learning_rate": 9.995343509441561e-05, + "loss": 0.0062, + "step": 728 + }, + { + "epoch": 0.563043058505503, + "grad_norm": 0.026875965297222137, + "learning_rate": 9.995226388243804e-05, + "loss": 0.006, + "step": 729 + }, + { + "epoch": 0.5638154083799961, + "grad_norm": 0.022183291614055634, + "learning_rate": 9.995107813056347e-05, + "loss": 0.0056, + "step": 730 + }, + { + "epoch": 0.5645877582544893, + "grad_norm": 0.025123730301856995, + "learning_rate": 9.994987783913704e-05, + "loss": 0.0061, + "step": 731 + }, + { + "epoch": 0.5653601081289824, + "grad_norm": 0.03460489585995674, + "learning_rate": 9.994866300850809e-05, + "loss": 0.0054, + "step": 732 + }, + { + "epoch": 0.5661324580034756, + "grad_norm": 0.012985138222575188, + "learning_rate": 9.994743363903028e-05, + "loss": 0.0052, + "step": 733 + }, + { + "epoch": 0.5669048078779687, + "grad_norm": 0.01988379657268524, + "learning_rate": 9.994618973106142e-05, + "loss": 0.0053, + "step": 734 + }, + { + "epoch": 0.5676771577524619, + "grad_norm": 0.03403865545988083, + "learning_rate": 9.994493128496359e-05, + "loss": 0.0057, + "step": 735 + }, + { + "epoch": 0.568449507626955, + "grad_norm": 0.019713513553142548, + "learning_rate": 9.994365830110311e-05, + "loss": 0.0052, + "step": 736 + }, + { + "epoch": 0.5692218575014482, + "grad_norm": 0.02123085968196392, + "learning_rate": 9.994237077985048e-05, + "loss": 0.0059, + "step": 737 + }, + { + "epoch": 0.5699942073759413, + "grad_norm": 0.021393541246652603, + "learning_rate": 9.99410687215805e-05, + "loss": 0.0047, + "step": 738 + }, + { + "epoch": 0.5707665572504345, + "grad_norm": 0.019972041249275208, + "learning_rate": 9.993975212667212e-05, + "loss": 0.0054, + "step": 739 + }, + { + "epoch": 0.5715389071249276, + "grad_norm": 0.01621190644800663, + "learning_rate": 9.993842099550863e-05, + "loss": 0.0054, + "step": 740 + }, + { + "epoch": 0.5723112569994208, + "grad_norm": 0.0192471481859684, + "learning_rate": 9.993707532847745e-05, + "loss": 0.005, + "step": 741 + }, + { + "epoch": 0.5730836068739139, + "grad_norm": 0.026484373956918716, + "learning_rate": 9.993571512597028e-05, + "loss": 0.0055, + "step": 742 + }, + { + "epoch": 0.5738559567484071, + "grad_norm": 0.01591717079281807, + "learning_rate": 9.993434038838306e-05, + "loss": 0.0055, + "step": 743 + }, + { + "epoch": 0.5746283066229002, + "grad_norm": 0.020151285454630852, + "learning_rate": 9.993295111611592e-05, + "loss": 0.0054, + "step": 744 + }, + { + "epoch": 0.5754006564973934, + "grad_norm": 0.015028299763798714, + "learning_rate": 9.993154730957326e-05, + "loss": 0.0059, + "step": 745 + }, + { + "epoch": 0.5761730063718865, + "grad_norm": 0.012499259784817696, + "learning_rate": 9.993012896916368e-05, + "loss": 0.0049, + "step": 746 + }, + { + "epoch": 0.5769453562463797, + "grad_norm": 0.019595742225646973, + "learning_rate": 9.992869609530001e-05, + "loss": 0.0053, + "step": 747 + }, + { + "epoch": 0.5777177061208728, + "grad_norm": 0.026846617460250854, + "learning_rate": 9.992724868839935e-05, + "loss": 0.0059, + "step": 748 + }, + { + "epoch": 0.5784900559953658, + "grad_norm": 0.043270599097013474, + "learning_rate": 9.992578674888302e-05, + "loss": 0.0059, + "step": 749 + }, + { + "epoch": 0.579262405869859, + "grad_norm": 0.03132093697786331, + "learning_rate": 9.992431027717652e-05, + "loss": 0.0055, + "step": 750 + }, + { + "epoch": 0.5800347557443521, + "grad_norm": 0.04177276790142059, + "learning_rate": 9.992281927370963e-05, + "loss": 0.0055, + "step": 751 + }, + { + "epoch": 0.5808071056188453, + "grad_norm": 0.018702374771237373, + "learning_rate": 9.992131373891635e-05, + "loss": 0.0056, + "step": 752 + }, + { + "epoch": 0.5815794554933384, + "grad_norm": 0.031128762289881706, + "learning_rate": 9.991979367323491e-05, + "loss": 0.0058, + "step": 753 + }, + { + "epoch": 0.5823518053678316, + "grad_norm": 0.028675846755504608, + "learning_rate": 9.991825907710775e-05, + "loss": 0.0051, + "step": 754 + }, + { + "epoch": 0.5831241552423247, + "grad_norm": 0.01913781464099884, + "learning_rate": 9.991670995098155e-05, + "loss": 0.0052, + "step": 755 + }, + { + "epoch": 0.5838965051168179, + "grad_norm": 0.02314949221909046, + "learning_rate": 9.991514629530723e-05, + "loss": 0.0058, + "step": 756 + }, + { + "epoch": 0.584668854991311, + "grad_norm": 0.0678134635090828, + "learning_rate": 9.991356811053994e-05, + "loss": 0.0058, + "step": 757 + }, + { + "epoch": 0.5854412048658042, + "grad_norm": 0.013736193999648094, + "learning_rate": 9.991197539713903e-05, + "loss": 0.0047, + "step": 758 + }, + { + "epoch": 0.5862135547402973, + "grad_norm": 0.059933267533779144, + "learning_rate": 9.991036815556814e-05, + "loss": 0.0062, + "step": 759 + }, + { + "epoch": 0.5869859046147905, + "grad_norm": 0.04519271478056908, + "learning_rate": 9.990874638629506e-05, + "loss": 0.006, + "step": 760 + }, + { + "epoch": 0.5877582544892836, + "grad_norm": 0.018265997990965843, + "learning_rate": 9.990711008979187e-05, + "loss": 0.0057, + "step": 761 + }, + { + "epoch": 0.5885306043637768, + "grad_norm": 0.04235892370343208, + "learning_rate": 9.990545926653485e-05, + "loss": 0.0055, + "step": 762 + }, + { + "epoch": 0.58930295423827, + "grad_norm": 0.04719007760286331, + "learning_rate": 9.990379391700451e-05, + "loss": 0.0057, + "step": 763 + }, + { + "epoch": 0.5900753041127631, + "grad_norm": 0.03028462827205658, + "learning_rate": 9.990211404168561e-05, + "loss": 0.0056, + "step": 764 + }, + { + "epoch": 0.5908476539872562, + "grad_norm": 0.02930932678282261, + "learning_rate": 9.990041964106708e-05, + "loss": 0.0054, + "step": 765 + }, + { + "epoch": 0.5916200038617494, + "grad_norm": 0.023444976657629013, + "learning_rate": 9.989871071564217e-05, + "loss": 0.0051, + "step": 766 + }, + { + "epoch": 0.5923923537362425, + "grad_norm": 0.021648840978741646, + "learning_rate": 9.989698726590829e-05, + "loss": 0.0056, + "step": 767 + }, + { + "epoch": 0.5931647036107357, + "grad_norm": 0.03562987968325615, + "learning_rate": 9.989524929236707e-05, + "loss": 0.0054, + "step": 768 + }, + { + "epoch": 0.5939370534852288, + "grad_norm": 0.0237098541110754, + "learning_rate": 9.989349679552441e-05, + "loss": 0.0056, + "step": 769 + }, + { + "epoch": 0.594709403359722, + "grad_norm": 0.032227423042058945, + "learning_rate": 9.989172977589043e-05, + "loss": 0.0057, + "step": 770 + }, + { + "epoch": 0.5954817532342151, + "grad_norm": 0.03126399964094162, + "learning_rate": 9.988994823397946e-05, + "loss": 0.0054, + "step": 771 + }, + { + "epoch": 0.5962541031087083, + "grad_norm": 0.012847068719565868, + "learning_rate": 9.988815217031005e-05, + "loss": 0.0054, + "step": 772 + }, + { + "epoch": 0.5970264529832013, + "grad_norm": 0.014000168070197105, + "learning_rate": 9.988634158540501e-05, + "loss": 0.005, + "step": 773 + }, + { + "epoch": 0.5977988028576945, + "grad_norm": 0.0342111699283123, + "learning_rate": 9.988451647979134e-05, + "loss": 0.0054, + "step": 774 + }, + { + "epoch": 0.5985711527321876, + "grad_norm": 0.017097115516662598, + "learning_rate": 9.98826768540003e-05, + "loss": 0.0056, + "step": 775 + }, + { + "epoch": 0.5993435026066808, + "grad_norm": 0.02337076887488365, + "learning_rate": 9.988082270856735e-05, + "loss": 0.0058, + "step": 776 + }, + { + "epoch": 0.6001158524811739, + "grad_norm": 0.020811082795262337, + "learning_rate": 9.98789540440322e-05, + "loss": 0.0048, + "step": 777 + }, + { + "epoch": 0.6008882023556671, + "grad_norm": 0.021415896713733673, + "learning_rate": 9.987707086093876e-05, + "loss": 0.0056, + "step": 778 + }, + { + "epoch": 0.6016605522301602, + "grad_norm": 0.022532187402248383, + "learning_rate": 9.987517315983517e-05, + "loss": 0.0052, + "step": 779 + }, + { + "epoch": 0.6024329021046534, + "grad_norm": 0.015836693346500397, + "learning_rate": 9.987326094127383e-05, + "loss": 0.0053, + "step": 780 + }, + { + "epoch": 0.6032052519791465, + "grad_norm": 0.026889432221651077, + "learning_rate": 9.987133420581133e-05, + "loss": 0.0051, + "step": 781 + }, + { + "epoch": 0.6039776018536397, + "grad_norm": 0.016825655475258827, + "learning_rate": 9.98693929540085e-05, + "loss": 0.0057, + "step": 782 + }, + { + "epoch": 0.6047499517281328, + "grad_norm": 0.013712005689740181, + "learning_rate": 9.986743718643037e-05, + "loss": 0.0052, + "step": 783 + }, + { + "epoch": 0.605522301602626, + "grad_norm": 0.014379930682480335, + "learning_rate": 9.986546690364625e-05, + "loss": 0.0055, + "step": 784 + }, + { + "epoch": 0.6062946514771191, + "grad_norm": 0.015972580760717392, + "learning_rate": 9.986348210622961e-05, + "loss": 0.0051, + "step": 785 + }, + { + "epoch": 0.6070670013516123, + "grad_norm": 0.012665828689932823, + "learning_rate": 9.98614827947582e-05, + "loss": 0.0052, + "step": 786 + }, + { + "epoch": 0.6078393512261054, + "grad_norm": 0.015817679464817047, + "learning_rate": 9.985946896981396e-05, + "loss": 0.0064, + "step": 787 + }, + { + "epoch": 0.6086117011005986, + "grad_norm": 0.02141885831952095, + "learning_rate": 9.985744063198305e-05, + "loss": 0.0057, + "step": 788 + }, + { + "epoch": 0.6093840509750917, + "grad_norm": 0.019856026396155357, + "learning_rate": 9.985539778185591e-05, + "loss": 0.0056, + "step": 789 + }, + { + "epoch": 0.6101564008495849, + "grad_norm": 0.014455040916800499, + "learning_rate": 9.985334042002714e-05, + "loss": 0.0058, + "step": 790 + }, + { + "epoch": 0.610928750724078, + "grad_norm": 0.012989351525902748, + "learning_rate": 9.985126854709559e-05, + "loss": 0.0055, + "step": 791 + }, + { + "epoch": 0.6117011005985712, + "grad_norm": 0.0151575468480587, + "learning_rate": 9.984918216366435e-05, + "loss": 0.0053, + "step": 792 + }, + { + "epoch": 0.6124734504730643, + "grad_norm": 0.018324896693229675, + "learning_rate": 9.984708127034067e-05, + "loss": 0.0052, + "step": 793 + }, + { + "epoch": 0.6132458003475575, + "grad_norm": 0.016394076868891716, + "learning_rate": 9.984496586773611e-05, + "loss": 0.0056, + "step": 794 + }, + { + "epoch": 0.6140181502220506, + "grad_norm": 0.016955038532614708, + "learning_rate": 9.98428359564664e-05, + "loss": 0.0058, + "step": 795 + }, + { + "epoch": 0.6147905000965437, + "grad_norm": 0.01672198437154293, + "learning_rate": 9.98406915371515e-05, + "loss": 0.005, + "step": 796 + }, + { + "epoch": 0.6155628499710368, + "grad_norm": 0.018940173089504242, + "learning_rate": 9.983853261041561e-05, + "loss": 0.0055, + "step": 797 + }, + { + "epoch": 0.61633519984553, + "grad_norm": 0.015106343664228916, + "learning_rate": 9.983635917688714e-05, + "loss": 0.0046, + "step": 798 + }, + { + "epoch": 0.6171075497200231, + "grad_norm": 0.03357703983783722, + "learning_rate": 9.983417123719872e-05, + "loss": 0.0055, + "step": 799 + }, + { + "epoch": 0.6178798995945163, + "grad_norm": 0.02359483204782009, + "learning_rate": 9.983196879198721e-05, + "loss": 0.005, + "step": 800 + }, + { + "epoch": 0.6186522494690094, + "grad_norm": 0.02851145900785923, + "learning_rate": 9.982975184189367e-05, + "loss": 0.0057, + "step": 801 + }, + { + "epoch": 0.6194245993435026, + "grad_norm": 0.05479593202471733, + "learning_rate": 9.982752038756344e-05, + "loss": 0.0054, + "step": 802 + }, + { + "epoch": 0.6201969492179957, + "grad_norm": 0.030343232676386833, + "learning_rate": 9.9825274429646e-05, + "loss": 0.0059, + "step": 803 + }, + { + "epoch": 0.6209692990924889, + "grad_norm": 0.03817446529865265, + "learning_rate": 9.982301396879512e-05, + "loss": 0.0053, + "step": 804 + }, + { + "epoch": 0.621741648966982, + "grad_norm": 0.02437039092183113, + "learning_rate": 9.982073900566876e-05, + "loss": 0.0058, + "step": 805 + }, + { + "epoch": 0.6225139988414752, + "grad_norm": 0.015008049085736275, + "learning_rate": 9.98184495409291e-05, + "loss": 0.0055, + "step": 806 + }, + { + "epoch": 0.6232863487159683, + "grad_norm": 0.03220735862851143, + "learning_rate": 9.981614557524254e-05, + "loss": 0.0056, + "step": 807 + }, + { + "epoch": 0.6240586985904615, + "grad_norm": 0.03285316750407219, + "learning_rate": 9.981382710927974e-05, + "loss": 0.0054, + "step": 808 + }, + { + "epoch": 0.6248310484649546, + "grad_norm": 0.028768053278326988, + "learning_rate": 9.981149414371553e-05, + "loss": 0.0049, + "step": 809 + }, + { + "epoch": 0.6256033983394478, + "grad_norm": 0.01450389251112938, + "learning_rate": 9.980914667922898e-05, + "loss": 0.0053, + "step": 810 + }, + { + "epoch": 0.6263757482139409, + "grad_norm": 0.025595488026738167, + "learning_rate": 9.980678471650337e-05, + "loss": 0.0054, + "step": 811 + }, + { + "epoch": 0.6271480980884341, + "grad_norm": 0.02557152882218361, + "learning_rate": 9.980440825622622e-05, + "loss": 0.0048, + "step": 812 + }, + { + "epoch": 0.6279204479629272, + "grad_norm": 0.013808295130729675, + "learning_rate": 9.980201729908926e-05, + "loss": 0.0052, + "step": 813 + }, + { + "epoch": 0.6286927978374204, + "grad_norm": 0.015615541487932205, + "learning_rate": 9.979961184578847e-05, + "loss": 0.0059, + "step": 814 + }, + { + "epoch": 0.6294651477119135, + "grad_norm": 0.01616467721760273, + "learning_rate": 9.979719189702397e-05, + "loss": 0.0051, + "step": 815 + }, + { + "epoch": 0.6302374975864067, + "grad_norm": 0.01760113425552845, + "learning_rate": 9.979475745350018e-05, + "loss": 0.005, + "step": 816 + }, + { + "epoch": 0.6310098474608998, + "grad_norm": 0.014776479452848434, + "learning_rate": 9.979230851592567e-05, + "loss": 0.005, + "step": 817 + }, + { + "epoch": 0.631782197335393, + "grad_norm": 0.014517302624881268, + "learning_rate": 9.978984508501332e-05, + "loss": 0.0053, + "step": 818 + }, + { + "epoch": 0.6325545472098861, + "grad_norm": 0.013414164073765278, + "learning_rate": 9.978736716148013e-05, + "loss": 0.0051, + "step": 819 + }, + { + "epoch": 0.6333268970843792, + "grad_norm": 0.018927980214357376, + "learning_rate": 9.978487474604741e-05, + "loss": 0.0053, + "step": 820 + }, + { + "epoch": 0.6340992469588723, + "grad_norm": 0.01567482389509678, + "learning_rate": 9.978236783944059e-05, + "loss": 0.0055, + "step": 821 + }, + { + "epoch": 0.6348715968333655, + "grad_norm": 0.015856629237532616, + "learning_rate": 9.97798464423894e-05, + "loss": 0.0057, + "step": 822 + }, + { + "epoch": 0.6356439467078586, + "grad_norm": 0.019539091736078262, + "learning_rate": 9.977731055562775e-05, + "loss": 0.0058, + "step": 823 + }, + { + "epoch": 0.6364162965823518, + "grad_norm": 0.015069976449012756, + "learning_rate": 9.977476017989377e-05, + "loss": 0.0058, + "step": 824 + }, + { + "epoch": 0.6371886464568449, + "grad_norm": 0.01712663099169731, + "learning_rate": 9.977219531592984e-05, + "loss": 0.0058, + "step": 825 + }, + { + "epoch": 0.6379609963313381, + "grad_norm": 0.015889057889580727, + "learning_rate": 9.97696159644825e-05, + "loss": 0.0053, + "step": 826 + }, + { + "epoch": 0.6387333462058312, + "grad_norm": 0.024051977321505547, + "learning_rate": 9.976702212630255e-05, + "loss": 0.0055, + "step": 827 + }, + { + "epoch": 0.6395056960803244, + "grad_norm": 0.018556464463472366, + "learning_rate": 9.976441380214499e-05, + "loss": 0.0054, + "step": 828 + }, + { + "epoch": 0.6402780459548175, + "grad_norm": 0.013441438786685467, + "learning_rate": 9.976179099276903e-05, + "loss": 0.0053, + "step": 829 + }, + { + "epoch": 0.6410503958293107, + "grad_norm": 0.010628352873027325, + "learning_rate": 9.975915369893813e-05, + "loss": 0.0051, + "step": 830 + }, + { + "epoch": 0.6418227457038038, + "grad_norm": 0.019367951899766922, + "learning_rate": 9.975650192141992e-05, + "loss": 0.0051, + "step": 831 + }, + { + "epoch": 0.642595095578297, + "grad_norm": 0.013856122270226479, + "learning_rate": 9.975383566098628e-05, + "loss": 0.0049, + "step": 832 + }, + { + "epoch": 0.6433674454527901, + "grad_norm": 0.02479681186378002, + "learning_rate": 9.975115491841329e-05, + "loss": 0.0054, + "step": 833 + }, + { + "epoch": 0.6441397953272833, + "grad_norm": 0.01671089604496956, + "learning_rate": 9.974845969448127e-05, + "loss": 0.0055, + "step": 834 + }, + { + "epoch": 0.6449121452017764, + "grad_norm": 0.03032640554010868, + "learning_rate": 9.974574998997471e-05, + "loss": 0.0061, + "step": 835 + }, + { + "epoch": 0.6456844950762696, + "grad_norm": 0.036496784538030624, + "learning_rate": 9.974302580568232e-05, + "loss": 0.0054, + "step": 836 + }, + { + "epoch": 0.6464568449507627, + "grad_norm": 0.014118451625108719, + "learning_rate": 9.974028714239709e-05, + "loss": 0.0054, + "step": 837 + }, + { + "epoch": 0.6472291948252559, + "grad_norm": 0.046305056661367416, + "learning_rate": 9.973753400091616e-05, + "loss": 0.0058, + "step": 838 + }, + { + "epoch": 0.648001544699749, + "grad_norm": 0.020002854987978935, + "learning_rate": 9.97347663820409e-05, + "loss": 0.0059, + "step": 839 + }, + { + "epoch": 0.6487738945742422, + "grad_norm": 0.03489231690764427, + "learning_rate": 9.973198428657688e-05, + "loss": 0.0056, + "step": 840 + }, + { + "epoch": 0.6495462444487353, + "grad_norm": 0.03892209753394127, + "learning_rate": 9.972918771533394e-05, + "loss": 0.0059, + "step": 841 + }, + { + "epoch": 0.6503185943232285, + "grad_norm": 0.011789434589445591, + "learning_rate": 9.972637666912607e-05, + "loss": 0.0049, + "step": 842 + }, + { + "epoch": 0.6510909441977216, + "grad_norm": 0.032112717628479004, + "learning_rate": 9.97235511487715e-05, + "loss": 0.0054, + "step": 843 + }, + { + "epoch": 0.6518632940722147, + "grad_norm": 0.050589319318532944, + "learning_rate": 9.972071115509266e-05, + "loss": 0.0058, + "step": 844 + }, + { + "epoch": 0.6526356439467078, + "grad_norm": 0.012880904600024223, + "learning_rate": 9.971785668891623e-05, + "loss": 0.005, + "step": 845 + }, + { + "epoch": 0.653407993821201, + "grad_norm": 0.046898335218429565, + "learning_rate": 9.971498775107305e-05, + "loss": 0.0057, + "step": 846 + }, + { + "epoch": 0.6541803436956941, + "grad_norm": 0.03865799680352211, + "learning_rate": 9.971210434239822e-05, + "loss": 0.0052, + "step": 847 + }, + { + "epoch": 0.6549526935701873, + "grad_norm": 0.015834303572773933, + "learning_rate": 9.9709206463731e-05, + "loss": 0.005, + "step": 848 + }, + { + "epoch": 0.6557250434446804, + "grad_norm": 0.016615983098745346, + "learning_rate": 9.970629411591494e-05, + "loss": 0.005, + "step": 849 + }, + { + "epoch": 0.6564973933191736, + "grad_norm": 0.0366445891559124, + "learning_rate": 9.970336729979772e-05, + "loss": 0.0053, + "step": 850 + }, + { + "epoch": 0.6572697431936667, + "grad_norm": 0.02086579240858555, + "learning_rate": 9.970042601623127e-05, + "loss": 0.0052, + "step": 851 + }, + { + "epoch": 0.6580420930681599, + "grad_norm": 0.021980177611112595, + "learning_rate": 9.969747026607172e-05, + "loss": 0.0053, + "step": 852 + }, + { + "epoch": 0.658814442942653, + "grad_norm": 0.029278066009283066, + "learning_rate": 9.969450005017944e-05, + "loss": 0.0052, + "step": 853 + }, + { + "epoch": 0.6595867928171462, + "grad_norm": 0.020770156756043434, + "learning_rate": 9.969151536941897e-05, + "loss": 0.0054, + "step": 854 + }, + { + "epoch": 0.6603591426916393, + "grad_norm": 0.014317609369754791, + "learning_rate": 9.968851622465907e-05, + "loss": 0.0049, + "step": 855 + }, + { + "epoch": 0.6611314925661325, + "grad_norm": 0.036996036767959595, + "learning_rate": 9.968550261677274e-05, + "loss": 0.0059, + "step": 856 + }, + { + "epoch": 0.6619038424406256, + "grad_norm": 0.017970040440559387, + "learning_rate": 9.968247454663717e-05, + "loss": 0.0048, + "step": 857 + }, + { + "epoch": 0.6626761923151188, + "grad_norm": 0.014349796809256077, + "learning_rate": 9.967943201513374e-05, + "loss": 0.0052, + "step": 858 + }, + { + "epoch": 0.6634485421896119, + "grad_norm": 0.023744618520140648, + "learning_rate": 9.967637502314806e-05, + "loss": 0.0049, + "step": 859 + }, + { + "epoch": 0.6642208920641051, + "grad_norm": 0.03156176209449768, + "learning_rate": 9.967330357156996e-05, + "loss": 0.0054, + "step": 860 + }, + { + "epoch": 0.6649932419385982, + "grad_norm": 0.019208727404475212, + "learning_rate": 9.967021766129345e-05, + "loss": 0.0052, + "step": 861 + }, + { + "epoch": 0.6657655918130914, + "grad_norm": 0.037500545382499695, + "learning_rate": 9.966711729321679e-05, + "loss": 0.005, + "step": 862 + }, + { + "epoch": 0.6665379416875845, + "grad_norm": 0.040097303688526154, + "learning_rate": 9.966400246824238e-05, + "loss": 0.0058, + "step": 863 + }, + { + "epoch": 0.6673102915620777, + "grad_norm": 0.020316198468208313, + "learning_rate": 9.966087318727691e-05, + "loss": 0.0049, + "step": 864 + }, + { + "epoch": 0.6680826414365708, + "grad_norm": 0.04022366181015968, + "learning_rate": 9.96577294512312e-05, + "loss": 0.0057, + "step": 865 + }, + { + "epoch": 0.668854991311064, + "grad_norm": 0.017211418598890305, + "learning_rate": 9.965457126102036e-05, + "loss": 0.0049, + "step": 866 + }, + { + "epoch": 0.6696273411855571, + "grad_norm": 0.018000515177845955, + "learning_rate": 9.965139861756362e-05, + "loss": 0.0047, + "step": 867 + }, + { + "epoch": 0.6703996910600502, + "grad_norm": 0.02290462702512741, + "learning_rate": 9.964821152178451e-05, + "loss": 0.0051, + "step": 868 + }, + { + "epoch": 0.6711720409345433, + "grad_norm": 0.020106492564082146, + "learning_rate": 9.964500997461065e-05, + "loss": 0.005, + "step": 869 + }, + { + "epoch": 0.6719443908090365, + "grad_norm": 0.05181401968002319, + "learning_rate": 9.9641793976974e-05, + "loss": 0.0054, + "step": 870 + }, + { + "epoch": 0.6727167406835296, + "grad_norm": 0.03370905667543411, + "learning_rate": 9.963856352981062e-05, + "loss": 0.0057, + "step": 871 + }, + { + "epoch": 0.6734890905580228, + "grad_norm": 0.02146386355161667, + "learning_rate": 9.963531863406082e-05, + "loss": 0.0056, + "step": 872 + }, + { + "epoch": 0.6742614404325159, + "grad_norm": 0.023943284526467323, + "learning_rate": 9.963205929066912e-05, + "loss": 0.0057, + "step": 873 + }, + { + "epoch": 0.675033790307009, + "grad_norm": 0.023869581520557404, + "learning_rate": 9.962878550058422e-05, + "loss": 0.0053, + "step": 874 + }, + { + "epoch": 0.6758061401815022, + "grad_norm": 0.025636158883571625, + "learning_rate": 9.962549726475906e-05, + "loss": 0.0055, + "step": 875 + }, + { + "epoch": 0.6765784900559954, + "grad_norm": 0.016352355480194092, + "learning_rate": 9.962219458415077e-05, + "loss": 0.0051, + "step": 876 + }, + { + "epoch": 0.6773508399304885, + "grad_norm": 0.017515957355499268, + "learning_rate": 9.961887745972065e-05, + "loss": 0.005, + "step": 877 + }, + { + "epoch": 0.6781231898049817, + "grad_norm": 0.021827103570103645, + "learning_rate": 9.961554589243424e-05, + "loss": 0.0051, + "step": 878 + }, + { + "epoch": 0.6788955396794748, + "grad_norm": 0.02612106315791607, + "learning_rate": 9.961219988326132e-05, + "loss": 0.0056, + "step": 879 + }, + { + "epoch": 0.679667889553968, + "grad_norm": 0.017447635531425476, + "learning_rate": 9.960883943317579e-05, + "loss": 0.0052, + "step": 880 + }, + { + "epoch": 0.6804402394284611, + "grad_norm": 0.023314133286476135, + "learning_rate": 9.960546454315582e-05, + "loss": 0.0051, + "step": 881 + }, + { + "epoch": 0.6812125893029543, + "grad_norm": 0.02142924815416336, + "learning_rate": 9.960207521418374e-05, + "loss": 0.0053, + "step": 882 + }, + { + "epoch": 0.6819849391774474, + "grad_norm": 0.022927336394786835, + "learning_rate": 9.959867144724611e-05, + "loss": 0.006, + "step": 883 + }, + { + "epoch": 0.6827572890519406, + "grad_norm": 0.01784409210085869, + "learning_rate": 9.95952532433337e-05, + "loss": 0.005, + "step": 884 + }, + { + "epoch": 0.6835296389264337, + "grad_norm": 0.017598478123545647, + "learning_rate": 9.959182060344144e-05, + "loss": 0.0057, + "step": 885 + }, + { + "epoch": 0.6843019888009269, + "grad_norm": 0.019970614463090897, + "learning_rate": 9.958837352856852e-05, + "loss": 0.0057, + "step": 886 + }, + { + "epoch": 0.68507433867542, + "grad_norm": 0.016816386952996254, + "learning_rate": 9.958491201971825e-05, + "loss": 0.0052, + "step": 887 + }, + { + "epoch": 0.6858466885499132, + "grad_norm": 0.02354409731924534, + "learning_rate": 9.958143607789823e-05, + "loss": 0.0057, + "step": 888 + }, + { + "epoch": 0.6866190384244063, + "grad_norm": 0.01257417444139719, + "learning_rate": 9.957794570412022e-05, + "loss": 0.0049, + "step": 889 + }, + { + "epoch": 0.6873913882988995, + "grad_norm": 0.015461335889995098, + "learning_rate": 9.957444089940018e-05, + "loss": 0.0051, + "step": 890 + }, + { + "epoch": 0.6881637381733925, + "grad_norm": 0.016879508271813393, + "learning_rate": 9.957092166475828e-05, + "loss": 0.0057, + "step": 891 + }, + { + "epoch": 0.6889360880478856, + "grad_norm": 0.020782092586159706, + "learning_rate": 9.956738800121886e-05, + "loss": 0.0051, + "step": 892 + }, + { + "epoch": 0.6897084379223788, + "grad_norm": 0.013319279067218304, + "learning_rate": 9.95638399098105e-05, + "loss": 0.0049, + "step": 893 + }, + { + "epoch": 0.6904807877968719, + "grad_norm": 0.039818745106458664, + "learning_rate": 9.956027739156596e-05, + "loss": 0.0055, + "step": 894 + }, + { + "epoch": 0.6912531376713651, + "grad_norm": 0.030000966042280197, + "learning_rate": 9.955670044752223e-05, + "loss": 0.0054, + "step": 895 + }, + { + "epoch": 0.6920254875458582, + "grad_norm": 0.019784117117524147, + "learning_rate": 9.955310907872044e-05, + "loss": 0.006, + "step": 896 + }, + { + "epoch": 0.6927978374203514, + "grad_norm": 0.03611806407570839, + "learning_rate": 9.954950328620596e-05, + "loss": 0.0055, + "step": 897 + }, + { + "epoch": 0.6935701872948445, + "grad_norm": 0.03365226835012436, + "learning_rate": 9.954588307102834e-05, + "loss": 0.0054, + "step": 898 + }, + { + "epoch": 0.6943425371693377, + "grad_norm": 0.016417738050222397, + "learning_rate": 9.954224843424136e-05, + "loss": 0.0049, + "step": 899 + }, + { + "epoch": 0.6951148870438308, + "grad_norm": 0.028995562344789505, + "learning_rate": 9.953859937690295e-05, + "loss": 0.005, + "step": 900 + }, + { + "epoch": 0.695887236918324, + "grad_norm": 0.03609693422913551, + "learning_rate": 9.953493590007528e-05, + "loss": 0.0053, + "step": 901 + }, + { + "epoch": 0.6966595867928171, + "grad_norm": 0.01953314244747162, + "learning_rate": 9.953125800482469e-05, + "loss": 0.0052, + "step": 902 + }, + { + "epoch": 0.6974319366673103, + "grad_norm": 0.03401225060224533, + "learning_rate": 9.952756569222173e-05, + "loss": 0.0046, + "step": 903 + }, + { + "epoch": 0.6982042865418034, + "grad_norm": 0.027537843212485313, + "learning_rate": 9.952385896334114e-05, + "loss": 0.005, + "step": 904 + }, + { + "epoch": 0.6989766364162966, + "grad_norm": 0.02060253545641899, + "learning_rate": 9.952013781926186e-05, + "loss": 0.006, + "step": 905 + }, + { + "epoch": 0.6997489862907897, + "grad_norm": 0.017162665724754333, + "learning_rate": 9.951640226106704e-05, + "loss": 0.0053, + "step": 906 + }, + { + "epoch": 0.7005213361652829, + "grad_norm": 0.016031889244914055, + "learning_rate": 9.951265228984398e-05, + "loss": 0.005, + "step": 907 + }, + { + "epoch": 0.701293686039776, + "grad_norm": 0.01761656627058983, + "learning_rate": 9.950888790668424e-05, + "loss": 0.005, + "step": 908 + }, + { + "epoch": 0.7020660359142692, + "grad_norm": 0.015069528482854366, + "learning_rate": 9.950510911268352e-05, + "loss": 0.0049, + "step": 909 + }, + { + "epoch": 0.7028383857887623, + "grad_norm": 0.019050147384405136, + "learning_rate": 9.950131590894173e-05, + "loss": 0.0056, + "step": 910 + }, + { + "epoch": 0.7036107356632555, + "grad_norm": 0.03884793817996979, + "learning_rate": 9.949750829656299e-05, + "loss": 0.0046, + "step": 911 + }, + { + "epoch": 0.7043830855377486, + "grad_norm": 0.01480827759951353, + "learning_rate": 9.949368627665561e-05, + "loss": 0.0053, + "step": 912 + }, + { + "epoch": 0.7051554354122418, + "grad_norm": 0.015345815569162369, + "learning_rate": 9.948984985033208e-05, + "loss": 0.0047, + "step": 913 + }, + { + "epoch": 0.7059277852867349, + "grad_norm": 0.014812378212809563, + "learning_rate": 9.94859990187091e-05, + "loss": 0.0051, + "step": 914 + }, + { + "epoch": 0.706700135161228, + "grad_norm": 0.013499701395630836, + "learning_rate": 9.948213378290754e-05, + "loss": 0.0052, + "step": 915 + }, + { + "epoch": 0.7074724850357211, + "grad_norm": 0.010988417081534863, + "learning_rate": 9.947825414405248e-05, + "loss": 0.0044, + "step": 916 + }, + { + "epoch": 0.7082448349102143, + "grad_norm": 0.016281627118587494, + "learning_rate": 9.94743601032732e-05, + "loss": 0.0048, + "step": 917 + }, + { + "epoch": 0.7090171847847074, + "grad_norm": 0.017756443470716476, + "learning_rate": 9.947045166170315e-05, + "loss": 0.005, + "step": 918 + }, + { + "epoch": 0.7097895346592006, + "grad_norm": 0.01533447951078415, + "learning_rate": 9.946652882047999e-05, + "loss": 0.0056, + "step": 919 + }, + { + "epoch": 0.7105618845336937, + "grad_norm": 0.01683143526315689, + "learning_rate": 9.946259158074556e-05, + "loss": 0.0054, + "step": 920 + }, + { + "epoch": 0.7113342344081869, + "grad_norm": 0.01568206399679184, + "learning_rate": 9.945863994364588e-05, + "loss": 0.0055, + "step": 921 + }, + { + "epoch": 0.71210658428268, + "grad_norm": 0.015111725777387619, + "learning_rate": 9.945467391033121e-05, + "loss": 0.0055, + "step": 922 + }, + { + "epoch": 0.7128789341571732, + "grad_norm": 0.02060648240149021, + "learning_rate": 9.945069348195595e-05, + "loss": 0.0052, + "step": 923 + }, + { + "epoch": 0.7136512840316663, + "grad_norm": 0.014100771397352219, + "learning_rate": 9.94466986596787e-05, + "loss": 0.0055, + "step": 924 + }, + { + "epoch": 0.7144236339061595, + "grad_norm": 0.012977558188140392, + "learning_rate": 9.944268944466226e-05, + "loss": 0.0043, + "step": 925 + }, + { + "epoch": 0.7151959837806526, + "grad_norm": 0.027847349643707275, + "learning_rate": 9.943866583807362e-05, + "loss": 0.0052, + "step": 926 + }, + { + "epoch": 0.7159683336551458, + "grad_norm": 0.021106727421283722, + "learning_rate": 9.943462784108396e-05, + "loss": 0.0058, + "step": 927 + }, + { + "epoch": 0.7167406835296389, + "grad_norm": 0.016337329521775246, + "learning_rate": 9.943057545486863e-05, + "loss": 0.0052, + "step": 928 + }, + { + "epoch": 0.7175130334041321, + "grad_norm": 0.01878957264125347, + "learning_rate": 9.942650868060716e-05, + "loss": 0.0051, + "step": 929 + }, + { + "epoch": 0.7182853832786252, + "grad_norm": 0.030198317021131516, + "learning_rate": 9.942242751948335e-05, + "loss": 0.0051, + "step": 930 + }, + { + "epoch": 0.7190577331531184, + "grad_norm": 0.027476893737912178, + "learning_rate": 9.941833197268509e-05, + "loss": 0.0059, + "step": 931 + }, + { + "epoch": 0.7198300830276115, + "grad_norm": 0.025040531530976295, + "learning_rate": 9.941422204140449e-05, + "loss": 0.0055, + "step": 932 + }, + { + "epoch": 0.7206024329021047, + "grad_norm": 0.024198127910494804, + "learning_rate": 9.941009772683786e-05, + "loss": 0.005, + "step": 933 + }, + { + "epoch": 0.7213747827765978, + "grad_norm": 0.015231741592288017, + "learning_rate": 9.94059590301857e-05, + "loss": 0.0047, + "step": 934 + }, + { + "epoch": 0.722147132651091, + "grad_norm": 0.014660035260021687, + "learning_rate": 9.940180595265266e-05, + "loss": 0.0053, + "step": 935 + }, + { + "epoch": 0.7229194825255841, + "grad_norm": 0.025344964116811752, + "learning_rate": 9.939763849544762e-05, + "loss": 0.0057, + "step": 936 + }, + { + "epoch": 0.7236918324000773, + "grad_norm": 0.011590663343667984, + "learning_rate": 9.939345665978361e-05, + "loss": 0.0049, + "step": 937 + }, + { + "epoch": 0.7244641822745704, + "grad_norm": 0.023815158754587173, + "learning_rate": 9.938926044687788e-05, + "loss": 0.0054, + "step": 938 + }, + { + "epoch": 0.7252365321490635, + "grad_norm": 0.023053664714097977, + "learning_rate": 9.938504985795184e-05, + "loss": 0.0044, + "step": 939 + }, + { + "epoch": 0.7260088820235566, + "grad_norm": 0.018284421414136887, + "learning_rate": 9.93808248942311e-05, + "loss": 0.0055, + "step": 940 + }, + { + "epoch": 0.7267812318980498, + "grad_norm": 0.031033355742692947, + "learning_rate": 9.937658555694541e-05, + "loss": 0.0052, + "step": 941 + }, + { + "epoch": 0.7275535817725429, + "grad_norm": 0.020948749035596848, + "learning_rate": 9.937233184732877e-05, + "loss": 0.0058, + "step": 942 + }, + { + "epoch": 0.7283259316470361, + "grad_norm": 0.016316091641783714, + "learning_rate": 9.936806376661932e-05, + "loss": 0.0056, + "step": 943 + }, + { + "epoch": 0.7290982815215292, + "grad_norm": 0.02747221291065216, + "learning_rate": 9.936378131605941e-05, + "loss": 0.0052, + "step": 944 + }, + { + "epoch": 0.7298706313960224, + "grad_norm": 0.03303531929850578, + "learning_rate": 9.935948449689553e-05, + "loss": 0.0049, + "step": 945 + }, + { + "epoch": 0.7306429812705155, + "grad_norm": 0.027890313416719437, + "learning_rate": 9.935517331037842e-05, + "loss": 0.0048, + "step": 946 + }, + { + "epoch": 0.7314153311450087, + "grad_norm": 0.0356268435716629, + "learning_rate": 9.935084775776292e-05, + "loss": 0.0053, + "step": 947 + }, + { + "epoch": 0.7321876810195018, + "grad_norm": 0.02592495270073414, + "learning_rate": 9.934650784030812e-05, + "loss": 0.0051, + "step": 948 + }, + { + "epoch": 0.732960030893995, + "grad_norm": 0.01498821098357439, + "learning_rate": 9.934215355927724e-05, + "loss": 0.0048, + "step": 949 + }, + { + "epoch": 0.7337323807684881, + "grad_norm": 0.020218007266521454, + "learning_rate": 9.933778491593776e-05, + "loss": 0.005, + "step": 950 + }, + { + "epoch": 0.7345047306429813, + "grad_norm": 0.040983304381370544, + "learning_rate": 9.933340191156123e-05, + "loss": 0.0053, + "step": 951 + }, + { + "epoch": 0.7352770805174744, + "grad_norm": 0.02097119390964508, + "learning_rate": 9.932900454742347e-05, + "loss": 0.0053, + "step": 952 + }, + { + "epoch": 0.7360494303919676, + "grad_norm": 0.05633474886417389, + "learning_rate": 9.932459282480442e-05, + "loss": 0.0053, + "step": 953 + }, + { + "epoch": 0.7368217802664607, + "grad_norm": 0.03562900051474571, + "learning_rate": 9.932016674498822e-05, + "loss": 0.0058, + "step": 954 + }, + { + "epoch": 0.7375941301409539, + "grad_norm": 0.023267168551683426, + "learning_rate": 9.931572630926324e-05, + "loss": 0.0046, + "step": 955 + }, + { + "epoch": 0.738366480015447, + "grad_norm": 0.030092468485236168, + "learning_rate": 9.931127151892197e-05, + "loss": 0.0054, + "step": 956 + }, + { + "epoch": 0.7391388298899402, + "grad_norm": 0.02708488330245018, + "learning_rate": 9.930680237526107e-05, + "loss": 0.0056, + "step": 957 + }, + { + "epoch": 0.7399111797644333, + "grad_norm": 0.011751326732337475, + "learning_rate": 9.93023188795814e-05, + "loss": 0.0047, + "step": 958 + }, + { + "epoch": 0.7406835296389265, + "grad_norm": 0.028150459751486778, + "learning_rate": 9.9297821033188e-05, + "loss": 0.0054, + "step": 959 + }, + { + "epoch": 0.7414558795134196, + "grad_norm": 0.017759088426828384, + "learning_rate": 9.929330883739011e-05, + "loss": 0.0054, + "step": 960 + }, + { + "epoch": 0.7422282293879128, + "grad_norm": 0.015162650495767593, + "learning_rate": 9.92887822935011e-05, + "loss": 0.005, + "step": 961 + }, + { + "epoch": 0.7430005792624059, + "grad_norm": 0.016975706443190575, + "learning_rate": 9.928424140283854e-05, + "loss": 0.0047, + "step": 962 + }, + { + "epoch": 0.743772929136899, + "grad_norm": 0.017186246812343597, + "learning_rate": 9.927968616672416e-05, + "loss": 0.0052, + "step": 963 + }, + { + "epoch": 0.7445452790113921, + "grad_norm": 0.010945098474621773, + "learning_rate": 9.927511658648389e-05, + "loss": 0.0044, + "step": 964 + }, + { + "epoch": 0.7453176288858853, + "grad_norm": 0.024373026564717293, + "learning_rate": 9.927053266344784e-05, + "loss": 0.0056, + "step": 965 + }, + { + "epoch": 0.7460899787603784, + "grad_norm": 0.03226197138428688, + "learning_rate": 9.926593439895027e-05, + "loss": 0.0053, + "step": 966 + }, + { + "epoch": 0.7468623286348716, + "grad_norm": 0.024539737030863762, + "learning_rate": 9.926132179432962e-05, + "loss": 0.0052, + "step": 967 + }, + { + "epoch": 0.7476346785093647, + "grad_norm": 0.04279707744717598, + "learning_rate": 9.92566948509285e-05, + "loss": 0.0055, + "step": 968 + }, + { + "epoch": 0.7484070283838579, + "grad_norm": 0.024123720824718475, + "learning_rate": 9.92520535700937e-05, + "loss": 0.0048, + "step": 969 + }, + { + "epoch": 0.749179378258351, + "grad_norm": 0.03971217945218086, + "learning_rate": 9.924739795317621e-05, + "loss": 0.0051, + "step": 970 + }, + { + "epoch": 0.7499517281328442, + "grad_norm": 0.04534037783741951, + "learning_rate": 9.924272800153117e-05, + "loss": 0.0058, + "step": 971 + }, + { + "epoch": 0.7507240780073373, + "grad_norm": 0.014068394899368286, + "learning_rate": 9.923804371651783e-05, + "loss": 0.0055, + "step": 972 + }, + { + "epoch": 0.7514964278818305, + "grad_norm": 0.020580369979143143, + "learning_rate": 9.923334509949973e-05, + "loss": 0.0048, + "step": 973 + }, + { + "epoch": 0.7522687777563236, + "grad_norm": 0.03742160275578499, + "learning_rate": 9.922863215184452e-05, + "loss": 0.005, + "step": 974 + }, + { + "epoch": 0.7530411276308168, + "grad_norm": 0.043946683406829834, + "learning_rate": 9.9223904874924e-05, + "loss": 0.0055, + "step": 975 + }, + { + "epoch": 0.7538134775053099, + "grad_norm": 0.04362013563513756, + "learning_rate": 9.921916327011418e-05, + "loss": 0.0049, + "step": 976 + }, + { + "epoch": 0.7545858273798031, + "grad_norm": 0.03981248289346695, + "learning_rate": 9.921440733879524e-05, + "loss": 0.0052, + "step": 977 + }, + { + "epoch": 0.7553581772542962, + "grad_norm": 0.023527968674898148, + "learning_rate": 9.920963708235148e-05, + "loss": 0.0051, + "step": 978 + }, + { + "epoch": 0.7561305271287894, + "grad_norm": 0.01712462492287159, + "learning_rate": 9.920485250217144e-05, + "loss": 0.0048, + "step": 979 + }, + { + "epoch": 0.7569028770032825, + "grad_norm": 0.020297260954976082, + "learning_rate": 9.920005359964778e-05, + "loss": 0.0055, + "step": 980 + }, + { + "epoch": 0.7576752268777757, + "grad_norm": 0.048822131007909775, + "learning_rate": 9.919524037617735e-05, + "loss": 0.0057, + "step": 981 + }, + { + "epoch": 0.7584475767522688, + "grad_norm": 0.015059332363307476, + "learning_rate": 9.919041283316116e-05, + "loss": 0.0051, + "step": 982 + }, + { + "epoch": 0.759219926626762, + "grad_norm": 0.021079909056425095, + "learning_rate": 9.918557097200441e-05, + "loss": 0.006, + "step": 983 + }, + { + "epoch": 0.7599922765012551, + "grad_norm": 0.04429293051362038, + "learning_rate": 9.918071479411642e-05, + "loss": 0.0057, + "step": 984 + }, + { + "epoch": 0.7607646263757483, + "grad_norm": 0.05578068643808365, + "learning_rate": 9.91758443009107e-05, + "loss": 0.0056, + "step": 985 + }, + { + "epoch": 0.7615369762502413, + "grad_norm": 0.039931271225214005, + "learning_rate": 9.917095949380497e-05, + "loss": 0.005, + "step": 986 + }, + { + "epoch": 0.7623093261247345, + "grad_norm": 0.07003474980592728, + "learning_rate": 9.916606037422105e-05, + "loss": 0.0056, + "step": 987 + }, + { + "epoch": 0.7630816759992276, + "grad_norm": 0.03523440286517143, + "learning_rate": 9.916114694358498e-05, + "loss": 0.0051, + "step": 988 + }, + { + "epoch": 0.7638540258737208, + "grad_norm": 0.028762444853782654, + "learning_rate": 9.915621920332691e-05, + "loss": 0.0053, + "step": 989 + }, + { + "epoch": 0.7646263757482139, + "grad_norm": 0.0738329142332077, + "learning_rate": 9.915127715488121e-05, + "loss": 0.0066, + "step": 990 + }, + { + "epoch": 0.765398725622707, + "grad_norm": 0.03267653286457062, + "learning_rate": 9.914632079968639e-05, + "loss": 0.0052, + "step": 991 + }, + { + "epoch": 0.7661710754972002, + "grad_norm": 0.02162010595202446, + "learning_rate": 9.914135013918511e-05, + "loss": 0.0051, + "step": 992 + }, + { + "epoch": 0.7669434253716934, + "grad_norm": 0.05322584509849548, + "learning_rate": 9.913636517482423e-05, + "loss": 0.0063, + "step": 993 + }, + { + "epoch": 0.7677157752461865, + "grad_norm": 0.04363081231713295, + "learning_rate": 9.913136590805472e-05, + "loss": 0.0053, + "step": 994 + }, + { + "epoch": 0.7684881251206797, + "grad_norm": 0.013890145346522331, + "learning_rate": 9.912635234033178e-05, + "loss": 0.0051, + "step": 995 + }, + { + "epoch": 0.7692604749951728, + "grad_norm": 0.044813696295022964, + "learning_rate": 9.912132447311472e-05, + "loss": 0.0061, + "step": 996 + }, + { + "epoch": 0.770032824869666, + "grad_norm": 0.06689228862524033, + "learning_rate": 9.911628230786703e-05, + "loss": 0.0056, + "step": 997 + }, + { + "epoch": 0.7708051747441591, + "grad_norm": 0.019164199009537697, + "learning_rate": 9.911122584605638e-05, + "loss": 0.0051, + "step": 998 + }, + { + "epoch": 0.7715775246186523, + "grad_norm": 0.039524856954813004, + "learning_rate": 9.910615508915457e-05, + "loss": 0.005, + "step": 999 + }, + { + "epoch": 0.7723498744931454, + "grad_norm": 0.07112497836351395, + "learning_rate": 9.910107003863755e-05, + "loss": 0.0063, + "step": 1000 + }, + { + "epoch": 0.7731222243676386, + "grad_norm": 0.018947944045066833, + "learning_rate": 9.909597069598552e-05, + "loss": 0.0049, + "step": 1001 + }, + { + "epoch": 0.7738945742421317, + "grad_norm": 0.035670891404151917, + "learning_rate": 9.909085706268272e-05, + "loss": 0.0053, + "step": 1002 + }, + { + "epoch": 0.7746669241166249, + "grad_norm": 0.062375158071517944, + "learning_rate": 9.908572914021762e-05, + "loss": 0.0055, + "step": 1003 + }, + { + "epoch": 0.775439273991118, + "grad_norm": 0.01835649274289608, + "learning_rate": 9.908058693008284e-05, + "loss": 0.0053, + "step": 1004 + }, + { + "epoch": 0.7762116238656112, + "grad_norm": 0.0214702058583498, + "learning_rate": 9.907543043377514e-05, + "loss": 0.0055, + "step": 1005 + }, + { + "epoch": 0.7769839737401043, + "grad_norm": 0.04028837010264397, + "learning_rate": 9.907025965279548e-05, + "loss": 0.0051, + "step": 1006 + }, + { + "epoch": 0.7777563236145975, + "grad_norm": 0.03422695770859718, + "learning_rate": 9.906507458864891e-05, + "loss": 0.0056, + "step": 1007 + }, + { + "epoch": 0.7785286734890906, + "grad_norm": 0.013215802609920502, + "learning_rate": 9.905987524284471e-05, + "loss": 0.0051, + "step": 1008 + }, + { + "epoch": 0.7793010233635838, + "grad_norm": 0.030311673879623413, + "learning_rate": 9.905466161689627e-05, + "loss": 0.0055, + "step": 1009 + }, + { + "epoch": 0.7800733732380768, + "grad_norm": 0.041229937225580215, + "learning_rate": 9.904943371232116e-05, + "loss": 0.0054, + "step": 1010 + }, + { + "epoch": 0.78084572311257, + "grad_norm": 0.0180194228887558, + "learning_rate": 9.904419153064107e-05, + "loss": 0.0051, + "step": 1011 + }, + { + "epoch": 0.7816180729870631, + "grad_norm": 0.027759356424212456, + "learning_rate": 9.90389350733819e-05, + "loss": 0.0052, + "step": 1012 + }, + { + "epoch": 0.7823904228615562, + "grad_norm": 0.02440088428556919, + "learning_rate": 9.903366434207367e-05, + "loss": 0.0054, + "step": 1013 + }, + { + "epoch": 0.7831627727360494, + "grad_norm": 0.017409170046448708, + "learning_rate": 9.902837933825055e-05, + "loss": 0.0052, + "step": 1014 + }, + { + "epoch": 0.7839351226105425, + "grad_norm": 0.018347326666116714, + "learning_rate": 9.902308006345091e-05, + "loss": 0.0059, + "step": 1015 + }, + { + "epoch": 0.7847074724850357, + "grad_norm": 0.030546607449650764, + "learning_rate": 9.90177665192172e-05, + "loss": 0.0052, + "step": 1016 + }, + { + "epoch": 0.7854798223595288, + "grad_norm": 0.029116885736584663, + "learning_rate": 9.901243870709609e-05, + "loss": 0.0049, + "step": 1017 + }, + { + "epoch": 0.786252172234022, + "grad_norm": 0.022032609209418297, + "learning_rate": 9.900709662863837e-05, + "loss": 0.0052, + "step": 1018 + }, + { + "epoch": 0.7870245221085151, + "grad_norm": 0.04383152723312378, + "learning_rate": 9.900174028539899e-05, + "loss": 0.0056, + "step": 1019 + }, + { + "epoch": 0.7877968719830083, + "grad_norm": 0.01003220397979021, + "learning_rate": 9.899636967893706e-05, + "loss": 0.0053, + "step": 1020 + }, + { + "epoch": 0.7885692218575014, + "grad_norm": 0.01930188573896885, + "learning_rate": 9.899098481081582e-05, + "loss": 0.0047, + "step": 1021 + }, + { + "epoch": 0.7893415717319946, + "grad_norm": 0.016947351396083832, + "learning_rate": 9.898558568260268e-05, + "loss": 0.0045, + "step": 1022 + }, + { + "epoch": 0.7901139216064877, + "grad_norm": 0.019604964181780815, + "learning_rate": 9.89801722958692e-05, + "loss": 0.0054, + "step": 1023 + }, + { + "epoch": 0.7908862714809809, + "grad_norm": 0.012611506506800652, + "learning_rate": 9.897474465219108e-05, + "loss": 0.0053, + "step": 1024 + }, + { + "epoch": 0.791658621355474, + "grad_norm": 0.022115463390946388, + "learning_rate": 9.896930275314819e-05, + "loss": 0.0055, + "step": 1025 + }, + { + "epoch": 0.7924309712299672, + "grad_norm": 0.010714237578213215, + "learning_rate": 9.896384660032452e-05, + "loss": 0.0041, + "step": 1026 + }, + { + "epoch": 0.7932033211044603, + "grad_norm": 0.021694796159863472, + "learning_rate": 9.895837619530822e-05, + "loss": 0.0048, + "step": 1027 + }, + { + "epoch": 0.7939756709789535, + "grad_norm": 0.026115277782082558, + "learning_rate": 9.895289153969161e-05, + "loss": 0.0049, + "step": 1028 + }, + { + "epoch": 0.7947480208534466, + "grad_norm": 0.0178262647241354, + "learning_rate": 9.894739263507113e-05, + "loss": 0.0056, + "step": 1029 + }, + { + "epoch": 0.7955203707279398, + "grad_norm": 0.0118311932310462, + "learning_rate": 9.894187948304737e-05, + "loss": 0.0055, + "step": 1030 + }, + { + "epoch": 0.796292720602433, + "grad_norm": 0.012592138722538948, + "learning_rate": 9.893635208522509e-05, + "loss": 0.0046, + "step": 1031 + }, + { + "epoch": 0.7970650704769261, + "grad_norm": 0.01537346187978983, + "learning_rate": 9.89308104432132e-05, + "loss": 0.005, + "step": 1032 + }, + { + "epoch": 0.7978374203514192, + "grad_norm": 0.013634511269629002, + "learning_rate": 9.892525455862469e-05, + "loss": 0.0052, + "step": 1033 + }, + { + "epoch": 0.7986097702259123, + "grad_norm": 0.017275096848607063, + "learning_rate": 9.891968443307678e-05, + "loss": 0.0052, + "step": 1034 + }, + { + "epoch": 0.7993821201004054, + "grad_norm": 0.014000173658132553, + "learning_rate": 9.891410006819079e-05, + "loss": 0.0051, + "step": 1035 + }, + { + "epoch": 0.8001544699748986, + "grad_norm": 0.01269017904996872, + "learning_rate": 9.890850146559219e-05, + "loss": 0.0041, + "step": 1036 + }, + { + "epoch": 0.8009268198493917, + "grad_norm": 0.013115121982991695, + "learning_rate": 9.890288862691059e-05, + "loss": 0.0052, + "step": 1037 + }, + { + "epoch": 0.8016991697238849, + "grad_norm": 0.0130803557112813, + "learning_rate": 9.889726155377979e-05, + "loss": 0.0052, + "step": 1038 + }, + { + "epoch": 0.802471519598378, + "grad_norm": 0.020898472517728806, + "learning_rate": 9.889162024783764e-05, + "loss": 0.0045, + "step": 1039 + }, + { + "epoch": 0.8032438694728712, + "grad_norm": 0.01687263883650303, + "learning_rate": 9.888596471072622e-05, + "loss": 0.005, + "step": 1040 + }, + { + "epoch": 0.8040162193473643, + "grad_norm": 0.016980089247226715, + "learning_rate": 9.888029494409172e-05, + "loss": 0.0051, + "step": 1041 + }, + { + "epoch": 0.8047885692218575, + "grad_norm": 0.029596013948321342, + "learning_rate": 9.887461094958445e-05, + "loss": 0.0051, + "step": 1042 + }, + { + "epoch": 0.8055609190963506, + "grad_norm": 0.010340928100049496, + "learning_rate": 9.886891272885893e-05, + "loss": 0.0047, + "step": 1043 + }, + { + "epoch": 0.8063332689708438, + "grad_norm": 0.01951519399881363, + "learning_rate": 9.886320028357372e-05, + "loss": 0.0049, + "step": 1044 + }, + { + "epoch": 0.8071056188453369, + "grad_norm": 0.013048755936324596, + "learning_rate": 9.885747361539162e-05, + "loss": 0.0055, + "step": 1045 + }, + { + "epoch": 0.8078779687198301, + "grad_norm": 0.015198012813925743, + "learning_rate": 9.885173272597949e-05, + "loss": 0.0053, + "step": 1046 + }, + { + "epoch": 0.8086503185943232, + "grad_norm": 0.015721959993243217, + "learning_rate": 9.884597761700838e-05, + "loss": 0.005, + "step": 1047 + }, + { + "epoch": 0.8094226684688164, + "grad_norm": 0.01544812973588705, + "learning_rate": 9.884020829015347e-05, + "loss": 0.005, + "step": 1048 + }, + { + "epoch": 0.8101950183433095, + "grad_norm": 0.013755733147263527, + "learning_rate": 9.883442474709406e-05, + "loss": 0.0043, + "step": 1049 + }, + { + "epoch": 0.8109673682178027, + "grad_norm": 0.024196283891797066, + "learning_rate": 9.882862698951361e-05, + "loss": 0.0047, + "step": 1050 + }, + { + "epoch": 0.8117397180922958, + "grad_norm": 0.01150796189904213, + "learning_rate": 9.882281501909968e-05, + "loss": 0.0048, + "step": 1051 + }, + { + "epoch": 0.812512067966789, + "grad_norm": 0.01957443729043007, + "learning_rate": 9.881698883754402e-05, + "loss": 0.005, + "step": 1052 + }, + { + "epoch": 0.8132844178412821, + "grad_norm": 0.029252223670482635, + "learning_rate": 9.881114844654249e-05, + "loss": 0.0052, + "step": 1053 + }, + { + "epoch": 0.8140567677157753, + "grad_norm": 0.014108984731137753, + "learning_rate": 9.880529384779508e-05, + "loss": 0.0052, + "step": 1054 + }, + { + "epoch": 0.8148291175902684, + "grad_norm": 0.025499513372778893, + "learning_rate": 9.879942504300593e-05, + "loss": 0.0052, + "step": 1055 + }, + { + "epoch": 0.8156014674647616, + "grad_norm": 0.018272900953888893, + "learning_rate": 9.87935420338833e-05, + "loss": 0.0045, + "step": 1056 + }, + { + "epoch": 0.8163738173392547, + "grad_norm": 0.014985686168074608, + "learning_rate": 9.878764482213959e-05, + "loss": 0.0049, + "step": 1057 + }, + { + "epoch": 0.8171461672137478, + "grad_norm": 0.020466111600399017, + "learning_rate": 9.878173340949136e-05, + "loss": 0.0049, + "step": 1058 + }, + { + "epoch": 0.8179185170882409, + "grad_norm": 0.013434127904474735, + "learning_rate": 9.877580779765922e-05, + "loss": 0.0044, + "step": 1059 + }, + { + "epoch": 0.8186908669627341, + "grad_norm": 0.014021494425833225, + "learning_rate": 9.876986798836803e-05, + "loss": 0.005, + "step": 1060 + }, + { + "epoch": 0.8194632168372272, + "grad_norm": 0.02708282321691513, + "learning_rate": 9.87639139833467e-05, + "loss": 0.0052, + "step": 1061 + }, + { + "epoch": 0.8202355667117204, + "grad_norm": 0.016625193879008293, + "learning_rate": 9.875794578432831e-05, + "loss": 0.0051, + "step": 1062 + }, + { + "epoch": 0.8210079165862135, + "grad_norm": 0.010641127824783325, + "learning_rate": 9.875196339305004e-05, + "loss": 0.0049, + "step": 1063 + }, + { + "epoch": 0.8217802664607067, + "grad_norm": 0.025936421006917953, + "learning_rate": 9.874596681125324e-05, + "loss": 0.0053, + "step": 1064 + }, + { + "epoch": 0.8225526163351998, + "grad_norm": 0.017523979768157005, + "learning_rate": 9.873995604068335e-05, + "loss": 0.0056, + "step": 1065 + }, + { + "epoch": 0.823324966209693, + "grad_norm": 0.018067866563796997, + "learning_rate": 9.873393108308999e-05, + "loss": 0.0048, + "step": 1066 + }, + { + "epoch": 0.8240973160841861, + "grad_norm": 0.026911109685897827, + "learning_rate": 9.872789194022684e-05, + "loss": 0.0047, + "step": 1067 + }, + { + "epoch": 0.8248696659586793, + "grad_norm": 0.02376358024775982, + "learning_rate": 9.872183861385177e-05, + "loss": 0.0049, + "step": 1068 + }, + { + "epoch": 0.8256420158331724, + "grad_norm": 0.023630045354366302, + "learning_rate": 9.871577110572679e-05, + "loss": 0.0054, + "step": 1069 + }, + { + "epoch": 0.8264143657076656, + "grad_norm": 0.03594619780778885, + "learning_rate": 9.870968941761793e-05, + "loss": 0.0054, + "step": 1070 + }, + { + "epoch": 0.8271867155821587, + "grad_norm": 0.013311091810464859, + "learning_rate": 9.870359355129548e-05, + "loss": 0.0047, + "step": 1071 + }, + { + "epoch": 0.8279590654566519, + "grad_norm": 0.018972985446453094, + "learning_rate": 9.869748350853378e-05, + "loss": 0.0053, + "step": 1072 + }, + { + "epoch": 0.828731415331145, + "grad_norm": 0.014069105498492718, + "learning_rate": 9.869135929111133e-05, + "loss": 0.0048, + "step": 1073 + }, + { + "epoch": 0.8295037652056382, + "grad_norm": 0.013836747035384178, + "learning_rate": 9.868522090081071e-05, + "loss": 0.0046, + "step": 1074 + }, + { + "epoch": 0.8302761150801313, + "grad_norm": 0.014035874046385288, + "learning_rate": 9.86790683394187e-05, + "loss": 0.0061, + "step": 1075 + }, + { + "epoch": 0.8310484649546245, + "grad_norm": 0.015665380284190178, + "learning_rate": 9.867290160872613e-05, + "loss": 0.005, + "step": 1076 + }, + { + "epoch": 0.8318208148291176, + "grad_norm": 0.010867977514863014, + "learning_rate": 9.866672071052798e-05, + "loss": 0.0045, + "step": 1077 + }, + { + "epoch": 0.8325931647036108, + "grad_norm": 0.010544631630182266, + "learning_rate": 9.86605256466234e-05, + "loss": 0.0045, + "step": 1078 + }, + { + "epoch": 0.8333655145781039, + "grad_norm": 0.018490837886929512, + "learning_rate": 9.865431641881558e-05, + "loss": 0.005, + "step": 1079 + }, + { + "epoch": 0.8341378644525971, + "grad_norm": 0.011083516292273998, + "learning_rate": 9.864809302891192e-05, + "loss": 0.0051, + "step": 1080 + }, + { + "epoch": 0.8349102143270901, + "grad_norm": 0.014008168131113052, + "learning_rate": 9.864185547872385e-05, + "loss": 0.0052, + "step": 1081 + }, + { + "epoch": 0.8356825642015833, + "grad_norm": 0.01850767433643341, + "learning_rate": 9.8635603770067e-05, + "loss": 0.0051, + "step": 1082 + }, + { + "epoch": 0.8364549140760764, + "grad_norm": 0.011455691419541836, + "learning_rate": 9.862933790476108e-05, + "loss": 0.0052, + "step": 1083 + }, + { + "epoch": 0.8372272639505696, + "grad_norm": 0.014069200493395329, + "learning_rate": 9.862305788462996e-05, + "loss": 0.005, + "step": 1084 + }, + { + "epoch": 0.8379996138250627, + "grad_norm": 0.02683812938630581, + "learning_rate": 9.861676371150154e-05, + "loss": 0.0047, + "step": 1085 + }, + { + "epoch": 0.8387719636995559, + "grad_norm": 0.016436012461781502, + "learning_rate": 9.861045538720798e-05, + "loss": 0.0055, + "step": 1086 + }, + { + "epoch": 0.839544313574049, + "grad_norm": 0.01961844600737095, + "learning_rate": 9.860413291358542e-05, + "loss": 0.0049, + "step": 1087 + }, + { + "epoch": 0.8403166634485422, + "grad_norm": 0.013608099892735481, + "learning_rate": 9.859779629247421e-05, + "loss": 0.0043, + "step": 1088 + }, + { + "epoch": 0.8410890133230353, + "grad_norm": 0.012145274318754673, + "learning_rate": 9.859144552571877e-05, + "loss": 0.0048, + "step": 1089 + }, + { + "epoch": 0.8418613631975285, + "grad_norm": 0.014922577887773514, + "learning_rate": 9.858508061516766e-05, + "loss": 0.0052, + "step": 1090 + }, + { + "epoch": 0.8426337130720216, + "grad_norm": 0.015595716424286366, + "learning_rate": 9.857870156267357e-05, + "loss": 0.0053, + "step": 1091 + }, + { + "epoch": 0.8434060629465148, + "grad_norm": 0.01833524741232395, + "learning_rate": 9.857230837009329e-05, + "loss": 0.0055, + "step": 1092 + }, + { + "epoch": 0.8441784128210079, + "grad_norm": 0.01634645089507103, + "learning_rate": 9.856590103928767e-05, + "loss": 0.0061, + "step": 1093 + }, + { + "epoch": 0.8449507626955011, + "grad_norm": 0.016310440376400948, + "learning_rate": 9.855947957212178e-05, + "loss": 0.0049, + "step": 1094 + }, + { + "epoch": 0.8457231125699942, + "grad_norm": 0.016290003433823586, + "learning_rate": 9.855304397046474e-05, + "loss": 0.005, + "step": 1095 + }, + { + "epoch": 0.8464954624444874, + "grad_norm": 0.013762637041509151, + "learning_rate": 9.85465942361898e-05, + "loss": 0.0049, + "step": 1096 + }, + { + "epoch": 0.8472678123189805, + "grad_norm": 0.017816467210650444, + "learning_rate": 9.854013037117431e-05, + "loss": 0.0052, + "step": 1097 + }, + { + "epoch": 0.8480401621934737, + "grad_norm": 0.019573671743273735, + "learning_rate": 9.853365237729976e-05, + "loss": 0.0046, + "step": 1098 + }, + { + "epoch": 0.8488125120679668, + "grad_norm": 0.013996942900121212, + "learning_rate": 9.852716025645175e-05, + "loss": 0.0049, + "step": 1099 + }, + { + "epoch": 0.84958486194246, + "grad_norm": 0.039246659725904465, + "learning_rate": 9.852065401051993e-05, + "loss": 0.0055, + "step": 1100 + }, + { + "epoch": 0.8503572118169531, + "grad_norm": 0.01461487915366888, + "learning_rate": 9.851413364139817e-05, + "loss": 0.0057, + "step": 1101 + }, + { + "epoch": 0.8511295616914463, + "grad_norm": 0.02236500196158886, + "learning_rate": 9.850759915098434e-05, + "loss": 0.0053, + "step": 1102 + }, + { + "epoch": 0.8519019115659394, + "grad_norm": 0.020510345697402954, + "learning_rate": 9.850105054118052e-05, + "loss": 0.0047, + "step": 1103 + }, + { + "epoch": 0.8526742614404326, + "grad_norm": 0.018675046041607857, + "learning_rate": 9.84944878138928e-05, + "loss": 0.0052, + "step": 1104 + }, + { + "epoch": 0.8534466113149256, + "grad_norm": 0.017054179683327675, + "learning_rate": 9.84879109710315e-05, + "loss": 0.0046, + "step": 1105 + }, + { + "epoch": 0.8542189611894188, + "grad_norm": 0.026352353394031525, + "learning_rate": 9.848132001451091e-05, + "loss": 0.0052, + "step": 1106 + }, + { + "epoch": 0.8549913110639119, + "grad_norm": 0.02260519377887249, + "learning_rate": 9.847471494624953e-05, + "loss": 0.005, + "step": 1107 + }, + { + "epoch": 0.8557636609384051, + "grad_norm": 0.03695909306406975, + "learning_rate": 9.846809576816996e-05, + "loss": 0.0051, + "step": 1108 + }, + { + "epoch": 0.8565360108128982, + "grad_norm": 0.030567850917577744, + "learning_rate": 9.846146248219882e-05, + "loss": 0.0049, + "step": 1109 + }, + { + "epoch": 0.8573083606873914, + "grad_norm": 0.010855203494429588, + "learning_rate": 9.845481509026697e-05, + "loss": 0.0051, + "step": 1110 + }, + { + "epoch": 0.8580807105618845, + "grad_norm": 0.017507346346974373, + "learning_rate": 9.844815359430926e-05, + "loss": 0.0054, + "step": 1111 + }, + { + "epoch": 0.8588530604363777, + "grad_norm": 0.03854590281844139, + "learning_rate": 9.84414779962647e-05, + "loss": 0.0051, + "step": 1112 + }, + { + "epoch": 0.8596254103108708, + "grad_norm": 0.014054159633815289, + "learning_rate": 9.843478829807639e-05, + "loss": 0.0048, + "step": 1113 + }, + { + "epoch": 0.860397760185364, + "grad_norm": 0.02449275553226471, + "learning_rate": 9.842808450169156e-05, + "loss": 0.0048, + "step": 1114 + }, + { + "epoch": 0.8611701100598571, + "grad_norm": 0.03688691556453705, + "learning_rate": 9.84213666090615e-05, + "loss": 0.0055, + "step": 1115 + }, + { + "epoch": 0.8619424599343503, + "grad_norm": 0.011802013963460922, + "learning_rate": 9.841463462214165e-05, + "loss": 0.0045, + "step": 1116 + }, + { + "epoch": 0.8627148098088434, + "grad_norm": 0.023805933073163033, + "learning_rate": 9.84078885428915e-05, + "loss": 0.005, + "step": 1117 + }, + { + "epoch": 0.8634871596833366, + "grad_norm": 0.01294466108083725, + "learning_rate": 9.840112837327469e-05, + "loss": 0.0045, + "step": 1118 + }, + { + "epoch": 0.8642595095578297, + "grad_norm": 0.017138268798589706, + "learning_rate": 9.839435411525892e-05, + "loss": 0.0051, + "step": 1119 + }, + { + "epoch": 0.8650318594323229, + "grad_norm": 0.00986329186707735, + "learning_rate": 9.838756577081605e-05, + "loss": 0.0043, + "step": 1120 + }, + { + "epoch": 0.865804209306816, + "grad_norm": 0.014220822602510452, + "learning_rate": 9.838076334192198e-05, + "loss": 0.005, + "step": 1121 + }, + { + "epoch": 0.8665765591813092, + "grad_norm": 0.017122695222496986, + "learning_rate": 9.837394683055675e-05, + "loss": 0.0044, + "step": 1122 + }, + { + "epoch": 0.8673489090558023, + "grad_norm": 0.01046262588351965, + "learning_rate": 9.836711623870445e-05, + "loss": 0.0043, + "step": 1123 + }, + { + "epoch": 0.8681212589302955, + "grad_norm": 0.014623397961258888, + "learning_rate": 9.836027156835332e-05, + "loss": 0.0049, + "step": 1124 + }, + { + "epoch": 0.8688936088047886, + "grad_norm": 0.01762223243713379, + "learning_rate": 9.835341282149568e-05, + "loss": 0.0041, + "step": 1125 + }, + { + "epoch": 0.8696659586792818, + "grad_norm": 0.02136484906077385, + "learning_rate": 9.834654000012796e-05, + "loss": 0.0053, + "step": 1126 + }, + { + "epoch": 0.8704383085537749, + "grad_norm": 0.03846302628517151, + "learning_rate": 9.833965310625063e-05, + "loss": 0.005, + "step": 1127 + }, + { + "epoch": 0.8712106584282681, + "grad_norm": 0.023591913282871246, + "learning_rate": 9.833275214186833e-05, + "loss": 0.0049, + "step": 1128 + }, + { + "epoch": 0.8719830083027611, + "grad_norm": 0.011188088916242123, + "learning_rate": 9.832583710898974e-05, + "loss": 0.0047, + "step": 1129 + }, + { + "epoch": 0.8727553581772542, + "grad_norm": 0.031315676867961884, + "learning_rate": 9.831890800962771e-05, + "loss": 0.005, + "step": 1130 + }, + { + "epoch": 0.8735277080517474, + "grad_norm": 0.02841937728226185, + "learning_rate": 9.831196484579908e-05, + "loss": 0.0052, + "step": 1131 + }, + { + "epoch": 0.8743000579262405, + "grad_norm": 0.016970042139291763, + "learning_rate": 9.830500761952484e-05, + "loss": 0.0043, + "step": 1132 + }, + { + "epoch": 0.8750724078007337, + "grad_norm": 0.017414122819900513, + "learning_rate": 9.82980363328301e-05, + "loss": 0.005, + "step": 1133 + }, + { + "epoch": 0.8758447576752268, + "grad_norm": 0.03681933134794235, + "learning_rate": 9.829105098774403e-05, + "loss": 0.0054, + "step": 1134 + }, + { + "epoch": 0.87661710754972, + "grad_norm": 0.03045729361474514, + "learning_rate": 9.828405158629987e-05, + "loss": 0.0046, + "step": 1135 + }, + { + "epoch": 0.8773894574242131, + "grad_norm": 0.02646372653543949, + "learning_rate": 9.827703813053499e-05, + "loss": 0.0051, + "step": 1136 + }, + { + "epoch": 0.8781618072987063, + "grad_norm": 0.0201679989695549, + "learning_rate": 9.827001062249086e-05, + "loss": 0.0049, + "step": 1137 + }, + { + "epoch": 0.8789341571731994, + "grad_norm": 0.022596973925828934, + "learning_rate": 9.826296906421298e-05, + "loss": 0.0051, + "step": 1138 + }, + { + "epoch": 0.8797065070476926, + "grad_norm": 0.01095764059573412, + "learning_rate": 9.8255913457751e-05, + "loss": 0.0045, + "step": 1139 + }, + { + "epoch": 0.8804788569221857, + "grad_norm": 0.02390890382230282, + "learning_rate": 9.824884380515862e-05, + "loss": 0.0051, + "step": 1140 + }, + { + "epoch": 0.8812512067966789, + "grad_norm": 0.024027172476053238, + "learning_rate": 9.824176010849367e-05, + "loss": 0.0043, + "step": 1141 + }, + { + "epoch": 0.882023556671172, + "grad_norm": 0.011508166790008545, + "learning_rate": 9.823466236981802e-05, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 0.8827959065456652, + "grad_norm": 0.01273355819284916, + "learning_rate": 9.822755059119765e-05, + "loss": 0.0044, + "step": 1143 + }, + { + "epoch": 0.8835682564201583, + "grad_norm": 0.018790990114212036, + "learning_rate": 9.822042477470265e-05, + "loss": 0.0048, + "step": 1144 + }, + { + "epoch": 0.8843406062946515, + "grad_norm": 0.023220032453536987, + "learning_rate": 9.821328492240715e-05, + "loss": 0.005, + "step": 1145 + }, + { + "epoch": 0.8851129561691446, + "grad_norm": 0.029522253200411797, + "learning_rate": 9.820613103638941e-05, + "loss": 0.0045, + "step": 1146 + }, + { + "epoch": 0.8858853060436378, + "grad_norm": 0.014386980794370174, + "learning_rate": 9.819896311873174e-05, + "loss": 0.0045, + "step": 1147 + }, + { + "epoch": 0.886657655918131, + "grad_norm": 0.010787330567836761, + "learning_rate": 9.819178117152053e-05, + "loss": 0.0045, + "step": 1148 + }, + { + "epoch": 0.8874300057926241, + "grad_norm": 0.013695201836526394, + "learning_rate": 9.81845851968463e-05, + "loss": 0.0052, + "step": 1149 + }, + { + "epoch": 0.8882023556671172, + "grad_norm": 0.013468950055539608, + "learning_rate": 9.817737519680362e-05, + "loss": 0.004, + "step": 1150 + }, + { + "epoch": 0.8889747055416104, + "grad_norm": 0.013891457580029964, + "learning_rate": 9.817015117349113e-05, + "loss": 0.0051, + "step": 1151 + }, + { + "epoch": 0.8897470554161035, + "grad_norm": 0.011809339746832848, + "learning_rate": 9.816291312901159e-05, + "loss": 0.0043, + "step": 1152 + }, + { + "epoch": 0.8905194052905966, + "grad_norm": 0.021739143878221512, + "learning_rate": 9.81556610654718e-05, + "loss": 0.0047, + "step": 1153 + }, + { + "epoch": 0.8912917551650897, + "grad_norm": 0.013957880437374115, + "learning_rate": 9.814839498498268e-05, + "loss": 0.0047, + "step": 1154 + }, + { + "epoch": 0.8920641050395829, + "grad_norm": 0.011841529048979282, + "learning_rate": 9.814111488965918e-05, + "loss": 0.0047, + "step": 1155 + }, + { + "epoch": 0.892836454914076, + "grad_norm": 0.02467193454504013, + "learning_rate": 9.81338207816204e-05, + "loss": 0.0048, + "step": 1156 + }, + { + "epoch": 0.8936088047885692, + "grad_norm": 0.020380204543471336, + "learning_rate": 9.812651266298944e-05, + "loss": 0.0046, + "step": 1157 + }, + { + "epoch": 0.8943811546630623, + "grad_norm": 0.026750663295388222, + "learning_rate": 9.811919053589355e-05, + "loss": 0.0049, + "step": 1158 + }, + { + "epoch": 0.8951535045375555, + "grad_norm": 0.016972428187727928, + "learning_rate": 9.8111854402464e-05, + "loss": 0.0042, + "step": 1159 + }, + { + "epoch": 0.8959258544120486, + "grad_norm": 0.014942123554646969, + "learning_rate": 9.810450426483618e-05, + "loss": 0.0046, + "step": 1160 + }, + { + "epoch": 0.8966982042865418, + "grad_norm": 0.013228869996964931, + "learning_rate": 9.809714012514953e-05, + "loss": 0.0047, + "step": 1161 + }, + { + "epoch": 0.8974705541610349, + "grad_norm": 0.015319743193686008, + "learning_rate": 9.808976198554755e-05, + "loss": 0.0049, + "step": 1162 + }, + { + "epoch": 0.8982429040355281, + "grad_norm": 0.01305676344782114, + "learning_rate": 9.80823698481779e-05, + "loss": 0.0048, + "step": 1163 + }, + { + "epoch": 0.8990152539100212, + "grad_norm": 0.023310331627726555, + "learning_rate": 9.807496371519219e-05, + "loss": 0.0051, + "step": 1164 + }, + { + "epoch": 0.8997876037845144, + "grad_norm": 0.020194826647639275, + "learning_rate": 9.806754358874617e-05, + "loss": 0.0051, + "step": 1165 + }, + { + "epoch": 0.9005599536590075, + "grad_norm": 0.01612170971930027, + "learning_rate": 9.806010947099971e-05, + "loss": 0.004, + "step": 1166 + }, + { + "epoch": 0.9013323035335007, + "grad_norm": 0.02737812139093876, + "learning_rate": 9.805266136411663e-05, + "loss": 0.0049, + "step": 1167 + }, + { + "epoch": 0.9021046534079938, + "grad_norm": 0.011050846427679062, + "learning_rate": 9.804519927026496e-05, + "loss": 0.0045, + "step": 1168 + }, + { + "epoch": 0.902877003282487, + "grad_norm": 0.013702969066798687, + "learning_rate": 9.803772319161672e-05, + "loss": 0.0047, + "step": 1169 + }, + { + "epoch": 0.9036493531569801, + "grad_norm": 0.018601300194859505, + "learning_rate": 9.803023313034797e-05, + "loss": 0.0047, + "step": 1170 + }, + { + "epoch": 0.9044217030314733, + "grad_norm": 0.034467071294784546, + "learning_rate": 9.802272908863897e-05, + "loss": 0.0054, + "step": 1171 + }, + { + "epoch": 0.9051940529059664, + "grad_norm": 0.025881033390760422, + "learning_rate": 9.801521106867388e-05, + "loss": 0.0047, + "step": 1172 + }, + { + "epoch": 0.9059664027804596, + "grad_norm": 0.05163715407252312, + "learning_rate": 9.800767907264105e-05, + "loss": 0.0049, + "step": 1173 + }, + { + "epoch": 0.9067387526549527, + "grad_norm": 0.03238176926970482, + "learning_rate": 9.800013310273288e-05, + "loss": 0.0049, + "step": 1174 + }, + { + "epoch": 0.9075111025294459, + "grad_norm": 0.022428739815950394, + "learning_rate": 9.799257316114579e-05, + "loss": 0.0046, + "step": 1175 + }, + { + "epoch": 0.9082834524039389, + "grad_norm": 0.04021529480814934, + "learning_rate": 9.79849992500803e-05, + "loss": 0.0046, + "step": 1176 + }, + { + "epoch": 0.9090558022784321, + "grad_norm": 0.04667231813073158, + "learning_rate": 9.797741137174102e-05, + "loss": 0.0053, + "step": 1177 + }, + { + "epoch": 0.9098281521529252, + "grad_norm": 0.01686747372150421, + "learning_rate": 9.796980952833656e-05, + "loss": 0.0052, + "step": 1178 + }, + { + "epoch": 0.9106005020274184, + "grad_norm": 0.028423011302947998, + "learning_rate": 9.796219372207966e-05, + "loss": 0.0055, + "step": 1179 + }, + { + "epoch": 0.9113728519019115, + "grad_norm": 0.04666873812675476, + "learning_rate": 9.795456395518709e-05, + "loss": 0.0052, + "step": 1180 + }, + { + "epoch": 0.9121452017764047, + "grad_norm": 0.019545091316103935, + "learning_rate": 9.79469202298797e-05, + "loss": 0.0051, + "step": 1181 + }, + { + "epoch": 0.9129175516508978, + "grad_norm": 0.021862614899873734, + "learning_rate": 9.793926254838237e-05, + "loss": 0.0051, + "step": 1182 + }, + { + "epoch": 0.913689901525391, + "grad_norm": 0.0275779590010643, + "learning_rate": 9.793159091292408e-05, + "loss": 0.0054, + "step": 1183 + }, + { + "epoch": 0.9144622513998841, + "grad_norm": 0.04146139696240425, + "learning_rate": 9.792390532573786e-05, + "loss": 0.0051, + "step": 1184 + }, + { + "epoch": 0.9152346012743773, + "grad_norm": 0.010904265567660332, + "learning_rate": 9.791620578906079e-05, + "loss": 0.0043, + "step": 1185 + }, + { + "epoch": 0.9160069511488704, + "grad_norm": 0.03226253017783165, + "learning_rate": 9.790849230513402e-05, + "loss": 0.0047, + "step": 1186 + }, + { + "epoch": 0.9167793010233636, + "grad_norm": 0.03302263468503952, + "learning_rate": 9.790076487620276e-05, + "loss": 0.0048, + "step": 1187 + }, + { + "epoch": 0.9175516508978567, + "grad_norm": 0.016255497932434082, + "learning_rate": 9.78930235045163e-05, + "loss": 0.005, + "step": 1188 + }, + { + "epoch": 0.9183240007723499, + "grad_norm": 0.03711313381791115, + "learning_rate": 9.788526819232795e-05, + "loss": 0.0055, + "step": 1189 + }, + { + "epoch": 0.919096350646843, + "grad_norm": 0.0345834456384182, + "learning_rate": 9.787749894189507e-05, + "loss": 0.0059, + "step": 1190 + }, + { + "epoch": 0.9198687005213362, + "grad_norm": 0.01827995665371418, + "learning_rate": 9.786971575547914e-05, + "loss": 0.0047, + "step": 1191 + }, + { + "epoch": 0.9206410503958293, + "grad_norm": 0.02552584744989872, + "learning_rate": 9.786191863534563e-05, + "loss": 0.0044, + "step": 1192 + }, + { + "epoch": 0.9214134002703225, + "grad_norm": 0.05019800364971161, + "learning_rate": 9.78541075837641e-05, + "loss": 0.0055, + "step": 1193 + }, + { + "epoch": 0.9221857501448156, + "grad_norm": 0.012669118121266365, + "learning_rate": 9.784628260300817e-05, + "loss": 0.0045, + "step": 1194 + }, + { + "epoch": 0.9229581000193088, + "grad_norm": 0.022647053003311157, + "learning_rate": 9.783844369535549e-05, + "loss": 0.0044, + "step": 1195 + }, + { + "epoch": 0.9237304498938019, + "grad_norm": 0.02889241650700569, + "learning_rate": 9.783059086308779e-05, + "loss": 0.0049, + "step": 1196 + }, + { + "epoch": 0.9245027997682951, + "grad_norm": 0.03231604024767876, + "learning_rate": 9.782272410849083e-05, + "loss": 0.005, + "step": 1197 + }, + { + "epoch": 0.9252751496427882, + "grad_norm": 0.015890056267380714, + "learning_rate": 9.781484343385442e-05, + "loss": 0.0046, + "step": 1198 + }, + { + "epoch": 0.9260474995172814, + "grad_norm": 0.031044993549585342, + "learning_rate": 9.780694884147245e-05, + "loss": 0.0049, + "step": 1199 + }, + { + "epoch": 0.9268198493917744, + "grad_norm": 0.024014852941036224, + "learning_rate": 9.779904033364284e-05, + "loss": 0.0048, + "step": 1200 + }, + { + "epoch": 0.9275921992662676, + "grad_norm": 0.010703184641897678, + "learning_rate": 9.779111791266757e-05, + "loss": 0.0041, + "step": 1201 + }, + { + "epoch": 0.9283645491407607, + "grad_norm": 0.01483464427292347, + "learning_rate": 9.778318158085268e-05, + "loss": 0.0048, + "step": 1202 + }, + { + "epoch": 0.9291368990152539, + "grad_norm": 0.01868997886776924, + "learning_rate": 9.777523134050821e-05, + "loss": 0.0049, + "step": 1203 + }, + { + "epoch": 0.929909248889747, + "grad_norm": 0.0175771564245224, + "learning_rate": 9.776726719394831e-05, + "loss": 0.0043, + "step": 1204 + }, + { + "epoch": 0.9306815987642402, + "grad_norm": 0.010368075221776962, + "learning_rate": 9.775928914349113e-05, + "loss": 0.0042, + "step": 1205 + }, + { + "epoch": 0.9314539486387333, + "grad_norm": 0.017229681834578514, + "learning_rate": 9.775129719145891e-05, + "loss": 0.0047, + "step": 1206 + }, + { + "epoch": 0.9322262985132265, + "grad_norm": 0.02685217559337616, + "learning_rate": 9.774329134017788e-05, + "loss": 0.0049, + "step": 1207 + }, + { + "epoch": 0.9329986483877196, + "grad_norm": 0.01164172776043415, + "learning_rate": 9.77352715919784e-05, + "loss": 0.0049, + "step": 1208 + }, + { + "epoch": 0.9337709982622128, + "grad_norm": 0.027054371312260628, + "learning_rate": 9.772723794919478e-05, + "loss": 0.0045, + "step": 1209 + }, + { + "epoch": 0.9345433481367059, + "grad_norm": 0.022101471200585365, + "learning_rate": 9.771919041416544e-05, + "loss": 0.0054, + "step": 1210 + }, + { + "epoch": 0.9353156980111991, + "grad_norm": 0.015406518243253231, + "learning_rate": 9.771112898923283e-05, + "loss": 0.0045, + "step": 1211 + }, + { + "epoch": 0.9360880478856922, + "grad_norm": 0.028950640931725502, + "learning_rate": 9.770305367674341e-05, + "loss": 0.0047, + "step": 1212 + }, + { + "epoch": 0.9368603977601854, + "grad_norm": 0.01438950840383768, + "learning_rate": 9.769496447904774e-05, + "loss": 0.0044, + "step": 1213 + }, + { + "epoch": 0.9376327476346785, + "grad_norm": 0.011271512135863304, + "learning_rate": 9.768686139850037e-05, + "loss": 0.0045, + "step": 1214 + }, + { + "epoch": 0.9384050975091717, + "grad_norm": 0.02822425588965416, + "learning_rate": 9.76787444374599e-05, + "loss": 0.0049, + "step": 1215 + }, + { + "epoch": 0.9391774473836648, + "grad_norm": 0.014436332508921623, + "learning_rate": 9.767061359828899e-05, + "loss": 0.0051, + "step": 1216 + }, + { + "epoch": 0.939949797258158, + "grad_norm": 0.013423058204352856, + "learning_rate": 9.766246888335437e-05, + "loss": 0.0048, + "step": 1217 + }, + { + "epoch": 0.9407221471326511, + "grad_norm": 0.01095044706016779, + "learning_rate": 9.765431029502672e-05, + "loss": 0.0046, + "step": 1218 + }, + { + "epoch": 0.9414944970071443, + "grad_norm": 0.01047494262456894, + "learning_rate": 9.764613783568082e-05, + "loss": 0.0044, + "step": 1219 + }, + { + "epoch": 0.9422668468816374, + "grad_norm": 0.020056355744600296, + "learning_rate": 9.763795150769548e-05, + "loss": 0.0045, + "step": 1220 + }, + { + "epoch": 0.9430391967561306, + "grad_norm": 0.014146436005830765, + "learning_rate": 9.762975131345356e-05, + "loss": 0.005, + "step": 1221 + }, + { + "epoch": 0.9438115466306237, + "grad_norm": 0.02325545623898506, + "learning_rate": 9.76215372553419e-05, + "loss": 0.0056, + "step": 1222 + }, + { + "epoch": 0.9445838965051169, + "grad_norm": 0.023392075672745705, + "learning_rate": 9.761330933575145e-05, + "loss": 0.0056, + "step": 1223 + }, + { + "epoch": 0.9453562463796099, + "grad_norm": 0.012304428964853287, + "learning_rate": 9.760506755707713e-05, + "loss": 0.0051, + "step": 1224 + }, + { + "epoch": 0.9461285962541031, + "grad_norm": 0.023236092180013657, + "learning_rate": 9.759681192171795e-05, + "loss": 0.0045, + "step": 1225 + }, + { + "epoch": 0.9469009461285962, + "grad_norm": 0.020777378231287003, + "learning_rate": 9.758854243207689e-05, + "loss": 0.0046, + "step": 1226 + }, + { + "epoch": 0.9476732960030894, + "grad_norm": 0.013657379895448685, + "learning_rate": 9.758025909056103e-05, + "loss": 0.0047, + "step": 1227 + }, + { + "epoch": 0.9484456458775825, + "grad_norm": 0.024491876363754272, + "learning_rate": 9.757196189958145e-05, + "loss": 0.0048, + "step": 1228 + }, + { + "epoch": 0.9492179957520757, + "grad_norm": 0.015795817598700523, + "learning_rate": 9.756365086155325e-05, + "loss": 0.0045, + "step": 1229 + }, + { + "epoch": 0.9499903456265688, + "grad_norm": 0.011541806161403656, + "learning_rate": 9.755532597889558e-05, + "loss": 0.0049, + "step": 1230 + }, + { + "epoch": 0.950762695501062, + "grad_norm": 0.02976318635046482, + "learning_rate": 9.75469872540316e-05, + "loss": 0.0048, + "step": 1231 + }, + { + "epoch": 0.9515350453755551, + "grad_norm": 0.01473459042608738, + "learning_rate": 9.75386346893885e-05, + "loss": 0.0044, + "step": 1232 + }, + { + "epoch": 0.9523073952500483, + "grad_norm": 0.014375674538314342, + "learning_rate": 9.753026828739756e-05, + "loss": 0.0047, + "step": 1233 + }, + { + "epoch": 0.9530797451245414, + "grad_norm": 0.03567088767886162, + "learning_rate": 9.7521888050494e-05, + "loss": 0.0051, + "step": 1234 + }, + { + "epoch": 0.9538520949990346, + "grad_norm": 0.018942657858133316, + "learning_rate": 9.75134939811171e-05, + "loss": 0.0042, + "step": 1235 + }, + { + "epoch": 0.9546244448735277, + "grad_norm": 0.024158213287591934, + "learning_rate": 9.750508608171018e-05, + "loss": 0.0048, + "step": 1236 + }, + { + "epoch": 0.9553967947480209, + "grad_norm": 0.04132556542754173, + "learning_rate": 9.749666435472059e-05, + "loss": 0.0049, + "step": 1237 + }, + { + "epoch": 0.956169144622514, + "grad_norm": 0.013448309153318405, + "learning_rate": 9.748822880259967e-05, + "loss": 0.005, + "step": 1238 + }, + { + "epoch": 0.9569414944970072, + "grad_norm": 0.034693315625190735, + "learning_rate": 9.747977942780281e-05, + "loss": 0.0047, + "step": 1239 + }, + { + "epoch": 0.9577138443715003, + "grad_norm": 0.019551943987607956, + "learning_rate": 9.747131623278943e-05, + "loss": 0.0053, + "step": 1240 + }, + { + "epoch": 0.9584861942459935, + "grad_norm": 0.01473858579993248, + "learning_rate": 9.746283922002295e-05, + "loss": 0.0052, + "step": 1241 + }, + { + "epoch": 0.9592585441204866, + "grad_norm": 0.035735420882701874, + "learning_rate": 9.745434839197082e-05, + "loss": 0.004, + "step": 1242 + }, + { + "epoch": 0.9600308939949798, + "grad_norm": 0.03530476614832878, + "learning_rate": 9.744584375110453e-05, + "loss": 0.0049, + "step": 1243 + }, + { + "epoch": 0.9608032438694729, + "grad_norm": 0.013710936531424522, + "learning_rate": 9.743732529989958e-05, + "loss": 0.0047, + "step": 1244 + }, + { + "epoch": 0.9615755937439661, + "grad_norm": 0.037209782749414444, + "learning_rate": 9.742879304083546e-05, + "loss": 0.0052, + "step": 1245 + }, + { + "epoch": 0.9623479436184592, + "grad_norm": 0.027198487892746925, + "learning_rate": 9.742024697639573e-05, + "loss": 0.0049, + "step": 1246 + }, + { + "epoch": 0.9631202934929524, + "grad_norm": 0.009933719411492348, + "learning_rate": 9.741168710906792e-05, + "loss": 0.0046, + "step": 1247 + }, + { + "epoch": 0.9638926433674454, + "grad_norm": 0.013061133213341236, + "learning_rate": 9.74031134413436e-05, + "loss": 0.0048, + "step": 1248 + }, + { + "epoch": 0.9646649932419386, + "grad_norm": 0.04397881403565407, + "learning_rate": 9.739452597571839e-05, + "loss": 0.005, + "step": 1249 + }, + { + "epoch": 0.9654373431164317, + "grad_norm": 0.01409953273832798, + "learning_rate": 9.738592471469188e-05, + "loss": 0.0049, + "step": 1250 + }, + { + "epoch": 0.9662096929909249, + "grad_norm": 0.027452891692519188, + "learning_rate": 9.737730966076766e-05, + "loss": 0.0043, + "step": 1251 + }, + { + "epoch": 0.966982042865418, + "grad_norm": 0.040638476610183716, + "learning_rate": 9.736868081645339e-05, + "loss": 0.0053, + "step": 1252 + }, + { + "epoch": 0.9677543927399112, + "grad_norm": 0.0253109373152256, + "learning_rate": 9.736003818426073e-05, + "loss": 0.0051, + "step": 1253 + }, + { + "epoch": 0.9685267426144043, + "grad_norm": 0.012483520433306694, + "learning_rate": 9.735138176670531e-05, + "loss": 0.0044, + "step": 1254 + }, + { + "epoch": 0.9692990924888975, + "grad_norm": 0.0266621932387352, + "learning_rate": 9.734271156630683e-05, + "loss": 0.005, + "step": 1255 + }, + { + "epoch": 0.9700714423633906, + "grad_norm": 0.037746869027614594, + "learning_rate": 9.733402758558896e-05, + "loss": 0.0053, + "step": 1256 + }, + { + "epoch": 0.9708437922378838, + "grad_norm": 0.012163268402218819, + "learning_rate": 9.73253298270794e-05, + "loss": 0.0046, + "step": 1257 + }, + { + "epoch": 0.9716161421123769, + "grad_norm": 0.017105646431446075, + "learning_rate": 9.731661829330986e-05, + "loss": 0.0049, + "step": 1258 + }, + { + "epoch": 0.97238849198687, + "grad_norm": 0.05115858465433121, + "learning_rate": 9.730789298681607e-05, + "loss": 0.0052, + "step": 1259 + }, + { + "epoch": 0.9731608418613632, + "grad_norm": 0.014376375824213028, + "learning_rate": 9.72991539101377e-05, + "loss": 0.0052, + "step": 1260 + }, + { + "epoch": 0.9739331917358564, + "grad_norm": 0.022228136658668518, + "learning_rate": 9.729040106581858e-05, + "loss": 0.0051, + "step": 1261 + }, + { + "epoch": 0.9747055416103495, + "grad_norm": 0.04539450630545616, + "learning_rate": 9.728163445640636e-05, + "loss": 0.0056, + "step": 1262 + }, + { + "epoch": 0.9754778914848427, + "grad_norm": 0.015013706870377064, + "learning_rate": 9.727285408445285e-05, + "loss": 0.005, + "step": 1263 + }, + { + "epoch": 0.9762502413593358, + "grad_norm": 0.02358063869178295, + "learning_rate": 9.726405995251377e-05, + "loss": 0.0049, + "step": 1264 + }, + { + "epoch": 0.977022591233829, + "grad_norm": 0.02099069207906723, + "learning_rate": 9.72552520631489e-05, + "loss": 0.0049, + "step": 1265 + }, + { + "epoch": 0.9777949411083221, + "grad_norm": 0.031068965792655945, + "learning_rate": 9.724643041892199e-05, + "loss": 0.0048, + "step": 1266 + }, + { + "epoch": 0.9785672909828153, + "grad_norm": 0.019800299778580666, + "learning_rate": 9.72375950224008e-05, + "loss": 0.0043, + "step": 1267 + }, + { + "epoch": 0.9793396408573084, + "grad_norm": 0.015887726098299026, + "learning_rate": 9.722874587615711e-05, + "loss": 0.0042, + "step": 1268 + }, + { + "epoch": 0.9801119907318016, + "grad_norm": 0.020139768719673157, + "learning_rate": 9.72198829827667e-05, + "loss": 0.0052, + "step": 1269 + }, + { + "epoch": 0.9808843406062947, + "grad_norm": 0.012581721879541874, + "learning_rate": 9.721100634480934e-05, + "loss": 0.0047, + "step": 1270 + }, + { + "epoch": 0.9816566904807877, + "grad_norm": 0.014556423760950565, + "learning_rate": 9.72021159648688e-05, + "loss": 0.0043, + "step": 1271 + }, + { + "epoch": 0.9824290403552809, + "grad_norm": 0.01289236731827259, + "learning_rate": 9.719321184553286e-05, + "loss": 0.0048, + "step": 1272 + }, + { + "epoch": 0.983201390229774, + "grad_norm": 0.013001179322600365, + "learning_rate": 9.718429398939329e-05, + "loss": 0.0049, + "step": 1273 + }, + { + "epoch": 0.9839737401042672, + "grad_norm": 0.010636583901941776, + "learning_rate": 9.717536239904586e-05, + "loss": 0.0049, + "step": 1274 + }, + { + "epoch": 0.9847460899787603, + "grad_norm": 0.014941484667360783, + "learning_rate": 9.716641707709035e-05, + "loss": 0.0047, + "step": 1275 + }, + { + "epoch": 0.9855184398532535, + "grad_norm": 0.01230227667838335, + "learning_rate": 9.715745802613052e-05, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 0.9862907897277466, + "grad_norm": 0.019549479708075523, + "learning_rate": 9.714848524877413e-05, + "loss": 0.0058, + "step": 1277 + }, + { + "epoch": 0.9870631396022398, + "grad_norm": 0.015148761682212353, + "learning_rate": 9.713949874763296e-05, + "loss": 0.0046, + "step": 1278 + }, + { + "epoch": 0.9878354894767329, + "grad_norm": 0.012779118493199348, + "learning_rate": 9.713049852532275e-05, + "loss": 0.0047, + "step": 1279 + }, + { + "epoch": 0.9886078393512261, + "grad_norm": 0.02783294767141342, + "learning_rate": 9.712148458446322e-05, + "loss": 0.005, + "step": 1280 + }, + { + "epoch": 0.9893801892257192, + "grad_norm": 0.017038974910974503, + "learning_rate": 9.711245692767814e-05, + "loss": 0.0042, + "step": 1281 + }, + { + "epoch": 0.9901525391002124, + "grad_norm": 0.015469277277588844, + "learning_rate": 9.710341555759523e-05, + "loss": 0.0042, + "step": 1282 + }, + { + "epoch": 0.9909248889747055, + "grad_norm": 0.015973426401615143, + "learning_rate": 9.709436047684624e-05, + "loss": 0.0045, + "step": 1283 + }, + { + "epoch": 0.9916972388491987, + "grad_norm": 0.02046266943216324, + "learning_rate": 9.708529168806686e-05, + "loss": 0.0051, + "step": 1284 + }, + { + "epoch": 0.9924695887236918, + "grad_norm": 0.01801963895559311, + "learning_rate": 9.70762091938968e-05, + "loss": 0.0048, + "step": 1285 + }, + { + "epoch": 0.993241938598185, + "grad_norm": 0.013758906163275242, + "learning_rate": 9.706711299697976e-05, + "loss": 0.0051, + "step": 1286 + }, + { + "epoch": 0.9940142884726781, + "grad_norm": 0.021490855142474174, + "learning_rate": 9.70580030999634e-05, + "loss": 0.0047, + "step": 1287 + }, + { + "epoch": 0.9947866383471713, + "grad_norm": 0.01400856114923954, + "learning_rate": 9.704887950549943e-05, + "loss": 0.0054, + "step": 1288 + }, + { + "epoch": 0.9955589882216644, + "grad_norm": 0.01435021311044693, + "learning_rate": 9.703974221624351e-05, + "loss": 0.0045, + "step": 1289 + }, + { + "epoch": 0.9963313380961576, + "grad_norm": 0.014206396415829659, + "learning_rate": 9.703059123485523e-05, + "loss": 0.0047, + "step": 1290 + }, + { + "epoch": 0.9971036879706507, + "grad_norm": 0.012883146293461323, + "learning_rate": 9.702142656399824e-05, + "loss": 0.0049, + "step": 1291 + }, + { + "epoch": 0.9978760378451439, + "grad_norm": 0.022087296470999718, + "learning_rate": 9.701224820634019e-05, + "loss": 0.0042, + "step": 1292 + }, + { + "epoch": 0.998648387719637, + "grad_norm": 0.01635371334850788, + "learning_rate": 9.700305616455266e-05, + "loss": 0.0043, + "step": 1293 + }, + { + "epoch": 0.9994207375941302, + "grad_norm": 0.016066590324044228, + "learning_rate": 9.69938504413112e-05, + "loss": 0.0054, + "step": 1294 + }, + { + "epoch": 1.0007723498744932, + "grad_norm": 0.03909831866621971, + "learning_rate": 9.698463103929542e-05, + "loss": 0.0095, + "step": 1295 + }, + { + "epoch": 1.0015446997489863, + "grad_norm": 0.009909950196743011, + "learning_rate": 9.697539796118884e-05, + "loss": 0.0042, + "step": 1296 + }, + { + "epoch": 1.0023170496234795, + "grad_norm": 0.011213305406272411, + "learning_rate": 9.6966151209679e-05, + "loss": 0.0047, + "step": 1297 + }, + { + "epoch": 1.0030893994979726, + "grad_norm": 0.03059287928044796, + "learning_rate": 9.695689078745737e-05, + "loss": 0.0049, + "step": 1298 + }, + { + "epoch": 1.0038617493724658, + "grad_norm": 0.030008280649781227, + "learning_rate": 9.694761669721947e-05, + "loss": 0.0049, + "step": 1299 + }, + { + "epoch": 1.004634099246959, + "grad_norm": 0.022881874814629555, + "learning_rate": 9.693832894166479e-05, + "loss": 0.0049, + "step": 1300 + }, + { + "epoch": 1.005406449121452, + "grad_norm": 0.021256916224956512, + "learning_rate": 9.69290275234967e-05, + "loss": 0.0048, + "step": 1301 + }, + { + "epoch": 1.0061787989959452, + "grad_norm": 0.020482858642935753, + "learning_rate": 9.691971244542266e-05, + "loss": 0.0049, + "step": 1302 + }, + { + "epoch": 1.0069511488704384, + "grad_norm": 0.015014131553471088, + "learning_rate": 9.691038371015406e-05, + "loss": 0.0043, + "step": 1303 + }, + { + "epoch": 1.0077234987449315, + "grad_norm": 0.022472646087408066, + "learning_rate": 9.690104132040627e-05, + "loss": 0.0047, + "step": 1304 + }, + { + "epoch": 1.0084958486194247, + "grad_norm": 0.01419077068567276, + "learning_rate": 9.689168527889863e-05, + "loss": 0.0048, + "step": 1305 + }, + { + "epoch": 1.0092681984939178, + "grad_norm": 0.016903279349207878, + "learning_rate": 9.688231558835445e-05, + "loss": 0.0044, + "step": 1306 + }, + { + "epoch": 1.010040548368411, + "grad_norm": 0.014411347918212414, + "learning_rate": 9.687293225150104e-05, + "loss": 0.0045, + "step": 1307 + }, + { + "epoch": 1.010812898242904, + "grad_norm": 0.010093619115650654, + "learning_rate": 9.686353527106967e-05, + "loss": 0.005, + "step": 1308 + }, + { + "epoch": 1.0115852481173973, + "grad_norm": 0.010465381667017937, + "learning_rate": 9.685412464979554e-05, + "loss": 0.0042, + "step": 1309 + }, + { + "epoch": 1.0123575979918904, + "grad_norm": 0.012797888368368149, + "learning_rate": 9.684470039041786e-05, + "loss": 0.0047, + "step": 1310 + }, + { + "epoch": 1.0131299478663836, + "grad_norm": 0.011513960547745228, + "learning_rate": 9.683526249567982e-05, + "loss": 0.0046, + "step": 1311 + }, + { + "epoch": 1.0139022977408767, + "grad_norm": 0.013557342812418938, + "learning_rate": 9.682581096832856e-05, + "loss": 0.0043, + "step": 1312 + }, + { + "epoch": 1.0146746476153699, + "grad_norm": 0.012859497219324112, + "learning_rate": 9.681634581111519e-05, + "loss": 0.0044, + "step": 1313 + }, + { + "epoch": 1.015446997489863, + "grad_norm": 0.018372129648923874, + "learning_rate": 9.68068670267948e-05, + "loss": 0.0043, + "step": 1314 + }, + { + "epoch": 1.0162193473643562, + "grad_norm": 0.012753891758620739, + "learning_rate": 9.679737461812641e-05, + "loss": 0.004, + "step": 1315 + }, + { + "epoch": 1.0169916972388493, + "grad_norm": 0.010481827892363071, + "learning_rate": 9.678786858787306e-05, + "loss": 0.0041, + "step": 1316 + }, + { + "epoch": 1.0177640471133425, + "grad_norm": 0.018584148958325386, + "learning_rate": 9.677834893880168e-05, + "loss": 0.0049, + "step": 1317 + }, + { + "epoch": 1.0185363969878356, + "grad_norm": 0.012177852913737297, + "learning_rate": 9.676881567368325e-05, + "loss": 0.0045, + "step": 1318 + }, + { + "epoch": 1.0193087468623285, + "grad_norm": 0.011105979792773724, + "learning_rate": 9.675926879529268e-05, + "loss": 0.0042, + "step": 1319 + }, + { + "epoch": 1.0200810967368217, + "grad_norm": 0.01322960201650858, + "learning_rate": 9.674970830640881e-05, + "loss": 0.0042, + "step": 1320 + }, + { + "epoch": 1.0208534466113148, + "grad_norm": 0.0165097676217556, + "learning_rate": 9.674013420981447e-05, + "loss": 0.0046, + "step": 1321 + }, + { + "epoch": 1.021625796485808, + "grad_norm": 0.011996297165751457, + "learning_rate": 9.673054650829645e-05, + "loss": 0.0038, + "step": 1322 + }, + { + "epoch": 1.0223981463603011, + "grad_norm": 0.011866395361721516, + "learning_rate": 9.672094520464552e-05, + "loss": 0.0044, + "step": 1323 + }, + { + "epoch": 1.0231704962347943, + "grad_norm": 0.010129679925739765, + "learning_rate": 9.671133030165635e-05, + "loss": 0.0041, + "step": 1324 + }, + { + "epoch": 1.0239428461092874, + "grad_norm": 0.01069657877087593, + "learning_rate": 9.670170180212764e-05, + "loss": 0.0041, + "step": 1325 + }, + { + "epoch": 1.0247151959837806, + "grad_norm": 0.014417791739106178, + "learning_rate": 9.669205970886197e-05, + "loss": 0.0048, + "step": 1326 + }, + { + "epoch": 1.0254875458582737, + "grad_norm": 0.011403602547943592, + "learning_rate": 9.668240402466597e-05, + "loss": 0.0046, + "step": 1327 + }, + { + "epoch": 1.0262598957327669, + "grad_norm": 0.0127126919105649, + "learning_rate": 9.667273475235017e-05, + "loss": 0.0048, + "step": 1328 + }, + { + "epoch": 1.02703224560726, + "grad_norm": 0.0228350218385458, + "learning_rate": 9.666305189472903e-05, + "loss": 0.0048, + "step": 1329 + }, + { + "epoch": 1.0278045954817532, + "grad_norm": 0.012411186471581459, + "learning_rate": 9.665335545462102e-05, + "loss": 0.0042, + "step": 1330 + }, + { + "epoch": 1.0285769453562463, + "grad_norm": 0.01859738491475582, + "learning_rate": 9.664364543484851e-05, + "loss": 0.0047, + "step": 1331 + }, + { + "epoch": 1.0293492952307395, + "grad_norm": 0.01865040697157383, + "learning_rate": 9.66339218382379e-05, + "loss": 0.0045, + "step": 1332 + }, + { + "epoch": 1.0301216451052326, + "grad_norm": 0.02105005457997322, + "learning_rate": 9.662418466761947e-05, + "loss": 0.0046, + "step": 1333 + }, + { + "epoch": 1.0308939949797258, + "grad_norm": 0.010967560112476349, + "learning_rate": 9.661443392582746e-05, + "loss": 0.0047, + "step": 1334 + }, + { + "epoch": 1.031666344854219, + "grad_norm": 0.040137216448783875, + "learning_rate": 9.66046696157001e-05, + "loss": 0.0046, + "step": 1335 + }, + { + "epoch": 1.032438694728712, + "grad_norm": 0.018981043249368668, + "learning_rate": 9.659489174007951e-05, + "loss": 0.0046, + "step": 1336 + }, + { + "epoch": 1.0332110446032052, + "grad_norm": 0.01990450546145439, + "learning_rate": 9.658510030181184e-05, + "loss": 0.0046, + "step": 1337 + }, + { + "epoch": 1.0339833944776984, + "grad_norm": 0.01513310894370079, + "learning_rate": 9.657529530374713e-05, + "loss": 0.0039, + "step": 1338 + }, + { + "epoch": 1.0347557443521915, + "grad_norm": 0.024439837783575058, + "learning_rate": 9.656547674873934e-05, + "loss": 0.0045, + "step": 1339 + }, + { + "epoch": 1.0355280942266847, + "grad_norm": 0.01174076646566391, + "learning_rate": 9.655564463964646e-05, + "loss": 0.0047, + "step": 1340 + }, + { + "epoch": 1.0363004441011778, + "grad_norm": 0.019211484119296074, + "learning_rate": 9.654579897933033e-05, + "loss": 0.0044, + "step": 1341 + }, + { + "epoch": 1.037072793975671, + "grad_norm": 0.01669737510383129, + "learning_rate": 9.653593977065685e-05, + "loss": 0.0045, + "step": 1342 + }, + { + "epoch": 1.0378451438501641, + "grad_norm": 0.01965293660759926, + "learning_rate": 9.652606701649574e-05, + "loss": 0.0045, + "step": 1343 + }, + { + "epoch": 1.0386174937246573, + "grad_norm": 0.013414140790700912, + "learning_rate": 9.651618071972075e-05, + "loss": 0.0044, + "step": 1344 + }, + { + "epoch": 1.0393898435991504, + "grad_norm": 0.016711724922060966, + "learning_rate": 9.650628088320953e-05, + "loss": 0.0045, + "step": 1345 + }, + { + "epoch": 1.0401621934736436, + "grad_norm": 0.025946568697690964, + "learning_rate": 9.649636750984368e-05, + "loss": 0.0045, + "step": 1346 + }, + { + "epoch": 1.0409345433481367, + "grad_norm": 0.010373227298259735, + "learning_rate": 9.648644060250875e-05, + "loss": 0.0044, + "step": 1347 + }, + { + "epoch": 1.0417068932226299, + "grad_norm": 0.015023860149085522, + "learning_rate": 9.647650016409421e-05, + "loss": 0.0043, + "step": 1348 + }, + { + "epoch": 1.042479243097123, + "grad_norm": 0.02561667561531067, + "learning_rate": 9.646654619749352e-05, + "loss": 0.0046, + "step": 1349 + }, + { + "epoch": 1.0432515929716162, + "grad_norm": 0.011588500812649727, + "learning_rate": 9.645657870560401e-05, + "loss": 0.0049, + "step": 1350 + }, + { + "epoch": 1.0440239428461093, + "grad_norm": 0.018138093873858452, + "learning_rate": 9.644659769132696e-05, + "loss": 0.0043, + "step": 1351 + }, + { + "epoch": 1.0447962927206025, + "grad_norm": 0.013827506452798843, + "learning_rate": 9.643660315756764e-05, + "loss": 0.0043, + "step": 1352 + }, + { + "epoch": 1.0455686425950956, + "grad_norm": 0.015662331134080887, + "learning_rate": 9.64265951072352e-05, + "loss": 0.0041, + "step": 1353 + }, + { + "epoch": 1.0463409924695888, + "grad_norm": 0.01205469761043787, + "learning_rate": 9.641657354324273e-05, + "loss": 0.004, + "step": 1354 + }, + { + "epoch": 1.047113342344082, + "grad_norm": 0.017667818814516068, + "learning_rate": 9.640653846850728e-05, + "loss": 0.0043, + "step": 1355 + }, + { + "epoch": 1.047885692218575, + "grad_norm": 0.039402734488248825, + "learning_rate": 9.63964898859498e-05, + "loss": 0.0049, + "step": 1356 + }, + { + "epoch": 1.0486580420930682, + "grad_norm": 0.01787903904914856, + "learning_rate": 9.638642779849523e-05, + "loss": 0.004, + "step": 1357 + }, + { + "epoch": 1.0494303919675614, + "grad_norm": 0.02071666158735752, + "learning_rate": 9.637635220907235e-05, + "loss": 0.0048, + "step": 1358 + }, + { + "epoch": 1.0502027418420545, + "grad_norm": 0.016656925901770592, + "learning_rate": 9.636626312061395e-05, + "loss": 0.0047, + "step": 1359 + }, + { + "epoch": 1.0509750917165477, + "grad_norm": 0.011745874769985676, + "learning_rate": 9.635616053605672e-05, + "loss": 0.0045, + "step": 1360 + }, + { + "epoch": 1.0517474415910408, + "grad_norm": 0.018410833552479744, + "learning_rate": 9.634604445834127e-05, + "loss": 0.0049, + "step": 1361 + }, + { + "epoch": 1.052519791465534, + "grad_norm": 0.01569872908294201, + "learning_rate": 9.633591489041213e-05, + "loss": 0.0047, + "step": 1362 + }, + { + "epoch": 1.0532921413400271, + "grad_norm": 0.014584588818252087, + "learning_rate": 9.632577183521782e-05, + "loss": 0.0046, + "step": 1363 + }, + { + "epoch": 1.0540644912145203, + "grad_norm": 0.023064883425831795, + "learning_rate": 9.631561529571069e-05, + "loss": 0.0047, + "step": 1364 + }, + { + "epoch": 1.0548368410890134, + "grad_norm": 0.015406741760671139, + "learning_rate": 9.630544527484708e-05, + "loss": 0.0042, + "step": 1365 + }, + { + "epoch": 1.0556091909635064, + "grad_norm": 0.014026117511093616, + "learning_rate": 9.629526177558725e-05, + "loss": 0.0046, + "step": 1366 + }, + { + "epoch": 1.0563815408379995, + "grad_norm": 0.025568680837750435, + "learning_rate": 9.628506480089535e-05, + "loss": 0.005, + "step": 1367 + }, + { + "epoch": 1.0571538907124927, + "grad_norm": 0.015331593342125416, + "learning_rate": 9.627485435373948e-05, + "loss": 0.0043, + "step": 1368 + }, + { + "epoch": 1.0579262405869858, + "grad_norm": 0.01583258807659149, + "learning_rate": 9.626463043709168e-05, + "loss": 0.0046, + "step": 1369 + }, + { + "epoch": 1.058698590461479, + "grad_norm": 0.018904291093349457, + "learning_rate": 9.625439305392784e-05, + "loss": 0.0044, + "step": 1370 + }, + { + "epoch": 1.0594709403359721, + "grad_norm": 0.01609903946518898, + "learning_rate": 9.624414220722784e-05, + "loss": 0.0044, + "step": 1371 + }, + { + "epoch": 1.0602432902104653, + "grad_norm": 0.019533008337020874, + "learning_rate": 9.623387789997547e-05, + "loss": 0.0045, + "step": 1372 + }, + { + "epoch": 1.0610156400849584, + "grad_norm": 0.021502437070012093, + "learning_rate": 9.622360013515838e-05, + "loss": 0.0046, + "step": 1373 + }, + { + "epoch": 1.0617879899594516, + "grad_norm": 0.033865705132484436, + "learning_rate": 9.62133089157682e-05, + "loss": 0.0051, + "step": 1374 + }, + { + "epoch": 1.0625603398339447, + "grad_norm": 0.021266119554638863, + "learning_rate": 9.620300424480046e-05, + "loss": 0.0048, + "step": 1375 + }, + { + "epoch": 1.0633326897084379, + "grad_norm": 0.029526852071285248, + "learning_rate": 9.619268612525461e-05, + "loss": 0.0044, + "step": 1376 + }, + { + "epoch": 1.064105039582931, + "grad_norm": 0.02488149330019951, + "learning_rate": 9.618235456013397e-05, + "loss": 0.0047, + "step": 1377 + }, + { + "epoch": 1.0648773894574242, + "grad_norm": 0.020261965692043304, + "learning_rate": 9.617200955244586e-05, + "loss": 0.0046, + "step": 1378 + }, + { + "epoch": 1.0656497393319173, + "grad_norm": 0.01376634556800127, + "learning_rate": 9.616165110520143e-05, + "loss": 0.0044, + "step": 1379 + }, + { + "epoch": 1.0664220892064105, + "grad_norm": 0.03926656395196915, + "learning_rate": 9.615127922141576e-05, + "loss": 0.0044, + "step": 1380 + }, + { + "epoch": 1.0671944390809036, + "grad_norm": 0.022135786712169647, + "learning_rate": 9.614089390410788e-05, + "loss": 0.0049, + "step": 1381 + }, + { + "epoch": 1.0679667889553968, + "grad_norm": 0.017217369750142097, + "learning_rate": 9.61304951563007e-05, + "loss": 0.0046, + "step": 1382 + }, + { + "epoch": 1.06873913882989, + "grad_norm": 0.02201268821954727, + "learning_rate": 9.612008298102104e-05, + "loss": 0.0046, + "step": 1383 + }, + { + "epoch": 1.069511488704383, + "grad_norm": 0.024450382217764854, + "learning_rate": 9.610965738129963e-05, + "loss": 0.0047, + "step": 1384 + }, + { + "epoch": 1.0702838385788762, + "grad_norm": 0.01855049841105938, + "learning_rate": 9.609921836017113e-05, + "loss": 0.0049, + "step": 1385 + }, + { + "epoch": 1.0710561884533694, + "grad_norm": 0.032038744539022446, + "learning_rate": 9.608876592067404e-05, + "loss": 0.0045, + "step": 1386 + }, + { + "epoch": 1.0718285383278625, + "grad_norm": 0.011334764771163464, + "learning_rate": 9.607830006585087e-05, + "loss": 0.0044, + "step": 1387 + }, + { + "epoch": 1.0726008882023557, + "grad_norm": 0.01075075939297676, + "learning_rate": 9.606782079874794e-05, + "loss": 0.0046, + "step": 1388 + }, + { + "epoch": 1.0733732380768488, + "grad_norm": 0.01240682415664196, + "learning_rate": 9.605732812241553e-05, + "loss": 0.0043, + "step": 1389 + }, + { + "epoch": 1.074145587951342, + "grad_norm": 0.014968945644795895, + "learning_rate": 9.604682203990778e-05, + "loss": 0.0039, + "step": 1390 + }, + { + "epoch": 1.0749179378258351, + "grad_norm": 0.014436611905694008, + "learning_rate": 9.60363025542828e-05, + "loss": 0.0041, + "step": 1391 + }, + { + "epoch": 1.0756902877003283, + "grad_norm": 0.011069850996136665, + "learning_rate": 9.602576966860251e-05, + "loss": 0.004, + "step": 1392 + }, + { + "epoch": 1.0764626375748214, + "grad_norm": 0.01463300921022892, + "learning_rate": 9.60152233859328e-05, + "loss": 0.0047, + "step": 1393 + }, + { + "epoch": 1.0772349874493146, + "grad_norm": 0.020886411890387535, + "learning_rate": 9.600466370934345e-05, + "loss": 0.004, + "step": 1394 + }, + { + "epoch": 1.0780073373238077, + "grad_norm": 0.013697480782866478, + "learning_rate": 9.599409064190811e-05, + "loss": 0.0046, + "step": 1395 + }, + { + "epoch": 1.0787796871983009, + "grad_norm": 0.01161142997443676, + "learning_rate": 9.598350418670434e-05, + "loss": 0.0039, + "step": 1396 + }, + { + "epoch": 1.079552037072794, + "grad_norm": 0.013734615407884121, + "learning_rate": 9.597290434681363e-05, + "loss": 0.0049, + "step": 1397 + }, + { + "epoch": 1.0803243869472872, + "grad_norm": 0.014882289804518223, + "learning_rate": 9.596229112532132e-05, + "loss": 0.0043, + "step": 1398 + }, + { + "epoch": 1.0810967368217803, + "grad_norm": 0.011839316226541996, + "learning_rate": 9.595166452531663e-05, + "loss": 0.0044, + "step": 1399 + }, + { + "epoch": 1.0818690866962735, + "grad_norm": 0.015712959691882133, + "learning_rate": 9.594102454989275e-05, + "loss": 0.0044, + "step": 1400 + }, + { + "epoch": 1.0826414365707666, + "grad_norm": 0.008323220536112785, + "learning_rate": 9.593037120214672e-05, + "loss": 0.0044, + "step": 1401 + }, + { + "epoch": 1.0834137864452598, + "grad_norm": 0.023797884583473206, + "learning_rate": 9.591970448517946e-05, + "loss": 0.0048, + "step": 1402 + }, + { + "epoch": 1.084186136319753, + "grad_norm": 0.021375704556703568, + "learning_rate": 9.590902440209577e-05, + "loss": 0.0051, + "step": 1403 + }, + { + "epoch": 1.084958486194246, + "grad_norm": 0.012305250391364098, + "learning_rate": 9.58983309560044e-05, + "loss": 0.0048, + "step": 1404 + }, + { + "epoch": 1.0857308360687392, + "grad_norm": 0.03434315323829651, + "learning_rate": 9.588762415001795e-05, + "loss": 0.0043, + "step": 1405 + }, + { + "epoch": 1.0865031859432324, + "grad_norm": 0.018410326912999153, + "learning_rate": 9.587690398725288e-05, + "loss": 0.005, + "step": 1406 + }, + { + "epoch": 1.0872755358177255, + "grad_norm": 0.02015906572341919, + "learning_rate": 9.586617047082962e-05, + "loss": 0.004, + "step": 1407 + }, + { + "epoch": 1.0880478856922187, + "grad_norm": 0.06614932417869568, + "learning_rate": 9.585542360387238e-05, + "loss": 0.0047, + "step": 1408 + }, + { + "epoch": 1.0888202355667118, + "grad_norm": 0.014235509559512138, + "learning_rate": 9.584466338950937e-05, + "loss": 0.0051, + "step": 1409 + }, + { + "epoch": 1.089592585441205, + "grad_norm": 0.01355188898742199, + "learning_rate": 9.583388983087258e-05, + "loss": 0.0045, + "step": 1410 + }, + { + "epoch": 1.0903649353156981, + "grad_norm": 0.022596238180994987, + "learning_rate": 9.582310293109798e-05, + "loss": 0.0045, + "step": 1411 + }, + { + "epoch": 1.0911372851901913, + "grad_norm": 0.02476441115140915, + "learning_rate": 9.581230269332533e-05, + "loss": 0.0049, + "step": 1412 + }, + { + "epoch": 1.0919096350646842, + "grad_norm": 0.013642823323607445, + "learning_rate": 9.580148912069836e-05, + "loss": 0.0044, + "step": 1413 + }, + { + "epoch": 1.0926819849391776, + "grad_norm": 0.026510091498494148, + "learning_rate": 9.579066221636459e-05, + "loss": 0.0038, + "step": 1414 + }, + { + "epoch": 1.0934543348136705, + "grad_norm": 0.036978624761104584, + "learning_rate": 9.57798219834755e-05, + "loss": 0.0052, + "step": 1415 + }, + { + "epoch": 1.0942266846881636, + "grad_norm": 0.030871255323290825, + "learning_rate": 9.576896842518643e-05, + "loss": 0.0047, + "step": 1416 + }, + { + "epoch": 1.0949990345626568, + "grad_norm": 0.028220554813742638, + "learning_rate": 9.575810154465658e-05, + "loss": 0.0047, + "step": 1417 + }, + { + "epoch": 1.09577138443715, + "grad_norm": 0.02838347852230072, + "learning_rate": 9.574722134504904e-05, + "loss": 0.0053, + "step": 1418 + }, + { + "epoch": 1.096543734311643, + "grad_norm": 0.017904307693243027, + "learning_rate": 9.573632782953075e-05, + "loss": 0.0046, + "step": 1419 + }, + { + "epoch": 1.0973160841861362, + "grad_norm": 0.018795479089021683, + "learning_rate": 9.572542100127258e-05, + "loss": 0.0049, + "step": 1420 + }, + { + "epoch": 1.0980884340606294, + "grad_norm": 0.021671850234270096, + "learning_rate": 9.571450086344922e-05, + "loss": 0.0045, + "step": 1421 + }, + { + "epoch": 1.0988607839351225, + "grad_norm": 0.014755095355212688, + "learning_rate": 9.570356741923927e-05, + "loss": 0.0043, + "step": 1422 + }, + { + "epoch": 1.0996331338096157, + "grad_norm": 0.03559665009379387, + "learning_rate": 9.569262067182518e-05, + "loss": 0.0048, + "step": 1423 + }, + { + "epoch": 1.1004054836841088, + "grad_norm": 0.021603044122457504, + "learning_rate": 9.56816606243933e-05, + "loss": 0.0048, + "step": 1424 + }, + { + "epoch": 1.101177833558602, + "grad_norm": 0.012603303417563438, + "learning_rate": 9.567068728013384e-05, + "loss": 0.0047, + "step": 1425 + }, + { + "epoch": 1.1019501834330951, + "grad_norm": 0.021283099427819252, + "learning_rate": 9.565970064224085e-05, + "loss": 0.005, + "step": 1426 + }, + { + "epoch": 1.1027225333075883, + "grad_norm": 0.028067365288734436, + "learning_rate": 9.56487007139123e-05, + "loss": 0.0046, + "step": 1427 + }, + { + "epoch": 1.1034948831820814, + "grad_norm": 0.013425813987851143, + "learning_rate": 9.563768749834998e-05, + "loss": 0.0042, + "step": 1428 + }, + { + "epoch": 1.1042672330565746, + "grad_norm": 0.022721700370311737, + "learning_rate": 9.562666099875959e-05, + "loss": 0.0043, + "step": 1429 + }, + { + "epoch": 1.1050395829310677, + "grad_norm": 0.03619357943534851, + "learning_rate": 9.561562121835066e-05, + "loss": 0.0051, + "step": 1430 + }, + { + "epoch": 1.105811932805561, + "grad_norm": 0.013147195801138878, + "learning_rate": 9.560456816033662e-05, + "loss": 0.0047, + "step": 1431 + }, + { + "epoch": 1.106584282680054, + "grad_norm": 0.03599505499005318, + "learning_rate": 9.559350182793475e-05, + "loss": 0.0047, + "step": 1432 + }, + { + "epoch": 1.1073566325545472, + "grad_norm": 0.01628255285322666, + "learning_rate": 9.558242222436617e-05, + "loss": 0.0045, + "step": 1433 + }, + { + "epoch": 1.1081289824290403, + "grad_norm": 0.019326291978359222, + "learning_rate": 9.557132935285591e-05, + "loss": 0.0045, + "step": 1434 + }, + { + "epoch": 1.1089013323035335, + "grad_norm": 0.01495333295315504, + "learning_rate": 9.556022321663283e-05, + "loss": 0.0046, + "step": 1435 + }, + { + "epoch": 1.1096736821780266, + "grad_norm": 0.014609484933316708, + "learning_rate": 9.554910381892964e-05, + "loss": 0.0045, + "step": 1436 + }, + { + "epoch": 1.1104460320525198, + "grad_norm": 0.01904207468032837, + "learning_rate": 9.553797116298295e-05, + "loss": 0.0035, + "step": 1437 + }, + { + "epoch": 1.111218381927013, + "grad_norm": 0.012693614698946476, + "learning_rate": 9.552682525203319e-05, + "loss": 0.0044, + "step": 1438 + }, + { + "epoch": 1.111990731801506, + "grad_norm": 0.010756377130746841, + "learning_rate": 9.551566608932467e-05, + "loss": 0.0043, + "step": 1439 + }, + { + "epoch": 1.1127630816759992, + "grad_norm": 0.013393844477832317, + "learning_rate": 9.550449367810557e-05, + "loss": 0.0049, + "step": 1440 + }, + { + "epoch": 1.1135354315504924, + "grad_norm": 0.0180951077491045, + "learning_rate": 9.549330802162789e-05, + "loss": 0.0055, + "step": 1441 + }, + { + "epoch": 1.1143077814249855, + "grad_norm": 0.013738658279180527, + "learning_rate": 9.54821091231475e-05, + "loss": 0.0052, + "step": 1442 + }, + { + "epoch": 1.1150801312994787, + "grad_norm": 0.01545040961354971, + "learning_rate": 9.547089698592416e-05, + "loss": 0.0044, + "step": 1443 + }, + { + "epoch": 1.1158524811739718, + "grad_norm": 0.012175020761787891, + "learning_rate": 9.545967161322141e-05, + "loss": 0.0045, + "step": 1444 + }, + { + "epoch": 1.116624831048465, + "grad_norm": 0.018132036551833153, + "learning_rate": 9.544843300830671e-05, + "loss": 0.0048, + "step": 1445 + }, + { + "epoch": 1.1173971809229581, + "grad_norm": 0.015559297055006027, + "learning_rate": 9.543718117445135e-05, + "loss": 0.0043, + "step": 1446 + }, + { + "epoch": 1.1181695307974513, + "grad_norm": 0.017296213656663895, + "learning_rate": 9.542591611493046e-05, + "loss": 0.0048, + "step": 1447 + }, + { + "epoch": 1.1189418806719444, + "grad_norm": 0.015066379681229591, + "learning_rate": 9.541463783302303e-05, + "loss": 0.004, + "step": 1448 + }, + { + "epoch": 1.1197142305464376, + "grad_norm": 0.010292529128491879, + "learning_rate": 9.540334633201186e-05, + "loss": 0.0046, + "step": 1449 + }, + { + "epoch": 1.1204865804209307, + "grad_norm": 0.0397217720746994, + "learning_rate": 9.53920416151837e-05, + "loss": 0.0041, + "step": 1450 + }, + { + "epoch": 1.121258930295424, + "grad_norm": 0.016435086727142334, + "learning_rate": 9.538072368582902e-05, + "loss": 0.0042, + "step": 1451 + }, + { + "epoch": 1.122031280169917, + "grad_norm": 0.0129328528419137, + "learning_rate": 9.536939254724222e-05, + "loss": 0.004, + "step": 1452 + }, + { + "epoch": 1.1228036300444102, + "grad_norm": 0.013893014751374722, + "learning_rate": 9.535804820272152e-05, + "loss": 0.0045, + "step": 1453 + }, + { + "epoch": 1.1235759799189033, + "grad_norm": 0.017391353845596313, + "learning_rate": 9.534669065556901e-05, + "loss": 0.0042, + "step": 1454 + }, + { + "epoch": 1.1243483297933965, + "grad_norm": 0.0244289580732584, + "learning_rate": 9.533531990909055e-05, + "loss": 0.0046, + "step": 1455 + }, + { + "epoch": 1.1251206796678896, + "grad_norm": 0.011779186315834522, + "learning_rate": 9.53239359665959e-05, + "loss": 0.0041, + "step": 1456 + }, + { + "epoch": 1.1258930295423828, + "grad_norm": 0.02714175544679165, + "learning_rate": 9.531253883139869e-05, + "loss": 0.0046, + "step": 1457 + }, + { + "epoch": 1.1266653794168757, + "grad_norm": 0.01168652344495058, + "learning_rate": 9.53011285068163e-05, + "loss": 0.0049, + "step": 1458 + }, + { + "epoch": 1.127437729291369, + "grad_norm": 0.021335959434509277, + "learning_rate": 9.528970499617003e-05, + "loss": 0.0041, + "step": 1459 + }, + { + "epoch": 1.128210079165862, + "grad_norm": 0.013770773075520992, + "learning_rate": 9.5278268302785e-05, + "loss": 0.004, + "step": 1460 + }, + { + "epoch": 1.1289824290403554, + "grad_norm": 0.011553505435585976, + "learning_rate": 9.526681842999011e-05, + "loss": 0.0046, + "step": 1461 + }, + { + "epoch": 1.1297547789148483, + "grad_norm": 0.018769564107060432, + "learning_rate": 9.525535538111818e-05, + "loss": 0.0046, + "step": 1462 + }, + { + "epoch": 1.1305271287893417, + "grad_norm": 0.014158394187688828, + "learning_rate": 9.524387915950581e-05, + "loss": 0.0044, + "step": 1463 + }, + { + "epoch": 1.1312994786638346, + "grad_norm": 0.021640509366989136, + "learning_rate": 9.523238976849344e-05, + "loss": 0.0043, + "step": 1464 + }, + { + "epoch": 1.1320718285383278, + "grad_norm": 0.015402225777506828, + "learning_rate": 9.522088721142539e-05, + "loss": 0.0047, + "step": 1465 + }, + { + "epoch": 1.132844178412821, + "grad_norm": 0.012613057158887386, + "learning_rate": 9.520937149164975e-05, + "loss": 0.0042, + "step": 1466 + }, + { + "epoch": 1.133616528287314, + "grad_norm": 0.010349423624575138, + "learning_rate": 9.519784261251847e-05, + "loss": 0.0047, + "step": 1467 + }, + { + "epoch": 1.1343888781618072, + "grad_norm": 0.014359497465193272, + "learning_rate": 9.518630057738733e-05, + "loss": 0.0054, + "step": 1468 + }, + { + "epoch": 1.1351612280363004, + "grad_norm": 0.02081706002354622, + "learning_rate": 9.517474538961595e-05, + "loss": 0.0043, + "step": 1469 + }, + { + "epoch": 1.1359335779107935, + "grad_norm": 0.015621309168636799, + "learning_rate": 9.516317705256774e-05, + "loss": 0.0046, + "step": 1470 + }, + { + "epoch": 1.1367059277852867, + "grad_norm": 0.01698329485952854, + "learning_rate": 9.515159556960998e-05, + "loss": 0.0046, + "step": 1471 + }, + { + "epoch": 1.1374782776597798, + "grad_norm": 0.03221715986728668, + "learning_rate": 9.514000094411377e-05, + "loss": 0.0049, + "step": 1472 + }, + { + "epoch": 1.138250627534273, + "grad_norm": 0.013817714527249336, + "learning_rate": 9.5128393179454e-05, + "loss": 0.0049, + "step": 1473 + }, + { + "epoch": 1.1390229774087661, + "grad_norm": 0.016748011112213135, + "learning_rate": 9.511677227900942e-05, + "loss": 0.0046, + "step": 1474 + }, + { + "epoch": 1.1397953272832593, + "grad_norm": 0.031057976186275482, + "learning_rate": 9.510513824616261e-05, + "loss": 0.0052, + "step": 1475 + }, + { + "epoch": 1.1405676771577524, + "grad_norm": 0.03200233355164528, + "learning_rate": 9.509349108429993e-05, + "loss": 0.0049, + "step": 1476 + }, + { + "epoch": 1.1413400270322456, + "grad_norm": 0.01874365098774433, + "learning_rate": 9.50818307968116e-05, + "loss": 0.0049, + "step": 1477 + }, + { + "epoch": 1.1421123769067387, + "grad_norm": 0.02029556781053543, + "learning_rate": 9.507015738709165e-05, + "loss": 0.0051, + "step": 1478 + }, + { + "epoch": 1.1428847267812319, + "grad_norm": 0.010294657200574875, + "learning_rate": 9.505847085853792e-05, + "loss": 0.0043, + "step": 1479 + }, + { + "epoch": 1.143657076655725, + "grad_norm": 0.011460382491350174, + "learning_rate": 9.504677121455208e-05, + "loss": 0.0047, + "step": 1480 + }, + { + "epoch": 1.1444294265302182, + "grad_norm": 0.015220578759908676, + "learning_rate": 9.503505845853963e-05, + "loss": 0.0046, + "step": 1481 + }, + { + "epoch": 1.1452017764047113, + "grad_norm": 0.07611880451440811, + "learning_rate": 9.502333259390984e-05, + "loss": 0.0049, + "step": 1482 + }, + { + "epoch": 1.1459741262792045, + "grad_norm": 0.01241256482899189, + "learning_rate": 9.501159362407584e-05, + "loss": 0.0048, + "step": 1483 + }, + { + "epoch": 1.1467464761536976, + "grad_norm": 0.16832369565963745, + "learning_rate": 9.499984155245457e-05, + "loss": 0.0067, + "step": 1484 + }, + { + "epoch": 1.1475188260281908, + "grad_norm": 0.011174335144460201, + "learning_rate": 9.498807638246676e-05, + "loss": 0.0041, + "step": 1485 + }, + { + "epoch": 1.148291175902684, + "grad_norm": 0.022033747285604477, + "learning_rate": 9.497629811753697e-05, + "loss": 0.0049, + "step": 1486 + }, + { + "epoch": 1.149063525777177, + "grad_norm": 0.018059583380818367, + "learning_rate": 9.496450676109359e-05, + "loss": 0.006, + "step": 1487 + }, + { + "epoch": 1.1498358756516702, + "grad_norm": 0.044133514165878296, + "learning_rate": 9.495270231656875e-05, + "loss": 0.0065, + "step": 1488 + }, + { + "epoch": 1.1506082255261634, + "grad_norm": 0.056028980761766434, + "learning_rate": 9.494088478739848e-05, + "loss": 0.0056, + "step": 1489 + }, + { + "epoch": 1.1513805754006565, + "grad_norm": 0.02036936953663826, + "learning_rate": 9.492905417702255e-05, + "loss": 0.0045, + "step": 1490 + }, + { + "epoch": 1.1521529252751497, + "grad_norm": 0.01621861197054386, + "learning_rate": 9.491721048888461e-05, + "loss": 0.0047, + "step": 1491 + }, + { + "epoch": 1.1529252751496428, + "grad_norm": 0.016505256295204163, + "learning_rate": 9.490535372643203e-05, + "loss": 0.0048, + "step": 1492 + }, + { + "epoch": 1.153697625024136, + "grad_norm": 0.019400065764784813, + "learning_rate": 9.489348389311603e-05, + "loss": 0.0038, + "step": 1493 + }, + { + "epoch": 1.1544699748986291, + "grad_norm": 0.03492862358689308, + "learning_rate": 9.488160099239164e-05, + "loss": 0.0051, + "step": 1494 + }, + { + "epoch": 1.1552423247731223, + "grad_norm": 0.03151748329401016, + "learning_rate": 9.486970502771769e-05, + "loss": 0.005, + "step": 1495 + }, + { + "epoch": 1.1560146746476154, + "grad_norm": 0.013358225114643574, + "learning_rate": 9.48577960025568e-05, + "loss": 0.0052, + "step": 1496 + }, + { + "epoch": 1.1567870245221086, + "grad_norm": 0.024633124470710754, + "learning_rate": 9.48458739203754e-05, + "loss": 0.005, + "step": 1497 + }, + { + "epoch": 1.1575593743966017, + "grad_norm": 0.030730850994586945, + "learning_rate": 9.483393878464372e-05, + "loss": 0.0051, + "step": 1498 + }, + { + "epoch": 1.1583317242710949, + "grad_norm": 0.021617265418171883, + "learning_rate": 9.482199059883581e-05, + "loss": 0.0049, + "step": 1499 + }, + { + "epoch": 1.159104074145588, + "grad_norm": 0.022437119856476784, + "learning_rate": 9.481002936642946e-05, + "loss": 0.0046, + "step": 1500 + }, + { + "epoch": 1.1598764240200812, + "grad_norm": 0.04322676733136177, + "learning_rate": 9.479805509090633e-05, + "loss": 0.005, + "step": 1501 + }, + { + "epoch": 1.1606487738945743, + "grad_norm": 0.019928233698010445, + "learning_rate": 9.478606777575183e-05, + "loss": 0.0048, + "step": 1502 + }, + { + "epoch": 1.1614211237690675, + "grad_norm": 0.029807768762111664, + "learning_rate": 9.477406742445516e-05, + "loss": 0.005, + "step": 1503 + }, + { + "epoch": 1.1621934736435606, + "grad_norm": 0.02389885112643242, + "learning_rate": 9.476205404050936e-05, + "loss": 0.0045, + "step": 1504 + }, + { + "epoch": 1.1629658235180536, + "grad_norm": 0.01405603438615799, + "learning_rate": 9.475002762741122e-05, + "loss": 0.0045, + "step": 1505 + }, + { + "epoch": 1.163738173392547, + "grad_norm": 0.02136818692088127, + "learning_rate": 9.473798818866134e-05, + "loss": 0.0049, + "step": 1506 + }, + { + "epoch": 1.1645105232670399, + "grad_norm": 0.026121748611330986, + "learning_rate": 9.472593572776411e-05, + "loss": 0.0048, + "step": 1507 + }, + { + "epoch": 1.1652828731415332, + "grad_norm": 0.022823212668299675, + "learning_rate": 9.471387024822773e-05, + "loss": 0.0049, + "step": 1508 + }, + { + "epoch": 1.1660552230160262, + "grad_norm": 0.019243579357862473, + "learning_rate": 9.470179175356413e-05, + "loss": 0.0049, + "step": 1509 + }, + { + "epoch": 1.1668275728905195, + "grad_norm": 0.014758034609258175, + "learning_rate": 9.468970024728911e-05, + "loss": 0.0047, + "step": 1510 + }, + { + "epoch": 1.1675999227650125, + "grad_norm": 0.013576776720583439, + "learning_rate": 9.467759573292217e-05, + "loss": 0.0051, + "step": 1511 + }, + { + "epoch": 1.1683722726395056, + "grad_norm": 0.03590305894613266, + "learning_rate": 9.466547821398668e-05, + "loss": 0.0049, + "step": 1512 + }, + { + "epoch": 1.1691446225139988, + "grad_norm": 0.0228645708411932, + "learning_rate": 9.465334769400975e-05, + "loss": 0.0047, + "step": 1513 + }, + { + "epoch": 1.169916972388492, + "grad_norm": 0.0239447969943285, + "learning_rate": 9.464120417652226e-05, + "loss": 0.0046, + "step": 1514 + }, + { + "epoch": 1.170689322262985, + "grad_norm": 0.038972727954387665, + "learning_rate": 9.462904766505893e-05, + "loss": 0.0051, + "step": 1515 + }, + { + "epoch": 1.1714616721374782, + "grad_norm": 0.01209209393709898, + "learning_rate": 9.46168781631582e-05, + "loss": 0.0051, + "step": 1516 + }, + { + "epoch": 1.1722340220119714, + "grad_norm": 0.01610351912677288, + "learning_rate": 9.460469567436232e-05, + "loss": 0.0046, + "step": 1517 + }, + { + "epoch": 1.1730063718864645, + "grad_norm": 0.036079291254282, + "learning_rate": 9.459250020221731e-05, + "loss": 0.005, + "step": 1518 + }, + { + "epoch": 1.1737787217609577, + "grad_norm": 0.01571802981197834, + "learning_rate": 9.458029175027301e-05, + "loss": 0.0049, + "step": 1519 + }, + { + "epoch": 1.1745510716354508, + "grad_norm": 0.018590154126286507, + "learning_rate": 9.456807032208298e-05, + "loss": 0.0047, + "step": 1520 + }, + { + "epoch": 1.175323421509944, + "grad_norm": 0.04737241193652153, + "learning_rate": 9.455583592120458e-05, + "loss": 0.005, + "step": 1521 + }, + { + "epoch": 1.176095771384437, + "grad_norm": 0.023734835907816887, + "learning_rate": 9.454358855119895e-05, + "loss": 0.0047, + "step": 1522 + }, + { + "epoch": 1.1768681212589303, + "grad_norm": 0.04579580947756767, + "learning_rate": 9.453132821563102e-05, + "loss": 0.0051, + "step": 1523 + }, + { + "epoch": 1.1776404711334234, + "grad_norm": 0.05348771810531616, + "learning_rate": 9.451905491806946e-05, + "loss": 0.0053, + "step": 1524 + }, + { + "epoch": 1.1784128210079166, + "grad_norm": 0.013545769266784191, + "learning_rate": 9.450676866208675e-05, + "loss": 0.0047, + "step": 1525 + }, + { + "epoch": 1.1791851708824097, + "grad_norm": 0.04317854717373848, + "learning_rate": 9.44944694512591e-05, + "loss": 0.0049, + "step": 1526 + }, + { + "epoch": 1.1799575207569029, + "grad_norm": 0.027513282373547554, + "learning_rate": 9.448215728916652e-05, + "loss": 0.0045, + "step": 1527 + }, + { + "epoch": 1.180729870631396, + "grad_norm": 0.020986218005418777, + "learning_rate": 9.446983217939278e-05, + "loss": 0.0046, + "step": 1528 + }, + { + "epoch": 1.1815022205058892, + "grad_norm": 0.03395180404186249, + "learning_rate": 9.445749412552544e-05, + "loss": 0.005, + "step": 1529 + }, + { + "epoch": 1.1822745703803823, + "grad_norm": 0.01142895594239235, + "learning_rate": 9.44451431311558e-05, + "loss": 0.0048, + "step": 1530 + }, + { + "epoch": 1.1830469202548755, + "grad_norm": 0.016486341133713722, + "learning_rate": 9.443277919987892e-05, + "loss": 0.0047, + "step": 1531 + }, + { + "epoch": 1.1838192701293686, + "grad_norm": 0.010691473260521889, + "learning_rate": 9.442040233529366e-05, + "loss": 0.0051, + "step": 1532 + }, + { + "epoch": 1.1845916200038618, + "grad_norm": 0.015164146199822426, + "learning_rate": 9.440801254100261e-05, + "loss": 0.0044, + "step": 1533 + }, + { + "epoch": 1.185363969878355, + "grad_norm": 0.01204696111381054, + "learning_rate": 9.439560982061215e-05, + "loss": 0.0049, + "step": 1534 + }, + { + "epoch": 1.186136319752848, + "grad_norm": 0.012924645096063614, + "learning_rate": 9.438319417773243e-05, + "loss": 0.0046, + "step": 1535 + }, + { + "epoch": 1.1869086696273412, + "grad_norm": 0.016666272655129433, + "learning_rate": 9.43707656159773e-05, + "loss": 0.0044, + "step": 1536 + }, + { + "epoch": 1.1876810195018344, + "grad_norm": 0.012084845453500748, + "learning_rate": 9.435832413896446e-05, + "loss": 0.0042, + "step": 1537 + }, + { + "epoch": 1.1884533693763275, + "grad_norm": 0.017669981345534325, + "learning_rate": 9.43458697503153e-05, + "loss": 0.0046, + "step": 1538 + }, + { + "epoch": 1.1892257192508207, + "grad_norm": 0.0243675597012043, + "learning_rate": 9.433340245365499e-05, + "loss": 0.0041, + "step": 1539 + }, + { + "epoch": 1.1899980691253138, + "grad_norm": 0.014637123793363571, + "learning_rate": 9.432092225261246e-05, + "loss": 0.0044, + "step": 1540 + }, + { + "epoch": 1.190770418999807, + "grad_norm": 0.01418989896774292, + "learning_rate": 9.430842915082042e-05, + "loss": 0.0042, + "step": 1541 + }, + { + "epoch": 1.1915427688743, + "grad_norm": 0.02326418273150921, + "learning_rate": 9.429592315191527e-05, + "loss": 0.0043, + "step": 1542 + }, + { + "epoch": 1.1923151187487933, + "grad_norm": 0.010195459239184856, + "learning_rate": 9.428340425953723e-05, + "loss": 0.0046, + "step": 1543 + }, + { + "epoch": 1.1930874686232864, + "grad_norm": 0.0273361224681139, + "learning_rate": 9.427087247733023e-05, + "loss": 0.0052, + "step": 1544 + }, + { + "epoch": 1.1938598184977796, + "grad_norm": 0.01398895401507616, + "learning_rate": 9.425832780894198e-05, + "loss": 0.0048, + "step": 1545 + }, + { + "epoch": 1.1946321683722727, + "grad_norm": 0.01838972046971321, + "learning_rate": 9.424577025802394e-05, + "loss": 0.0049, + "step": 1546 + }, + { + "epoch": 1.1954045182467659, + "grad_norm": 0.028748217970132828, + "learning_rate": 9.423319982823129e-05, + "loss": 0.0045, + "step": 1547 + }, + { + "epoch": 1.196176868121259, + "grad_norm": 0.012720394879579544, + "learning_rate": 9.422061652322298e-05, + "loss": 0.0038, + "step": 1548 + }, + { + "epoch": 1.1969492179957522, + "grad_norm": 0.011654634028673172, + "learning_rate": 9.420802034666172e-05, + "loss": 0.004, + "step": 1549 + }, + { + "epoch": 1.1977215678702453, + "grad_norm": 0.015816690400242805, + "learning_rate": 9.419541130221394e-05, + "loss": 0.0043, + "step": 1550 + }, + { + "epoch": 1.1984939177447385, + "grad_norm": 0.014582616277039051, + "learning_rate": 9.418278939354984e-05, + "loss": 0.0045, + "step": 1551 + }, + { + "epoch": 1.1992662676192316, + "grad_norm": 0.011749477125704288, + "learning_rate": 9.417015462434336e-05, + "loss": 0.0044, + "step": 1552 + }, + { + "epoch": 1.2000386174937248, + "grad_norm": 0.014693439938127995, + "learning_rate": 9.415750699827213e-05, + "loss": 0.0045, + "step": 1553 + }, + { + "epoch": 1.2008109673682177, + "grad_norm": 0.014372429810464382, + "learning_rate": 9.414484651901763e-05, + "loss": 0.0043, + "step": 1554 + }, + { + "epoch": 1.201583317242711, + "grad_norm": 0.01311397459357977, + "learning_rate": 9.413217319026497e-05, + "loss": 0.0042, + "step": 1555 + }, + { + "epoch": 1.202355667117204, + "grad_norm": 0.01853027381002903, + "learning_rate": 9.411948701570307e-05, + "loss": 0.0047, + "step": 1556 + }, + { + "epoch": 1.2031280169916974, + "grad_norm": 0.00959880743175745, + "learning_rate": 9.410678799902458e-05, + "loss": 0.0043, + "step": 1557 + }, + { + "epoch": 1.2039003668661903, + "grad_norm": 0.011557672172784805, + "learning_rate": 9.409407614392585e-05, + "loss": 0.0048, + "step": 1558 + }, + { + "epoch": 1.2046727167406834, + "grad_norm": 0.022469764575362206, + "learning_rate": 9.408135145410701e-05, + "loss": 0.0045, + "step": 1559 + }, + { + "epoch": 1.2054450666151766, + "grad_norm": 0.012665819376707077, + "learning_rate": 9.406861393327193e-05, + "loss": 0.0042, + "step": 1560 + }, + { + "epoch": 1.2062174164896697, + "grad_norm": 0.016224119812250137, + "learning_rate": 9.405586358512817e-05, + "loss": 0.0047, + "step": 1561 + }, + { + "epoch": 1.2069897663641629, + "grad_norm": 0.021541811525821686, + "learning_rate": 9.404310041338704e-05, + "loss": 0.0046, + "step": 1562 + }, + { + "epoch": 1.207762116238656, + "grad_norm": 0.017033765092492104, + "learning_rate": 9.40303244217636e-05, + "loss": 0.0043, + "step": 1563 + }, + { + "epoch": 1.2085344661131492, + "grad_norm": 0.02095952443778515, + "learning_rate": 9.401753561397664e-05, + "loss": 0.0043, + "step": 1564 + }, + { + "epoch": 1.2093068159876423, + "grad_norm": 0.013313321396708488, + "learning_rate": 9.400473399374868e-05, + "loss": 0.0039, + "step": 1565 + }, + { + "epoch": 1.2100791658621355, + "grad_norm": 0.016337791457772255, + "learning_rate": 9.399191956480594e-05, + "loss": 0.0042, + "step": 1566 + }, + { + "epoch": 1.2108515157366286, + "grad_norm": 0.01406900305300951, + "learning_rate": 9.397909233087839e-05, + "loss": 0.0049, + "step": 1567 + }, + { + "epoch": 1.2116238656111218, + "grad_norm": 0.013073080219328403, + "learning_rate": 9.396625229569975e-05, + "loss": 0.0042, + "step": 1568 + }, + { + "epoch": 1.212396215485615, + "grad_norm": 0.013312644325196743, + "learning_rate": 9.395339946300743e-05, + "loss": 0.0042, + "step": 1569 + }, + { + "epoch": 1.213168565360108, + "grad_norm": 0.014997152611613274, + "learning_rate": 9.394053383654258e-05, + "loss": 0.0049, + "step": 1570 + }, + { + "epoch": 1.2139409152346012, + "grad_norm": 0.014144647866487503, + "learning_rate": 9.392765542005008e-05, + "loss": 0.0043, + "step": 1571 + }, + { + "epoch": 1.2147132651090944, + "grad_norm": 0.01141285989433527, + "learning_rate": 9.391476421727853e-05, + "loss": 0.0044, + "step": 1572 + }, + { + "epoch": 1.2154856149835875, + "grad_norm": 0.020384816452860832, + "learning_rate": 9.390186023198022e-05, + "loss": 0.0043, + "step": 1573 + }, + { + "epoch": 1.2162579648580807, + "grad_norm": 0.01110097672790289, + "learning_rate": 9.388894346791121e-05, + "loss": 0.0044, + "step": 1574 + }, + { + "epoch": 1.2170303147325738, + "grad_norm": 0.015966853126883507, + "learning_rate": 9.387601392883128e-05, + "loss": 0.0044, + "step": 1575 + }, + { + "epoch": 1.217802664607067, + "grad_norm": 0.009524564258754253, + "learning_rate": 9.386307161850384e-05, + "loss": 0.004, + "step": 1576 + }, + { + "epoch": 1.2185750144815601, + "grad_norm": 0.01066622231155634, + "learning_rate": 9.385011654069615e-05, + "loss": 0.0044, + "step": 1577 + }, + { + "epoch": 1.2193473643560533, + "grad_norm": 0.010826393030583858, + "learning_rate": 9.38371486991791e-05, + "loss": 0.0045, + "step": 1578 + }, + { + "epoch": 1.2201197142305464, + "grad_norm": 0.01694653183221817, + "learning_rate": 9.38241680977273e-05, + "loss": 0.0048, + "step": 1579 + }, + { + "epoch": 1.2208920641050396, + "grad_norm": 0.022291971370577812, + "learning_rate": 9.38111747401191e-05, + "loss": 0.0045, + "step": 1580 + }, + { + "epoch": 1.2216644139795327, + "grad_norm": 0.01632075384259224, + "learning_rate": 9.379816863013655e-05, + "loss": 0.0045, + "step": 1581 + }, + { + "epoch": 1.2224367638540259, + "grad_norm": 0.01566249504685402, + "learning_rate": 9.378514977156543e-05, + "loss": 0.0047, + "step": 1582 + }, + { + "epoch": 1.223209113728519, + "grad_norm": 0.016212694346904755, + "learning_rate": 9.377211816819518e-05, + "loss": 0.0044, + "step": 1583 + }, + { + "epoch": 1.2239814636030122, + "grad_norm": 0.009922226890921593, + "learning_rate": 9.375907382381903e-05, + "loss": 0.0039, + "step": 1584 + }, + { + "epoch": 1.2247538134775053, + "grad_norm": 0.011142924427986145, + "learning_rate": 9.374601674223383e-05, + "loss": 0.0044, + "step": 1585 + }, + { + "epoch": 1.2255261633519985, + "grad_norm": 0.012572258710861206, + "learning_rate": 9.373294692724022e-05, + "loss": 0.0044, + "step": 1586 + }, + { + "epoch": 1.2262985132264916, + "grad_norm": 0.012604492716491222, + "learning_rate": 9.371986438264246e-05, + "loss": 0.0044, + "step": 1587 + }, + { + "epoch": 1.2270708631009848, + "grad_norm": 0.017465921118855476, + "learning_rate": 9.370676911224862e-05, + "loss": 0.0047, + "step": 1588 + }, + { + "epoch": 1.227843212975478, + "grad_norm": 0.019798975437879562, + "learning_rate": 9.369366111987037e-05, + "loss": 0.0048, + "step": 1589 + }, + { + "epoch": 1.228615562849971, + "grad_norm": 0.013204040005803108, + "learning_rate": 9.368054040932315e-05, + "loss": 0.0044, + "step": 1590 + }, + { + "epoch": 1.2293879127244642, + "grad_norm": 0.013707738369703293, + "learning_rate": 9.366740698442608e-05, + "loss": 0.0043, + "step": 1591 + }, + { + "epoch": 1.2301602625989574, + "grad_norm": 0.014138542115688324, + "learning_rate": 9.3654260849002e-05, + "loss": 0.0045, + "step": 1592 + }, + { + "epoch": 1.2309326124734505, + "grad_norm": 0.011442754417657852, + "learning_rate": 9.364110200687738e-05, + "loss": 0.0044, + "step": 1593 + }, + { + "epoch": 1.2317049623479437, + "grad_norm": 0.013380014337599277, + "learning_rate": 9.36279304618825e-05, + "loss": 0.0045, + "step": 1594 + }, + { + "epoch": 1.2324773122224368, + "grad_norm": 0.01104913279414177, + "learning_rate": 9.361474621785125e-05, + "loss": 0.004, + "step": 1595 + }, + { + "epoch": 1.23324966209693, + "grad_norm": 0.010025731287896633, + "learning_rate": 9.360154927862123e-05, + "loss": 0.0039, + "step": 1596 + }, + { + "epoch": 1.2340220119714231, + "grad_norm": 0.009694824926555157, + "learning_rate": 9.358833964803379e-05, + "loss": 0.004, + "step": 1597 + }, + { + "epoch": 1.2347943618459163, + "grad_norm": 0.012553083710372448, + "learning_rate": 9.357511732993392e-05, + "loss": 0.0047, + "step": 1598 + }, + { + "epoch": 1.2355667117204094, + "grad_norm": 0.012948554940521717, + "learning_rate": 9.356188232817029e-05, + "loss": 0.0048, + "step": 1599 + }, + { + "epoch": 1.2363390615949026, + "grad_norm": 0.013301286846399307, + "learning_rate": 9.354863464659532e-05, + "loss": 0.0049, + "step": 1600 + }, + { + "epoch": 1.2371114114693955, + "grad_norm": 0.0150552187114954, + "learning_rate": 9.353537428906508e-05, + "loss": 0.0043, + "step": 1601 + }, + { + "epoch": 1.2378837613438889, + "grad_norm": 0.01858876273036003, + "learning_rate": 9.352210125943934e-05, + "loss": 0.0048, + "step": 1602 + }, + { + "epoch": 1.2386561112183818, + "grad_norm": 0.016063978895545006, + "learning_rate": 9.350881556158155e-05, + "loss": 0.0051, + "step": 1603 + }, + { + "epoch": 1.2394284610928752, + "grad_norm": 0.010043537244200706, + "learning_rate": 9.349551719935887e-05, + "loss": 0.0039, + "step": 1604 + }, + { + "epoch": 1.2402008109673681, + "grad_norm": 0.009146731346845627, + "learning_rate": 9.348220617664212e-05, + "loss": 0.0038, + "step": 1605 + }, + { + "epoch": 1.2409731608418613, + "grad_norm": 0.011488694697618484, + "learning_rate": 9.346888249730583e-05, + "loss": 0.0048, + "step": 1606 + }, + { + "epoch": 1.2417455107163544, + "grad_norm": 0.014834541827440262, + "learning_rate": 9.345554616522818e-05, + "loss": 0.0049, + "step": 1607 + }, + { + "epoch": 1.2425178605908476, + "grad_norm": 0.010890133678913116, + "learning_rate": 9.344219718429108e-05, + "loss": 0.0043, + "step": 1608 + }, + { + "epoch": 1.2432902104653407, + "grad_norm": 0.019118288531899452, + "learning_rate": 9.342883555838007e-05, + "loss": 0.004, + "step": 1609 + }, + { + "epoch": 1.2440625603398339, + "grad_norm": 0.025927064940333366, + "learning_rate": 9.341546129138442e-05, + "loss": 0.0045, + "step": 1610 + }, + { + "epoch": 1.244834910214327, + "grad_norm": 0.019404688850045204, + "learning_rate": 9.340207438719703e-05, + "loss": 0.0044, + "step": 1611 + }, + { + "epoch": 1.2456072600888202, + "grad_norm": 0.01408322062343359, + "learning_rate": 9.338867484971454e-05, + "loss": 0.0046, + "step": 1612 + }, + { + "epoch": 1.2463796099633133, + "grad_norm": 0.01599975675344467, + "learning_rate": 9.33752626828372e-05, + "loss": 0.0045, + "step": 1613 + }, + { + "epoch": 1.2471519598378065, + "grad_norm": 0.012950134463608265, + "learning_rate": 9.336183789046899e-05, + "loss": 0.0042, + "step": 1614 + }, + { + "epoch": 1.2479243097122996, + "grad_norm": 0.012142792344093323, + "learning_rate": 9.334840047651752e-05, + "loss": 0.0043, + "step": 1615 + }, + { + "epoch": 1.2486966595867928, + "grad_norm": 0.011769046075642109, + "learning_rate": 9.33349504448941e-05, + "loss": 0.0042, + "step": 1616 + }, + { + "epoch": 1.249469009461286, + "grad_norm": 0.019216034561395645, + "learning_rate": 9.332148779951375e-05, + "loss": 0.0047, + "step": 1617 + }, + { + "epoch": 1.250241359335779, + "grad_norm": 0.012334473431110382, + "learning_rate": 9.330801254429507e-05, + "loss": 0.0044, + "step": 1618 + }, + { + "epoch": 1.2510137092102722, + "grad_norm": 0.01532732229679823, + "learning_rate": 9.32945246831604e-05, + "loss": 0.0049, + "step": 1619 + }, + { + "epoch": 1.2517860590847654, + "grad_norm": 0.01669018529355526, + "learning_rate": 9.328102422003572e-05, + "loss": 0.0044, + "step": 1620 + }, + { + "epoch": 1.2525584089592585, + "grad_norm": 0.011733302846550941, + "learning_rate": 9.326751115885071e-05, + "loss": 0.0043, + "step": 1621 + }, + { + "epoch": 1.2533307588337517, + "grad_norm": 0.011070352047681808, + "learning_rate": 9.325398550353868e-05, + "loss": 0.0043, + "step": 1622 + }, + { + "epoch": 1.2541031087082448, + "grad_norm": 0.00988784246146679, + "learning_rate": 9.324044725803662e-05, + "loss": 0.004, + "step": 1623 + }, + { + "epoch": 1.254875458582738, + "grad_norm": 0.009769883938133717, + "learning_rate": 9.322689642628519e-05, + "loss": 0.0039, + "step": 1624 + }, + { + "epoch": 1.2556478084572311, + "grad_norm": 0.009687105193734169, + "learning_rate": 9.321333301222872e-05, + "loss": 0.0042, + "step": 1625 + }, + { + "epoch": 1.2564201583317243, + "grad_norm": 0.014387411065399647, + "learning_rate": 9.319975701981519e-05, + "loss": 0.0046, + "step": 1626 + }, + { + "epoch": 1.2571925082062174, + "grad_norm": 0.009872864000499249, + "learning_rate": 9.318616845299622e-05, + "loss": 0.004, + "step": 1627 + }, + { + "epoch": 1.2579648580807106, + "grad_norm": 0.01052020862698555, + "learning_rate": 9.317256731572713e-05, + "loss": 0.0042, + "step": 1628 + }, + { + "epoch": 1.2587372079552037, + "grad_norm": 0.010644960217177868, + "learning_rate": 9.315895361196689e-05, + "loss": 0.0041, + "step": 1629 + }, + { + "epoch": 1.2595095578296969, + "grad_norm": 0.021633964031934738, + "learning_rate": 9.314532734567811e-05, + "loss": 0.0045, + "step": 1630 + }, + { + "epoch": 1.26028190770419, + "grad_norm": 0.011609155684709549, + "learning_rate": 9.313168852082708e-05, + "loss": 0.0046, + "step": 1631 + }, + { + "epoch": 1.2610542575786832, + "grad_norm": 0.016867250204086304, + "learning_rate": 9.311803714138372e-05, + "loss": 0.0047, + "step": 1632 + }, + { + "epoch": 1.2618266074531763, + "grad_norm": 0.026296664029359818, + "learning_rate": 9.310437321132161e-05, + "loss": 0.0044, + "step": 1633 + }, + { + "epoch": 1.2625989573276695, + "grad_norm": 0.010850663296878338, + "learning_rate": 9.3090696734618e-05, + "loss": 0.005, + "step": 1634 + }, + { + "epoch": 1.2633713072021626, + "grad_norm": 0.025731515139341354, + "learning_rate": 9.307700771525379e-05, + "loss": 0.0051, + "step": 1635 + }, + { + "epoch": 1.2641436570766558, + "grad_norm": 0.01730777695775032, + "learning_rate": 9.30633061572135e-05, + "loss": 0.0041, + "step": 1636 + }, + { + "epoch": 1.264916006951149, + "grad_norm": 0.014599094167351723, + "learning_rate": 9.304959206448534e-05, + "loss": 0.0047, + "step": 1637 + }, + { + "epoch": 1.265688356825642, + "grad_norm": 0.03559194132685661, + "learning_rate": 9.303586544106115e-05, + "loss": 0.0044, + "step": 1638 + }, + { + "epoch": 1.2664607067001352, + "grad_norm": 0.01836187206208706, + "learning_rate": 9.302212629093641e-05, + "loss": 0.0043, + "step": 1639 + }, + { + "epoch": 1.2672330565746284, + "grad_norm": 0.016208026558160782, + "learning_rate": 9.300837461811027e-05, + "loss": 0.0049, + "step": 1640 + }, + { + "epoch": 1.2680054064491215, + "grad_norm": 0.01692858897149563, + "learning_rate": 9.299461042658548e-05, + "loss": 0.0043, + "step": 1641 + }, + { + "epoch": 1.2687777563236147, + "grad_norm": 0.012160011567175388, + "learning_rate": 9.29808337203685e-05, + "loss": 0.0044, + "step": 1642 + }, + { + "epoch": 1.2695501061981078, + "grad_norm": 0.022362643852829933, + "learning_rate": 9.296704450346938e-05, + "loss": 0.0044, + "step": 1643 + }, + { + "epoch": 1.270322456072601, + "grad_norm": 0.010011281818151474, + "learning_rate": 9.295324277990183e-05, + "loss": 0.0041, + "step": 1644 + }, + { + "epoch": 1.2710948059470941, + "grad_norm": 0.020944936200976372, + "learning_rate": 9.293942855368318e-05, + "loss": 0.005, + "step": 1645 + }, + { + "epoch": 1.271867155821587, + "grad_norm": 0.012940247543156147, + "learning_rate": 9.292560182883444e-05, + "loss": 0.0051, + "step": 1646 + }, + { + "epoch": 1.2726395056960804, + "grad_norm": 0.010885242372751236, + "learning_rate": 9.291176260938023e-05, + "loss": 0.005, + "step": 1647 + }, + { + "epoch": 1.2734118555705733, + "grad_norm": 0.02365632727742195, + "learning_rate": 9.28979108993488e-05, + "loss": 0.0048, + "step": 1648 + }, + { + "epoch": 1.2741842054450667, + "grad_norm": 0.018599843606352806, + "learning_rate": 9.288404670277208e-05, + "loss": 0.004, + "step": 1649 + }, + { + "epoch": 1.2749565553195596, + "grad_norm": 0.0399547703564167, + "learning_rate": 9.287017002368557e-05, + "loss": 0.0053, + "step": 1650 + }, + { + "epoch": 1.275728905194053, + "grad_norm": 0.01006177719682455, + "learning_rate": 9.285628086612844e-05, + "loss": 0.0042, + "step": 1651 + }, + { + "epoch": 1.276501255068546, + "grad_norm": 0.009012104943394661, + "learning_rate": 9.284237923414351e-05, + "loss": 0.0042, + "step": 1652 + }, + { + "epoch": 1.2772736049430393, + "grad_norm": 0.010311353951692581, + "learning_rate": 9.282846513177718e-05, + "loss": 0.0049, + "step": 1653 + }, + { + "epoch": 1.2780459548175322, + "grad_norm": 0.020534196868538857, + "learning_rate": 9.281453856307953e-05, + "loss": 0.0048, + "step": 1654 + }, + { + "epoch": 1.2788183046920256, + "grad_norm": 0.020206743851304054, + "learning_rate": 9.280059953210425e-05, + "loss": 0.0051, + "step": 1655 + }, + { + "epoch": 1.2795906545665185, + "grad_norm": 0.021070044487714767, + "learning_rate": 9.278664804290864e-05, + "loss": 0.0049, + "step": 1656 + }, + { + "epoch": 1.2803630044410117, + "grad_norm": 0.014754528179764748, + "learning_rate": 9.277268409955364e-05, + "loss": 0.0044, + "step": 1657 + }, + { + "epoch": 1.2811353543155048, + "grad_norm": 0.012802411802113056, + "learning_rate": 9.275870770610382e-05, + "loss": 0.0044, + "step": 1658 + }, + { + "epoch": 1.281907704189998, + "grad_norm": 0.009704644791781902, + "learning_rate": 9.274471886662739e-05, + "loss": 0.0045, + "step": 1659 + }, + { + "epoch": 1.2826800540644911, + "grad_norm": 0.02821960113942623, + "learning_rate": 9.273071758519615e-05, + "loss": 0.0048, + "step": 1660 + }, + { + "epoch": 1.2834524039389843, + "grad_norm": 0.010638576932251453, + "learning_rate": 9.271670386588552e-05, + "loss": 0.0045, + "step": 1661 + }, + { + "epoch": 1.2842247538134774, + "grad_norm": 0.016380267217755318, + "learning_rate": 9.270267771277458e-05, + "loss": 0.0046, + "step": 1662 + }, + { + "epoch": 1.2849971036879706, + "grad_norm": 0.021698275581002235, + "learning_rate": 9.268863912994599e-05, + "loss": 0.0045, + "step": 1663 + }, + { + "epoch": 1.2857694535624637, + "grad_norm": 0.016744716092944145, + "learning_rate": 9.267458812148604e-05, + "loss": 0.0043, + "step": 1664 + }, + { + "epoch": 1.286541803436957, + "grad_norm": 0.018823161721229553, + "learning_rate": 9.266052469148463e-05, + "loss": 0.0043, + "step": 1665 + }, + { + "epoch": 1.28731415331145, + "grad_norm": 0.015203451737761497, + "learning_rate": 9.264644884403532e-05, + "loss": 0.0046, + "step": 1666 + }, + { + "epoch": 1.2880865031859432, + "grad_norm": 0.013647705316543579, + "learning_rate": 9.263236058323522e-05, + "loss": 0.0046, + "step": 1667 + }, + { + "epoch": 1.2888588530604363, + "grad_norm": 0.014722113497555256, + "learning_rate": 9.261825991318509e-05, + "loss": 0.0041, + "step": 1668 + }, + { + "epoch": 1.2896312029349295, + "grad_norm": 0.02481660805642605, + "learning_rate": 9.260414683798929e-05, + "loss": 0.0053, + "step": 1669 + }, + { + "epoch": 1.2904035528094226, + "grad_norm": 0.017250480130314827, + "learning_rate": 9.25900213617558e-05, + "loss": 0.0048, + "step": 1670 + }, + { + "epoch": 1.2911759026839158, + "grad_norm": 0.00951310619711876, + "learning_rate": 9.25758834885962e-05, + "loss": 0.004, + "step": 1671 + }, + { + "epoch": 1.291948252558409, + "grad_norm": 0.02184438891708851, + "learning_rate": 9.256173322262569e-05, + "loss": 0.0051, + "step": 1672 + }, + { + "epoch": 1.292720602432902, + "grad_norm": 0.01367892511188984, + "learning_rate": 9.254757056796305e-05, + "loss": 0.0048, + "step": 1673 + }, + { + "epoch": 1.2934929523073952, + "grad_norm": 0.017672359943389893, + "learning_rate": 9.253339552873074e-05, + "loss": 0.0046, + "step": 1674 + }, + { + "epoch": 1.2942653021818884, + "grad_norm": 0.011920403689146042, + "learning_rate": 9.251920810905473e-05, + "loss": 0.0102, + "step": 1675 + }, + { + "epoch": 1.2950376520563815, + "grad_norm": 0.016817884519696236, + "learning_rate": 9.250500831306462e-05, + "loss": 0.0043, + "step": 1676 + }, + { + "epoch": 1.2958100019308747, + "grad_norm": 0.015963738784193993, + "learning_rate": 9.249079614489364e-05, + "loss": 0.004, + "step": 1677 + }, + { + "epoch": 1.2965823518053678, + "grad_norm": 0.07387775182723999, + "learning_rate": 9.247657160867864e-05, + "loss": 0.0053, + "step": 1678 + }, + { + "epoch": 1.297354701679861, + "grad_norm": 0.30068767070770264, + "learning_rate": 9.246233470856e-05, + "loss": 0.006, + "step": 1679 + }, + { + "epoch": 1.2981270515543541, + "grad_norm": 0.06857229024171829, + "learning_rate": 9.244808544868177e-05, + "loss": 0.0045, + "step": 1680 + }, + { + "epoch": 1.2988994014288473, + "grad_norm": 0.012670093216001987, + "learning_rate": 9.243382383319154e-05, + "loss": 0.0043, + "step": 1681 + }, + { + "epoch": 1.2996717513033405, + "grad_norm": 0.03482682630419731, + "learning_rate": 9.241954986624052e-05, + "loss": 0.0047, + "step": 1682 + }, + { + "epoch": 1.3004441011778336, + "grad_norm": 0.03190697729587555, + "learning_rate": 9.240526355198353e-05, + "loss": 0.0126, + "step": 1683 + }, + { + "epoch": 1.3012164510523268, + "grad_norm": 0.14594252407550812, + "learning_rate": 9.239096489457898e-05, + "loss": 0.0054, + "step": 1684 + }, + { + "epoch": 1.30198880092682, + "grad_norm": 0.029185067862272263, + "learning_rate": 9.237665389818885e-05, + "loss": 0.006, + "step": 1685 + }, + { + "epoch": 1.302761150801313, + "grad_norm": 0.04388967528939247, + "learning_rate": 9.236233056697872e-05, + "loss": 0.0054, + "step": 1686 + }, + { + "epoch": 1.3035335006758062, + "grad_norm": 0.1524173617362976, + "learning_rate": 9.234799490511778e-05, + "loss": 0.0117, + "step": 1687 + }, + { + "epoch": 1.3043058505502994, + "grad_norm": 0.04995962977409363, + "learning_rate": 9.233364691677877e-05, + "loss": 0.0056, + "step": 1688 + }, + { + "epoch": 1.3050782004247925, + "grad_norm": 0.03565353527665138, + "learning_rate": 9.231928660613807e-05, + "loss": 0.0053, + "step": 1689 + }, + { + "epoch": 1.3058505502992857, + "grad_norm": 0.06701270490884781, + "learning_rate": 9.230491397737562e-05, + "loss": 0.0066, + "step": 1690 + }, + { + "epoch": 1.3066229001737788, + "grad_norm": 0.05576106533408165, + "learning_rate": 9.229052903467493e-05, + "loss": 0.0067, + "step": 1691 + }, + { + "epoch": 1.307395250048272, + "grad_norm": 0.4241020083427429, + "learning_rate": 9.22761317822231e-05, + "loss": 0.0152, + "step": 1692 + }, + { + "epoch": 1.3081675999227649, + "grad_norm": 0.02562202699482441, + "learning_rate": 9.226172222421083e-05, + "loss": 0.0045, + "step": 1693 + }, + { + "epoch": 1.3089399497972583, + "grad_norm": 0.06259538233280182, + "learning_rate": 9.224730036483241e-05, + "loss": 0.0065, + "step": 1694 + }, + { + "epoch": 1.3097122996717512, + "grad_norm": 0.055991485714912415, + "learning_rate": 9.223286620828569e-05, + "loss": 0.006, + "step": 1695 + }, + { + "epoch": 1.3104846495462446, + "grad_norm": 0.12314442545175552, + "learning_rate": 9.22184197587721e-05, + "loss": 0.0213, + "step": 1696 + }, + { + "epoch": 1.3112569994207375, + "grad_norm": 0.034334950149059296, + "learning_rate": 9.220396102049665e-05, + "loss": 0.0055, + "step": 1697 + }, + { + "epoch": 1.3120293492952309, + "grad_norm": 0.05128410831093788, + "learning_rate": 9.218948999766792e-05, + "loss": 0.0055, + "step": 1698 + }, + { + "epoch": 1.3128016991697238, + "grad_norm": 0.12215114384889603, + "learning_rate": 9.21750066944981e-05, + "loss": 0.0058, + "step": 1699 + }, + { + "epoch": 1.3135740490442172, + "grad_norm": 0.03655848279595375, + "learning_rate": 9.21605111152029e-05, + "loss": 0.0058, + "step": 1700 + }, + { + "epoch": 1.31434639891871, + "grad_norm": 0.07133016735315323, + "learning_rate": 9.214600326400165e-05, + "loss": 0.0062, + "step": 1701 + }, + { + "epoch": 1.3151187487932035, + "grad_norm": 0.03918299451470375, + "learning_rate": 9.213148314511723e-05, + "loss": 0.0053, + "step": 1702 + }, + { + "epoch": 1.3158910986676964, + "grad_norm": 0.01166351418942213, + "learning_rate": 9.211695076277611e-05, + "loss": 0.0053, + "step": 1703 + }, + { + "epoch": 1.3166634485421895, + "grad_norm": 0.03695222735404968, + "learning_rate": 9.210240612120831e-05, + "loss": 0.0053, + "step": 1704 + }, + { + "epoch": 1.3174357984166827, + "grad_norm": 0.07988899946212769, + "learning_rate": 9.208784922464742e-05, + "loss": 0.0057, + "step": 1705 + }, + { + "epoch": 1.3182081482911758, + "grad_norm": 0.023410560563206673, + "learning_rate": 9.207328007733059e-05, + "loss": 0.0059, + "step": 1706 + }, + { + "epoch": 1.318980498165669, + "grad_norm": 0.018493063747882843, + "learning_rate": 9.205869868349854e-05, + "loss": 0.0055, + "step": 1707 + }, + { + "epoch": 1.3197528480401621, + "grad_norm": 0.06907300651073456, + "learning_rate": 9.204410504739559e-05, + "loss": 0.0055, + "step": 1708 + }, + { + "epoch": 1.3205251979146553, + "grad_norm": 0.04984796419739723, + "learning_rate": 9.202949917326957e-05, + "loss": 0.0057, + "step": 1709 + }, + { + "epoch": 1.3212975477891484, + "grad_norm": 0.02336471527814865, + "learning_rate": 9.201488106537192e-05, + "loss": 0.005, + "step": 1710 + }, + { + "epoch": 1.3220698976636416, + "grad_norm": 0.026698529720306396, + "learning_rate": 9.200025072795762e-05, + "loss": 0.0059, + "step": 1711 + }, + { + "epoch": 1.3228422475381347, + "grad_norm": 0.020736917853355408, + "learning_rate": 9.198560816528519e-05, + "loss": 0.0058, + "step": 1712 + }, + { + "epoch": 1.3236145974126279, + "grad_norm": 0.030118491500616074, + "learning_rate": 9.197095338161671e-05, + "loss": 0.0058, + "step": 1713 + }, + { + "epoch": 1.324386947287121, + "grad_norm": 0.02043141797184944, + "learning_rate": 9.195628638121786e-05, + "loss": 0.0054, + "step": 1714 + }, + { + "epoch": 1.3251592971616142, + "grad_norm": 0.017679326236248016, + "learning_rate": 9.194160716835786e-05, + "loss": 0.0053, + "step": 1715 + }, + { + "epoch": 1.3259316470361073, + "grad_norm": 0.02373124659061432, + "learning_rate": 9.192691574730944e-05, + "loss": 0.0052, + "step": 1716 + }, + { + "epoch": 1.3267039969106005, + "grad_norm": 0.1194394901394844, + "learning_rate": 9.191221212234895e-05, + "loss": 0.0051, + "step": 1717 + }, + { + "epoch": 1.3274763467850936, + "grad_norm": 0.02503488026559353, + "learning_rate": 9.189749629775622e-05, + "loss": 0.0048, + "step": 1718 + }, + { + "epoch": 1.3282486966595868, + "grad_norm": 0.04794839769601822, + "learning_rate": 9.188276827781472e-05, + "loss": 0.0061, + "step": 1719 + }, + { + "epoch": 1.32902104653408, + "grad_norm": 0.021881457418203354, + "learning_rate": 9.186802806681139e-05, + "loss": 0.0048, + "step": 1720 + }, + { + "epoch": 1.329793396408573, + "grad_norm": 0.023368775844573975, + "learning_rate": 9.185327566903675e-05, + "loss": 0.0052, + "step": 1721 + }, + { + "epoch": 1.3305657462830662, + "grad_norm": 0.0751650407910347, + "learning_rate": 9.183851108878488e-05, + "loss": 0.0057, + "step": 1722 + }, + { + "epoch": 1.3313380961575594, + "grad_norm": 0.020163100212812424, + "learning_rate": 9.182373433035338e-05, + "loss": 0.0051, + "step": 1723 + }, + { + "epoch": 1.3321104460320525, + "grad_norm": 0.0644889622926712, + "learning_rate": 9.180894539804342e-05, + "loss": 0.0059, + "step": 1724 + }, + { + "epoch": 1.3328827959065457, + "grad_norm": 0.05060422047972679, + "learning_rate": 9.179414429615969e-05, + "loss": 0.0063, + "step": 1725 + }, + { + "epoch": 1.3336551457810388, + "grad_norm": 0.016556408256292343, + "learning_rate": 9.177933102901044e-05, + "loss": 0.0046, + "step": 1726 + }, + { + "epoch": 1.334427495655532, + "grad_norm": 0.054401129484176636, + "learning_rate": 9.176450560090745e-05, + "loss": 0.0052, + "step": 1727 + }, + { + "epoch": 1.3351998455300251, + "grad_norm": 0.014559631235897541, + "learning_rate": 9.174966801616603e-05, + "loss": 0.0047, + "step": 1728 + }, + { + "epoch": 1.3359721954045183, + "grad_norm": 0.02533547952771187, + "learning_rate": 9.173481827910508e-05, + "loss": 0.0057, + "step": 1729 + }, + { + "epoch": 1.3367445452790114, + "grad_norm": 0.021959854289889336, + "learning_rate": 9.171995639404696e-05, + "loss": 0.0052, + "step": 1730 + }, + { + "epoch": 1.3375168951535046, + "grad_norm": 0.017806829884648323, + "learning_rate": 9.170508236531763e-05, + "loss": 0.0053, + "step": 1731 + }, + { + "epoch": 1.3382892450279977, + "grad_norm": 0.032244615256786346, + "learning_rate": 9.169019619724654e-05, + "loss": 0.0049, + "step": 1732 + }, + { + "epoch": 1.3390615949024909, + "grad_norm": 0.01540327351540327, + "learning_rate": 9.167529789416671e-05, + "loss": 0.0051, + "step": 1733 + }, + { + "epoch": 1.339833944776984, + "grad_norm": 0.023344023153185844, + "learning_rate": 9.166038746041468e-05, + "loss": 0.0051, + "step": 1734 + }, + { + "epoch": 1.3406062946514772, + "grad_norm": 0.015337632037699223, + "learning_rate": 9.164546490033051e-05, + "loss": 0.0051, + "step": 1735 + }, + { + "epoch": 1.3413786445259703, + "grad_norm": 0.015382025390863419, + "learning_rate": 9.16305302182578e-05, + "loss": 0.0057, + "step": 1736 + }, + { + "epoch": 1.3421509944004635, + "grad_norm": 0.021464722231030464, + "learning_rate": 9.161558341854366e-05, + "loss": 0.0052, + "step": 1737 + }, + { + "epoch": 1.3429233442749566, + "grad_norm": 0.027189360931515694, + "learning_rate": 9.160062450553874e-05, + "loss": 0.0055, + "step": 1738 + }, + { + "epoch": 1.3436956941494498, + "grad_norm": 0.012121383100748062, + "learning_rate": 9.158565348359727e-05, + "loss": 0.0049, + "step": 1739 + }, + { + "epoch": 1.3444680440239427, + "grad_norm": 0.014514083042740822, + "learning_rate": 9.157067035707689e-05, + "loss": 0.0051, + "step": 1740 + }, + { + "epoch": 1.345240393898436, + "grad_norm": 0.01988913305103779, + "learning_rate": 9.155567513033884e-05, + "loss": 0.0048, + "step": 1741 + }, + { + "epoch": 1.346012743772929, + "grad_norm": 0.013310214504599571, + "learning_rate": 9.154066780774791e-05, + "loss": 0.0051, + "step": 1742 + }, + { + "epoch": 1.3467850936474224, + "grad_norm": 0.014470972120761871, + "learning_rate": 9.15256483936723e-05, + "loss": 0.0044, + "step": 1743 + }, + { + "epoch": 1.3475574435219153, + "grad_norm": 0.013322905637323856, + "learning_rate": 9.151061689248386e-05, + "loss": 0.004, + "step": 1744 + }, + { + "epoch": 1.3483297933964087, + "grad_norm": 0.013611961156129837, + "learning_rate": 9.149557330855787e-05, + "loss": 0.0048, + "step": 1745 + }, + { + "epoch": 1.3491021432709016, + "grad_norm": 0.026355788111686707, + "learning_rate": 9.148051764627315e-05, + "loss": 0.0047, + "step": 1746 + }, + { + "epoch": 1.349874493145395, + "grad_norm": 0.023001806810498238, + "learning_rate": 9.146544991001204e-05, + "loss": 0.0053, + "step": 1747 + }, + { + "epoch": 1.350646843019888, + "grad_norm": 0.016485080122947693, + "learning_rate": 9.145037010416043e-05, + "loss": 0.0045, + "step": 1748 + }, + { + "epoch": 1.3514191928943813, + "grad_norm": 0.024821486324071884, + "learning_rate": 9.143527823310762e-05, + "loss": 0.005, + "step": 1749 + }, + { + "epoch": 1.3521915427688742, + "grad_norm": 0.015309068374335766, + "learning_rate": 9.142017430124655e-05, + "loss": 0.0046, + "step": 1750 + }, + { + "epoch": 1.3529638926433676, + "grad_norm": 0.0249322522431612, + "learning_rate": 9.140505831297357e-05, + "loss": 0.005, + "step": 1751 + }, + { + "epoch": 1.3537362425178605, + "grad_norm": 0.016517311334609985, + "learning_rate": 9.138993027268861e-05, + "loss": 0.0045, + "step": 1752 + }, + { + "epoch": 1.3545085923923537, + "grad_norm": 0.01045146957039833, + "learning_rate": 9.137479018479506e-05, + "loss": 0.0047, + "step": 1753 + }, + { + "epoch": 1.3552809422668468, + "grad_norm": 0.01968018338084221, + "learning_rate": 9.135963805369983e-05, + "loss": 0.005, + "step": 1754 + }, + { + "epoch": 1.35605329214134, + "grad_norm": 0.023560628294944763, + "learning_rate": 9.134447388381335e-05, + "loss": 0.0044, + "step": 1755 + }, + { + "epoch": 1.3568256420158331, + "grad_norm": 0.017931949347257614, + "learning_rate": 9.132929767954951e-05, + "loss": 0.0048, + "step": 1756 + }, + { + "epoch": 1.3575979918903263, + "grad_norm": 0.011551735922694206, + "learning_rate": 9.13141094453258e-05, + "loss": 0.0046, + "step": 1757 + }, + { + "epoch": 1.3583703417648194, + "grad_norm": 0.015296884812414646, + "learning_rate": 9.129890918556309e-05, + "loss": 0.0049, + "step": 1758 + }, + { + "epoch": 1.3591426916393126, + "grad_norm": 0.011915626004338264, + "learning_rate": 9.128369690468586e-05, + "loss": 0.0044, + "step": 1759 + }, + { + "epoch": 1.3599150415138057, + "grad_norm": 0.014855817891657352, + "learning_rate": 9.126847260712198e-05, + "loss": 0.0047, + "step": 1760 + }, + { + "epoch": 1.3606873913882989, + "grad_norm": 0.012156839482486248, + "learning_rate": 9.125323629730291e-05, + "loss": 0.0049, + "step": 1761 + }, + { + "epoch": 1.361459741262792, + "grad_norm": 0.01043495163321495, + "learning_rate": 9.123798797966357e-05, + "loss": 0.0045, + "step": 1762 + }, + { + "epoch": 1.3622320911372852, + "grad_norm": 0.015351502224802971, + "learning_rate": 9.122272765864237e-05, + "loss": 0.0045, + "step": 1763 + }, + { + "epoch": 1.3630044410117783, + "grad_norm": 0.017676282674074173, + "learning_rate": 9.120745533868121e-05, + "loss": 0.0048, + "step": 1764 + }, + { + "epoch": 1.3637767908862715, + "grad_norm": 0.0142407501116395, + "learning_rate": 9.119217102422552e-05, + "loss": 0.0047, + "step": 1765 + }, + { + "epoch": 1.3645491407607646, + "grad_norm": 0.013388961553573608, + "learning_rate": 9.117687471972418e-05, + "loss": 0.0049, + "step": 1766 + }, + { + "epoch": 1.3653214906352578, + "grad_norm": 0.015762466937303543, + "learning_rate": 9.116156642962956e-05, + "loss": 0.005, + "step": 1767 + }, + { + "epoch": 1.366093840509751, + "grad_norm": 0.020082663744688034, + "learning_rate": 9.114624615839756e-05, + "loss": 0.0049, + "step": 1768 + }, + { + "epoch": 1.366866190384244, + "grad_norm": 0.011602682061493397, + "learning_rate": 9.113091391048753e-05, + "loss": 0.0044, + "step": 1769 + }, + { + "epoch": 1.3676385402587372, + "grad_norm": 0.011811558157205582, + "learning_rate": 9.111556969036232e-05, + "loss": 0.0049, + "step": 1770 + }, + { + "epoch": 1.3684108901332304, + "grad_norm": 0.010968919843435287, + "learning_rate": 9.110021350248825e-05, + "loss": 0.0048, + "step": 1771 + }, + { + "epoch": 1.3691832400077235, + "grad_norm": 0.01981322281062603, + "learning_rate": 9.108484535133514e-05, + "loss": 0.0054, + "step": 1772 + }, + { + "epoch": 1.3699555898822167, + "grad_norm": 0.012514528818428516, + "learning_rate": 9.10694652413763e-05, + "loss": 0.0048, + "step": 1773 + }, + { + "epoch": 1.3707279397567098, + "grad_norm": 0.01699264906346798, + "learning_rate": 9.105407317708849e-05, + "loss": 0.005, + "step": 1774 + }, + { + "epoch": 1.371500289631203, + "grad_norm": 0.009253366850316525, + "learning_rate": 9.103866916295198e-05, + "loss": 0.0043, + "step": 1775 + }, + { + "epoch": 1.3722726395056961, + "grad_norm": 0.010722543112933636, + "learning_rate": 9.102325320345052e-05, + "loss": 0.0044, + "step": 1776 + }, + { + "epoch": 1.3730449893801893, + "grad_norm": 0.0193585567176342, + "learning_rate": 9.100782530307128e-05, + "loss": 0.0047, + "step": 1777 + }, + { + "epoch": 1.3738173392546824, + "grad_norm": 0.013146034441888332, + "learning_rate": 9.099238546630498e-05, + "loss": 0.0046, + "step": 1778 + }, + { + "epoch": 1.3745896891291756, + "grad_norm": 0.013135528191924095, + "learning_rate": 9.097693369764579e-05, + "loss": 0.0048, + "step": 1779 + }, + { + "epoch": 1.3753620390036687, + "grad_norm": 0.02093779295682907, + "learning_rate": 9.096147000159132e-05, + "loss": 0.005, + "step": 1780 + }, + { + "epoch": 1.3761343888781619, + "grad_norm": 0.010460708290338516, + "learning_rate": 9.09459943826427e-05, + "loss": 0.0044, + "step": 1781 + }, + { + "epoch": 1.376906738752655, + "grad_norm": 0.021889355033636093, + "learning_rate": 9.093050684530451e-05, + "loss": 0.0048, + "step": 1782 + }, + { + "epoch": 1.3776790886271482, + "grad_norm": 0.017091799527406693, + "learning_rate": 9.091500739408478e-05, + "loss": 0.0043, + "step": 1783 + }, + { + "epoch": 1.3784514385016413, + "grad_norm": 0.021311407908797264, + "learning_rate": 9.089949603349505e-05, + "loss": 0.0049, + "step": 1784 + }, + { + "epoch": 1.3792237883761345, + "grad_norm": 0.012659488245844841, + "learning_rate": 9.088397276805028e-05, + "loss": 0.0044, + "step": 1785 + }, + { + "epoch": 1.3799961382506276, + "grad_norm": 0.01769060641527176, + "learning_rate": 9.086843760226891e-05, + "loss": 0.0048, + "step": 1786 + }, + { + "epoch": 1.3807684881251205, + "grad_norm": 0.010824068449437618, + "learning_rate": 9.085289054067289e-05, + "loss": 0.0048, + "step": 1787 + }, + { + "epoch": 1.381540837999614, + "grad_norm": 0.021509883925318718, + "learning_rate": 9.083733158778755e-05, + "loss": 0.005, + "step": 1788 + }, + { + "epoch": 1.3823131878741068, + "grad_norm": 0.026273906230926514, + "learning_rate": 9.082176074814177e-05, + "loss": 0.0048, + "step": 1789 + }, + { + "epoch": 1.3830855377486002, + "grad_norm": 0.025889763608574867, + "learning_rate": 9.080617802626781e-05, + "loss": 0.0042, + "step": 1790 + }, + { + "epoch": 1.3838578876230931, + "grad_norm": 0.011952931992709637, + "learning_rate": 9.079058342670143e-05, + "loss": 0.0043, + "step": 1791 + }, + { + "epoch": 1.3846302374975865, + "grad_norm": 0.043131519109010696, + "learning_rate": 9.077497695398185e-05, + "loss": 0.0048, + "step": 1792 + }, + { + "epoch": 1.3854025873720794, + "grad_norm": 0.01599467732012272, + "learning_rate": 9.075935861265174e-05, + "loss": 0.0045, + "step": 1793 + }, + { + "epoch": 1.3861749372465728, + "grad_norm": 0.01082384493201971, + "learning_rate": 9.074372840725721e-05, + "loss": 0.0043, + "step": 1794 + }, + { + "epoch": 1.3869472871210657, + "grad_norm": 0.028997058048844337, + "learning_rate": 9.072808634234784e-05, + "loss": 0.005, + "step": 1795 + }, + { + "epoch": 1.3877196369955591, + "grad_norm": 0.02237372286617756, + "learning_rate": 9.071243242247667e-05, + "loss": 0.0047, + "step": 1796 + }, + { + "epoch": 1.388491986870052, + "grad_norm": 0.016084033995866776, + "learning_rate": 9.069676665220015e-05, + "loss": 0.0049, + "step": 1797 + }, + { + "epoch": 1.3892643367445454, + "grad_norm": 0.037538450211286545, + "learning_rate": 9.068108903607821e-05, + "loss": 0.0047, + "step": 1798 + }, + { + "epoch": 1.3900366866190383, + "grad_norm": 0.014534000307321548, + "learning_rate": 9.066539957867425e-05, + "loss": 0.0046, + "step": 1799 + }, + { + "epoch": 1.3908090364935315, + "grad_norm": 0.01424864400178194, + "learning_rate": 9.064969828455509e-05, + "loss": 0.0046, + "step": 1800 + }, + { + "epoch": 1.3915813863680246, + "grad_norm": 0.021501099690794945, + "learning_rate": 9.063398515829097e-05, + "loss": 0.0046, + "step": 1801 + }, + { + "epoch": 1.3923537362425178, + "grad_norm": 0.01063248235732317, + "learning_rate": 9.061826020445564e-05, + "loss": 0.0043, + "step": 1802 + }, + { + "epoch": 1.393126086117011, + "grad_norm": 0.10095581412315369, + "learning_rate": 9.060252342762622e-05, + "loss": 0.0045, + "step": 1803 + }, + { + "epoch": 1.393898435991504, + "grad_norm": 0.017239512875676155, + "learning_rate": 9.058677483238332e-05, + "loss": 0.0046, + "step": 1804 + }, + { + "epoch": 1.3946707858659972, + "grad_norm": 0.017307721078395844, + "learning_rate": 9.057101442331097e-05, + "loss": 0.0052, + "step": 1805 + }, + { + "epoch": 1.3954431357404904, + "grad_norm": 0.014669974334537983, + "learning_rate": 9.055524220499665e-05, + "loss": 0.0051, + "step": 1806 + }, + { + "epoch": 1.3962154856149835, + "grad_norm": 0.10470427572727203, + "learning_rate": 9.053945818203126e-05, + "loss": 0.0059, + "step": 1807 + }, + { + "epoch": 1.3969878354894767, + "grad_norm": 0.01636355370283127, + "learning_rate": 9.052366235900918e-05, + "loss": 0.0045, + "step": 1808 + }, + { + "epoch": 1.3977601853639698, + "grad_norm": 0.016303371638059616, + "learning_rate": 9.050785474052814e-05, + "loss": 0.0052, + "step": 1809 + }, + { + "epoch": 1.398532535238463, + "grad_norm": 0.022861136123538017, + "learning_rate": 9.04920353311894e-05, + "loss": 0.005, + "step": 1810 + }, + { + "epoch": 1.3993048851129561, + "grad_norm": 0.02645104192197323, + "learning_rate": 9.047620413559759e-05, + "loss": 0.0051, + "step": 1811 + }, + { + "epoch": 1.4000772349874493, + "grad_norm": 0.038855042308568954, + "learning_rate": 9.046036115836081e-05, + "loss": 0.0058, + "step": 1812 + }, + { + "epoch": 1.4008495848619424, + "grad_norm": 0.018047073855996132, + "learning_rate": 9.044450640409053e-05, + "loss": 0.0054, + "step": 1813 + }, + { + "epoch": 1.4016219347364356, + "grad_norm": 0.034481655806303024, + "learning_rate": 9.042863987740171e-05, + "loss": 0.005, + "step": 1814 + }, + { + "epoch": 1.4023942846109287, + "grad_norm": 0.01587347313761711, + "learning_rate": 9.04127615829127e-05, + "loss": 0.0052, + "step": 1815 + }, + { + "epoch": 1.403166634485422, + "grad_norm": 0.024790631607174873, + "learning_rate": 9.03968715252453e-05, + "loss": 0.0057, + "step": 1816 + }, + { + "epoch": 1.403938984359915, + "grad_norm": 0.01651730015873909, + "learning_rate": 9.038096970902472e-05, + "loss": 0.0054, + "step": 1817 + }, + { + "epoch": 1.4047113342344082, + "grad_norm": 0.018351811915636063, + "learning_rate": 9.03650561388796e-05, + "loss": 0.0047, + "step": 1818 + }, + { + "epoch": 1.4054836841089013, + "grad_norm": 0.03518560901284218, + "learning_rate": 9.034913081944199e-05, + "loss": 0.0049, + "step": 1819 + }, + { + "epoch": 1.4062560339833945, + "grad_norm": 0.01802041381597519, + "learning_rate": 9.033319375534734e-05, + "loss": 0.005, + "step": 1820 + }, + { + "epoch": 1.4070283838578876, + "grad_norm": 0.015685800462961197, + "learning_rate": 9.031724495123458e-05, + "loss": 0.0053, + "step": 1821 + }, + { + "epoch": 1.4078007337323808, + "grad_norm": 0.019203344359993935, + "learning_rate": 9.030128441174601e-05, + "loss": 0.0047, + "step": 1822 + }, + { + "epoch": 1.408573083606874, + "grad_norm": 0.018631575629115105, + "learning_rate": 9.028531214152735e-05, + "loss": 0.0056, + "step": 1823 + }, + { + "epoch": 1.409345433481367, + "grad_norm": 0.020659970119595528, + "learning_rate": 9.026932814522776e-05, + "loss": 0.0054, + "step": 1824 + }, + { + "epoch": 1.4101177833558602, + "grad_norm": 0.025972822681069374, + "learning_rate": 9.025333242749978e-05, + "loss": 0.0053, + "step": 1825 + }, + { + "epoch": 1.4108901332303534, + "grad_norm": 0.015437978319823742, + "learning_rate": 9.023732499299937e-05, + "loss": 0.0051, + "step": 1826 + }, + { + "epoch": 1.4116624831048465, + "grad_norm": 0.016610266640782356, + "learning_rate": 9.022130584638593e-05, + "loss": 0.0045, + "step": 1827 + }, + { + "epoch": 1.4124348329793397, + "grad_norm": 0.011608053930103779, + "learning_rate": 9.020527499232223e-05, + "loss": 0.0049, + "step": 1828 + }, + { + "epoch": 1.4132071828538328, + "grad_norm": 0.02084183320403099, + "learning_rate": 9.018923243547449e-05, + "loss": 0.0046, + "step": 1829 + }, + { + "epoch": 1.413979532728326, + "grad_norm": 0.013906857930123806, + "learning_rate": 9.017317818051225e-05, + "loss": 0.0055, + "step": 1830 + }, + { + "epoch": 1.4147518826028191, + "grad_norm": 0.0152976606041193, + "learning_rate": 9.015711223210857e-05, + "loss": 0.0045, + "step": 1831 + }, + { + "epoch": 1.4155242324773123, + "grad_norm": 0.016641858965158463, + "learning_rate": 9.014103459493986e-05, + "loss": 0.0046, + "step": 1832 + }, + { + "epoch": 1.4162965823518054, + "grad_norm": 0.012137793004512787, + "learning_rate": 9.012494527368588e-05, + "loss": 0.0048, + "step": 1833 + }, + { + "epoch": 1.4170689322262986, + "grad_norm": 0.031864847987890244, + "learning_rate": 9.010884427302993e-05, + "loss": 0.0047, + "step": 1834 + }, + { + "epoch": 1.4178412821007917, + "grad_norm": 0.01978926546871662, + "learning_rate": 9.009273159765853e-05, + "loss": 0.005, + "step": 1835 + }, + { + "epoch": 1.4186136319752847, + "grad_norm": 0.011707760393619537, + "learning_rate": 9.007660725226175e-05, + "loss": 0.0052, + "step": 1836 + }, + { + "epoch": 1.419385981849778, + "grad_norm": 0.03232321888208389, + "learning_rate": 9.006047124153297e-05, + "loss": 0.0052, + "step": 1837 + }, + { + "epoch": 1.420158331724271, + "grad_norm": 0.01921793818473816, + "learning_rate": 9.004432357016901e-05, + "loss": 0.0051, + "step": 1838 + }, + { + "epoch": 1.4209306815987643, + "grad_norm": 0.031026741489768028, + "learning_rate": 9.002816424287004e-05, + "loss": 0.0045, + "step": 1839 + }, + { + "epoch": 1.4217030314732573, + "grad_norm": 0.03284458816051483, + "learning_rate": 9.001199326433969e-05, + "loss": 0.0045, + "step": 1840 + }, + { + "epoch": 1.4224753813477506, + "grad_norm": 0.02016862854361534, + "learning_rate": 8.99958106392849e-05, + "loss": 0.0051, + "step": 1841 + }, + { + "epoch": 1.4232477312222436, + "grad_norm": 0.030963387340307236, + "learning_rate": 8.997961637241608e-05, + "loss": 0.0043, + "step": 1842 + }, + { + "epoch": 1.424020081096737, + "grad_norm": 0.011597777716815472, + "learning_rate": 8.996341046844696e-05, + "loss": 0.0054, + "step": 1843 + }, + { + "epoch": 1.4247924309712299, + "grad_norm": 0.014895058237016201, + "learning_rate": 8.994719293209471e-05, + "loss": 0.0047, + "step": 1844 + }, + { + "epoch": 1.4255647808457232, + "grad_norm": 0.013554844073951244, + "learning_rate": 8.993096376807983e-05, + "loss": 0.0055, + "step": 1845 + }, + { + "epoch": 1.4263371307202162, + "grad_norm": 0.016599401831626892, + "learning_rate": 8.991472298112627e-05, + "loss": 0.0047, + "step": 1846 + }, + { + "epoch": 1.4271094805947093, + "grad_norm": 0.011927340179681778, + "learning_rate": 8.989847057596131e-05, + "loss": 0.0046, + "step": 1847 + }, + { + "epoch": 1.4278818304692025, + "grad_norm": 0.015175368636846542, + "learning_rate": 8.988220655731565e-05, + "loss": 0.0049, + "step": 1848 + }, + { + "epoch": 1.4286541803436956, + "grad_norm": 0.02604135498404503, + "learning_rate": 8.986593092992334e-05, + "loss": 0.0048, + "step": 1849 + }, + { + "epoch": 1.4294265302181888, + "grad_norm": 0.012300568632781506, + "learning_rate": 8.984964369852183e-05, + "loss": 0.0049, + "step": 1850 + }, + { + "epoch": 1.430198880092682, + "grad_norm": 0.020940154790878296, + "learning_rate": 8.983334486785192e-05, + "loss": 0.0046, + "step": 1851 + }, + { + "epoch": 1.430971229967175, + "grad_norm": 0.011929292231798172, + "learning_rate": 8.981703444265783e-05, + "loss": 0.005, + "step": 1852 + }, + { + "epoch": 1.4317435798416682, + "grad_norm": 0.01556863822042942, + "learning_rate": 8.980071242768713e-05, + "loss": 0.0045, + "step": 1853 + }, + { + "epoch": 1.4325159297161614, + "grad_norm": 0.017990630120038986, + "learning_rate": 8.978437882769074e-05, + "loss": 0.0048, + "step": 1854 + }, + { + "epoch": 1.4332882795906545, + "grad_norm": 0.019159870222210884, + "learning_rate": 8.9768033647423e-05, + "loss": 0.0044, + "step": 1855 + }, + { + "epoch": 1.4340606294651477, + "grad_norm": 0.022200316190719604, + "learning_rate": 8.975167689164159e-05, + "loss": 0.0058, + "step": 1856 + }, + { + "epoch": 1.4348329793396408, + "grad_norm": 0.02062351442873478, + "learning_rate": 8.973530856510757e-05, + "loss": 0.0045, + "step": 1857 + }, + { + "epoch": 1.435605329214134, + "grad_norm": 0.04572264105081558, + "learning_rate": 8.971892867258535e-05, + "loss": 0.0054, + "step": 1858 + }, + { + "epoch": 1.4363776790886271, + "grad_norm": 0.04120631515979767, + "learning_rate": 8.970253721884272e-05, + "loss": 0.005, + "step": 1859 + }, + { + "epoch": 1.4371500289631203, + "grad_norm": 0.042257945984601974, + "learning_rate": 8.968613420865087e-05, + "loss": 0.0058, + "step": 1860 + }, + { + "epoch": 1.4379223788376134, + "grad_norm": 0.013364771381020546, + "learning_rate": 8.966971964678429e-05, + "loss": 0.0047, + "step": 1861 + }, + { + "epoch": 1.4386947287121066, + "grad_norm": 0.02408026158809662, + "learning_rate": 8.965329353802087e-05, + "loss": 0.0045, + "step": 1862 + }, + { + "epoch": 1.4394670785865997, + "grad_norm": 0.04472861811518669, + "learning_rate": 8.963685588714185e-05, + "loss": 0.0051, + "step": 1863 + }, + { + "epoch": 1.4402394284610929, + "grad_norm": 0.011410464532673359, + "learning_rate": 8.962040669893184e-05, + "loss": 0.0051, + "step": 1864 + }, + { + "epoch": 1.441011778335586, + "grad_norm": 0.024275388568639755, + "learning_rate": 8.960394597817878e-05, + "loss": 0.0046, + "step": 1865 + }, + { + "epoch": 1.4417841282100792, + "grad_norm": 0.029177196323871613, + "learning_rate": 8.958747372967403e-05, + "loss": 0.0046, + "step": 1866 + }, + { + "epoch": 1.4425564780845723, + "grad_norm": 0.011070268228650093, + "learning_rate": 8.95709899582122e-05, + "loss": 0.0046, + "step": 1867 + }, + { + "epoch": 1.4433288279590655, + "grad_norm": 0.01813647709786892, + "learning_rate": 8.955449466859138e-05, + "loss": 0.0042, + "step": 1868 + }, + { + "epoch": 1.4441011778335586, + "grad_norm": 0.03172139450907707, + "learning_rate": 8.953798786561294e-05, + "loss": 0.0053, + "step": 1869 + }, + { + "epoch": 1.4448735277080518, + "grad_norm": 0.02178853377699852, + "learning_rate": 8.952146955408157e-05, + "loss": 0.0048, + "step": 1870 + }, + { + "epoch": 1.445645877582545, + "grad_norm": 0.013043739832937717, + "learning_rate": 8.95049397388054e-05, + "loss": 0.0051, + "step": 1871 + }, + { + "epoch": 1.446418227457038, + "grad_norm": 0.016747932881116867, + "learning_rate": 8.948839842459583e-05, + "loss": 0.0053, + "step": 1872 + }, + { + "epoch": 1.4471905773315312, + "grad_norm": 0.012382641434669495, + "learning_rate": 8.947184561626765e-05, + "loss": 0.0048, + "step": 1873 + }, + { + "epoch": 1.4479629272060244, + "grad_norm": 0.024634165689349174, + "learning_rate": 8.945528131863896e-05, + "loss": 0.0043, + "step": 1874 + }, + { + "epoch": 1.4487352770805175, + "grad_norm": 0.015179039910435677, + "learning_rate": 8.943870553653126e-05, + "loss": 0.0048, + "step": 1875 + }, + { + "epoch": 1.4495076269550107, + "grad_norm": 0.0111264418810606, + "learning_rate": 8.942211827476934e-05, + "loss": 0.005, + "step": 1876 + }, + { + "epoch": 1.4502799768295038, + "grad_norm": 0.012447085231542587, + "learning_rate": 8.940551953818136e-05, + "loss": 0.0048, + "step": 1877 + }, + { + "epoch": 1.451052326703997, + "grad_norm": 0.009181671775877476, + "learning_rate": 8.938890933159881e-05, + "loss": 0.0044, + "step": 1878 + }, + { + "epoch": 1.4518246765784901, + "grad_norm": 0.009875455871224403, + "learning_rate": 8.93722876598565e-05, + "loss": 0.0046, + "step": 1879 + }, + { + "epoch": 1.4525970264529833, + "grad_norm": 0.014886122196912766, + "learning_rate": 8.935565452779263e-05, + "loss": 0.0051, + "step": 1880 + }, + { + "epoch": 1.4533693763274764, + "grad_norm": 0.013239393942058086, + "learning_rate": 8.933900994024868e-05, + "loss": 0.0051, + "step": 1881 + }, + { + "epoch": 1.4541417262019696, + "grad_norm": 0.012474555522203445, + "learning_rate": 8.932235390206948e-05, + "loss": 0.0049, + "step": 1882 + }, + { + "epoch": 1.4549140760764625, + "grad_norm": 0.012886752374470234, + "learning_rate": 8.930568641810324e-05, + "loss": 0.0049, + "step": 1883 + }, + { + "epoch": 1.4556864259509559, + "grad_norm": 0.012491632252931595, + "learning_rate": 8.928900749320143e-05, + "loss": 0.0046, + "step": 1884 + }, + { + "epoch": 1.4564587758254488, + "grad_norm": 0.02662251889705658, + "learning_rate": 8.927231713221886e-05, + "loss": 0.0046, + "step": 1885 + }, + { + "epoch": 1.4572311256999422, + "grad_norm": 0.01857464201748371, + "learning_rate": 8.925561534001374e-05, + "loss": 0.0046, + "step": 1886 + }, + { + "epoch": 1.458003475574435, + "grad_norm": 0.02623864635825157, + "learning_rate": 8.923890212144755e-05, + "loss": 0.0049, + "step": 1887 + }, + { + "epoch": 1.4587758254489285, + "grad_norm": 0.023755142465233803, + "learning_rate": 8.922217748138508e-05, + "loss": 0.0046, + "step": 1888 + }, + { + "epoch": 1.4595481753234214, + "grad_norm": 0.019182506948709488, + "learning_rate": 8.920544142469447e-05, + "loss": 0.0042, + "step": 1889 + }, + { + "epoch": 1.4603205251979148, + "grad_norm": 0.014418594539165497, + "learning_rate": 8.918869395624719e-05, + "loss": 0.0045, + "step": 1890 + }, + { + "epoch": 1.4610928750724077, + "grad_norm": 0.029968436807394028, + "learning_rate": 8.917193508091803e-05, + "loss": 0.0048, + "step": 1891 + }, + { + "epoch": 1.461865224946901, + "grad_norm": 0.015648595988750458, + "learning_rate": 8.91551648035851e-05, + "loss": 0.0047, + "step": 1892 + }, + { + "epoch": 1.462637574821394, + "grad_norm": 0.023029394447803497, + "learning_rate": 8.91383831291298e-05, + "loss": 0.0046, + "step": 1893 + }, + { + "epoch": 1.4634099246958872, + "grad_norm": 0.016863130033016205, + "learning_rate": 8.912159006243688e-05, + "loss": 0.0047, + "step": 1894 + }, + { + "epoch": 1.4641822745703803, + "grad_norm": 0.01065768487751484, + "learning_rate": 8.91047856083944e-05, + "loss": 0.0043, + "step": 1895 + }, + { + "epoch": 1.4649546244448735, + "grad_norm": 0.012436062097549438, + "learning_rate": 8.908796977189371e-05, + "loss": 0.0054, + "step": 1896 + }, + { + "epoch": 1.4657269743193666, + "grad_norm": 0.008846023119986057, + "learning_rate": 8.907114255782953e-05, + "loss": 0.0043, + "step": 1897 + }, + { + "epoch": 1.4664993241938598, + "grad_norm": 0.017335759475827217, + "learning_rate": 8.905430397109981e-05, + "loss": 0.0039, + "step": 1898 + }, + { + "epoch": 1.467271674068353, + "grad_norm": 0.00891080778092146, + "learning_rate": 8.903745401660591e-05, + "loss": 0.0042, + "step": 1899 + }, + { + "epoch": 1.468044023942846, + "grad_norm": 0.01016610860824585, + "learning_rate": 8.90205926992524e-05, + "loss": 0.0043, + "step": 1900 + }, + { + "epoch": 1.4688163738173392, + "grad_norm": 0.021171605214476585, + "learning_rate": 8.900372002394723e-05, + "loss": 0.0046, + "step": 1901 + }, + { + "epoch": 1.4695887236918324, + "grad_norm": 0.013704544864594936, + "learning_rate": 8.898683599560162e-05, + "loss": 0.0046, + "step": 1902 + }, + { + "epoch": 1.4703610735663255, + "grad_norm": 0.010161432437598705, + "learning_rate": 8.896994061913009e-05, + "loss": 0.0043, + "step": 1903 + }, + { + "epoch": 1.4711334234408187, + "grad_norm": 0.018764277920126915, + "learning_rate": 8.89530338994505e-05, + "loss": 0.0047, + "step": 1904 + }, + { + "epoch": 1.4719057733153118, + "grad_norm": 0.014022842980921268, + "learning_rate": 8.893611584148395e-05, + "loss": 0.0047, + "step": 1905 + }, + { + "epoch": 1.472678123189805, + "grad_norm": 0.01479887031018734, + "learning_rate": 8.891918645015491e-05, + "loss": 0.0042, + "step": 1906 + }, + { + "epoch": 1.473450473064298, + "grad_norm": 0.014984914101660252, + "learning_rate": 8.89022457303911e-05, + "loss": 0.0039, + "step": 1907 + }, + { + "epoch": 1.4742228229387913, + "grad_norm": 0.014987524598836899, + "learning_rate": 8.888529368712357e-05, + "loss": 0.0046, + "step": 1908 + }, + { + "epoch": 1.4749951728132844, + "grad_norm": 0.018837671726942062, + "learning_rate": 8.886833032528665e-05, + "loss": 0.0042, + "step": 1909 + }, + { + "epoch": 1.4757675226877776, + "grad_norm": 0.016406618058681488, + "learning_rate": 8.885135564981794e-05, + "loss": 0.0049, + "step": 1910 + }, + { + "epoch": 1.4765398725622707, + "grad_norm": 0.011555914767086506, + "learning_rate": 8.883436966565836e-05, + "loss": 0.0048, + "step": 1911 + }, + { + "epoch": 1.4773122224367639, + "grad_norm": 0.023820480331778526, + "learning_rate": 8.881737237775216e-05, + "loss": 0.0046, + "step": 1912 + }, + { + "epoch": 1.478084572311257, + "grad_norm": 0.0240377988666296, + "learning_rate": 8.880036379104681e-05, + "loss": 0.0048, + "step": 1913 + }, + { + "epoch": 1.4788569221857502, + "grad_norm": 0.017000969499349594, + "learning_rate": 8.87833439104931e-05, + "loss": 0.0051, + "step": 1914 + }, + { + "epoch": 1.4796292720602433, + "grad_norm": 0.03072609193623066, + "learning_rate": 8.876631274104511e-05, + "loss": 0.0048, + "step": 1915 + }, + { + "epoch": 1.4804016219347365, + "grad_norm": 0.04257432371377945, + "learning_rate": 8.87492702876602e-05, + "loss": 0.0047, + "step": 1916 + }, + { + "epoch": 1.4811739718092296, + "grad_norm": 0.016059063374996185, + "learning_rate": 8.873221655529902e-05, + "loss": 0.0049, + "step": 1917 + }, + { + "epoch": 1.4819463216837228, + "grad_norm": 0.028160041198134422, + "learning_rate": 8.871515154892549e-05, + "loss": 0.0053, + "step": 1918 + }, + { + "epoch": 1.482718671558216, + "grad_norm": 0.022643111646175385, + "learning_rate": 8.869807527350683e-05, + "loss": 0.0046, + "step": 1919 + }, + { + "epoch": 1.483491021432709, + "grad_norm": 0.01096571795642376, + "learning_rate": 8.868098773401352e-05, + "loss": 0.0045, + "step": 1920 + }, + { + "epoch": 1.4842633713072022, + "grad_norm": 0.01421598345041275, + "learning_rate": 8.866388893541932e-05, + "loss": 0.0043, + "step": 1921 + }, + { + "epoch": 1.4850357211816954, + "grad_norm": 0.03223200514912605, + "learning_rate": 8.864677888270133e-05, + "loss": 0.0046, + "step": 1922 + }, + { + "epoch": 1.4858080710561885, + "grad_norm": 0.011875314638018608, + "learning_rate": 8.862965758083983e-05, + "loss": 0.0046, + "step": 1923 + }, + { + "epoch": 1.4865804209306817, + "grad_norm": 0.01744246669113636, + "learning_rate": 8.861252503481842e-05, + "loss": 0.0047, + "step": 1924 + }, + { + "epoch": 1.4873527708051748, + "grad_norm": 0.011263393796980381, + "learning_rate": 8.859538124962397e-05, + "loss": 0.0041, + "step": 1925 + }, + { + "epoch": 1.488125120679668, + "grad_norm": 0.016610687598586082, + "learning_rate": 8.857822623024663e-05, + "loss": 0.0044, + "step": 1926 + }, + { + "epoch": 1.488897470554161, + "grad_norm": 0.010531166568398476, + "learning_rate": 8.85610599816798e-05, + "loss": 0.0043, + "step": 1927 + }, + { + "epoch": 1.4896698204286543, + "grad_norm": 0.021132631227374077, + "learning_rate": 8.854388250892019e-05, + "loss": 0.0047, + "step": 1928 + }, + { + "epoch": 1.4904421703031474, + "grad_norm": 0.01411630492657423, + "learning_rate": 8.852669381696772e-05, + "loss": 0.0044, + "step": 1929 + }, + { + "epoch": 1.4912145201776403, + "grad_norm": 0.012702484615147114, + "learning_rate": 8.85094939108256e-05, + "loss": 0.0047, + "step": 1930 + }, + { + "epoch": 1.4919868700521337, + "grad_norm": 0.009157133288681507, + "learning_rate": 8.849228279550032e-05, + "loss": 0.0041, + "step": 1931 + }, + { + "epoch": 1.4927592199266266, + "grad_norm": 0.020086420699954033, + "learning_rate": 8.847506047600162e-05, + "loss": 0.0045, + "step": 1932 + }, + { + "epoch": 1.49353156980112, + "grad_norm": 0.009814996272325516, + "learning_rate": 8.845782695734248e-05, + "loss": 0.0039, + "step": 1933 + }, + { + "epoch": 1.494303919675613, + "grad_norm": 0.011432342231273651, + "learning_rate": 8.844058224453919e-05, + "loss": 0.0048, + "step": 1934 + }, + { + "epoch": 1.4950762695501063, + "grad_norm": 0.01218743808567524, + "learning_rate": 8.842332634261126e-05, + "loss": 0.0042, + "step": 1935 + }, + { + "epoch": 1.4958486194245992, + "grad_norm": 0.01667974330484867, + "learning_rate": 8.840605925658145e-05, + "loss": 0.0045, + "step": 1936 + }, + { + "epoch": 1.4966209692990926, + "grad_norm": 0.0116344029083848, + "learning_rate": 8.838878099147583e-05, + "loss": 0.0042, + "step": 1937 + }, + { + "epoch": 1.4973933191735855, + "grad_norm": 0.013338102027773857, + "learning_rate": 8.837149155232364e-05, + "loss": 0.0046, + "step": 1938 + }, + { + "epoch": 1.498165669048079, + "grad_norm": 0.01640329137444496, + "learning_rate": 8.835419094415745e-05, + "loss": 0.0044, + "step": 1939 + }, + { + "epoch": 1.4989380189225718, + "grad_norm": 0.01282061543315649, + "learning_rate": 8.833687917201301e-05, + "loss": 0.0051, + "step": 1940 + }, + { + "epoch": 1.4997103687970652, + "grad_norm": 0.01326051913201809, + "learning_rate": 8.831955624092941e-05, + "loss": 0.0049, + "step": 1941 + }, + { + "epoch": 1.5004827186715581, + "grad_norm": 0.014805569313466549, + "learning_rate": 8.83022221559489e-05, + "loss": 0.004, + "step": 1942 + }, + { + "epoch": 1.5012550685460515, + "grad_norm": 0.009898116812109947, + "learning_rate": 8.828487692211704e-05, + "loss": 0.0046, + "step": 1943 + }, + { + "epoch": 1.5020274184205444, + "grad_norm": 0.025644246488809586, + "learning_rate": 8.826752054448259e-05, + "loss": 0.0049, + "step": 1944 + }, + { + "epoch": 1.5027997682950378, + "grad_norm": 0.01422660518437624, + "learning_rate": 8.825015302809756e-05, + "loss": 0.0042, + "step": 1945 + }, + { + "epoch": 1.5035721181695307, + "grad_norm": 0.019137471914291382, + "learning_rate": 8.823277437801724e-05, + "loss": 0.0048, + "step": 1946 + }, + { + "epoch": 1.504344468044024, + "grad_norm": 0.027513476088643074, + "learning_rate": 8.821538459930013e-05, + "loss": 0.0044, + "step": 1947 + }, + { + "epoch": 1.505116817918517, + "grad_norm": 0.017542105168104172, + "learning_rate": 8.819798369700797e-05, + "loss": 0.0043, + "step": 1948 + }, + { + "epoch": 1.5058891677930102, + "grad_norm": 0.025185147300362587, + "learning_rate": 8.818057167620574e-05, + "loss": 0.0044, + "step": 1949 + }, + { + "epoch": 1.5066615176675033, + "grad_norm": 0.030095340684056282, + "learning_rate": 8.816314854196167e-05, + "loss": 0.0046, + "step": 1950 + }, + { + "epoch": 1.5074338675419965, + "grad_norm": 0.011635826900601387, + "learning_rate": 8.814571429934719e-05, + "loss": 0.0042, + "step": 1951 + }, + { + "epoch": 1.5082062174164896, + "grad_norm": 0.022228319197893143, + "learning_rate": 8.8128268953437e-05, + "loss": 0.0043, + "step": 1952 + }, + { + "epoch": 1.5089785672909828, + "grad_norm": 0.02450932003557682, + "learning_rate": 8.811081250930902e-05, + "loss": 0.0046, + "step": 1953 + }, + { + "epoch": 1.509750917165476, + "grad_norm": 0.01285399030894041, + "learning_rate": 8.80933449720444e-05, + "loss": 0.0047, + "step": 1954 + }, + { + "epoch": 1.510523267039969, + "grad_norm": 0.02027706429362297, + "learning_rate": 8.807586634672751e-05, + "loss": 0.0046, + "step": 1955 + }, + { + "epoch": 1.5112956169144622, + "grad_norm": 0.013969060964882374, + "learning_rate": 8.805837663844598e-05, + "loss": 0.0043, + "step": 1956 + }, + { + "epoch": 1.5120679667889554, + "grad_norm": 0.009368671104311943, + "learning_rate": 8.804087585229061e-05, + "loss": 0.0039, + "step": 1957 + }, + { + "epoch": 1.5128403166634485, + "grad_norm": 0.023244036361575127, + "learning_rate": 8.802336399335547e-05, + "loss": 0.0046, + "step": 1958 + }, + { + "epoch": 1.5136126665379417, + "grad_norm": 0.009645405225455761, + "learning_rate": 8.800584106673784e-05, + "loss": 0.0046, + "step": 1959 + }, + { + "epoch": 1.5143850164124348, + "grad_norm": 0.013144160620868206, + "learning_rate": 8.798830707753823e-05, + "loss": 0.0047, + "step": 1960 + }, + { + "epoch": 1.515157366286928, + "grad_norm": 0.012967154383659363, + "learning_rate": 8.797076203086033e-05, + "loss": 0.0048, + "step": 1961 + }, + { + "epoch": 1.5159297161614211, + "grad_norm": 0.01168031059205532, + "learning_rate": 8.795320593181112e-05, + "loss": 0.0041, + "step": 1962 + }, + { + "epoch": 1.5167020660359143, + "grad_norm": 0.012260856106877327, + "learning_rate": 8.793563878550072e-05, + "loss": 0.0049, + "step": 1963 + }, + { + "epoch": 1.5174744159104074, + "grad_norm": 0.013430275954306126, + "learning_rate": 8.791806059704251e-05, + "loss": 0.004, + "step": 1964 + }, + { + "epoch": 1.5182467657849006, + "grad_norm": 0.02253013476729393, + "learning_rate": 8.79004713715531e-05, + "loss": 0.0044, + "step": 1965 + }, + { + "epoch": 1.5190191156593937, + "grad_norm": 0.009793316014111042, + "learning_rate": 8.788287111415227e-05, + "loss": 0.0045, + "step": 1966 + }, + { + "epoch": 1.5197914655338869, + "grad_norm": 0.016231399029493332, + "learning_rate": 8.786525982996302e-05, + "loss": 0.0038, + "step": 1967 + }, + { + "epoch": 1.52056381540838, + "grad_norm": 0.020186608657240868, + "learning_rate": 8.784763752411159e-05, + "loss": 0.0046, + "step": 1968 + }, + { + "epoch": 1.5213361652828732, + "grad_norm": 0.011528832837939262, + "learning_rate": 8.783000420172738e-05, + "loss": 0.0048, + "step": 1969 + }, + { + "epoch": 1.5221085151573663, + "grad_norm": 0.012485725805163383, + "learning_rate": 8.781235986794305e-05, + "loss": 0.0048, + "step": 1970 + }, + { + "epoch": 1.5228808650318595, + "grad_norm": 0.027413304895162582, + "learning_rate": 8.779470452789445e-05, + "loss": 0.005, + "step": 1971 + }, + { + "epoch": 1.5236532149063526, + "grad_norm": 0.008898314088582993, + "learning_rate": 8.777703818672059e-05, + "loss": 0.0042, + "step": 1972 + }, + { + "epoch": 1.5244255647808456, + "grad_norm": 0.011833777651190758, + "learning_rate": 8.775936084956371e-05, + "loss": 0.0042, + "step": 1973 + }, + { + "epoch": 1.525197914655339, + "grad_norm": 0.016269506886601448, + "learning_rate": 8.77416725215693e-05, + "loss": 0.0047, + "step": 1974 + }, + { + "epoch": 1.5259702645298319, + "grad_norm": 0.013332594186067581, + "learning_rate": 8.772397320788597e-05, + "loss": 0.0042, + "step": 1975 + }, + { + "epoch": 1.5267426144043252, + "grad_norm": 0.011268539354205132, + "learning_rate": 8.770626291366557e-05, + "loss": 0.0042, + "step": 1976 + }, + { + "epoch": 1.5275149642788182, + "grad_norm": 0.013201587833464146, + "learning_rate": 8.768854164406314e-05, + "loss": 0.0045, + "step": 1977 + }, + { + "epoch": 1.5282873141533115, + "grad_norm": 0.022223128005862236, + "learning_rate": 8.767080940423692e-05, + "loss": 0.0046, + "step": 1978 + }, + { + "epoch": 1.5290596640278045, + "grad_norm": 0.010781112127006054, + "learning_rate": 8.765306619934833e-05, + "loss": 0.0048, + "step": 1979 + }, + { + "epoch": 1.5298320139022978, + "grad_norm": 0.02067198045551777, + "learning_rate": 8.763531203456199e-05, + "loss": 0.0048, + "step": 1980 + }, + { + "epoch": 1.5306043637767908, + "grad_norm": 0.025904567912220955, + "learning_rate": 8.76175469150457e-05, + "loss": 0.0049, + "step": 1981 + }, + { + "epoch": 1.5313767136512841, + "grad_norm": 0.023831602185964584, + "learning_rate": 8.759977084597047e-05, + "loss": 0.0048, + "step": 1982 + }, + { + "epoch": 1.532149063525777, + "grad_norm": 0.024097897112369537, + "learning_rate": 8.758198383251047e-05, + "loss": 0.0044, + "step": 1983 + }, + { + "epoch": 1.5329214134002704, + "grad_norm": 0.011627217754721642, + "learning_rate": 8.756418587984307e-05, + "loss": 0.0041, + "step": 1984 + }, + { + "epoch": 1.5336937632747634, + "grad_norm": 0.013759966939687729, + "learning_rate": 8.754637699314885e-05, + "loss": 0.0041, + "step": 1985 + }, + { + "epoch": 1.5344661131492567, + "grad_norm": 0.0151802534237504, + "learning_rate": 8.752855717761152e-05, + "loss": 0.0047, + "step": 1986 + }, + { + "epoch": 1.5352384630237497, + "grad_norm": 0.02687522955238819, + "learning_rate": 8.751072643841803e-05, + "loss": 0.0047, + "step": 1987 + }, + { + "epoch": 1.536010812898243, + "grad_norm": 0.01627619005739689, + "learning_rate": 8.749288478075842e-05, + "loss": 0.0043, + "step": 1988 + }, + { + "epoch": 1.536783162772736, + "grad_norm": 0.022365057840943336, + "learning_rate": 8.747503220982602e-05, + "loss": 0.0047, + "step": 1989 + }, + { + "epoch": 1.5375555126472293, + "grad_norm": 0.010350333526730537, + "learning_rate": 8.745716873081725e-05, + "loss": 0.004, + "step": 1990 + }, + { + "epoch": 1.5383278625217223, + "grad_norm": 0.013772976584732533, + "learning_rate": 8.743929434893176e-05, + "loss": 0.0046, + "step": 1991 + }, + { + "epoch": 1.5391002123962156, + "grad_norm": 0.014426304958760738, + "learning_rate": 8.742140906937233e-05, + "loss": 0.0042, + "step": 1992 + }, + { + "epoch": 1.5398725622707086, + "grad_norm": 0.011211644858121872, + "learning_rate": 8.740351289734495e-05, + "loss": 0.0046, + "step": 1993 + }, + { + "epoch": 1.540644912145202, + "grad_norm": 0.013804316520690918, + "learning_rate": 8.738560583805873e-05, + "loss": 0.0048, + "step": 1994 + }, + { + "epoch": 1.5414172620196949, + "grad_norm": 0.010096611455082893, + "learning_rate": 8.736768789672602e-05, + "loss": 0.0041, + "step": 1995 + }, + { + "epoch": 1.542189611894188, + "grad_norm": 0.019236311316490173, + "learning_rate": 8.73497590785623e-05, + "loss": 0.0047, + "step": 1996 + }, + { + "epoch": 1.5429619617686812, + "grad_norm": 0.019567430019378662, + "learning_rate": 8.73318193887862e-05, + "loss": 0.0042, + "step": 1997 + }, + { + "epoch": 1.5437343116431743, + "grad_norm": 0.008151182904839516, + "learning_rate": 8.731386883261952e-05, + "loss": 0.004, + "step": 1998 + }, + { + "epoch": 1.5445066615176675, + "grad_norm": 0.026208965107798576, + "learning_rate": 8.729590741528726e-05, + "loss": 0.0047, + "step": 1999 + }, + { + "epoch": 1.5452790113921606, + "grad_norm": 0.047962453216314316, + "learning_rate": 8.727793514201752e-05, + "loss": 0.0046, + "step": 2000 + }, + { + "epoch": 1.5460513612666538, + "grad_norm": 0.011168277822434902, + "learning_rate": 8.725995201804163e-05, + "loss": 0.0044, + "step": 2001 + }, + { + "epoch": 1.546823711141147, + "grad_norm": 0.013109843246638775, + "learning_rate": 8.724195804859403e-05, + "loss": 0.0044, + "step": 2002 + }, + { + "epoch": 1.54759606101564, + "grad_norm": 0.02118433639407158, + "learning_rate": 8.722395323891233e-05, + "loss": 0.0041, + "step": 2003 + }, + { + "epoch": 1.5483684108901332, + "grad_norm": 0.012441566213965416, + "learning_rate": 8.720593759423728e-05, + "loss": 0.004, + "step": 2004 + }, + { + "epoch": 1.5491407607646264, + "grad_norm": 0.01646626740694046, + "learning_rate": 8.718791111981282e-05, + "loss": 0.0046, + "step": 2005 + }, + { + "epoch": 1.5499131106391195, + "grad_norm": 0.020530641078948975, + "learning_rate": 8.716987382088602e-05, + "loss": 0.005, + "step": 2006 + }, + { + "epoch": 1.5506854605136127, + "grad_norm": 0.01678456924855709, + "learning_rate": 8.715182570270707e-05, + "loss": 0.0049, + "step": 2007 + }, + { + "epoch": 1.5514578103881058, + "grad_norm": 0.011109710671007633, + "learning_rate": 8.713376677052939e-05, + "loss": 0.0041, + "step": 2008 + }, + { + "epoch": 1.552230160262599, + "grad_norm": 0.016971318051218987, + "learning_rate": 8.711569702960947e-05, + "loss": 0.0043, + "step": 2009 + }, + { + "epoch": 1.5530025101370921, + "grad_norm": 0.027093220502138138, + "learning_rate": 8.709761648520697e-05, + "loss": 0.0041, + "step": 2010 + }, + { + "epoch": 1.5537748600115853, + "grad_norm": 0.011815927922725677, + "learning_rate": 8.707952514258472e-05, + "loss": 0.0041, + "step": 2011 + }, + { + "epoch": 1.5545472098860784, + "grad_norm": 0.04407166689634323, + "learning_rate": 8.706142300700865e-05, + "loss": 0.0048, + "step": 2012 + }, + { + "epoch": 1.5553195597605716, + "grad_norm": 0.013555064797401428, + "learning_rate": 8.704331008374788e-05, + "loss": 0.0052, + "step": 2013 + }, + { + "epoch": 1.5560919096350647, + "grad_norm": 0.02045373059809208, + "learning_rate": 8.702518637807462e-05, + "loss": 0.0043, + "step": 2014 + }, + { + "epoch": 1.5568642595095579, + "grad_norm": 0.02960846945643425, + "learning_rate": 8.700705189526425e-05, + "loss": 0.0045, + "step": 2015 + }, + { + "epoch": 1.557636609384051, + "grad_norm": 0.025553971529006958, + "learning_rate": 8.69889066405953e-05, + "loss": 0.0047, + "step": 2016 + }, + { + "epoch": 1.5584089592585442, + "grad_norm": 0.015526111237704754, + "learning_rate": 8.697075061934937e-05, + "loss": 0.0042, + "step": 2017 + }, + { + "epoch": 1.5591813091330373, + "grad_norm": 0.03279326483607292, + "learning_rate": 8.695258383681128e-05, + "loss": 0.0046, + "step": 2018 + }, + { + "epoch": 1.5599536590075305, + "grad_norm": 0.020039940252900124, + "learning_rate": 8.693440629826893e-05, + "loss": 0.005, + "step": 2019 + }, + { + "epoch": 1.5607260088820234, + "grad_norm": 0.010598140768706799, + "learning_rate": 8.691621800901337e-05, + "loss": 0.0043, + "step": 2020 + }, + { + "epoch": 1.5614983587565168, + "grad_norm": 0.029510769993066788, + "learning_rate": 8.689801897433876e-05, + "loss": 0.0048, + "step": 2021 + }, + { + "epoch": 1.5622707086310097, + "grad_norm": 0.023090941831469536, + "learning_rate": 8.68798091995424e-05, + "loss": 0.0051, + "step": 2022 + }, + { + "epoch": 1.563043058505503, + "grad_norm": 0.01705724187195301, + "learning_rate": 8.68615886899247e-05, + "loss": 0.0044, + "step": 2023 + }, + { + "epoch": 1.563815408379996, + "grad_norm": 0.0226137712597847, + "learning_rate": 8.684335745078925e-05, + "loss": 0.0042, + "step": 2024 + }, + { + "epoch": 1.5645877582544894, + "grad_norm": 0.017362765967845917, + "learning_rate": 8.682511548744267e-05, + "loss": 0.0048, + "step": 2025 + }, + { + "epoch": 1.5653601081289823, + "grad_norm": 0.018477879464626312, + "learning_rate": 8.680686280519481e-05, + "loss": 0.0046, + "step": 2026 + }, + { + "epoch": 1.5661324580034757, + "grad_norm": 0.011460873298346996, + "learning_rate": 8.678859940935856e-05, + "loss": 0.0045, + "step": 2027 + }, + { + "epoch": 1.5669048078779686, + "grad_norm": 0.013441020622849464, + "learning_rate": 8.677032530524994e-05, + "loss": 0.0041, + "step": 2028 + }, + { + "epoch": 1.567677157752462, + "grad_norm": 0.02192874252796173, + "learning_rate": 8.675204049818812e-05, + "loss": 0.0045, + "step": 2029 + }, + { + "epoch": 1.568449507626955, + "grad_norm": 0.009835068136453629, + "learning_rate": 8.673374499349536e-05, + "loss": 0.0036, + "step": 2030 + }, + { + "epoch": 1.5692218575014483, + "grad_norm": 0.01787945069372654, + "learning_rate": 8.671543879649703e-05, + "loss": 0.0046, + "step": 2031 + }, + { + "epoch": 1.5699942073759412, + "grad_norm": 0.012214172631502151, + "learning_rate": 8.669712191252165e-05, + "loss": 0.004, + "step": 2032 + }, + { + "epoch": 1.5707665572504346, + "grad_norm": 0.01092873141169548, + "learning_rate": 8.667879434690078e-05, + "loss": 0.0044, + "step": 2033 + }, + { + "epoch": 1.5715389071249275, + "grad_norm": 0.011023624800145626, + "learning_rate": 8.666045610496916e-05, + "loss": 0.0041, + "step": 2034 + }, + { + "epoch": 1.5723112569994209, + "grad_norm": 0.012606930918991566, + "learning_rate": 8.66421071920646e-05, + "loss": 0.0053, + "step": 2035 + }, + { + "epoch": 1.5730836068739138, + "grad_norm": 0.00925703439861536, + "learning_rate": 8.662374761352804e-05, + "loss": 0.0043, + "step": 2036 + }, + { + "epoch": 1.5738559567484072, + "grad_norm": 0.008895349688827991, + "learning_rate": 8.660537737470348e-05, + "loss": 0.0038, + "step": 2037 + }, + { + "epoch": 1.5746283066229, + "grad_norm": 0.009540137834846973, + "learning_rate": 8.658699648093809e-05, + "loss": 0.0044, + "step": 2038 + }, + { + "epoch": 1.5754006564973935, + "grad_norm": 0.01367781963199377, + "learning_rate": 8.656860493758207e-05, + "loss": 0.0054, + "step": 2039 + }, + { + "epoch": 1.5761730063718864, + "grad_norm": 0.008566140197217464, + "learning_rate": 8.655020274998877e-05, + "loss": 0.0041, + "step": 2040 + }, + { + "epoch": 1.5769453562463798, + "grad_norm": 0.012065630406141281, + "learning_rate": 8.653178992351462e-05, + "loss": 0.0045, + "step": 2041 + }, + { + "epoch": 1.5777177061208727, + "grad_norm": 0.0094382269307971, + "learning_rate": 8.651336646351915e-05, + "loss": 0.0044, + "step": 2042 + }, + { + "epoch": 1.5784900559953658, + "grad_norm": 0.01034244243055582, + "learning_rate": 8.649493237536499e-05, + "loss": 0.0043, + "step": 2043 + }, + { + "epoch": 1.579262405869859, + "grad_norm": 0.020144378766417503, + "learning_rate": 8.647648766441784e-05, + "loss": 0.0047, + "step": 2044 + }, + { + "epoch": 1.5800347557443521, + "grad_norm": 0.014348277822136879, + "learning_rate": 8.645803233604652e-05, + "loss": 0.0042, + "step": 2045 + }, + { + "epoch": 1.5808071056188453, + "grad_norm": 0.021733032539486885, + "learning_rate": 8.643956639562294e-05, + "loss": 0.0049, + "step": 2046 + }, + { + "epoch": 1.5815794554933384, + "grad_norm": 0.021847965195775032, + "learning_rate": 8.642108984852206e-05, + "loss": 0.0046, + "step": 2047 + }, + { + "epoch": 1.5823518053678316, + "grad_norm": 0.010277594439685345, + "learning_rate": 8.640260270012199e-05, + "loss": 0.0043, + "step": 2048 + }, + { + "epoch": 1.5831241552423247, + "grad_norm": 0.013328421860933304, + "learning_rate": 8.638410495580389e-05, + "loss": 0.0042, + "step": 2049 + }, + { + "epoch": 1.583896505116818, + "grad_norm": 0.015377648174762726, + "learning_rate": 8.636559662095199e-05, + "loss": 0.0043, + "step": 2050 + }, + { + "epoch": 1.584668854991311, + "grad_norm": 0.008601309731602669, + "learning_rate": 8.63470777009536e-05, + "loss": 0.0044, + "step": 2051 + }, + { + "epoch": 1.5854412048658042, + "grad_norm": 0.014834502711892128, + "learning_rate": 8.632854820119917e-05, + "loss": 0.0048, + "step": 2052 + }, + { + "epoch": 1.5862135547402973, + "grad_norm": 0.011667091399431229, + "learning_rate": 8.631000812708217e-05, + "loss": 0.0047, + "step": 2053 + }, + { + "epoch": 1.5869859046147905, + "grad_norm": 0.010898937471210957, + "learning_rate": 8.629145748399919e-05, + "loss": 0.0047, + "step": 2054 + }, + { + "epoch": 1.5877582544892836, + "grad_norm": 0.013858029618859291, + "learning_rate": 8.627289627734983e-05, + "loss": 0.0046, + "step": 2055 + }, + { + "epoch": 1.5885306043637768, + "grad_norm": 0.011080384254455566, + "learning_rate": 8.625432451253683e-05, + "loss": 0.0045, + "step": 2056 + }, + { + "epoch": 1.58930295423827, + "grad_norm": 0.020170332863926888, + "learning_rate": 8.6235742194966e-05, + "loss": 0.0042, + "step": 2057 + }, + { + "epoch": 1.590075304112763, + "grad_norm": 0.022068405523896217, + "learning_rate": 8.621714933004619e-05, + "loss": 0.005, + "step": 2058 + }, + { + "epoch": 1.5908476539872562, + "grad_norm": 0.01097305491566658, + "learning_rate": 8.619854592318932e-05, + "loss": 0.0037, + "step": 2059 + }, + { + "epoch": 1.5916200038617494, + "grad_norm": 0.030156239867210388, + "learning_rate": 8.617993197981043e-05, + "loss": 0.0044, + "step": 2060 + }, + { + "epoch": 1.5923923537362425, + "grad_norm": 0.02470502071082592, + "learning_rate": 8.616130750532753e-05, + "loss": 0.0051, + "step": 2061 + }, + { + "epoch": 1.5931647036107357, + "grad_norm": 0.014046196825802326, + "learning_rate": 8.614267250516182e-05, + "loss": 0.0043, + "step": 2062 + }, + { + "epoch": 1.5939370534852288, + "grad_norm": 0.030482210218906403, + "learning_rate": 8.612402698473745e-05, + "loss": 0.0044, + "step": 2063 + }, + { + "epoch": 1.594709403359722, + "grad_norm": 0.02512381598353386, + "learning_rate": 8.61053709494817e-05, + "loss": 0.0043, + "step": 2064 + }, + { + "epoch": 1.5954817532342151, + "grad_norm": 0.00860644318163395, + "learning_rate": 8.608670440482489e-05, + "loss": 0.0042, + "step": 2065 + }, + { + "epoch": 1.5962541031087083, + "grad_norm": 0.012833881191909313, + "learning_rate": 8.606802735620041e-05, + "loss": 0.0046, + "step": 2066 + }, + { + "epoch": 1.5970264529832012, + "grad_norm": 0.04173438251018524, + "learning_rate": 8.604933980904466e-05, + "loss": 0.0047, + "step": 2067 + }, + { + "epoch": 1.5977988028576946, + "grad_norm": 0.021180585026741028, + "learning_rate": 8.603064176879718e-05, + "loss": 0.0044, + "step": 2068 + }, + { + "epoch": 1.5985711527321875, + "grad_norm": 0.016953809186816216, + "learning_rate": 8.601193324090049e-05, + "loss": 0.0037, + "step": 2069 + }, + { + "epoch": 1.599343502606681, + "grad_norm": 0.011930052191019058, + "learning_rate": 8.59932142308002e-05, + "loss": 0.0044, + "step": 2070 + }, + { + "epoch": 1.6001158524811738, + "grad_norm": 0.019897272810339928, + "learning_rate": 8.597448474394496e-05, + "loss": 0.0045, + "step": 2071 + }, + { + "epoch": 1.6008882023556672, + "grad_norm": 0.012094732373952866, + "learning_rate": 8.595574478578647e-05, + "loss": 0.0046, + "step": 2072 + }, + { + "epoch": 1.6016605522301601, + "grad_norm": 0.012607376091182232, + "learning_rate": 8.593699436177949e-05, + "loss": 0.0037, + "step": 2073 + }, + { + "epoch": 1.6024329021046535, + "grad_norm": 0.025926416739821434, + "learning_rate": 8.59182334773818e-05, + "loss": 0.0045, + "step": 2074 + }, + { + "epoch": 1.6032052519791464, + "grad_norm": 0.010309277102351189, + "learning_rate": 8.589946213805422e-05, + "loss": 0.0043, + "step": 2075 + }, + { + "epoch": 1.6039776018536398, + "grad_norm": 0.011180024594068527, + "learning_rate": 8.588068034926069e-05, + "loss": 0.0042, + "step": 2076 + }, + { + "epoch": 1.6047499517281327, + "grad_norm": 0.015378049574792385, + "learning_rate": 8.586188811646809e-05, + "loss": 0.004, + "step": 2077 + }, + { + "epoch": 1.605522301602626, + "grad_norm": 0.018012790009379387, + "learning_rate": 8.584308544514639e-05, + "loss": 0.0041, + "step": 2078 + }, + { + "epoch": 1.606294651477119, + "grad_norm": 0.012710918672382832, + "learning_rate": 8.582427234076861e-05, + "loss": 0.0043, + "step": 2079 + }, + { + "epoch": 1.6070670013516124, + "grad_norm": 0.021130891516804695, + "learning_rate": 8.580544880881079e-05, + "loss": 0.0045, + "step": 2080 + }, + { + "epoch": 1.6078393512261053, + "grad_norm": 0.025349225848913193, + "learning_rate": 8.578661485475199e-05, + "loss": 0.0047, + "step": 2081 + }, + { + "epoch": 1.6086117011005987, + "grad_norm": 0.012351235374808311, + "learning_rate": 8.576777048407432e-05, + "loss": 0.0045, + "step": 2082 + }, + { + "epoch": 1.6093840509750916, + "grad_norm": 0.025645259767770767, + "learning_rate": 8.574891570226292e-05, + "loss": 0.0044, + "step": 2083 + }, + { + "epoch": 1.610156400849585, + "grad_norm": 0.016709905117750168, + "learning_rate": 8.573005051480598e-05, + "loss": 0.005, + "step": 2084 + }, + { + "epoch": 1.610928750724078, + "grad_norm": 0.013472169637680054, + "learning_rate": 8.57111749271947e-05, + "loss": 0.004, + "step": 2085 + }, + { + "epoch": 1.6117011005985713, + "grad_norm": 0.011256473138928413, + "learning_rate": 8.569228894492328e-05, + "loss": 0.0037, + "step": 2086 + }, + { + "epoch": 1.6124734504730642, + "grad_norm": 0.009445869363844395, + "learning_rate": 8.567339257348898e-05, + "loss": 0.0046, + "step": 2087 + }, + { + "epoch": 1.6132458003475576, + "grad_norm": 0.014279279857873917, + "learning_rate": 8.56544858183921e-05, + "loss": 0.0049, + "step": 2088 + }, + { + "epoch": 1.6140181502220505, + "grad_norm": 0.013222617097198963, + "learning_rate": 8.563556868513592e-05, + "loss": 0.0046, + "step": 2089 + }, + { + "epoch": 1.6147905000965437, + "grad_norm": 0.011042260564863682, + "learning_rate": 8.561664117922677e-05, + "loss": 0.0042, + "step": 2090 + }, + { + "epoch": 1.6155628499710368, + "grad_norm": 0.011346302926540375, + "learning_rate": 8.559770330617399e-05, + "loss": 0.0041, + "step": 2091 + }, + { + "epoch": 1.61633519984553, + "grad_norm": 0.012888088822364807, + "learning_rate": 8.557875507148991e-05, + "loss": 0.0042, + "step": 2092 + }, + { + "epoch": 1.6171075497200231, + "grad_norm": 0.010861898772418499, + "learning_rate": 8.555979648068994e-05, + "loss": 0.0044, + "step": 2093 + }, + { + "epoch": 1.6178798995945163, + "grad_norm": 0.008471175096929073, + "learning_rate": 8.554082753929245e-05, + "loss": 0.0047, + "step": 2094 + }, + { + "epoch": 1.6186522494690094, + "grad_norm": 0.018162831664085388, + "learning_rate": 8.552184825281885e-05, + "loss": 0.0047, + "step": 2095 + }, + { + "epoch": 1.6194245993435026, + "grad_norm": 0.010423211380839348, + "learning_rate": 8.550285862679355e-05, + "loss": 0.0047, + "step": 2096 + }, + { + "epoch": 1.6201969492179957, + "grad_norm": 0.009892021305859089, + "learning_rate": 8.548385866674397e-05, + "loss": 0.0047, + "step": 2097 + }, + { + "epoch": 1.6209692990924889, + "grad_norm": 0.01203847024589777, + "learning_rate": 8.546484837820053e-05, + "loss": 0.0043, + "step": 2098 + }, + { + "epoch": 1.621741648966982, + "grad_norm": 0.01343244407325983, + "learning_rate": 8.544582776669665e-05, + "loss": 0.0049, + "step": 2099 + }, + { + "epoch": 1.6225139988414752, + "grad_norm": 0.011868324130773544, + "learning_rate": 8.542679683776881e-05, + "loss": 0.0051, + "step": 2100 + }, + { + "epoch": 1.6232863487159683, + "grad_norm": 0.012159244157373905, + "learning_rate": 8.540775559695645e-05, + "loss": 0.0041, + "step": 2101 + }, + { + "epoch": 1.6240586985904615, + "grad_norm": 0.008571230806410313, + "learning_rate": 8.5388704049802e-05, + "loss": 0.0041, + "step": 2102 + }, + { + "epoch": 1.6248310484649546, + "grad_norm": 0.012841513380408287, + "learning_rate": 8.536964220185089e-05, + "loss": 0.0042, + "step": 2103 + }, + { + "epoch": 1.6256033983394478, + "grad_norm": 0.0088861845433712, + "learning_rate": 8.535057005865158e-05, + "loss": 0.0042, + "step": 2104 + }, + { + "epoch": 1.626375748213941, + "grad_norm": 0.014981401152908802, + "learning_rate": 8.533148762575552e-05, + "loss": 0.0043, + "step": 2105 + }, + { + "epoch": 1.627148098088434, + "grad_norm": 0.01102379895746708, + "learning_rate": 8.531239490871712e-05, + "loss": 0.0048, + "step": 2106 + }, + { + "epoch": 1.6279204479629272, + "grad_norm": 0.010146740823984146, + "learning_rate": 8.529329191309383e-05, + "loss": 0.0045, + "step": 2107 + }, + { + "epoch": 1.6286927978374204, + "grad_norm": 0.02327127754688263, + "learning_rate": 8.527417864444606e-05, + "loss": 0.005, + "step": 2108 + }, + { + "epoch": 1.6294651477119135, + "grad_norm": 0.010763900354504585, + "learning_rate": 8.525505510833724e-05, + "loss": 0.0042, + "step": 2109 + }, + { + "epoch": 1.6302374975864067, + "grad_norm": 0.013578513637185097, + "learning_rate": 8.523592131033376e-05, + "loss": 0.0042, + "step": 2110 + }, + { + "epoch": 1.6310098474608998, + "grad_norm": 0.022221015766263008, + "learning_rate": 8.521677725600497e-05, + "loss": 0.005, + "step": 2111 + }, + { + "epoch": 1.631782197335393, + "grad_norm": 0.018897736445069313, + "learning_rate": 8.519762295092329e-05, + "loss": 0.0048, + "step": 2112 + }, + { + "epoch": 1.6325545472098861, + "grad_norm": 0.011113962158560753, + "learning_rate": 8.517845840066406e-05, + "loss": 0.0039, + "step": 2113 + }, + { + "epoch": 1.633326897084379, + "grad_norm": 0.034275464713573456, + "learning_rate": 8.515928361080558e-05, + "loss": 0.0048, + "step": 2114 + }, + { + "epoch": 1.6340992469588724, + "grad_norm": 0.011763646267354488, + "learning_rate": 8.514009858692924e-05, + "loss": 0.0045, + "step": 2115 + }, + { + "epoch": 1.6348715968333654, + "grad_norm": 0.010240164585411549, + "learning_rate": 8.512090333461929e-05, + "loss": 0.0049, + "step": 2116 + }, + { + "epoch": 1.6356439467078587, + "grad_norm": 0.010845442302525043, + "learning_rate": 8.5101697859463e-05, + "loss": 0.0038, + "step": 2117 + }, + { + "epoch": 1.6364162965823517, + "grad_norm": 0.013729963451623917, + "learning_rate": 8.508248216705065e-05, + "loss": 0.005, + "step": 2118 + }, + { + "epoch": 1.637188646456845, + "grad_norm": 0.016924763098359108, + "learning_rate": 8.506325626297545e-05, + "loss": 0.0052, + "step": 2119 + }, + { + "epoch": 1.637960996331338, + "grad_norm": 0.010555649176239967, + "learning_rate": 8.504402015283358e-05, + "loss": 0.0041, + "step": 2120 + }, + { + "epoch": 1.6387333462058313, + "grad_norm": 0.015738019719719887, + "learning_rate": 8.502477384222423e-05, + "loss": 0.0042, + "step": 2121 + }, + { + "epoch": 1.6395056960803243, + "grad_norm": 0.017977280542254448, + "learning_rate": 8.50055173367495e-05, + "loss": 0.0045, + "step": 2122 + }, + { + "epoch": 1.6402780459548176, + "grad_norm": 0.02719496376812458, + "learning_rate": 8.498625064201455e-05, + "loss": 0.0046, + "step": 2123 + }, + { + "epoch": 1.6410503958293106, + "grad_norm": 0.023861657828092575, + "learning_rate": 8.496697376362742e-05, + "loss": 0.0045, + "step": 2124 + }, + { + "epoch": 1.641822745703804, + "grad_norm": 0.01022212952375412, + "learning_rate": 8.494768670719912e-05, + "loss": 0.0037, + "step": 2125 + }, + { + "epoch": 1.6425950955782969, + "grad_norm": 0.009150752797722816, + "learning_rate": 8.492838947834367e-05, + "loss": 0.0045, + "step": 2126 + }, + { + "epoch": 1.6433674454527902, + "grad_norm": 0.025724977254867554, + "learning_rate": 8.490908208267805e-05, + "loss": 0.0049, + "step": 2127 + }, + { + "epoch": 1.6441397953272832, + "grad_norm": 0.014376234263181686, + "learning_rate": 8.488976452582213e-05, + "loss": 0.0045, + "step": 2128 + }, + { + "epoch": 1.6449121452017765, + "grad_norm": 0.015311934985220432, + "learning_rate": 8.487043681339881e-05, + "loss": 0.0046, + "step": 2129 + }, + { + "epoch": 1.6456844950762695, + "grad_norm": 0.018781229853630066, + "learning_rate": 8.485109895103391e-05, + "loss": 0.0042, + "step": 2130 + }, + { + "epoch": 1.6464568449507628, + "grad_norm": 0.013387559913098812, + "learning_rate": 8.483175094435622e-05, + "loss": 0.0041, + "step": 2131 + }, + { + "epoch": 1.6472291948252558, + "grad_norm": 0.03222404792904854, + "learning_rate": 8.481239279899748e-05, + "loss": 0.0046, + "step": 2132 + }, + { + "epoch": 1.6480015446997491, + "grad_norm": 0.012264180928468704, + "learning_rate": 8.479302452059238e-05, + "loss": 0.0042, + "step": 2133 + }, + { + "epoch": 1.648773894574242, + "grad_norm": 0.014101590029895306, + "learning_rate": 8.477364611477857e-05, + "loss": 0.0042, + "step": 2134 + }, + { + "epoch": 1.6495462444487354, + "grad_norm": 0.02377992682158947, + "learning_rate": 8.475425758719659e-05, + "loss": 0.0049, + "step": 2135 + }, + { + "epoch": 1.6503185943232284, + "grad_norm": 0.022946573793888092, + "learning_rate": 8.473485894349002e-05, + "loss": 0.0048, + "step": 2136 + }, + { + "epoch": 1.6510909441977217, + "grad_norm": 0.010878360830247402, + "learning_rate": 8.471545018930531e-05, + "loss": 0.0041, + "step": 2137 + }, + { + "epoch": 1.6518632940722147, + "grad_norm": 0.011692160740494728, + "learning_rate": 8.46960313302919e-05, + "loss": 0.0041, + "step": 2138 + }, + { + "epoch": 1.6526356439467078, + "grad_norm": 0.011987571604549885, + "learning_rate": 8.467660237210211e-05, + "loss": 0.0045, + "step": 2139 + }, + { + "epoch": 1.653407993821201, + "grad_norm": 0.009922868572175503, + "learning_rate": 8.465716332039128e-05, + "loss": 0.004, + "step": 2140 + }, + { + "epoch": 1.654180343695694, + "grad_norm": 0.02513553760945797, + "learning_rate": 8.463771418081763e-05, + "loss": 0.0044, + "step": 2141 + }, + { + "epoch": 1.6549526935701873, + "grad_norm": 0.009684463031589985, + "learning_rate": 8.461825495904236e-05, + "loss": 0.0042, + "step": 2142 + }, + { + "epoch": 1.6557250434446804, + "grad_norm": 0.011489993892610073, + "learning_rate": 8.459878566072955e-05, + "loss": 0.0042, + "step": 2143 + }, + { + "epoch": 1.6564973933191736, + "grad_norm": 0.021958818659186363, + "learning_rate": 8.457930629154625e-05, + "loss": 0.0041, + "step": 2144 + }, + { + "epoch": 1.6572697431936667, + "grad_norm": 0.019374405965209007, + "learning_rate": 8.455981685716244e-05, + "loss": 0.0043, + "step": 2145 + }, + { + "epoch": 1.6580420930681599, + "grad_norm": 0.010465430095791817, + "learning_rate": 8.454031736325101e-05, + "loss": 0.0043, + "step": 2146 + }, + { + "epoch": 1.658814442942653, + "grad_norm": 0.01048517506569624, + "learning_rate": 8.452080781548781e-05, + "loss": 0.0041, + "step": 2147 + }, + { + "epoch": 1.6595867928171462, + "grad_norm": 0.026124538853764534, + "learning_rate": 8.450128821955159e-05, + "loss": 0.0044, + "step": 2148 + }, + { + "epoch": 1.6603591426916393, + "grad_norm": 0.012944714166224003, + "learning_rate": 8.448175858112402e-05, + "loss": 0.0042, + "step": 2149 + }, + { + "epoch": 1.6611314925661325, + "grad_norm": 0.011114759370684624, + "learning_rate": 8.446221890588972e-05, + "loss": 0.004, + "step": 2150 + }, + { + "epoch": 1.6619038424406256, + "grad_norm": 0.023538824170827866, + "learning_rate": 8.444266919953623e-05, + "loss": 0.0051, + "step": 2151 + }, + { + "epoch": 1.6626761923151188, + "grad_norm": 0.015576600097119808, + "learning_rate": 8.442310946775397e-05, + "loss": 0.005, + "step": 2152 + }, + { + "epoch": 1.663448542189612, + "grad_norm": 0.013887589797377586, + "learning_rate": 8.440353971623631e-05, + "loss": 0.0043, + "step": 2153 + }, + { + "epoch": 1.664220892064105, + "grad_norm": 0.03156410902738571, + "learning_rate": 8.438395995067956e-05, + "loss": 0.0045, + "step": 2154 + }, + { + "epoch": 1.6649932419385982, + "grad_norm": 0.02747179940342903, + "learning_rate": 8.436437017678287e-05, + "loss": 0.0045, + "step": 2155 + }, + { + "epoch": 1.6657655918130914, + "grad_norm": 0.00802780594676733, + "learning_rate": 8.43447704002484e-05, + "loss": 0.0037, + "step": 2156 + }, + { + "epoch": 1.6665379416875845, + "grad_norm": 0.026982294395565987, + "learning_rate": 8.432516062678113e-05, + "loss": 0.0046, + "step": 2157 + }, + { + "epoch": 1.6673102915620777, + "grad_norm": 0.020452868193387985, + "learning_rate": 8.430554086208902e-05, + "loss": 0.0044, + "step": 2158 + }, + { + "epoch": 1.6680826414365708, + "grad_norm": 0.008987967856228352, + "learning_rate": 8.428591111188289e-05, + "loss": 0.0043, + "step": 2159 + }, + { + "epoch": 1.668854991311064, + "grad_norm": 0.008565445430576801, + "learning_rate": 8.426627138187648e-05, + "loss": 0.0044, + "step": 2160 + }, + { + "epoch": 1.6696273411855571, + "grad_norm": 0.0161731019616127, + "learning_rate": 8.424662167778647e-05, + "loss": 0.0043, + "step": 2161 + }, + { + "epoch": 1.6703996910600503, + "grad_norm": 0.01388185191899538, + "learning_rate": 8.42269620053324e-05, + "loss": 0.0042, + "step": 2162 + }, + { + "epoch": 1.6711720409345432, + "grad_norm": 0.010824089869856834, + "learning_rate": 8.420729237023672e-05, + "loss": 0.0045, + "step": 2163 + }, + { + "epoch": 1.6719443908090366, + "grad_norm": 0.018120337277650833, + "learning_rate": 8.418761277822478e-05, + "loss": 0.0047, + "step": 2164 + }, + { + "epoch": 1.6727167406835295, + "grad_norm": 0.01861923560500145, + "learning_rate": 8.416792323502486e-05, + "loss": 0.0046, + "step": 2165 + }, + { + "epoch": 1.6734890905580229, + "grad_norm": 0.012371395714581013, + "learning_rate": 8.414822374636808e-05, + "loss": 0.0039, + "step": 2166 + }, + { + "epoch": 1.6742614404325158, + "grad_norm": 0.012984382919967175, + "learning_rate": 8.41285143179885e-05, + "loss": 0.0039, + "step": 2167 + }, + { + "epoch": 1.6750337903070092, + "grad_norm": 0.015273356810212135, + "learning_rate": 8.410879495562307e-05, + "loss": 0.0044, + "step": 2168 + }, + { + "epoch": 1.675806140181502, + "grad_norm": 0.014091392047703266, + "learning_rate": 8.40890656650116e-05, + "loss": 0.0043, + "step": 2169 + }, + { + "epoch": 1.6765784900559955, + "grad_norm": 0.01114340964704752, + "learning_rate": 8.40693264518968e-05, + "loss": 0.0042, + "step": 2170 + }, + { + "epoch": 1.6773508399304884, + "grad_norm": 0.011085857637226582, + "learning_rate": 8.404957732202431e-05, + "loss": 0.0045, + "step": 2171 + }, + { + "epoch": 1.6781231898049818, + "grad_norm": 0.02794049121439457, + "learning_rate": 8.402981828114261e-05, + "loss": 0.004, + "step": 2172 + }, + { + "epoch": 1.6788955396794747, + "grad_norm": 0.016085123643279076, + "learning_rate": 8.401004933500307e-05, + "loss": 0.0045, + "step": 2173 + }, + { + "epoch": 1.679667889553968, + "grad_norm": 0.010472382418811321, + "learning_rate": 8.399027048935997e-05, + "loss": 0.0042, + "step": 2174 + }, + { + "epoch": 1.680440239428461, + "grad_norm": 0.00927384290844202, + "learning_rate": 8.397048174997044e-05, + "loss": 0.0043, + "step": 2175 + }, + { + "epoch": 1.6812125893029544, + "grad_norm": 0.03752785176038742, + "learning_rate": 8.395068312259451e-05, + "loss": 0.0048, + "step": 2176 + }, + { + "epoch": 1.6819849391774473, + "grad_norm": 0.011899629607796669, + "learning_rate": 8.393087461299508e-05, + "loss": 0.0042, + "step": 2177 + }, + { + "epoch": 1.6827572890519407, + "grad_norm": 0.01649544946849346, + "learning_rate": 8.391105622693793e-05, + "loss": 0.0046, + "step": 2178 + }, + { + "epoch": 1.6835296389264336, + "grad_norm": 0.04666175693273544, + "learning_rate": 8.389122797019172e-05, + "loss": 0.005, + "step": 2179 + }, + { + "epoch": 1.684301988800927, + "grad_norm": 0.008677134290337563, + "learning_rate": 8.387138984852795e-05, + "loss": 0.0039, + "step": 2180 + }, + { + "epoch": 1.68507433867542, + "grad_norm": 0.02470334805548191, + "learning_rate": 8.385154186772106e-05, + "loss": 0.0046, + "step": 2181 + }, + { + "epoch": 1.6858466885499133, + "grad_norm": 0.01587885431945324, + "learning_rate": 8.383168403354827e-05, + "loss": 0.0041, + "step": 2182 + }, + { + "epoch": 1.6866190384244062, + "grad_norm": 0.02103058062493801, + "learning_rate": 8.381181635178976e-05, + "loss": 0.0045, + "step": 2183 + }, + { + "epoch": 1.6873913882988996, + "grad_norm": 0.00931734312325716, + "learning_rate": 8.379193882822851e-05, + "loss": 0.0045, + "step": 2184 + }, + { + "epoch": 1.6881637381733925, + "grad_norm": 0.023588426411151886, + "learning_rate": 8.377205146865038e-05, + "loss": 0.0049, + "step": 2185 + }, + { + "epoch": 1.6889360880478856, + "grad_norm": 0.021287525072693825, + "learning_rate": 8.37521542788441e-05, + "loss": 0.004, + "step": 2186 + }, + { + "epoch": 1.6897084379223788, + "grad_norm": 0.014442606829106808, + "learning_rate": 8.37322472646013e-05, + "loss": 0.0042, + "step": 2187 + }, + { + "epoch": 1.690480787796872, + "grad_norm": 0.013919384218752384, + "learning_rate": 8.371233043171637e-05, + "loss": 0.0051, + "step": 2188 + }, + { + "epoch": 1.691253137671365, + "grad_norm": 0.02148333191871643, + "learning_rate": 8.369240378598667e-05, + "loss": 0.0044, + "step": 2189 + }, + { + "epoch": 1.6920254875458582, + "grad_norm": 0.020780183374881744, + "learning_rate": 8.367246733321235e-05, + "loss": 0.0049, + "step": 2190 + }, + { + "epoch": 1.6927978374203514, + "grad_norm": 0.01798652857542038, + "learning_rate": 8.365252107919641e-05, + "loss": 0.0043, + "step": 2191 + }, + { + "epoch": 1.6935701872948445, + "grad_norm": 0.016136417165398598, + "learning_rate": 8.363256502974474e-05, + "loss": 0.0041, + "step": 2192 + }, + { + "epoch": 1.6943425371693377, + "grad_norm": 0.009567273780703545, + "learning_rate": 8.361259919066606e-05, + "loss": 0.004, + "step": 2193 + }, + { + "epoch": 1.6951148870438308, + "grad_norm": 0.009171852841973305, + "learning_rate": 8.359262356777194e-05, + "loss": 0.0044, + "step": 2194 + }, + { + "epoch": 1.695887236918324, + "grad_norm": 0.009534629993140697, + "learning_rate": 8.357263816687681e-05, + "loss": 0.0042, + "step": 2195 + }, + { + "epoch": 1.6966595867928171, + "grad_norm": 0.025950351729989052, + "learning_rate": 8.355264299379794e-05, + "loss": 0.0044, + "step": 2196 + }, + { + "epoch": 1.6974319366673103, + "grad_norm": 0.00965665653347969, + "learning_rate": 8.353263805435543e-05, + "loss": 0.0045, + "step": 2197 + }, + { + "epoch": 1.6982042865418034, + "grad_norm": 0.02293946035206318, + "learning_rate": 8.351262335437224e-05, + "loss": 0.0046, + "step": 2198 + }, + { + "epoch": 1.6989766364162966, + "grad_norm": 0.018668964505195618, + "learning_rate": 8.349259889967416e-05, + "loss": 0.0043, + "step": 2199 + }, + { + "epoch": 1.6997489862907897, + "grad_norm": 0.011089003644883633, + "learning_rate": 8.347256469608983e-05, + "loss": 0.0043, + "step": 2200 + }, + { + "epoch": 1.700521336165283, + "grad_norm": 0.010769315995275974, + "learning_rate": 8.345252074945071e-05, + "loss": 0.0044, + "step": 2201 + }, + { + "epoch": 1.701293686039776, + "grad_norm": 0.018320759758353233, + "learning_rate": 8.343246706559113e-05, + "loss": 0.0042, + "step": 2202 + }, + { + "epoch": 1.7020660359142692, + "grad_norm": 0.019361697137355804, + "learning_rate": 8.341240365034823e-05, + "loss": 0.0042, + "step": 2203 + }, + { + "epoch": 1.7028383857887623, + "grad_norm": 0.011455011554062366, + "learning_rate": 8.339233050956198e-05, + "loss": 0.0041, + "step": 2204 + }, + { + "epoch": 1.7036107356632555, + "grad_norm": 0.011829971335828304, + "learning_rate": 8.337224764907518e-05, + "loss": 0.0048, + "step": 2205 + }, + { + "epoch": 1.7043830855377486, + "grad_norm": 0.008956990204751492, + "learning_rate": 8.335215507473346e-05, + "loss": 0.004, + "step": 2206 + }, + { + "epoch": 1.7051554354122418, + "grad_norm": 0.009752951562404633, + "learning_rate": 8.333205279238531e-05, + "loss": 0.004, + "step": 2207 + }, + { + "epoch": 1.705927785286735, + "grad_norm": 0.010895447805523872, + "learning_rate": 8.3311940807882e-05, + "loss": 0.0037, + "step": 2208 + }, + { + "epoch": 1.706700135161228, + "grad_norm": 0.007751632947474718, + "learning_rate": 8.329181912707764e-05, + "loss": 0.0037, + "step": 2209 + }, + { + "epoch": 1.707472485035721, + "grad_norm": 0.012226882390677929, + "learning_rate": 8.327168775582916e-05, + "loss": 0.0045, + "step": 2210 + }, + { + "epoch": 1.7082448349102144, + "grad_norm": 0.01452571339905262, + "learning_rate": 8.325154669999634e-05, + "loss": 0.0041, + "step": 2211 + }, + { + "epoch": 1.7090171847847073, + "grad_norm": 0.013364356011152267, + "learning_rate": 8.323139596544174e-05, + "loss": 0.0041, + "step": 2212 + }, + { + "epoch": 1.7097895346592007, + "grad_norm": 0.02150997333228588, + "learning_rate": 8.321123555803074e-05, + "loss": 0.0043, + "step": 2213 + }, + { + "epoch": 1.7105618845336936, + "grad_norm": 0.015584706328809261, + "learning_rate": 8.319106548363156e-05, + "loss": 0.0038, + "step": 2214 + }, + { + "epoch": 1.711334234408187, + "grad_norm": 0.009450695477426052, + "learning_rate": 8.317088574811524e-05, + "loss": 0.004, + "step": 2215 + }, + { + "epoch": 1.71210658428268, + "grad_norm": 0.008295596577227116, + "learning_rate": 8.315069635735557e-05, + "loss": 0.0042, + "step": 2216 + }, + { + "epoch": 1.7128789341571733, + "grad_norm": 0.02395360730588436, + "learning_rate": 8.313049731722924e-05, + "loss": 0.004, + "step": 2217 + }, + { + "epoch": 1.7136512840316662, + "grad_norm": 0.029887588694691658, + "learning_rate": 8.311028863361566e-05, + "loss": 0.0043, + "step": 2218 + }, + { + "epoch": 1.7144236339061596, + "grad_norm": 0.028670670464634895, + "learning_rate": 8.309007031239712e-05, + "loss": 0.0042, + "step": 2219 + }, + { + "epoch": 1.7151959837806525, + "grad_norm": 0.020392004400491714, + "learning_rate": 8.306984235945868e-05, + "loss": 0.0036, + "step": 2220 + }, + { + "epoch": 1.715968333655146, + "grad_norm": 0.028491845354437828, + "learning_rate": 8.304960478068819e-05, + "loss": 0.0045, + "step": 2221 + }, + { + "epoch": 1.7167406835296388, + "grad_norm": 0.01415249053388834, + "learning_rate": 8.302935758197634e-05, + "loss": 0.0044, + "step": 2222 + }, + { + "epoch": 1.7175130334041322, + "grad_norm": 0.03231945261359215, + "learning_rate": 8.30091007692166e-05, + "loss": 0.0046, + "step": 2223 + }, + { + "epoch": 1.7182853832786251, + "grad_norm": 0.023468049243092537, + "learning_rate": 8.29888343483052e-05, + "loss": 0.0045, + "step": 2224 + }, + { + "epoch": 1.7190577331531185, + "grad_norm": 0.009676053188741207, + "learning_rate": 8.296855832514128e-05, + "loss": 0.0036, + "step": 2225 + }, + { + "epoch": 1.7198300830276114, + "grad_norm": 0.012138811871409416, + "learning_rate": 8.294827270562664e-05, + "loss": 0.0041, + "step": 2226 + }, + { + "epoch": 1.7206024329021048, + "grad_norm": 0.02946018986403942, + "learning_rate": 8.292797749566594e-05, + "loss": 0.0046, + "step": 2227 + }, + { + "epoch": 1.7213747827765977, + "grad_norm": 0.01382883358746767, + "learning_rate": 8.290767270116666e-05, + "loss": 0.0043, + "step": 2228 + }, + { + "epoch": 1.722147132651091, + "grad_norm": 0.011325203813612461, + "learning_rate": 8.2887358328039e-05, + "loss": 0.0043, + "step": 2229 + }, + { + "epoch": 1.722919482525584, + "grad_norm": 0.015367789193987846, + "learning_rate": 8.2867034382196e-05, + "loss": 0.004, + "step": 2230 + }, + { + "epoch": 1.7236918324000774, + "grad_norm": 0.019331874325871468, + "learning_rate": 8.284670086955346e-05, + "loss": 0.0043, + "step": 2231 + }, + { + "epoch": 1.7244641822745703, + "grad_norm": 0.009774122387170792, + "learning_rate": 8.282635779602998e-05, + "loss": 0.0044, + "step": 2232 + }, + { + "epoch": 1.7252365321490635, + "grad_norm": 0.00924541987478733, + "learning_rate": 8.280600516754694e-05, + "loss": 0.0036, + "step": 2233 + }, + { + "epoch": 1.7260088820235566, + "grad_norm": 0.011827325448393822, + "learning_rate": 8.278564299002849e-05, + "loss": 0.0043, + "step": 2234 + }, + { + "epoch": 1.7267812318980498, + "grad_norm": 0.016981936991214752, + "learning_rate": 8.276527126940157e-05, + "loss": 0.0048, + "step": 2235 + }, + { + "epoch": 1.727553581772543, + "grad_norm": 0.011866158805787563, + "learning_rate": 8.27448900115959e-05, + "loss": 0.0042, + "step": 2236 + }, + { + "epoch": 1.728325931647036, + "grad_norm": 0.011118290014564991, + "learning_rate": 8.272449922254398e-05, + "loss": 0.0047, + "step": 2237 + }, + { + "epoch": 1.7290982815215292, + "grad_norm": 0.016521891579031944, + "learning_rate": 8.270409890818104e-05, + "loss": 0.005, + "step": 2238 + }, + { + "epoch": 1.7298706313960224, + "grad_norm": 0.009404239244759083, + "learning_rate": 8.268368907444518e-05, + "loss": 0.0046, + "step": 2239 + }, + { + "epoch": 1.7306429812705155, + "grad_norm": 0.009855051524937153, + "learning_rate": 8.266326972727714e-05, + "loss": 0.0044, + "step": 2240 + }, + { + "epoch": 1.7314153311450087, + "grad_norm": 0.01203001756221056, + "learning_rate": 8.264284087262056e-05, + "loss": 0.0051, + "step": 2241 + }, + { + "epoch": 1.7321876810195018, + "grad_norm": 0.015957722440361977, + "learning_rate": 8.262240251642173e-05, + "loss": 0.004, + "step": 2242 + }, + { + "epoch": 1.732960030893995, + "grad_norm": 0.014566673897206783, + "learning_rate": 8.260195466462981e-05, + "loss": 0.0044, + "step": 2243 + }, + { + "epoch": 1.7337323807684881, + "grad_norm": 0.016216455027461052, + "learning_rate": 8.258149732319665e-05, + "loss": 0.0048, + "step": 2244 + }, + { + "epoch": 1.7345047306429813, + "grad_norm": 0.014252996072173119, + "learning_rate": 8.256103049807688e-05, + "loss": 0.0045, + "step": 2245 + }, + { + "epoch": 1.7352770805174744, + "grad_norm": 0.011566858738660812, + "learning_rate": 8.254055419522792e-05, + "loss": 0.0045, + "step": 2246 + }, + { + "epoch": 1.7360494303919676, + "grad_norm": 0.01703554019331932, + "learning_rate": 8.252006842060993e-05, + "loss": 0.0041, + "step": 2247 + }, + { + "epoch": 1.7368217802664607, + "grad_norm": 0.009798984974622726, + "learning_rate": 8.249957318018581e-05, + "loss": 0.0049, + "step": 2248 + }, + { + "epoch": 1.7375941301409539, + "grad_norm": 0.008998863399028778, + "learning_rate": 8.247906847992122e-05, + "loss": 0.0041, + "step": 2249 + }, + { + "epoch": 1.738366480015447, + "grad_norm": 0.009814360179007053, + "learning_rate": 8.24585543257846e-05, + "loss": 0.0046, + "step": 2250 + }, + { + "epoch": 1.7391388298899402, + "grad_norm": 0.016251133754849434, + "learning_rate": 8.243803072374711e-05, + "loss": 0.0047, + "step": 2251 + }, + { + "epoch": 1.7399111797644333, + "grad_norm": 0.007743713911622763, + "learning_rate": 8.24174976797827e-05, + "loss": 0.0038, + "step": 2252 + }, + { + "epoch": 1.7406835296389265, + "grad_norm": 0.00983304250985384, + "learning_rate": 8.239695519986802e-05, + "loss": 0.0042, + "step": 2253 + }, + { + "epoch": 1.7414558795134196, + "grad_norm": 0.01851450838148594, + "learning_rate": 8.237640328998249e-05, + "loss": 0.0048, + "step": 2254 + }, + { + "epoch": 1.7422282293879128, + "grad_norm": 0.011477231048047543, + "learning_rate": 8.235584195610829e-05, + "loss": 0.0041, + "step": 2255 + }, + { + "epoch": 1.743000579262406, + "grad_norm": 0.010229852981865406, + "learning_rate": 8.233527120423031e-05, + "loss": 0.0041, + "step": 2256 + }, + { + "epoch": 1.7437729291368989, + "grad_norm": 0.027905916795134544, + "learning_rate": 8.231469104033621e-05, + "loss": 0.0044, + "step": 2257 + }, + { + "epoch": 1.7445452790113922, + "grad_norm": 0.010942929424345493, + "learning_rate": 8.229410147041639e-05, + "loss": 0.0042, + "step": 2258 + }, + { + "epoch": 1.7453176288858852, + "grad_norm": 0.008921844884753227, + "learning_rate": 8.227350250046393e-05, + "loss": 0.0044, + "step": 2259 + }, + { + "epoch": 1.7460899787603785, + "grad_norm": 0.017951516434550285, + "learning_rate": 8.225289413647475e-05, + "loss": 0.004, + "step": 2260 + }, + { + "epoch": 1.7468623286348715, + "grad_norm": 0.01976814493536949, + "learning_rate": 8.22322763844474e-05, + "loss": 0.004, + "step": 2261 + }, + { + "epoch": 1.7476346785093648, + "grad_norm": 0.014263933524489403, + "learning_rate": 8.221164925038325e-05, + "loss": 0.005, + "step": 2262 + }, + { + "epoch": 1.7484070283838578, + "grad_norm": 0.020307086408138275, + "learning_rate": 8.219101274028634e-05, + "loss": 0.0042, + "step": 2263 + }, + { + "epoch": 1.7491793782583511, + "grad_norm": 0.021188564598560333, + "learning_rate": 8.217036686016344e-05, + "loss": 0.0042, + "step": 2264 + }, + { + "epoch": 1.749951728132844, + "grad_norm": 0.010408424772322178, + "learning_rate": 8.21497116160241e-05, + "loss": 0.0043, + "step": 2265 + }, + { + "epoch": 1.7507240780073374, + "grad_norm": 0.01547803170979023, + "learning_rate": 8.212904701388054e-05, + "loss": 0.0042, + "step": 2266 + }, + { + "epoch": 1.7514964278818304, + "grad_norm": 0.030950602144002914, + "learning_rate": 8.210837305974775e-05, + "loss": 0.0045, + "step": 2267 + }, + { + "epoch": 1.7522687777563237, + "grad_norm": 0.00840567797422409, + "learning_rate": 8.208768975964338e-05, + "loss": 0.0045, + "step": 2268 + }, + { + "epoch": 1.7530411276308167, + "grad_norm": 0.014739587903022766, + "learning_rate": 8.206699711958789e-05, + "loss": 0.0038, + "step": 2269 + }, + { + "epoch": 1.75381347750531, + "grad_norm": 0.027478396892547607, + "learning_rate": 8.204629514560437e-05, + "loss": 0.0043, + "step": 2270 + }, + { + "epoch": 1.754585827379803, + "grad_norm": 0.009460730478167534, + "learning_rate": 8.202558384371868e-05, + "loss": 0.0047, + "step": 2271 + }, + { + "epoch": 1.7553581772542963, + "grad_norm": 0.026636026799678802, + "learning_rate": 8.200486321995936e-05, + "loss": 0.005, + "step": 2272 + }, + { + "epoch": 1.7561305271287893, + "grad_norm": 0.0176105797290802, + "learning_rate": 8.19841332803577e-05, + "loss": 0.0048, + "step": 2273 + }, + { + "epoch": 1.7569028770032826, + "grad_norm": 0.01977243646979332, + "learning_rate": 8.196339403094768e-05, + "loss": 0.0043, + "step": 2274 + }, + { + "epoch": 1.7576752268777756, + "grad_norm": 0.00944372545927763, + "learning_rate": 8.194264547776603e-05, + "loss": 0.0047, + "step": 2275 + }, + { + "epoch": 1.758447576752269, + "grad_norm": 0.01759764365851879, + "learning_rate": 8.192188762685208e-05, + "loss": 0.0043, + "step": 2276 + }, + { + "epoch": 1.7592199266267619, + "grad_norm": 0.014346209354698658, + "learning_rate": 8.190112048424799e-05, + "loss": 0.0043, + "step": 2277 + }, + { + "epoch": 1.7599922765012552, + "grad_norm": 0.01344558596611023, + "learning_rate": 8.188034405599856e-05, + "loss": 0.0045, + "step": 2278 + }, + { + "epoch": 1.7607646263757482, + "grad_norm": 0.010901092551648617, + "learning_rate": 8.18595583481513e-05, + "loss": 0.0047, + "step": 2279 + }, + { + "epoch": 1.7615369762502413, + "grad_norm": 0.012345247901976109, + "learning_rate": 8.183876336675644e-05, + "loss": 0.0046, + "step": 2280 + }, + { + "epoch": 1.7623093261247345, + "grad_norm": 0.012082891538739204, + "learning_rate": 8.18179591178669e-05, + "loss": 0.0045, + "step": 2281 + }, + { + "epoch": 1.7630816759992276, + "grad_norm": 0.010318143293261528, + "learning_rate": 8.179714560753828e-05, + "loss": 0.0043, + "step": 2282 + }, + { + "epoch": 1.7638540258737208, + "grad_norm": 0.01975180394947529, + "learning_rate": 8.177632284182888e-05, + "loss": 0.0048, + "step": 2283 + }, + { + "epoch": 1.764626375748214, + "grad_norm": 0.010247784666717052, + "learning_rate": 8.175549082679973e-05, + "loss": 0.0046, + "step": 2284 + }, + { + "epoch": 1.765398725622707, + "grad_norm": 0.012388781644403934, + "learning_rate": 8.173464956851452e-05, + "loss": 0.0042, + "step": 2285 + }, + { + "epoch": 1.7661710754972002, + "grad_norm": 0.010144729167222977, + "learning_rate": 8.171379907303964e-05, + "loss": 0.004, + "step": 2286 + }, + { + "epoch": 1.7669434253716934, + "grad_norm": 0.019393224269151688, + "learning_rate": 8.169293934644412e-05, + "loss": 0.0041, + "step": 2287 + }, + { + "epoch": 1.7677157752461865, + "grad_norm": 0.010567243210971355, + "learning_rate": 8.16720703947998e-05, + "loss": 0.004, + "step": 2288 + }, + { + "epoch": 1.7684881251206797, + "grad_norm": 0.013309785164892673, + "learning_rate": 8.165119222418107e-05, + "loss": 0.0041, + "step": 2289 + }, + { + "epoch": 1.7692604749951728, + "grad_norm": 0.010740738362073898, + "learning_rate": 8.163030484066508e-05, + "loss": 0.0044, + "step": 2290 + }, + { + "epoch": 1.770032824869666, + "grad_norm": 0.012563997879624367, + "learning_rate": 8.160940825033165e-05, + "loss": 0.0042, + "step": 2291 + }, + { + "epoch": 1.770805174744159, + "grad_norm": 0.008908885531127453, + "learning_rate": 8.158850245926325e-05, + "loss": 0.0043, + "step": 2292 + }, + { + "epoch": 1.7715775246186523, + "grad_norm": 0.013040358200669289, + "learning_rate": 8.156758747354507e-05, + "loss": 0.0043, + "step": 2293 + }, + { + "epoch": 1.7723498744931454, + "grad_norm": 0.01532841194421053, + "learning_rate": 8.154666329926494e-05, + "loss": 0.0046, + "step": 2294 + }, + { + "epoch": 1.7731222243676386, + "grad_norm": 0.009438990615308285, + "learning_rate": 8.152572994251342e-05, + "loss": 0.0041, + "step": 2295 + }, + { + "epoch": 1.7738945742421317, + "grad_norm": 0.01765560731291771, + "learning_rate": 8.150478740938365e-05, + "loss": 0.0038, + "step": 2296 + }, + { + "epoch": 1.7746669241166249, + "grad_norm": 0.009543683379888535, + "learning_rate": 8.148383570597154e-05, + "loss": 0.0041, + "step": 2297 + }, + { + "epoch": 1.775439273991118, + "grad_norm": 0.010798865929245949, + "learning_rate": 8.14628748383756e-05, + "loss": 0.0044, + "step": 2298 + }, + { + "epoch": 1.7762116238656112, + "grad_norm": 0.00954483449459076, + "learning_rate": 8.144190481269702e-05, + "loss": 0.0044, + "step": 2299 + }, + { + "epoch": 1.7769839737401043, + "grad_norm": 0.007039578165858984, + "learning_rate": 8.142092563503972e-05, + "loss": 0.0041, + "step": 2300 + }, + { + "epoch": 1.7777563236145975, + "grad_norm": 0.022727273404598236, + "learning_rate": 8.139993731151017e-05, + "loss": 0.0039, + "step": 2301 + }, + { + "epoch": 1.7785286734890906, + "grad_norm": 0.013155174441635609, + "learning_rate": 8.137893984821761e-05, + "loss": 0.0041, + "step": 2302 + }, + { + "epoch": 1.7793010233635838, + "grad_norm": 0.014248731546103954, + "learning_rate": 8.135793325127388e-05, + "loss": 0.0043, + "step": 2303 + }, + { + "epoch": 1.7800733732380767, + "grad_norm": 0.027962619438767433, + "learning_rate": 8.133691752679347e-05, + "loss": 0.0046, + "step": 2304 + }, + { + "epoch": 1.78084572311257, + "grad_norm": 0.023610763251781464, + "learning_rate": 8.131589268089358e-05, + "loss": 0.0039, + "step": 2305 + }, + { + "epoch": 1.781618072987063, + "grad_norm": 0.012954623438417912, + "learning_rate": 8.129485871969402e-05, + "loss": 0.0041, + "step": 2306 + }, + { + "epoch": 1.7823904228615564, + "grad_norm": 0.03095209412276745, + "learning_rate": 8.127381564931726e-05, + "loss": 0.0047, + "step": 2307 + }, + { + "epoch": 1.7831627727360493, + "grad_norm": 0.020441990345716476, + "learning_rate": 8.125276347588847e-05, + "loss": 0.0041, + "step": 2308 + }, + { + "epoch": 1.7839351226105427, + "grad_norm": 0.014079474844038486, + "learning_rate": 8.123170220553537e-05, + "loss": 0.0042, + "step": 2309 + }, + { + "epoch": 1.7847074724850356, + "grad_norm": 0.03220905736088753, + "learning_rate": 8.121063184438845e-05, + "loss": 0.0043, + "step": 2310 + }, + { + "epoch": 1.785479822359529, + "grad_norm": 0.018410203978419304, + "learning_rate": 8.118955239858072e-05, + "loss": 0.0047, + "step": 2311 + }, + { + "epoch": 1.7862521722340219, + "grad_norm": 0.015284133143723011, + "learning_rate": 8.116846387424794e-05, + "loss": 0.0041, + "step": 2312 + }, + { + "epoch": 1.7870245221085153, + "grad_norm": 0.024698738008737564, + "learning_rate": 8.114736627752846e-05, + "loss": 0.004, + "step": 2313 + }, + { + "epoch": 1.7877968719830082, + "grad_norm": 0.016370631754398346, + "learning_rate": 8.112625961456325e-05, + "loss": 0.0048, + "step": 2314 + }, + { + "epoch": 1.7885692218575016, + "grad_norm": 0.017090700566768646, + "learning_rate": 8.1105143891496e-05, + "loss": 0.0042, + "step": 2315 + }, + { + "epoch": 1.7893415717319945, + "grad_norm": 0.019004611298441887, + "learning_rate": 8.108401911447297e-05, + "loss": 0.0044, + "step": 2316 + }, + { + "epoch": 1.7901139216064879, + "grad_norm": 0.026414448395371437, + "learning_rate": 8.106288528964306e-05, + "loss": 0.0049, + "step": 2317 + }, + { + "epoch": 1.7908862714809808, + "grad_norm": 0.01318669319152832, + "learning_rate": 8.104174242315781e-05, + "loss": 0.0042, + "step": 2318 + }, + { + "epoch": 1.7916586213554742, + "grad_norm": 0.023264247924089432, + "learning_rate": 8.102059052117141e-05, + "loss": 0.0045, + "step": 2319 + }, + { + "epoch": 1.792430971229967, + "grad_norm": 0.010670443065464497, + "learning_rate": 8.099942958984068e-05, + "loss": 0.004, + "step": 2320 + }, + { + "epoch": 1.7932033211044605, + "grad_norm": 0.0195294339209795, + "learning_rate": 8.097825963532504e-05, + "loss": 0.0043, + "step": 2321 + }, + { + "epoch": 1.7939756709789534, + "grad_norm": 0.009845489636063576, + "learning_rate": 8.095708066378653e-05, + "loss": 0.0043, + "step": 2322 + }, + { + "epoch": 1.7947480208534468, + "grad_norm": 0.018464913591742516, + "learning_rate": 8.09358926813899e-05, + "loss": 0.0039, + "step": 2323 + }, + { + "epoch": 1.7955203707279397, + "grad_norm": 0.02056654542684555, + "learning_rate": 8.091469569430238e-05, + "loss": 0.0042, + "step": 2324 + }, + { + "epoch": 1.796292720602433, + "grad_norm": 0.010467185638844967, + "learning_rate": 8.089348970869398e-05, + "loss": 0.0043, + "step": 2325 + }, + { + "epoch": 1.797065070476926, + "grad_norm": 0.01489762682467699, + "learning_rate": 8.087227473073719e-05, + "loss": 0.0042, + "step": 2326 + }, + { + "epoch": 1.7978374203514194, + "grad_norm": 0.017772536724805832, + "learning_rate": 8.085105076660722e-05, + "loss": 0.0045, + "step": 2327 + }, + { + "epoch": 1.7986097702259123, + "grad_norm": 0.011074879206717014, + "learning_rate": 8.082981782248182e-05, + "loss": 0.0044, + "step": 2328 + }, + { + "epoch": 1.7993821201004054, + "grad_norm": 0.0100552998483181, + "learning_rate": 8.080857590454138e-05, + "loss": 0.0043, + "step": 2329 + }, + { + "epoch": 1.8001544699748986, + "grad_norm": 0.019595801830291748, + "learning_rate": 8.078732501896896e-05, + "loss": 0.0043, + "step": 2330 + }, + { + "epoch": 1.8009268198493917, + "grad_norm": 0.01229262538254261, + "learning_rate": 8.076606517195013e-05, + "loss": 0.0039, + "step": 2331 + }, + { + "epoch": 1.8016991697238849, + "grad_norm": 0.009469173848628998, + "learning_rate": 8.074479636967314e-05, + "loss": 0.0038, + "step": 2332 + }, + { + "epoch": 1.802471519598378, + "grad_norm": 0.012940244749188423, + "learning_rate": 8.072351861832883e-05, + "loss": 0.0039, + "step": 2333 + }, + { + "epoch": 1.8032438694728712, + "grad_norm": 0.01948258839547634, + "learning_rate": 8.070223192411061e-05, + "loss": 0.0043, + "step": 2334 + }, + { + "epoch": 1.8040162193473643, + "grad_norm": 0.008973821066319942, + "learning_rate": 8.068093629321456e-05, + "loss": 0.004, + "step": 2335 + }, + { + "epoch": 1.8047885692218575, + "grad_norm": 0.009137239307165146, + "learning_rate": 8.065963173183929e-05, + "loss": 0.0038, + "step": 2336 + }, + { + "epoch": 1.8055609190963506, + "grad_norm": 0.020940445363521576, + "learning_rate": 8.063831824618606e-05, + "loss": 0.0045, + "step": 2337 + }, + { + "epoch": 1.8063332689708438, + "grad_norm": 0.01261752936989069, + "learning_rate": 8.061699584245872e-05, + "loss": 0.0047, + "step": 2338 + }, + { + "epoch": 1.807105618845337, + "grad_norm": 0.01585754007101059, + "learning_rate": 8.05956645268637e-05, + "loss": 0.0042, + "step": 2339 + }, + { + "epoch": 1.80787796871983, + "grad_norm": 0.013830906711518764, + "learning_rate": 8.057432430561e-05, + "loss": 0.0046, + "step": 2340 + }, + { + "epoch": 1.8086503185943232, + "grad_norm": 0.009410025551915169, + "learning_rate": 8.05529751849093e-05, + "loss": 0.0044, + "step": 2341 + }, + { + "epoch": 1.8094226684688164, + "grad_norm": 0.01510144118219614, + "learning_rate": 8.053161717097575e-05, + "loss": 0.0044, + "step": 2342 + }, + { + "epoch": 1.8101950183433095, + "grad_norm": 0.012338819913566113, + "learning_rate": 8.05102502700262e-05, + "loss": 0.0041, + "step": 2343 + }, + { + "epoch": 1.8109673682178027, + "grad_norm": 0.016597097739577293, + "learning_rate": 8.048887448828001e-05, + "loss": 0.0045, + "step": 2344 + }, + { + "epoch": 1.8117397180922958, + "grad_norm": 0.009099415503442287, + "learning_rate": 8.046748983195919e-05, + "loss": 0.0044, + "step": 2345 + }, + { + "epoch": 1.812512067966789, + "grad_norm": 0.014252396300435066, + "learning_rate": 8.044609630728826e-05, + "loss": 0.0047, + "step": 2346 + }, + { + "epoch": 1.8132844178412821, + "grad_norm": 0.009875001385807991, + "learning_rate": 8.042469392049436e-05, + "loss": 0.0038, + "step": 2347 + }, + { + "epoch": 1.8140567677157753, + "grad_norm": 0.009954247623682022, + "learning_rate": 8.040328267780724e-05, + "loss": 0.0042, + "step": 2348 + }, + { + "epoch": 1.8148291175902684, + "grad_norm": 0.008104944601655006, + "learning_rate": 8.038186258545916e-05, + "loss": 0.0037, + "step": 2349 + }, + { + "epoch": 1.8156014674647616, + "grad_norm": 0.020304758101701736, + "learning_rate": 8.0360433649685e-05, + "loss": 0.0045, + "step": 2350 + }, + { + "epoch": 1.8163738173392547, + "grad_norm": 0.013075760565698147, + "learning_rate": 8.033899587672222e-05, + "loss": 0.0049, + "step": 2351 + }, + { + "epoch": 1.8171461672137479, + "grad_norm": 0.011701141484081745, + "learning_rate": 8.031754927281084e-05, + "loss": 0.0041, + "step": 2352 + }, + { + "epoch": 1.8179185170882408, + "grad_norm": 0.008222193457186222, + "learning_rate": 8.029609384419341e-05, + "loss": 0.0039, + "step": 2353 + }, + { + "epoch": 1.8186908669627342, + "grad_norm": 0.010191203095018864, + "learning_rate": 8.027462959711512e-05, + "loss": 0.0043, + "step": 2354 + }, + { + "epoch": 1.8194632168372271, + "grad_norm": 0.009672521613538265, + "learning_rate": 8.02531565378237e-05, + "loss": 0.0039, + "step": 2355 + }, + { + "epoch": 1.8202355667117205, + "grad_norm": 0.017960375174880028, + "learning_rate": 8.023167467256942e-05, + "loss": 0.0043, + "step": 2356 + }, + { + "epoch": 1.8210079165862134, + "grad_norm": 0.01434109266847372, + "learning_rate": 8.021018400760514e-05, + "loss": 0.0043, + "step": 2357 + }, + { + "epoch": 1.8217802664607068, + "grad_norm": 0.012434864416718483, + "learning_rate": 8.018868454918627e-05, + "loss": 0.0038, + "step": 2358 + }, + { + "epoch": 1.8225526163351997, + "grad_norm": 0.024516962468624115, + "learning_rate": 8.016717630357076e-05, + "loss": 0.0043, + "step": 2359 + }, + { + "epoch": 1.823324966209693, + "grad_norm": 0.01232131663709879, + "learning_rate": 8.01456592770192e-05, + "loss": 0.0046, + "step": 2360 + }, + { + "epoch": 1.824097316084186, + "grad_norm": 0.013871725648641586, + "learning_rate": 8.012413347579462e-05, + "loss": 0.0041, + "step": 2361 + }, + { + "epoch": 1.8248696659586794, + "grad_norm": 0.024569762870669365, + "learning_rate": 8.010259890616267e-05, + "loss": 0.0045, + "step": 2362 + }, + { + "epoch": 1.8256420158331723, + "grad_norm": 0.01810004748404026, + "learning_rate": 8.008105557439159e-05, + "loss": 0.004, + "step": 2363 + }, + { + "epoch": 1.8264143657076657, + "grad_norm": 0.01343507133424282, + "learning_rate": 8.005950348675205e-05, + "loss": 0.004, + "step": 2364 + }, + { + "epoch": 1.8271867155821586, + "grad_norm": 0.010591656900942326, + "learning_rate": 8.003794264951741e-05, + "loss": 0.0043, + "step": 2365 + }, + { + "epoch": 1.827959065456652, + "grad_norm": 0.010226280428469181, + "learning_rate": 8.001637306896346e-05, + "loss": 0.004, + "step": 2366 + }, + { + "epoch": 1.828731415331145, + "grad_norm": 0.019120151177048683, + "learning_rate": 7.999479475136859e-05, + "loss": 0.0041, + "step": 2367 + }, + { + "epoch": 1.8295037652056383, + "grad_norm": 0.015317712910473347, + "learning_rate": 7.997320770301377e-05, + "loss": 0.0043, + "step": 2368 + }, + { + "epoch": 1.8302761150801312, + "grad_norm": 0.011360458098351955, + "learning_rate": 7.995161193018241e-05, + "loss": 0.0041, + "step": 2369 + }, + { + "epoch": 1.8310484649546246, + "grad_norm": 0.009333595633506775, + "learning_rate": 7.993000743916056e-05, + "loss": 0.0042, + "step": 2370 + }, + { + "epoch": 1.8318208148291175, + "grad_norm": 0.013634170405566692, + "learning_rate": 7.990839423623675e-05, + "loss": 0.0041, + "step": 2371 + }, + { + "epoch": 1.8325931647036109, + "grad_norm": 0.008756657131016254, + "learning_rate": 7.988677232770205e-05, + "loss": 0.0038, + "step": 2372 + }, + { + "epoch": 1.8333655145781038, + "grad_norm": 0.011406585574150085, + "learning_rate": 7.98651417198501e-05, + "loss": 0.0047, + "step": 2373 + }, + { + "epoch": 1.8341378644525972, + "grad_norm": 0.009512819349765778, + "learning_rate": 7.984350241897703e-05, + "loss": 0.0044, + "step": 2374 + }, + { + "epoch": 1.8349102143270901, + "grad_norm": 0.011950364336371422, + "learning_rate": 7.98218544313815e-05, + "loss": 0.0048, + "step": 2375 + }, + { + "epoch": 1.8356825642015833, + "grad_norm": 0.010683958418667316, + "learning_rate": 7.980019776336475e-05, + "loss": 0.0043, + "step": 2376 + }, + { + "epoch": 1.8364549140760764, + "grad_norm": 0.009281586855649948, + "learning_rate": 7.977853242123052e-05, + "loss": 0.0037, + "step": 2377 + }, + { + "epoch": 1.8372272639505696, + "grad_norm": 0.01649283990263939, + "learning_rate": 7.975685841128502e-05, + "loss": 0.0043, + "step": 2378 + }, + { + "epoch": 1.8379996138250627, + "grad_norm": 0.008996492251753807, + "learning_rate": 7.973517573983707e-05, + "loss": 0.004, + "step": 2379 + }, + { + "epoch": 1.8387719636995559, + "grad_norm": 0.013421095907688141, + "learning_rate": 7.971348441319796e-05, + "loss": 0.0041, + "step": 2380 + }, + { + "epoch": 1.839544313574049, + "grad_norm": 0.012858722358942032, + "learning_rate": 7.969178443768151e-05, + "loss": 0.0039, + "step": 2381 + }, + { + "epoch": 1.8403166634485422, + "grad_norm": 0.00814067106693983, + "learning_rate": 7.967007581960407e-05, + "loss": 0.0039, + "step": 2382 + }, + { + "epoch": 1.8410890133230353, + "grad_norm": 0.014434592798352242, + "learning_rate": 7.964835856528446e-05, + "loss": 0.0036, + "step": 2383 + }, + { + "epoch": 1.8418613631975285, + "grad_norm": 0.010793699882924557, + "learning_rate": 7.962663268104408e-05, + "loss": 0.004, + "step": 2384 + }, + { + "epoch": 1.8426337130720216, + "grad_norm": 0.013711008243262768, + "learning_rate": 7.960489817320682e-05, + "loss": 0.0048, + "step": 2385 + }, + { + "epoch": 1.8434060629465148, + "grad_norm": 0.011649169027805328, + "learning_rate": 7.958315504809903e-05, + "loss": 0.0041, + "step": 2386 + }, + { + "epoch": 1.844178412821008, + "grad_norm": 0.026133539155125618, + "learning_rate": 7.956140331204963e-05, + "loss": 0.0042, + "step": 2387 + }, + { + "epoch": 1.844950762695501, + "grad_norm": 0.009728859178721905, + "learning_rate": 7.953964297139004e-05, + "loss": 0.0041, + "step": 2388 + }, + { + "epoch": 1.8457231125699942, + "grad_norm": 0.0270326379686594, + "learning_rate": 7.951787403245414e-05, + "loss": 0.0054, + "step": 2389 + }, + { + "epoch": 1.8464954624444874, + "grad_norm": 0.018707750365138054, + "learning_rate": 7.949609650157836e-05, + "loss": 0.0046, + "step": 2390 + }, + { + "epoch": 1.8472678123189805, + "grad_norm": 0.014537750743329525, + "learning_rate": 7.947431038510162e-05, + "loss": 0.004, + "step": 2391 + }, + { + "epoch": 1.8480401621934737, + "grad_norm": 0.01314216386526823, + "learning_rate": 7.945251568936529e-05, + "loss": 0.0045, + "step": 2392 + }, + { + "epoch": 1.8488125120679668, + "grad_norm": 0.028566807508468628, + "learning_rate": 7.943071242071334e-05, + "loss": 0.0046, + "step": 2393 + }, + { + "epoch": 1.84958486194246, + "grad_norm": 0.01397473644465208, + "learning_rate": 7.940890058549214e-05, + "loss": 0.0043, + "step": 2394 + }, + { + "epoch": 1.8503572118169531, + "grad_norm": 0.035390984266996384, + "learning_rate": 7.93870801900506e-05, + "loss": 0.0044, + "step": 2395 + }, + { + "epoch": 1.8511295616914463, + "grad_norm": 0.02588469907641411, + "learning_rate": 7.936525124074008e-05, + "loss": 0.0047, + "step": 2396 + }, + { + "epoch": 1.8519019115659394, + "grad_norm": 0.013072910718619823, + "learning_rate": 7.93434137439145e-05, + "loss": 0.0042, + "step": 2397 + }, + { + "epoch": 1.8526742614404326, + "grad_norm": 0.024199657142162323, + "learning_rate": 7.93215677059302e-05, + "loss": 0.0044, + "step": 2398 + }, + { + "epoch": 1.8534466113149257, + "grad_norm": 0.03284648805856705, + "learning_rate": 7.929971313314604e-05, + "loss": 0.0045, + "step": 2399 + }, + { + "epoch": 1.8542189611894186, + "grad_norm": 0.020237479358911514, + "learning_rate": 7.927785003192338e-05, + "loss": 0.0041, + "step": 2400 + }, + { + "epoch": 1.854991311063912, + "grad_norm": 0.012655111029744148, + "learning_rate": 7.925597840862602e-05, + "loss": 0.0042, + "step": 2401 + }, + { + "epoch": 1.855763660938405, + "grad_norm": 0.02779683843255043, + "learning_rate": 7.923409826962025e-05, + "loss": 0.0045, + "step": 2402 + }, + { + "epoch": 1.8565360108128983, + "grad_norm": 0.017316104844212532, + "learning_rate": 7.921220962127487e-05, + "loss": 0.0044, + "step": 2403 + }, + { + "epoch": 1.8573083606873912, + "grad_norm": 0.010716917924582958, + "learning_rate": 7.919031246996114e-05, + "loss": 0.0046, + "step": 2404 + }, + { + "epoch": 1.8580807105618846, + "grad_norm": 0.00959822628647089, + "learning_rate": 7.916840682205278e-05, + "loss": 0.0043, + "step": 2405 + }, + { + "epoch": 1.8588530604363775, + "grad_norm": 0.012936657294631004, + "learning_rate": 7.914649268392598e-05, + "loss": 0.0044, + "step": 2406 + }, + { + "epoch": 1.859625410310871, + "grad_norm": 0.02065490558743477, + "learning_rate": 7.912457006195945e-05, + "loss": 0.0045, + "step": 2407 + }, + { + "epoch": 1.8603977601853638, + "grad_norm": 0.01272338256239891, + "learning_rate": 7.910263896253432e-05, + "loss": 0.0039, + "step": 2408 + }, + { + "epoch": 1.8611701100598572, + "grad_norm": 0.010941894724965096, + "learning_rate": 7.908069939203419e-05, + "loss": 0.0045, + "step": 2409 + }, + { + "epoch": 1.8619424599343501, + "grad_norm": 0.009238572791218758, + "learning_rate": 7.905875135684515e-05, + "loss": 0.0041, + "step": 2410 + }, + { + "epoch": 1.8627148098088435, + "grad_norm": 0.017378829419612885, + "learning_rate": 7.903679486335575e-05, + "loss": 0.0041, + "step": 2411 + }, + { + "epoch": 1.8634871596833364, + "grad_norm": 0.019908621907234192, + "learning_rate": 7.901482991795699e-05, + "loss": 0.0052, + "step": 2412 + }, + { + "epoch": 1.8642595095578298, + "grad_norm": 0.011136463843286037, + "learning_rate": 7.899285652704232e-05, + "loss": 0.0037, + "step": 2413 + }, + { + "epoch": 1.8650318594323227, + "grad_norm": 0.033886104822158813, + "learning_rate": 7.897087469700768e-05, + "loss": 0.0047, + "step": 2414 + }, + { + "epoch": 1.8658042093068161, + "grad_norm": 0.024965351447463036, + "learning_rate": 7.894888443425145e-05, + "loss": 0.0043, + "step": 2415 + }, + { + "epoch": 1.866576559181309, + "grad_norm": 0.012115641497075558, + "learning_rate": 7.892688574517447e-05, + "loss": 0.0043, + "step": 2416 + }, + { + "epoch": 1.8673489090558024, + "grad_norm": 0.013908233493566513, + "learning_rate": 7.890487863618e-05, + "loss": 0.0045, + "step": 2417 + }, + { + "epoch": 1.8681212589302953, + "grad_norm": 0.02759517915546894, + "learning_rate": 7.888286311367379e-05, + "loss": 0.0046, + "step": 2418 + }, + { + "epoch": 1.8688936088047887, + "grad_norm": 0.008342528715729713, + "learning_rate": 7.886083918406404e-05, + "loss": 0.0039, + "step": 2419 + }, + { + "epoch": 1.8696659586792816, + "grad_norm": 0.00872607808560133, + "learning_rate": 7.883880685376137e-05, + "loss": 0.0047, + "step": 2420 + }, + { + "epoch": 1.870438308553775, + "grad_norm": 0.013731162063777447, + "learning_rate": 7.881676612917888e-05, + "loss": 0.004, + "step": 2421 + }, + { + "epoch": 1.871210658428268, + "grad_norm": 0.022072460502386093, + "learning_rate": 7.879471701673204e-05, + "loss": 0.0043, + "step": 2422 + }, + { + "epoch": 1.871983008302761, + "grad_norm": 0.010056992992758751, + "learning_rate": 7.877265952283889e-05, + "loss": 0.0042, + "step": 2423 + }, + { + "epoch": 1.8727553581772542, + "grad_norm": 0.009204492904245853, + "learning_rate": 7.875059365391977e-05, + "loss": 0.0042, + "step": 2424 + }, + { + "epoch": 1.8735277080517474, + "grad_norm": 0.00854427833110094, + "learning_rate": 7.872851941639754e-05, + "loss": 0.0042, + "step": 2425 + }, + { + "epoch": 1.8743000579262405, + "grad_norm": 0.01865958608686924, + "learning_rate": 7.87064368166975e-05, + "loss": 0.0034, + "step": 2426 + }, + { + "epoch": 1.8750724078007337, + "grad_norm": 0.011449846439063549, + "learning_rate": 7.868434586124734e-05, + "loss": 0.0042, + "step": 2427 + }, + { + "epoch": 1.8758447576752268, + "grad_norm": 0.017687395215034485, + "learning_rate": 7.866224655647718e-05, + "loss": 0.0041, + "step": 2428 + }, + { + "epoch": 1.87661710754972, + "grad_norm": 0.013626324012875557, + "learning_rate": 7.864013890881963e-05, + "loss": 0.0045, + "step": 2429 + }, + { + "epoch": 1.8773894574242131, + "grad_norm": 0.010985249653458595, + "learning_rate": 7.86180229247097e-05, + "loss": 0.0042, + "step": 2430 + }, + { + "epoch": 1.8781618072987063, + "grad_norm": 0.009432705119252205, + "learning_rate": 7.859589861058479e-05, + "loss": 0.0042, + "step": 2431 + }, + { + "epoch": 1.8789341571731994, + "grad_norm": 0.009560228325426579, + "learning_rate": 7.857376597288476e-05, + "loss": 0.004, + "step": 2432 + }, + { + "epoch": 1.8797065070476926, + "grad_norm": 0.014136698096990585, + "learning_rate": 7.85516250180519e-05, + "loss": 0.0045, + "step": 2433 + }, + { + "epoch": 1.8804788569221857, + "grad_norm": 0.010503970086574554, + "learning_rate": 7.85294757525309e-05, + "loss": 0.0046, + "step": 2434 + }, + { + "epoch": 1.881251206796679, + "grad_norm": 0.017374323680996895, + "learning_rate": 7.850731818276885e-05, + "loss": 0.0052, + "step": 2435 + }, + { + "epoch": 1.882023556671172, + "grad_norm": 0.011501210741698742, + "learning_rate": 7.848515231521531e-05, + "loss": 0.0043, + "step": 2436 + }, + { + "epoch": 1.8827959065456652, + "grad_norm": 0.009206007234752178, + "learning_rate": 7.846297815632224e-05, + "loss": 0.004, + "step": 2437 + }, + { + "epoch": 1.8835682564201583, + "grad_norm": 0.02441769279539585, + "learning_rate": 7.844079571254397e-05, + "loss": 0.0043, + "step": 2438 + }, + { + "epoch": 1.8843406062946515, + "grad_norm": 0.008784429170191288, + "learning_rate": 7.841860499033731e-05, + "loss": 0.0041, + "step": 2439 + }, + { + "epoch": 1.8851129561691446, + "grad_norm": 0.018666021525859833, + "learning_rate": 7.83964059961614e-05, + "loss": 0.0041, + "step": 2440 + }, + { + "epoch": 1.8858853060436378, + "grad_norm": 0.012539715506136417, + "learning_rate": 7.837419873647787e-05, + "loss": 0.0041, + "step": 2441 + }, + { + "epoch": 1.886657655918131, + "grad_norm": 0.008437935262918472, + "learning_rate": 7.835198321775067e-05, + "loss": 0.0041, + "step": 2442 + }, + { + "epoch": 1.887430005792624, + "grad_norm": 0.00943806767463684, + "learning_rate": 7.832975944644626e-05, + "loss": 0.0046, + "step": 2443 + }, + { + "epoch": 1.8882023556671172, + "grad_norm": 0.009006207808852196, + "learning_rate": 7.830752742903341e-05, + "loss": 0.0038, + "step": 2444 + }, + { + "epoch": 1.8889747055416104, + "grad_norm": 0.01270466111600399, + "learning_rate": 7.828528717198331e-05, + "loss": 0.0044, + "step": 2445 + }, + { + "epoch": 1.8897470554161035, + "grad_norm": 0.008993150666356087, + "learning_rate": 7.826303868176961e-05, + "loss": 0.0038, + "step": 2446 + }, + { + "epoch": 1.8905194052905965, + "grad_norm": 0.009910772554576397, + "learning_rate": 7.824078196486823e-05, + "loss": 0.0044, + "step": 2447 + }, + { + "epoch": 1.8912917551650898, + "grad_norm": 0.018001548945903778, + "learning_rate": 7.821851702775765e-05, + "loss": 0.0045, + "step": 2448 + }, + { + "epoch": 1.8920641050395828, + "grad_norm": 0.008888039737939835, + "learning_rate": 7.81962438769186e-05, + "loss": 0.0042, + "step": 2449 + }, + { + "epoch": 1.8928364549140761, + "grad_norm": 0.011056415736675262, + "learning_rate": 7.817396251883426e-05, + "loss": 0.004, + "step": 2450 + }, + { + "epoch": 1.893608804788569, + "grad_norm": 0.01189905870705843, + "learning_rate": 7.815167295999021e-05, + "loss": 0.0038, + "step": 2451 + }, + { + "epoch": 1.8943811546630624, + "grad_norm": 0.012073645368218422, + "learning_rate": 7.81293752068744e-05, + "loss": 0.0042, + "step": 2452 + }, + { + "epoch": 1.8951535045375554, + "grad_norm": 0.02056579291820526, + "learning_rate": 7.810706926597715e-05, + "loss": 0.0045, + "step": 2453 + }, + { + "epoch": 1.8959258544120487, + "grad_norm": 0.013602050952613354, + "learning_rate": 7.808475514379121e-05, + "loss": 0.0047, + "step": 2454 + }, + { + "epoch": 1.8966982042865417, + "grad_norm": 0.020173611119389534, + "learning_rate": 7.806243284681166e-05, + "loss": 0.0041, + "step": 2455 + }, + { + "epoch": 1.897470554161035, + "grad_norm": 0.013334146700799465, + "learning_rate": 7.804010238153598e-05, + "loss": 0.0034, + "step": 2456 + }, + { + "epoch": 1.898242904035528, + "grad_norm": 0.009373542852699757, + "learning_rate": 7.801776375446406e-05, + "loss": 0.0043, + "step": 2457 + }, + { + "epoch": 1.8990152539100214, + "grad_norm": 0.013425251469016075, + "learning_rate": 7.799541697209809e-05, + "loss": 0.0039, + "step": 2458 + }, + { + "epoch": 1.8997876037845143, + "grad_norm": 0.013179768808186054, + "learning_rate": 7.79730620409427e-05, + "loss": 0.0044, + "step": 2459 + }, + { + "epoch": 1.9005599536590077, + "grad_norm": 0.020629245787858963, + "learning_rate": 7.795069896750487e-05, + "loss": 0.004, + "step": 2460 + }, + { + "epoch": 1.9013323035335006, + "grad_norm": 0.01077171042561531, + "learning_rate": 7.792832775829395e-05, + "loss": 0.0038, + "step": 2461 + }, + { + "epoch": 1.902104653407994, + "grad_norm": 0.016820227727293968, + "learning_rate": 7.790594841982166e-05, + "loss": 0.0047, + "step": 2462 + }, + { + "epoch": 1.9028770032824869, + "grad_norm": 0.01245331671088934, + "learning_rate": 7.788356095860208e-05, + "loss": 0.0048, + "step": 2463 + }, + { + "epoch": 1.9036493531569803, + "grad_norm": 0.010155857540667057, + "learning_rate": 7.786116538115166e-05, + "loss": 0.004, + "step": 2464 + }, + { + "epoch": 1.9044217030314732, + "grad_norm": 0.012097825296223164, + "learning_rate": 7.783876169398921e-05, + "loss": 0.0042, + "step": 2465 + }, + { + "epoch": 1.9051940529059666, + "grad_norm": 0.014586256816983223, + "learning_rate": 7.78163499036359e-05, + "loss": 0.0048, + "step": 2466 + }, + { + "epoch": 1.9059664027804595, + "grad_norm": 0.010004458017647266, + "learning_rate": 7.779393001661529e-05, + "loss": 0.0046, + "step": 2467 + }, + { + "epoch": 1.9067387526549529, + "grad_norm": 0.010745279490947723, + "learning_rate": 7.777150203945322e-05, + "loss": 0.0039, + "step": 2468 + }, + { + "epoch": 1.9075111025294458, + "grad_norm": 0.01012917049229145, + "learning_rate": 7.774906597867797e-05, + "loss": 0.0043, + "step": 2469 + }, + { + "epoch": 1.908283452403939, + "grad_norm": 0.01098201610147953, + "learning_rate": 7.772662184082011e-05, + "loss": 0.004, + "step": 2470 + }, + { + "epoch": 1.909055802278432, + "grad_norm": 0.01820765621960163, + "learning_rate": 7.770416963241261e-05, + "loss": 0.0049, + "step": 2471 + }, + { + "epoch": 1.9098281521529252, + "grad_norm": 0.016260072588920593, + "learning_rate": 7.768170935999074e-05, + "loss": 0.0038, + "step": 2472 + }, + { + "epoch": 1.9106005020274184, + "grad_norm": 0.010583130642771721, + "learning_rate": 7.765924103009216e-05, + "loss": 0.0039, + "step": 2473 + }, + { + "epoch": 1.9113728519019115, + "grad_norm": 0.02265016734600067, + "learning_rate": 7.763676464925685e-05, + "loss": 0.0043, + "step": 2474 + }, + { + "epoch": 1.9121452017764047, + "grad_norm": 0.02282298542559147, + "learning_rate": 7.761428022402715e-05, + "loss": 0.0046, + "step": 2475 + }, + { + "epoch": 1.9129175516508978, + "grad_norm": 0.010374841280281544, + "learning_rate": 7.759178776094772e-05, + "loss": 0.0042, + "step": 2476 + }, + { + "epoch": 1.913689901525391, + "grad_norm": 0.027158847078680992, + "learning_rate": 7.756928726656559e-05, + "loss": 0.0049, + "step": 2477 + }, + { + "epoch": 1.9144622513998841, + "grad_norm": 0.017630072310566902, + "learning_rate": 7.754677874743009e-05, + "loss": 0.0044, + "step": 2478 + }, + { + "epoch": 1.9152346012743773, + "grad_norm": 0.015178913250565529, + "learning_rate": 7.75242622100929e-05, + "loss": 0.0043, + "step": 2479 + }, + { + "epoch": 1.9160069511488704, + "grad_norm": 0.01737389713525772, + "learning_rate": 7.750173766110806e-05, + "loss": 0.0048, + "step": 2480 + }, + { + "epoch": 1.9167793010233636, + "grad_norm": 0.028451379388570786, + "learning_rate": 7.747920510703194e-05, + "loss": 0.0047, + "step": 2481 + }, + { + "epoch": 1.9175516508978567, + "grad_norm": 0.0099767055362463, + "learning_rate": 7.745666455442318e-05, + "loss": 0.0039, + "step": 2482 + }, + { + "epoch": 1.9183240007723499, + "grad_norm": 0.023450423032045364, + "learning_rate": 7.743411600984282e-05, + "loss": 0.0043, + "step": 2483 + }, + { + "epoch": 1.919096350646843, + "grad_norm": 0.01755410246551037, + "learning_rate": 7.741155947985419e-05, + "loss": 0.0038, + "step": 2484 + }, + { + "epoch": 1.9198687005213362, + "grad_norm": 0.012542766518890858, + "learning_rate": 7.738899497102291e-05, + "loss": 0.0041, + "step": 2485 + }, + { + "epoch": 1.9206410503958293, + "grad_norm": 0.008834821172058582, + "learning_rate": 7.736642248991705e-05, + "loss": 0.0043, + "step": 2486 + }, + { + "epoch": 1.9214134002703225, + "grad_norm": 0.013980619609355927, + "learning_rate": 7.734384204310685e-05, + "loss": 0.0043, + "step": 2487 + }, + { + "epoch": 1.9221857501448156, + "grad_norm": 0.016002390533685684, + "learning_rate": 7.732125363716494e-05, + "loss": 0.0049, + "step": 2488 + }, + { + "epoch": 1.9229581000193088, + "grad_norm": 0.008405406959354877, + "learning_rate": 7.729865727866626e-05, + "loss": 0.0043, + "step": 2489 + }, + { + "epoch": 1.923730449893802, + "grad_norm": 0.007769963704049587, + "learning_rate": 7.727605297418808e-05, + "loss": 0.0043, + "step": 2490 + }, + { + "epoch": 1.924502799768295, + "grad_norm": 0.013532442972064018, + "learning_rate": 7.725344073030995e-05, + "loss": 0.0045, + "step": 2491 + }, + { + "epoch": 1.9252751496427882, + "grad_norm": 0.01704113557934761, + "learning_rate": 7.723082055361375e-05, + "loss": 0.0041, + "step": 2492 + }, + { + "epoch": 1.9260474995172814, + "grad_norm": 0.01334497332572937, + "learning_rate": 7.720819245068368e-05, + "loss": 0.004, + "step": 2493 + }, + { + "epoch": 1.9268198493917743, + "grad_norm": 0.008304566144943237, + "learning_rate": 7.718555642810623e-05, + "loss": 0.0041, + "step": 2494 + }, + { + "epoch": 1.9275921992662677, + "grad_norm": 0.014789161272346973, + "learning_rate": 7.716291249247018e-05, + "loss": 0.0043, + "step": 2495 + }, + { + "epoch": 1.9283645491407606, + "grad_norm": 0.01813403330743313, + "learning_rate": 7.714026065036666e-05, + "loss": 0.0041, + "step": 2496 + }, + { + "epoch": 1.929136899015254, + "grad_norm": 0.0111536281183362, + "learning_rate": 7.711760090838905e-05, + "loss": 0.0045, + "step": 2497 + }, + { + "epoch": 1.929909248889747, + "grad_norm": 0.01396595872938633, + "learning_rate": 7.709493327313307e-05, + "loss": 0.0048, + "step": 2498 + }, + { + "epoch": 1.9306815987642403, + "grad_norm": 0.015925578773021698, + "learning_rate": 7.707225775119671e-05, + "loss": 0.0043, + "step": 2499 + }, + { + "epoch": 1.9314539486387332, + "grad_norm": 0.008697424083948135, + "learning_rate": 7.704957434918028e-05, + "loss": 0.0042, + "step": 2500 + }, + { + "epoch": 1.9322262985132266, + "grad_norm": 0.0090946638956666, + "learning_rate": 7.702688307368635e-05, + "loss": 0.0042, + "step": 2501 + }, + { + "epoch": 1.9329986483877195, + "grad_norm": 0.01600305549800396, + "learning_rate": 7.700418393131982e-05, + "loss": 0.0039, + "step": 2502 + }, + { + "epoch": 1.9337709982622129, + "grad_norm": 0.015603674575686455, + "learning_rate": 7.698147692868785e-05, + "loss": 0.0041, + "step": 2503 + }, + { + "epoch": 1.9345433481367058, + "grad_norm": 0.010995871387422085, + "learning_rate": 7.695876207239993e-05, + "loss": 0.0042, + "step": 2504 + }, + { + "epoch": 1.9353156980111992, + "grad_norm": 0.011392833665013313, + "learning_rate": 7.693603936906775e-05, + "loss": 0.004, + "step": 2505 + }, + { + "epoch": 1.936088047885692, + "grad_norm": 0.01025738287717104, + "learning_rate": 7.691330882530539e-05, + "loss": 0.0039, + "step": 2506 + }, + { + "epoch": 1.9368603977601855, + "grad_norm": 0.020514898002147675, + "learning_rate": 7.689057044772914e-05, + "loss": 0.004, + "step": 2507 + }, + { + "epoch": 1.9376327476346784, + "grad_norm": 0.011781363748013973, + "learning_rate": 7.686782424295757e-05, + "loss": 0.0042, + "step": 2508 + }, + { + "epoch": 1.9384050975091718, + "grad_norm": 0.026511946693062782, + "learning_rate": 7.68450702176116e-05, + "loss": 0.0049, + "step": 2509 + }, + { + "epoch": 1.9391774473836647, + "grad_norm": 0.010671273805201054, + "learning_rate": 7.682230837831437e-05, + "loss": 0.0044, + "step": 2510 + }, + { + "epoch": 1.939949797258158, + "grad_norm": 0.013548245653510094, + "learning_rate": 7.679953873169125e-05, + "loss": 0.0046, + "step": 2511 + }, + { + "epoch": 1.940722147132651, + "grad_norm": 0.016395317390561104, + "learning_rate": 7.677676128436999e-05, + "loss": 0.0041, + "step": 2512 + }, + { + "epoch": 1.9414944970071444, + "grad_norm": 0.011841686442494392, + "learning_rate": 7.675397604298053e-05, + "loss": 0.0043, + "step": 2513 + }, + { + "epoch": 1.9422668468816373, + "grad_norm": 0.009677517227828503, + "learning_rate": 7.67311830141551e-05, + "loss": 0.0043, + "step": 2514 + }, + { + "epoch": 1.9430391967561307, + "grad_norm": 0.025067778304219246, + "learning_rate": 7.670838220452821e-05, + "loss": 0.0049, + "step": 2515 + }, + { + "epoch": 1.9438115466306236, + "grad_norm": 0.008173921145498753, + "learning_rate": 7.668557362073663e-05, + "loss": 0.0042, + "step": 2516 + }, + { + "epoch": 1.944583896505117, + "grad_norm": 0.011280354112386703, + "learning_rate": 7.666275726941936e-05, + "loss": 0.004, + "step": 2517 + }, + { + "epoch": 1.94535624637961, + "grad_norm": 0.010628647170960903, + "learning_rate": 7.663993315721771e-05, + "loss": 0.0044, + "step": 2518 + }, + { + "epoch": 1.946128596254103, + "grad_norm": 0.009494693949818611, + "learning_rate": 7.661710129077523e-05, + "loss": 0.0042, + "step": 2519 + }, + { + "epoch": 1.9469009461285962, + "grad_norm": 0.009466594085097313, + "learning_rate": 7.659426167673772e-05, + "loss": 0.004, + "step": 2520 + }, + { + "epoch": 1.9476732960030894, + "grad_norm": 0.007926770485937595, + "learning_rate": 7.657141432175323e-05, + "loss": 0.004, + "step": 2521 + }, + { + "epoch": 1.9484456458775825, + "grad_norm": 0.013132256455719471, + "learning_rate": 7.654855923247208e-05, + "loss": 0.0041, + "step": 2522 + }, + { + "epoch": 1.9492179957520757, + "grad_norm": 0.01026675384491682, + "learning_rate": 7.652569641554687e-05, + "loss": 0.0037, + "step": 2523 + }, + { + "epoch": 1.9499903456265688, + "grad_norm": 0.009410426020622253, + "learning_rate": 7.650282587763236e-05, + "loss": 0.004, + "step": 2524 + }, + { + "epoch": 1.950762695501062, + "grad_norm": 0.010991397313773632, + "learning_rate": 7.64799476253856e-05, + "loss": 0.0048, + "step": 2525 + }, + { + "epoch": 1.951535045375555, + "grad_norm": 0.011427038349211216, + "learning_rate": 7.645706166546596e-05, + "loss": 0.0038, + "step": 2526 + }, + { + "epoch": 1.9523073952500483, + "grad_norm": 0.009793714620172977, + "learning_rate": 7.643416800453495e-05, + "loss": 0.0037, + "step": 2527 + }, + { + "epoch": 1.9530797451245414, + "grad_norm": 0.013073851354420185, + "learning_rate": 7.641126664925637e-05, + "loss": 0.0046, + "step": 2528 + }, + { + "epoch": 1.9538520949990346, + "grad_norm": 0.013127139769494534, + "learning_rate": 7.638835760629626e-05, + "loss": 0.0038, + "step": 2529 + }, + { + "epoch": 1.9546244448735277, + "grad_norm": 0.01772155798971653, + "learning_rate": 7.636544088232284e-05, + "loss": 0.0055, + "step": 2530 + }, + { + "epoch": 1.9553967947480209, + "grad_norm": 0.015408862382173538, + "learning_rate": 7.63425164840067e-05, + "loss": 0.0045, + "step": 2531 + }, + { + "epoch": 1.956169144622514, + "grad_norm": 0.012125047855079174, + "learning_rate": 7.63195844180205e-05, + "loss": 0.004, + "step": 2532 + }, + { + "epoch": 1.9569414944970072, + "grad_norm": 0.015412149019539356, + "learning_rate": 7.629664469103926e-05, + "loss": 0.0035, + "step": 2533 + }, + { + "epoch": 1.9577138443715003, + "grad_norm": 0.008662154898047447, + "learning_rate": 7.627369730974016e-05, + "loss": 0.0041, + "step": 2534 + }, + { + "epoch": 1.9584861942459935, + "grad_norm": 0.008716880343854427, + "learning_rate": 7.625074228080262e-05, + "loss": 0.0042, + "step": 2535 + }, + { + "epoch": 1.9592585441204866, + "grad_norm": 0.01857982575893402, + "learning_rate": 7.62277796109083e-05, + "loss": 0.0039, + "step": 2536 + }, + { + "epoch": 1.9600308939949798, + "grad_norm": 0.015294843353331089, + "learning_rate": 7.62048093067411e-05, + "loss": 0.0039, + "step": 2537 + }, + { + "epoch": 1.960803243869473, + "grad_norm": 0.009465827606618404, + "learning_rate": 7.618183137498709e-05, + "loss": 0.0041, + "step": 2538 + }, + { + "epoch": 1.961575593743966, + "grad_norm": 0.01294454000890255, + "learning_rate": 7.615884582233461e-05, + "loss": 0.004, + "step": 2539 + }, + { + "epoch": 1.9623479436184592, + "grad_norm": 0.008212809450924397, + "learning_rate": 7.613585265547418e-05, + "loss": 0.0038, + "step": 2540 + }, + { + "epoch": 1.9631202934929524, + "grad_norm": 0.0148194320499897, + "learning_rate": 7.611285188109859e-05, + "loss": 0.0041, + "step": 2541 + }, + { + "epoch": 1.9638926433674455, + "grad_norm": 0.010401822626590729, + "learning_rate": 7.608984350590278e-05, + "loss": 0.0043, + "step": 2542 + }, + { + "epoch": 1.9646649932419384, + "grad_norm": 0.014921027235686779, + "learning_rate": 7.606682753658394e-05, + "loss": 0.0046, + "step": 2543 + }, + { + "epoch": 1.9654373431164318, + "grad_norm": 0.010640044696629047, + "learning_rate": 7.604380397984146e-05, + "loss": 0.0046, + "step": 2544 + }, + { + "epoch": 1.9662096929909247, + "grad_norm": 0.010940579697489738, + "learning_rate": 7.602077284237693e-05, + "loss": 0.0044, + "step": 2545 + }, + { + "epoch": 1.9669820428654181, + "grad_norm": 0.007183860521763563, + "learning_rate": 7.599773413089419e-05, + "loss": 0.004, + "step": 2546 + }, + { + "epoch": 1.967754392739911, + "grad_norm": 0.010012193582952023, + "learning_rate": 7.597468785209924e-05, + "loss": 0.0042, + "step": 2547 + }, + { + "epoch": 1.9685267426144044, + "grad_norm": 0.009017789736390114, + "learning_rate": 7.595163401270028e-05, + "loss": 0.0038, + "step": 2548 + }, + { + "epoch": 1.9692990924888973, + "grad_norm": 0.009588037617504597, + "learning_rate": 7.592857261940774e-05, + "loss": 0.0043, + "step": 2549 + }, + { + "epoch": 1.9700714423633907, + "grad_norm": 0.009811597876250744, + "learning_rate": 7.590550367893421e-05, + "loss": 0.0044, + "step": 2550 + }, + { + "epoch": 1.9708437922378836, + "grad_norm": 0.015444842167198658, + "learning_rate": 7.588242719799452e-05, + "loss": 0.0039, + "step": 2551 + }, + { + "epoch": 1.971616142112377, + "grad_norm": 0.01210468914359808, + "learning_rate": 7.585934318330569e-05, + "loss": 0.0039, + "step": 2552 + }, + { + "epoch": 1.97238849198687, + "grad_norm": 0.01698988489806652, + "learning_rate": 7.583625164158689e-05, + "loss": 0.0047, + "step": 2553 + }, + { + "epoch": 1.9731608418613633, + "grad_norm": 0.01659640483558178, + "learning_rate": 7.581315257955954e-05, + "loss": 0.0044, + "step": 2554 + }, + { + "epoch": 1.9739331917358562, + "grad_norm": 0.00832913164049387, + "learning_rate": 7.57900460039472e-05, + "loss": 0.0045, + "step": 2555 + }, + { + "epoch": 1.9747055416103496, + "grad_norm": 0.014485886320471764, + "learning_rate": 7.576693192147564e-05, + "loss": 0.0041, + "step": 2556 + }, + { + "epoch": 1.9754778914848425, + "grad_norm": 0.010839599184691906, + "learning_rate": 7.57438103388728e-05, + "loss": 0.0047, + "step": 2557 + }, + { + "epoch": 1.976250241359336, + "grad_norm": 0.011815196834504604, + "learning_rate": 7.572068126286883e-05, + "loss": 0.0046, + "step": 2558 + }, + { + "epoch": 1.9770225912338288, + "grad_norm": 0.011010092683136463, + "learning_rate": 7.569754470019603e-05, + "loss": 0.0038, + "step": 2559 + }, + { + "epoch": 1.9777949411083222, + "grad_norm": 0.008888042531907558, + "learning_rate": 7.56744006575889e-05, + "loss": 0.004, + "step": 2560 + }, + { + "epoch": 1.9785672909828151, + "grad_norm": 0.02021145448088646, + "learning_rate": 7.565124914178415e-05, + "loss": 0.0043, + "step": 2561 + }, + { + "epoch": 1.9793396408573085, + "grad_norm": 0.0163202416151762, + "learning_rate": 7.562809015952054e-05, + "loss": 0.0042, + "step": 2562 + }, + { + "epoch": 1.9801119907318014, + "grad_norm": 0.01251328643411398, + "learning_rate": 7.560492371753918e-05, + "loss": 0.0047, + "step": 2563 + }, + { + "epoch": 1.9808843406062948, + "grad_norm": 0.008392451331019402, + "learning_rate": 7.558174982258321e-05, + "loss": 0.0035, + "step": 2564 + }, + { + "epoch": 1.9816566904807877, + "grad_norm": 0.012348789721727371, + "learning_rate": 7.555856848139801e-05, + "loss": 0.0047, + "step": 2565 + }, + { + "epoch": 1.982429040355281, + "grad_norm": 0.014394666068255901, + "learning_rate": 7.55353797007311e-05, + "loss": 0.0046, + "step": 2566 + }, + { + "epoch": 1.983201390229774, + "grad_norm": 0.008999601006507874, + "learning_rate": 7.551218348733217e-05, + "loss": 0.0039, + "step": 2567 + }, + { + "epoch": 1.9839737401042672, + "grad_norm": 0.023641925305128098, + "learning_rate": 7.54889798479531e-05, + "loss": 0.0037, + "step": 2568 + }, + { + "epoch": 1.9847460899787603, + "grad_norm": 0.009869282133877277, + "learning_rate": 7.546576878934788e-05, + "loss": 0.0046, + "step": 2569 + }, + { + "epoch": 1.9855184398532535, + "grad_norm": 0.01192814763635397, + "learning_rate": 7.544255031827268e-05, + "loss": 0.0041, + "step": 2570 + }, + { + "epoch": 1.9862907897277466, + "grad_norm": 0.02921842224895954, + "learning_rate": 7.541932444148589e-05, + "loss": 0.0045, + "step": 2571 + }, + { + "epoch": 1.9870631396022398, + "grad_norm": 0.009306002408266068, + "learning_rate": 7.539609116574795e-05, + "loss": 0.0044, + "step": 2572 + }, + { + "epoch": 1.987835489476733, + "grad_norm": 0.009245769120752811, + "learning_rate": 7.537285049782153e-05, + "loss": 0.0045, + "step": 2573 + }, + { + "epoch": 1.988607839351226, + "grad_norm": 0.018734237179160118, + "learning_rate": 7.534960244447141e-05, + "loss": 0.0042, + "step": 2574 + }, + { + "epoch": 1.9893801892257192, + "grad_norm": 0.020502427592873573, + "learning_rate": 7.532634701246454e-05, + "loss": 0.0044, + "step": 2575 + }, + { + "epoch": 1.9901525391002124, + "grad_norm": 0.014067620038986206, + "learning_rate": 7.530308420857004e-05, + "loss": 0.0043, + "step": 2576 + }, + { + "epoch": 1.9909248889747055, + "grad_norm": 0.04353853315114975, + "learning_rate": 7.527981403955913e-05, + "loss": 0.005, + "step": 2577 + }, + { + "epoch": 1.9916972388491987, + "grad_norm": 0.022540874779224396, + "learning_rate": 7.525653651220519e-05, + "loss": 0.0045, + "step": 2578 + }, + { + "epoch": 1.9924695887236918, + "grad_norm": 0.017821505665779114, + "learning_rate": 7.523325163328375e-05, + "loss": 0.0044, + "step": 2579 + }, + { + "epoch": 1.993241938598185, + "grad_norm": 0.026386121287941933, + "learning_rate": 7.520995940957248e-05, + "loss": 0.0044, + "step": 2580 + }, + { + "epoch": 1.9940142884726781, + "grad_norm": 0.03731666877865791, + "learning_rate": 7.518665984785119e-05, + "loss": 0.0047, + "step": 2581 + }, + { + "epoch": 1.9947866383471713, + "grad_norm": 0.016136379912495613, + "learning_rate": 7.516335295490178e-05, + "loss": 0.0047, + "step": 2582 + }, + { + "epoch": 1.9955589882216644, + "grad_norm": 0.00737913278862834, + "learning_rate": 7.514003873750836e-05, + "loss": 0.0037, + "step": 2583 + }, + { + "epoch": 1.9963313380961576, + "grad_norm": 0.027570966631174088, + "learning_rate": 7.511671720245715e-05, + "loss": 0.0037, + "step": 2584 + }, + { + "epoch": 1.9971036879706507, + "grad_norm": 0.03167342394590378, + "learning_rate": 7.509338835653643e-05, + "loss": 0.0036, + "step": 2585 + }, + { + "epoch": 1.997876037845144, + "grad_norm": 0.009573575109243393, + "learning_rate": 7.507005220653673e-05, + "loss": 0.0037, + "step": 2586 + }, + { + "epoch": 1.998648387719637, + "grad_norm": 0.014849287457764149, + "learning_rate": 7.504670875925058e-05, + "loss": 0.0046, + "step": 2587 + }, + { + "epoch": 1.9994207375941302, + "grad_norm": 0.02890627458691597, + "learning_rate": 7.502335802147273e-05, + "loss": 0.0042, + "step": 2588 + }, + { + "epoch": 2.000772349874493, + "grad_norm": 0.038108453154563904, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0079, + "step": 2589 + }, + { + "epoch": 2.0015446997489863, + "grad_norm": 0.006743496749550104, + "learning_rate": 7.497663470163135e-05, + "loss": 0.0034, + "step": 2590 + }, + { + "epoch": 2.0023170496234792, + "grad_norm": 0.026532011106610298, + "learning_rate": 7.495326213316787e-05, + "loss": 0.0042, + "step": 2591 + }, + { + "epoch": 2.0030893994979726, + "grad_norm": 0.03554273024201393, + "learning_rate": 7.492988230141272e-05, + "loss": 0.005, + "step": 2592 + }, + { + "epoch": 2.0038617493724655, + "grad_norm": 0.013866342604160309, + "learning_rate": 7.490649521317121e-05, + "loss": 0.004, + "step": 2593 + }, + { + "epoch": 2.004634099246959, + "grad_norm": 0.015046735294163227, + "learning_rate": 7.488310087525079e-05, + "loss": 0.0038, + "step": 2594 + }, + { + "epoch": 2.005406449121452, + "grad_norm": 0.027027055621147156, + "learning_rate": 7.485969929446094e-05, + "loss": 0.0042, + "step": 2595 + }, + { + "epoch": 2.006178798995945, + "grad_norm": 0.01871171034872532, + "learning_rate": 7.483629047761333e-05, + "loss": 0.0038, + "step": 2596 + }, + { + "epoch": 2.006951148870438, + "grad_norm": 0.010589679703116417, + "learning_rate": 7.481287443152167e-05, + "loss": 0.0035, + "step": 2597 + }, + { + "epoch": 2.0077234987449315, + "grad_norm": 0.016800161451101303, + "learning_rate": 7.478945116300183e-05, + "loss": 0.0041, + "step": 2598 + }, + { + "epoch": 2.0084958486194244, + "grad_norm": 0.009656563401222229, + "learning_rate": 7.476602067887178e-05, + "loss": 0.004, + "step": 2599 + }, + { + "epoch": 2.009268198493918, + "grad_norm": 0.032102540135383606, + "learning_rate": 7.474258298595148e-05, + "loss": 0.004, + "step": 2600 + }, + { + "epoch": 2.0100405483684107, + "grad_norm": 0.012074053287506104, + "learning_rate": 7.471913809106316e-05, + "loss": 0.004, + "step": 2601 + }, + { + "epoch": 2.010812898242904, + "grad_norm": 0.016124553978443146, + "learning_rate": 7.469568600103103e-05, + "loss": 0.0044, + "step": 2602 + }, + { + "epoch": 2.011585248117397, + "grad_norm": 0.02067081443965435, + "learning_rate": 7.467222672268146e-05, + "loss": 0.0038, + "step": 2603 + }, + { + "epoch": 2.0123575979918904, + "grad_norm": 0.038172945380210876, + "learning_rate": 7.464876026284281e-05, + "loss": 0.0038, + "step": 2604 + }, + { + "epoch": 2.0131299478663833, + "grad_norm": 0.012773305177688599, + "learning_rate": 7.462528662834568e-05, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 2.0139022977408767, + "grad_norm": 0.03118148073554039, + "learning_rate": 7.460180582602262e-05, + "loss": 0.0044, + "step": 2606 + }, + { + "epoch": 2.0146746476153696, + "grad_norm": 0.02907433547079563, + "learning_rate": 7.457831786270834e-05, + "loss": 0.0039, + "step": 2607 + }, + { + "epoch": 2.015446997489863, + "grad_norm": 0.008807958103716373, + "learning_rate": 7.455482274523963e-05, + "loss": 0.0038, + "step": 2608 + }, + { + "epoch": 2.016219347364356, + "grad_norm": 0.01534703653305769, + "learning_rate": 7.453132048045532e-05, + "loss": 0.0041, + "step": 2609 + }, + { + "epoch": 2.0169916972388493, + "grad_norm": 0.016446243971586227, + "learning_rate": 7.45078110751964e-05, + "loss": 0.0045, + "step": 2610 + }, + { + "epoch": 2.0177640471133422, + "grad_norm": 0.0146534014493227, + "learning_rate": 7.448429453630585e-05, + "loss": 0.0043, + "step": 2611 + }, + { + "epoch": 2.0185363969878356, + "grad_norm": 0.011785686947405338, + "learning_rate": 7.446077087062879e-05, + "loss": 0.0038, + "step": 2612 + }, + { + "epoch": 2.0193087468623285, + "grad_norm": 0.01915571838617325, + "learning_rate": 7.443724008501237e-05, + "loss": 0.0048, + "step": 2613 + }, + { + "epoch": 2.020081096736822, + "grad_norm": 0.015387355349957943, + "learning_rate": 7.441370218630585e-05, + "loss": 0.0039, + "step": 2614 + }, + { + "epoch": 2.020853446611315, + "grad_norm": 0.009541081264615059, + "learning_rate": 7.439015718136055e-05, + "loss": 0.0038, + "step": 2615 + }, + { + "epoch": 2.021625796485808, + "grad_norm": 0.014201180078089237, + "learning_rate": 7.436660507702982e-05, + "loss": 0.0037, + "step": 2616 + }, + { + "epoch": 2.022398146360301, + "grad_norm": 0.010206950828433037, + "learning_rate": 7.434304588016912e-05, + "loss": 0.0038, + "step": 2617 + }, + { + "epoch": 2.0231704962347945, + "grad_norm": 0.020223252475261688, + "learning_rate": 7.431947959763598e-05, + "loss": 0.0041, + "step": 2618 + }, + { + "epoch": 2.0239428461092874, + "grad_norm": 0.010396569967269897, + "learning_rate": 7.429590623628998e-05, + "loss": 0.0044, + "step": 2619 + }, + { + "epoch": 2.024715195983781, + "grad_norm": 0.009901518002152443, + "learning_rate": 7.427232580299272e-05, + "loss": 0.0039, + "step": 2620 + }, + { + "epoch": 2.0254875458582737, + "grad_norm": 0.032874803990125656, + "learning_rate": 7.424873830460793e-05, + "loss": 0.0041, + "step": 2621 + }, + { + "epoch": 2.026259895732767, + "grad_norm": 0.013373463414609432, + "learning_rate": 7.422514374800135e-05, + "loss": 0.0035, + "step": 2622 + }, + { + "epoch": 2.02703224560726, + "grad_norm": 0.011829612776637077, + "learning_rate": 7.420154214004078e-05, + "loss": 0.0036, + "step": 2623 + }, + { + "epoch": 2.0278045954817534, + "grad_norm": 0.02061588130891323, + "learning_rate": 7.417793348759608e-05, + "loss": 0.0039, + "step": 2624 + }, + { + "epoch": 2.0285769453562463, + "grad_norm": 0.017115185037255287, + "learning_rate": 7.415431779753915e-05, + "loss": 0.0036, + "step": 2625 + }, + { + "epoch": 2.0293492952307397, + "grad_norm": 0.009569639340043068, + "learning_rate": 7.413069507674396e-05, + "loss": 0.0041, + "step": 2626 + }, + { + "epoch": 2.0301216451052326, + "grad_norm": 0.019505271688103676, + "learning_rate": 7.410706533208652e-05, + "loss": 0.0035, + "step": 2627 + }, + { + "epoch": 2.030893994979726, + "grad_norm": 0.01109884213656187, + "learning_rate": 7.408342857044484e-05, + "loss": 0.0036, + "step": 2628 + }, + { + "epoch": 2.031666344854219, + "grad_norm": 0.012622921727597713, + "learning_rate": 7.405978479869907e-05, + "loss": 0.0041, + "step": 2629 + }, + { + "epoch": 2.0324386947287123, + "grad_norm": 0.012068958953022957, + "learning_rate": 7.403613402373126e-05, + "loss": 0.004, + "step": 2630 + }, + { + "epoch": 2.0332110446032052, + "grad_norm": 0.010394621640443802, + "learning_rate": 7.401247625242566e-05, + "loss": 0.0037, + "step": 2631 + }, + { + "epoch": 2.0339833944776986, + "grad_norm": 0.013484945520758629, + "learning_rate": 7.398881149166846e-05, + "loss": 0.004, + "step": 2632 + }, + { + "epoch": 2.0347557443521915, + "grad_norm": 0.014950394630432129, + "learning_rate": 7.396513974834784e-05, + "loss": 0.0041, + "step": 2633 + }, + { + "epoch": 2.035528094226685, + "grad_norm": 0.011799067258834839, + "learning_rate": 7.394146102935414e-05, + "loss": 0.0035, + "step": 2634 + }, + { + "epoch": 2.036300444101178, + "grad_norm": 0.019200555980205536, + "learning_rate": 7.391777534157963e-05, + "loss": 0.004, + "step": 2635 + }, + { + "epoch": 2.037072793975671, + "grad_norm": 0.0084306038916111, + "learning_rate": 7.389408269191864e-05, + "loss": 0.0038, + "step": 2636 + }, + { + "epoch": 2.037845143850164, + "grad_norm": 0.007933447137475014, + "learning_rate": 7.387038308726755e-05, + "loss": 0.0036, + "step": 2637 + }, + { + "epoch": 2.038617493724657, + "grad_norm": 0.013884244486689568, + "learning_rate": 7.384667653452472e-05, + "loss": 0.004, + "step": 2638 + }, + { + "epoch": 2.0393898435991504, + "grad_norm": 0.013365771621465683, + "learning_rate": 7.382296304059055e-05, + "loss": 0.0039, + "step": 2639 + }, + { + "epoch": 2.0401621934736434, + "grad_norm": 0.017005208879709244, + "learning_rate": 7.379924261236751e-05, + "loss": 0.0035, + "step": 2640 + }, + { + "epoch": 2.0409345433481367, + "grad_norm": 0.016478469595313072, + "learning_rate": 7.377551525675999e-05, + "loss": 0.0038, + "step": 2641 + }, + { + "epoch": 2.0417068932226297, + "grad_norm": 0.010146384127438068, + "learning_rate": 7.375178098067448e-05, + "loss": 0.004, + "step": 2642 + }, + { + "epoch": 2.042479243097123, + "grad_norm": 0.012967503629624844, + "learning_rate": 7.372803979101945e-05, + "loss": 0.0035, + "step": 2643 + }, + { + "epoch": 2.043251592971616, + "grad_norm": 0.02204027958214283, + "learning_rate": 7.370429169470536e-05, + "loss": 0.004, + "step": 2644 + }, + { + "epoch": 2.0440239428461093, + "grad_norm": 0.009851394221186638, + "learning_rate": 7.368053669864475e-05, + "loss": 0.0037, + "step": 2645 + }, + { + "epoch": 2.0447962927206023, + "grad_norm": 0.01107101235538721, + "learning_rate": 7.365677480975211e-05, + "loss": 0.0038, + "step": 2646 + }, + { + "epoch": 2.0455686425950956, + "grad_norm": 0.011317633092403412, + "learning_rate": 7.363300603494393e-05, + "loss": 0.0039, + "step": 2647 + }, + { + "epoch": 2.0463409924695886, + "grad_norm": 0.008429724723100662, + "learning_rate": 7.360923038113876e-05, + "loss": 0.0039, + "step": 2648 + }, + { + "epoch": 2.047113342344082, + "grad_norm": 0.009489733725786209, + "learning_rate": 7.358544785525708e-05, + "loss": 0.0038, + "step": 2649 + }, + { + "epoch": 2.047885692218575, + "grad_norm": 0.009628896601498127, + "learning_rate": 7.356165846422144e-05, + "loss": 0.0038, + "step": 2650 + }, + { + "epoch": 2.0486580420930682, + "grad_norm": 0.008734790608286858, + "learning_rate": 7.353786221495636e-05, + "loss": 0.004, + "step": 2651 + }, + { + "epoch": 2.049430391967561, + "grad_norm": 0.010296465829014778, + "learning_rate": 7.351405911438833e-05, + "loss": 0.0039, + "step": 2652 + }, + { + "epoch": 2.0502027418420545, + "grad_norm": 0.016222462058067322, + "learning_rate": 7.349024916944586e-05, + "loss": 0.0037, + "step": 2653 + }, + { + "epoch": 2.0509750917165475, + "grad_norm": 0.010780733078718185, + "learning_rate": 7.346643238705946e-05, + "loss": 0.0042, + "step": 2654 + }, + { + "epoch": 2.051747441591041, + "grad_norm": 0.023083990439772606, + "learning_rate": 7.344260877416161e-05, + "loss": 0.0039, + "step": 2655 + }, + { + "epoch": 2.0525197914655338, + "grad_norm": 0.018788602203130722, + "learning_rate": 7.341877833768682e-05, + "loss": 0.0041, + "step": 2656 + }, + { + "epoch": 2.053292141340027, + "grad_norm": 0.01248422171920538, + "learning_rate": 7.33949410845715e-05, + "loss": 0.0042, + "step": 2657 + }, + { + "epoch": 2.05406449121452, + "grad_norm": 0.01570403017103672, + "learning_rate": 7.337109702175413e-05, + "loss": 0.0042, + "step": 2658 + }, + { + "epoch": 2.0548368410890134, + "grad_norm": 0.02470230497419834, + "learning_rate": 7.334724615617517e-05, + "loss": 0.0037, + "step": 2659 + }, + { + "epoch": 2.0556091909635064, + "grad_norm": 0.022730203345417976, + "learning_rate": 7.332338849477696e-05, + "loss": 0.004, + "step": 2660 + }, + { + "epoch": 2.0563815408379997, + "grad_norm": 0.008514747023582458, + "learning_rate": 7.329952404450395e-05, + "loss": 0.0036, + "step": 2661 + }, + { + "epoch": 2.0571538907124927, + "grad_norm": 0.02491837926208973, + "learning_rate": 7.327565281230247e-05, + "loss": 0.0038, + "step": 2662 + }, + { + "epoch": 2.057926240586986, + "grad_norm": 0.03023313358426094, + "learning_rate": 7.325177480512087e-05, + "loss": 0.0044, + "step": 2663 + }, + { + "epoch": 2.058698590461479, + "grad_norm": 0.007855813950300217, + "learning_rate": 7.322789002990948e-05, + "loss": 0.0033, + "step": 2664 + }, + { + "epoch": 2.0594709403359723, + "grad_norm": 0.011223818175494671, + "learning_rate": 7.320399849362055e-05, + "loss": 0.0035, + "step": 2665 + }, + { + "epoch": 2.0602432902104653, + "grad_norm": 0.017690075561404228, + "learning_rate": 7.318010020320833e-05, + "loss": 0.0042, + "step": 2666 + }, + { + "epoch": 2.0610156400849586, + "grad_norm": 0.01451319083571434, + "learning_rate": 7.315619516562908e-05, + "loss": 0.0037, + "step": 2667 + }, + { + "epoch": 2.0617879899594516, + "grad_norm": 0.009698505513370037, + "learning_rate": 7.313228338784091e-05, + "loss": 0.0037, + "step": 2668 + }, + { + "epoch": 2.062560339833945, + "grad_norm": 0.00963345356285572, + "learning_rate": 7.310836487680402e-05, + "loss": 0.0038, + "step": 2669 + }, + { + "epoch": 2.063332689708438, + "grad_norm": 0.022508280351758003, + "learning_rate": 7.308443963948047e-05, + "loss": 0.004, + "step": 2670 + }, + { + "epoch": 2.0641050395829312, + "grad_norm": 0.021821528673171997, + "learning_rate": 7.306050768283434e-05, + "loss": 0.0036, + "step": 2671 + }, + { + "epoch": 2.064877389457424, + "grad_norm": 0.009050424210727215, + "learning_rate": 7.303656901383164e-05, + "loss": 0.0037, + "step": 2672 + }, + { + "epoch": 2.0656497393319175, + "grad_norm": 0.020669065415859222, + "learning_rate": 7.301262363944035e-05, + "loss": 0.0035, + "step": 2673 + }, + { + "epoch": 2.0664220892064105, + "grad_norm": 0.019859908148646355, + "learning_rate": 7.298867156663036e-05, + "loss": 0.0038, + "step": 2674 + }, + { + "epoch": 2.067194439080904, + "grad_norm": 0.009505180642008781, + "learning_rate": 7.296471280237356e-05, + "loss": 0.0036, + "step": 2675 + }, + { + "epoch": 2.0679667889553968, + "grad_norm": 0.012986565008759499, + "learning_rate": 7.294074735364378e-05, + "loss": 0.0043, + "step": 2676 + }, + { + "epoch": 2.06873913882989, + "grad_norm": 0.02292938530445099, + "learning_rate": 7.291677522741676e-05, + "loss": 0.0038, + "step": 2677 + }, + { + "epoch": 2.069511488704383, + "grad_norm": 0.01900501362979412, + "learning_rate": 7.289279643067021e-05, + "loss": 0.0034, + "step": 2678 + }, + { + "epoch": 2.0702838385788764, + "grad_norm": 0.00864870473742485, + "learning_rate": 7.286881097038378e-05, + "loss": 0.0035, + "step": 2679 + }, + { + "epoch": 2.0710561884533694, + "grad_norm": 0.02228534035384655, + "learning_rate": 7.284481885353906e-05, + "loss": 0.0041, + "step": 2680 + }, + { + "epoch": 2.0718285383278627, + "grad_norm": 0.014121807180345058, + "learning_rate": 7.282082008711959e-05, + "loss": 0.0038, + "step": 2681 + }, + { + "epoch": 2.0726008882023557, + "grad_norm": 0.011820808053016663, + "learning_rate": 7.279681467811082e-05, + "loss": 0.0038, + "step": 2682 + }, + { + "epoch": 2.073373238076849, + "grad_norm": 0.007909181527793407, + "learning_rate": 7.277280263350012e-05, + "loss": 0.0035, + "step": 2683 + }, + { + "epoch": 2.074145587951342, + "grad_norm": 0.014459396712481976, + "learning_rate": 7.274878396027685e-05, + "loss": 0.0038, + "step": 2684 + }, + { + "epoch": 2.074917937825835, + "grad_norm": 0.02249385230243206, + "learning_rate": 7.272475866543225e-05, + "loss": 0.0042, + "step": 2685 + }, + { + "epoch": 2.0756902877003283, + "grad_norm": 0.008177523501217365, + "learning_rate": 7.270072675595951e-05, + "loss": 0.0035, + "step": 2686 + }, + { + "epoch": 2.076462637574821, + "grad_norm": 0.010570460930466652, + "learning_rate": 7.267668823885373e-05, + "loss": 0.0038, + "step": 2687 + }, + { + "epoch": 2.0772349874493146, + "grad_norm": 0.01808362826704979, + "learning_rate": 7.265264312111194e-05, + "loss": 0.0037, + "step": 2688 + }, + { + "epoch": 2.0780073373238075, + "grad_norm": 0.008438099175691605, + "learning_rate": 7.26285914097331e-05, + "loss": 0.0039, + "step": 2689 + }, + { + "epoch": 2.078779687198301, + "grad_norm": 0.009861689060926437, + "learning_rate": 7.260453311171809e-05, + "loss": 0.0036, + "step": 2690 + }, + { + "epoch": 2.079552037072794, + "grad_norm": 0.009457158856093884, + "learning_rate": 7.258046823406968e-05, + "loss": 0.0036, + "step": 2691 + }, + { + "epoch": 2.080324386947287, + "grad_norm": 0.012208986096084118, + "learning_rate": 7.25563967837926e-05, + "loss": 0.0038, + "step": 2692 + }, + { + "epoch": 2.08109673682178, + "grad_norm": 0.010008537210524082, + "learning_rate": 7.253231876789343e-05, + "loss": 0.0034, + "step": 2693 + }, + { + "epoch": 2.0818690866962735, + "grad_norm": 0.01255972869694233, + "learning_rate": 7.250823419338073e-05, + "loss": 0.0036, + "step": 2694 + }, + { + "epoch": 2.0826414365707664, + "grad_norm": 0.010346543975174427, + "learning_rate": 7.248414306726492e-05, + "loss": 0.0041, + "step": 2695 + }, + { + "epoch": 2.0834137864452598, + "grad_norm": 0.022845614701509476, + "learning_rate": 7.246004539655836e-05, + "loss": 0.0043, + "step": 2696 + }, + { + "epoch": 2.0841861363197527, + "grad_norm": 0.016264963895082474, + "learning_rate": 7.24359411882753e-05, + "loss": 0.004, + "step": 2697 + }, + { + "epoch": 2.084958486194246, + "grad_norm": 0.01792088896036148, + "learning_rate": 7.241183044943187e-05, + "loss": 0.0034, + "step": 2698 + }, + { + "epoch": 2.085730836068739, + "grad_norm": 0.01296139508485794, + "learning_rate": 7.238771318704615e-05, + "loss": 0.0034, + "step": 2699 + }, + { + "epoch": 2.0865031859432324, + "grad_norm": 0.010842915624380112, + "learning_rate": 7.236358940813807e-05, + "loss": 0.0038, + "step": 2700 + }, + { + "epoch": 2.0872755358177253, + "grad_norm": 0.012911595404148102, + "learning_rate": 7.233945911972948e-05, + "loss": 0.0041, + "step": 2701 + }, + { + "epoch": 2.0880478856922187, + "grad_norm": 0.02002376690506935, + "learning_rate": 7.231532232884417e-05, + "loss": 0.0038, + "step": 2702 + }, + { + "epoch": 2.0888202355667116, + "grad_norm": 0.022102218121290207, + "learning_rate": 7.229117904250771e-05, + "loss": 0.004, + "step": 2703 + }, + { + "epoch": 2.089592585441205, + "grad_norm": 0.008304497227072716, + "learning_rate": 7.226702926774767e-05, + "loss": 0.0039, + "step": 2704 + }, + { + "epoch": 2.090364935315698, + "grad_norm": 0.024397362023591995, + "learning_rate": 7.224287301159345e-05, + "loss": 0.005, + "step": 2705 + }, + { + "epoch": 2.0911372851901913, + "grad_norm": 0.043411292135715485, + "learning_rate": 7.221871028107635e-05, + "loss": 0.0047, + "step": 2706 + }, + { + "epoch": 2.091909635064684, + "grad_norm": 0.010221214033663273, + "learning_rate": 7.219454108322957e-05, + "loss": 0.0041, + "step": 2707 + }, + { + "epoch": 2.0926819849391776, + "grad_norm": 0.020435620099306107, + "learning_rate": 7.217036542508817e-05, + "loss": 0.0042, + "step": 2708 + }, + { + "epoch": 2.0934543348136705, + "grad_norm": 0.0436616912484169, + "learning_rate": 7.21461833136891e-05, + "loss": 0.0041, + "step": 2709 + }, + { + "epoch": 2.094226684688164, + "grad_norm": 0.018153520300984383, + "learning_rate": 7.212199475607119e-05, + "loss": 0.0041, + "step": 2710 + }, + { + "epoch": 2.094999034562657, + "grad_norm": 0.025095542892813683, + "learning_rate": 7.209779975927515e-05, + "loss": 0.0041, + "step": 2711 + }, + { + "epoch": 2.09577138443715, + "grad_norm": 0.03797965496778488, + "learning_rate": 7.207359833034355e-05, + "loss": 0.0043, + "step": 2712 + }, + { + "epoch": 2.096543734311643, + "grad_norm": 0.018812689930200577, + "learning_rate": 7.204939047632085e-05, + "loss": 0.0038, + "step": 2713 + }, + { + "epoch": 2.0973160841861365, + "grad_norm": 0.00882665067911148, + "learning_rate": 7.202517620425335e-05, + "loss": 0.0039, + "step": 2714 + }, + { + "epoch": 2.0980884340606294, + "grad_norm": 0.021644921973347664, + "learning_rate": 7.200095552118927e-05, + "loss": 0.0041, + "step": 2715 + }, + { + "epoch": 2.0988607839351228, + "grad_norm": 0.03702374920248985, + "learning_rate": 7.197672843417865e-05, + "loss": 0.0041, + "step": 2716 + }, + { + "epoch": 2.0996331338096157, + "grad_norm": 0.024662388488650322, + "learning_rate": 7.195249495027343e-05, + "loss": 0.0041, + "step": 2717 + }, + { + "epoch": 2.100405483684109, + "grad_norm": 0.02720011956989765, + "learning_rate": 7.192825507652734e-05, + "loss": 0.0045, + "step": 2718 + }, + { + "epoch": 2.101177833558602, + "grad_norm": 0.027912747114896774, + "learning_rate": 7.190400881999607e-05, + "loss": 0.0038, + "step": 2719 + }, + { + "epoch": 2.1019501834330954, + "grad_norm": 0.03504815697669983, + "learning_rate": 7.18797561877371e-05, + "loss": 0.0039, + "step": 2720 + }, + { + "epoch": 2.1027225333075883, + "grad_norm": 0.010269366204738617, + "learning_rate": 7.18554971868098e-05, + "loss": 0.0035, + "step": 2721 + }, + { + "epoch": 2.1034948831820817, + "grad_norm": 0.02692263200879097, + "learning_rate": 7.183123182427536e-05, + "loss": 0.004, + "step": 2722 + }, + { + "epoch": 2.1042672330565746, + "grad_norm": 0.025207681581377983, + "learning_rate": 7.180696010719683e-05, + "loss": 0.0039, + "step": 2723 + }, + { + "epoch": 2.105039582931068, + "grad_norm": 0.015352983959019184, + "learning_rate": 7.178268204263919e-05, + "loss": 0.0042, + "step": 2724 + }, + { + "epoch": 2.105811932805561, + "grad_norm": 0.009109385311603546, + "learning_rate": 7.175839763766909e-05, + "loss": 0.004, + "step": 2725 + }, + { + "epoch": 2.1065842826800543, + "grad_norm": 0.012240353971719742, + "learning_rate": 7.173410689935521e-05, + "loss": 0.0039, + "step": 2726 + }, + { + "epoch": 2.107356632554547, + "grad_norm": 0.020937219262123108, + "learning_rate": 7.1709809834768e-05, + "loss": 0.0039, + "step": 2727 + }, + { + "epoch": 2.1081289824290406, + "grad_norm": 0.01876830868422985, + "learning_rate": 7.16855064509797e-05, + "loss": 0.0037, + "step": 2728 + }, + { + "epoch": 2.1089013323035335, + "grad_norm": 0.012717272154986858, + "learning_rate": 7.166119675506449e-05, + "loss": 0.0038, + "step": 2729 + }, + { + "epoch": 2.109673682178027, + "grad_norm": 0.007943187840282917, + "learning_rate": 7.163688075409828e-05, + "loss": 0.0037, + "step": 2730 + }, + { + "epoch": 2.11044603205252, + "grad_norm": 0.018614256754517555, + "learning_rate": 7.161255845515891e-05, + "loss": 0.0041, + "step": 2731 + }, + { + "epoch": 2.1112183819270127, + "grad_norm": 0.013146874494850636, + "learning_rate": 7.158822986532601e-05, + "loss": 0.0038, + "step": 2732 + }, + { + "epoch": 2.111990731801506, + "grad_norm": 0.011025556363165379, + "learning_rate": 7.156389499168102e-05, + "loss": 0.0031, + "step": 2733 + }, + { + "epoch": 2.112763081675999, + "grad_norm": 0.01522703468799591, + "learning_rate": 7.153955384130726e-05, + "loss": 0.0037, + "step": 2734 + }, + { + "epoch": 2.1135354315504924, + "grad_norm": 0.01450337190181017, + "learning_rate": 7.151520642128985e-05, + "loss": 0.004, + "step": 2735 + }, + { + "epoch": 2.1143077814249853, + "grad_norm": 0.015826785936951637, + "learning_rate": 7.149085273871572e-05, + "loss": 0.0038, + "step": 2736 + }, + { + "epoch": 2.1150801312994787, + "grad_norm": 0.009047970175743103, + "learning_rate": 7.146649280067365e-05, + "loss": 0.0041, + "step": 2737 + }, + { + "epoch": 2.1158524811739716, + "grad_norm": 0.008611311204731464, + "learning_rate": 7.144212661425422e-05, + "loss": 0.0036, + "step": 2738 + }, + { + "epoch": 2.116624831048465, + "grad_norm": 0.016116051003336906, + "learning_rate": 7.141775418654985e-05, + "loss": 0.0039, + "step": 2739 + }, + { + "epoch": 2.117397180922958, + "grad_norm": 0.024656126275658607, + "learning_rate": 7.139337552465475e-05, + "loss": 0.0049, + "step": 2740 + }, + { + "epoch": 2.1181695307974513, + "grad_norm": 0.015838859602808952, + "learning_rate": 7.136899063566498e-05, + "loss": 0.0039, + "step": 2741 + }, + { + "epoch": 2.1189418806719442, + "grad_norm": 0.025547225028276443, + "learning_rate": 7.134459952667837e-05, + "loss": 0.0041, + "step": 2742 + }, + { + "epoch": 2.1197142305464376, + "grad_norm": 0.030239900574088097, + "learning_rate": 7.132020220479459e-05, + "loss": 0.0038, + "step": 2743 + }, + { + "epoch": 2.1204865804209305, + "grad_norm": 0.011210362426936626, + "learning_rate": 7.129579867711511e-05, + "loss": 0.0043, + "step": 2744 + }, + { + "epoch": 2.121258930295424, + "grad_norm": 0.022186698392033577, + "learning_rate": 7.127138895074322e-05, + "loss": 0.0042, + "step": 2745 + }, + { + "epoch": 2.122031280169917, + "grad_norm": 0.02024666965007782, + "learning_rate": 7.124697303278399e-05, + "loss": 0.0038, + "step": 2746 + }, + { + "epoch": 2.12280363004441, + "grad_norm": 0.012369618751108646, + "learning_rate": 7.12225509303443e-05, + "loss": 0.0042, + "step": 2747 + }, + { + "epoch": 2.123575979918903, + "grad_norm": 0.011102922260761261, + "learning_rate": 7.119812265053286e-05, + "loss": 0.0037, + "step": 2748 + }, + { + "epoch": 2.1243483297933965, + "grad_norm": 0.014041735790669918, + "learning_rate": 7.11736882004601e-05, + "loss": 0.0038, + "step": 2749 + }, + { + "epoch": 2.1251206796678894, + "grad_norm": 0.010623575188219547, + "learning_rate": 7.114924758723833e-05, + "loss": 0.0036, + "step": 2750 + }, + { + "epoch": 2.125893029542383, + "grad_norm": 0.01225269865244627, + "learning_rate": 7.112480081798165e-05, + "loss": 0.0039, + "step": 2751 + }, + { + "epoch": 2.1266653794168757, + "grad_norm": 0.013135985471308231, + "learning_rate": 7.110034789980588e-05, + "loss": 0.0039, + "step": 2752 + }, + { + "epoch": 2.127437729291369, + "grad_norm": 0.00986009743064642, + "learning_rate": 7.107588883982868e-05, + "loss": 0.0034, + "step": 2753 + }, + { + "epoch": 2.128210079165862, + "grad_norm": 0.00858435407280922, + "learning_rate": 7.105142364516952e-05, + "loss": 0.0034, + "step": 2754 + }, + { + "epoch": 2.1289824290403554, + "grad_norm": 0.008631178177893162, + "learning_rate": 7.102695232294958e-05, + "loss": 0.0036, + "step": 2755 + }, + { + "epoch": 2.1297547789148483, + "grad_norm": 0.009610356763005257, + "learning_rate": 7.100247488029192e-05, + "loss": 0.0036, + "step": 2756 + }, + { + "epoch": 2.1305271287893417, + "grad_norm": 0.009580448269844055, + "learning_rate": 7.09779913243213e-05, + "loss": 0.0037, + "step": 2757 + }, + { + "epoch": 2.1312994786638346, + "grad_norm": 0.009013169445097446, + "learning_rate": 7.095350166216431e-05, + "loss": 0.0038, + "step": 2758 + }, + { + "epoch": 2.132071828538328, + "grad_norm": 0.015030079521238804, + "learning_rate": 7.092900590094928e-05, + "loss": 0.0039, + "step": 2759 + }, + { + "epoch": 2.132844178412821, + "grad_norm": 0.011109733954071999, + "learning_rate": 7.090450404780635e-05, + "loss": 0.0037, + "step": 2760 + }, + { + "epoch": 2.1336165282873143, + "grad_norm": 0.009983384981751442, + "learning_rate": 7.087999610986741e-05, + "loss": 0.0036, + "step": 2761 + }, + { + "epoch": 2.1343888781618072, + "grad_norm": 0.01634259894490242, + "learning_rate": 7.085548209426613e-05, + "loss": 0.0043, + "step": 2762 + }, + { + "epoch": 2.1351612280363006, + "grad_norm": 0.009563818573951721, + "learning_rate": 7.083096200813794e-05, + "loss": 0.0038, + "step": 2763 + }, + { + "epoch": 2.1359335779107935, + "grad_norm": 0.011839197017252445, + "learning_rate": 7.080643585862007e-05, + "loss": 0.0043, + "step": 2764 + }, + { + "epoch": 2.136705927785287, + "grad_norm": 0.01338752917945385, + "learning_rate": 7.078190365285147e-05, + "loss": 0.0037, + "step": 2765 + }, + { + "epoch": 2.13747827765978, + "grad_norm": 0.009806690737605095, + "learning_rate": 7.075736539797287e-05, + "loss": 0.0033, + "step": 2766 + }, + { + "epoch": 2.138250627534273, + "grad_norm": 0.01095563918352127, + "learning_rate": 7.073282110112676e-05, + "loss": 0.004, + "step": 2767 + }, + { + "epoch": 2.139022977408766, + "grad_norm": 0.016509409993886948, + "learning_rate": 7.07082707694574e-05, + "loss": 0.0035, + "step": 2768 + }, + { + "epoch": 2.1397953272832595, + "grad_norm": 0.00953350868076086, + "learning_rate": 7.06837144101108e-05, + "loss": 0.0044, + "step": 2769 + }, + { + "epoch": 2.1405676771577524, + "grad_norm": 0.00878667738288641, + "learning_rate": 7.065915203023472e-05, + "loss": 0.0038, + "step": 2770 + }, + { + "epoch": 2.141340027032246, + "grad_norm": 0.01307417917996645, + "learning_rate": 7.063458363697867e-05, + "loss": 0.0043, + "step": 2771 + }, + { + "epoch": 2.1421123769067387, + "grad_norm": 0.0096151577308774, + "learning_rate": 7.061000923749395e-05, + "loss": 0.0039, + "step": 2772 + }, + { + "epoch": 2.142884726781232, + "grad_norm": 0.0076342313550412655, + "learning_rate": 7.058542883893351e-05, + "loss": 0.0035, + "step": 2773 + }, + { + "epoch": 2.143657076655725, + "grad_norm": 0.012591948732733727, + "learning_rate": 7.056084244845216e-05, + "loss": 0.0035, + "step": 2774 + }, + { + "epoch": 2.1444294265302184, + "grad_norm": 0.008794068358838558, + "learning_rate": 7.05362500732064e-05, + "loss": 0.0038, + "step": 2775 + }, + { + "epoch": 2.1452017764047113, + "grad_norm": 0.010124838910996914, + "learning_rate": 7.051165172035444e-05, + "loss": 0.0041, + "step": 2776 + }, + { + "epoch": 2.1459741262792047, + "grad_norm": 0.012167713604867458, + "learning_rate": 7.048704739705632e-05, + "loss": 0.0042, + "step": 2777 + }, + { + "epoch": 2.1467464761536976, + "grad_norm": 0.01282426342368126, + "learning_rate": 7.046243711047372e-05, + "loss": 0.0036, + "step": 2778 + }, + { + "epoch": 2.1475188260281906, + "grad_norm": 0.014842244796454906, + "learning_rate": 7.043782086777011e-05, + "loss": 0.0038, + "step": 2779 + }, + { + "epoch": 2.148291175902684, + "grad_norm": 0.0100597208365798, + "learning_rate": 7.04131986761107e-05, + "loss": 0.0039, + "step": 2780 + }, + { + "epoch": 2.1490635257771773, + "grad_norm": 0.013920299708843231, + "learning_rate": 7.038857054266241e-05, + "loss": 0.0043, + "step": 2781 + }, + { + "epoch": 2.1498358756516702, + "grad_norm": 0.013597175478935242, + "learning_rate": 7.036393647459387e-05, + "loss": 0.0042, + "step": 2782 + }, + { + "epoch": 2.150608225526163, + "grad_norm": 0.009669664315879345, + "learning_rate": 7.03392964790755e-05, + "loss": 0.0037, + "step": 2783 + }, + { + "epoch": 2.1513805754006565, + "grad_norm": 0.013507052324712276, + "learning_rate": 7.031465056327936e-05, + "loss": 0.0034, + "step": 2784 + }, + { + "epoch": 2.1521529252751495, + "grad_norm": 0.00888124294579029, + "learning_rate": 7.028999873437931e-05, + "loss": 0.0037, + "step": 2785 + }, + { + "epoch": 2.152925275149643, + "grad_norm": 0.015375855378806591, + "learning_rate": 7.026534099955094e-05, + "loss": 0.0039, + "step": 2786 + }, + { + "epoch": 2.1536976250241358, + "grad_norm": 0.013118419796228409, + "learning_rate": 7.024067736597145e-05, + "loss": 0.0041, + "step": 2787 + }, + { + "epoch": 2.154469974898629, + "grad_norm": 0.017940763384103775, + "learning_rate": 7.021600784081985e-05, + "loss": 0.0041, + "step": 2788 + }, + { + "epoch": 2.155242324773122, + "grad_norm": 0.016645725816488266, + "learning_rate": 7.019133243127688e-05, + "loss": 0.0041, + "step": 2789 + }, + { + "epoch": 2.1560146746476154, + "grad_norm": 0.010853741317987442, + "learning_rate": 7.016665114452491e-05, + "loss": 0.0032, + "step": 2790 + }, + { + "epoch": 2.1567870245221084, + "grad_norm": 0.015913356095552444, + "learning_rate": 7.014196398774808e-05, + "loss": 0.0042, + "step": 2791 + }, + { + "epoch": 2.1575593743966017, + "grad_norm": 0.014743924140930176, + "learning_rate": 7.011727096813226e-05, + "loss": 0.0032, + "step": 2792 + }, + { + "epoch": 2.1583317242710947, + "grad_norm": 0.007792849093675613, + "learning_rate": 7.009257209286491e-05, + "loss": 0.004, + "step": 2793 + }, + { + "epoch": 2.159104074145588, + "grad_norm": 0.009073421359062195, + "learning_rate": 7.006786736913536e-05, + "loss": 0.0037, + "step": 2794 + }, + { + "epoch": 2.159876424020081, + "grad_norm": 0.011311429552733898, + "learning_rate": 7.00431568041345e-05, + "loss": 0.004, + "step": 2795 + }, + { + "epoch": 2.1606487738945743, + "grad_norm": 0.013861387968063354, + "learning_rate": 7.001844040505501e-05, + "loss": 0.0036, + "step": 2796 + }, + { + "epoch": 2.1614211237690673, + "grad_norm": 0.02032576873898506, + "learning_rate": 6.999371817909124e-05, + "loss": 0.0041, + "step": 2797 + }, + { + "epoch": 2.1621934736435606, + "grad_norm": 0.023006441071629524, + "learning_rate": 6.99689901334392e-05, + "loss": 0.0038, + "step": 2798 + }, + { + "epoch": 2.1629658235180536, + "grad_norm": 0.017457854002714157, + "learning_rate": 6.994425627529666e-05, + "loss": 0.0034, + "step": 2799 + }, + { + "epoch": 2.163738173392547, + "grad_norm": 0.012595701031386852, + "learning_rate": 6.991951661186305e-05, + "loss": 0.0045, + "step": 2800 + }, + { + "epoch": 2.16451052326704, + "grad_norm": 0.010976454243063927, + "learning_rate": 6.989477115033945e-05, + "loss": 0.0036, + "step": 2801 + }, + { + "epoch": 2.1652828731415332, + "grad_norm": 0.009047305211424828, + "learning_rate": 6.987001989792869e-05, + "loss": 0.0035, + "step": 2802 + }, + { + "epoch": 2.166055223016026, + "grad_norm": 0.02184477262198925, + "learning_rate": 6.984526286183528e-05, + "loss": 0.0035, + "step": 2803 + }, + { + "epoch": 2.1668275728905195, + "grad_norm": 0.013075319118797779, + "learning_rate": 6.982050004926537e-05, + "loss": 0.0035, + "step": 2804 + }, + { + "epoch": 2.1675999227650125, + "grad_norm": 0.010313881561160088, + "learning_rate": 6.979573146742682e-05, + "loss": 0.0039, + "step": 2805 + }, + { + "epoch": 2.168372272639506, + "grad_norm": 0.026523558422923088, + "learning_rate": 6.977095712352916e-05, + "loss": 0.0037, + "step": 2806 + }, + { + "epoch": 2.1691446225139988, + "grad_norm": 0.019071469083428383, + "learning_rate": 6.974617702478362e-05, + "loss": 0.0041, + "step": 2807 + }, + { + "epoch": 2.169916972388492, + "grad_norm": 0.010107327252626419, + "learning_rate": 6.972139117840307e-05, + "loss": 0.0038, + "step": 2808 + }, + { + "epoch": 2.170689322262985, + "grad_norm": 0.023329516872763634, + "learning_rate": 6.96965995916021e-05, + "loss": 0.0038, + "step": 2809 + }, + { + "epoch": 2.1714616721374784, + "grad_norm": 0.014047396369278431, + "learning_rate": 6.967180227159691e-05, + "loss": 0.0037, + "step": 2810 + }, + { + "epoch": 2.1722340220119714, + "grad_norm": 0.010431072674691677, + "learning_rate": 6.96469992256054e-05, + "loss": 0.0039, + "step": 2811 + }, + { + "epoch": 2.1730063718864647, + "grad_norm": 0.015764454379677773, + "learning_rate": 6.962219046084717e-05, + "loss": 0.0039, + "step": 2812 + }, + { + "epoch": 2.1737787217609577, + "grad_norm": 0.013384529389441013, + "learning_rate": 6.959737598454342e-05, + "loss": 0.0039, + "step": 2813 + }, + { + "epoch": 2.174551071635451, + "grad_norm": 0.01582462526857853, + "learning_rate": 6.957255580391707e-05, + "loss": 0.0036, + "step": 2814 + }, + { + "epoch": 2.175323421509944, + "grad_norm": 0.008840755559504032, + "learning_rate": 6.954772992619265e-05, + "loss": 0.0038, + "step": 2815 + }, + { + "epoch": 2.1760957713844373, + "grad_norm": 0.01822042465209961, + "learning_rate": 6.952289835859639e-05, + "loss": 0.0035, + "step": 2816 + }, + { + "epoch": 2.1768681212589303, + "grad_norm": 0.02567700669169426, + "learning_rate": 6.949806110835615e-05, + "loss": 0.0035, + "step": 2817 + }, + { + "epoch": 2.1776404711334236, + "grad_norm": 0.018477164208889008, + "learning_rate": 6.947321818270146e-05, + "loss": 0.0037, + "step": 2818 + }, + { + "epoch": 2.1784128210079166, + "grad_norm": 0.009455078281462193, + "learning_rate": 6.944836958886349e-05, + "loss": 0.0037, + "step": 2819 + }, + { + "epoch": 2.17918517088241, + "grad_norm": 0.029945973306894302, + "learning_rate": 6.942351533407507e-05, + "loss": 0.0041, + "step": 2820 + }, + { + "epoch": 2.179957520756903, + "grad_norm": 0.022179346531629562, + "learning_rate": 6.939865542557067e-05, + "loss": 0.0037, + "step": 2821 + }, + { + "epoch": 2.1807298706313962, + "grad_norm": 0.016879554837942123, + "learning_rate": 6.937378987058642e-05, + "loss": 0.0036, + "step": 2822 + }, + { + "epoch": 2.181502220505889, + "grad_norm": 0.015892690047621727, + "learning_rate": 6.934891867636004e-05, + "loss": 0.004, + "step": 2823 + }, + { + "epoch": 2.1822745703803825, + "grad_norm": 0.036032550036907196, + "learning_rate": 6.9324041850131e-05, + "loss": 0.004, + "step": 2824 + }, + { + "epoch": 2.1830469202548755, + "grad_norm": 0.01696144975721836, + "learning_rate": 6.92991593991403e-05, + "loss": 0.0036, + "step": 2825 + }, + { + "epoch": 2.1838192701293684, + "grad_norm": 0.007884988561272621, + "learning_rate": 6.927427133063061e-05, + "loss": 0.0038, + "step": 2826 + }, + { + "epoch": 2.1845916200038618, + "grad_norm": 0.026676487177610397, + "learning_rate": 6.924937765184629e-05, + "loss": 0.0038, + "step": 2827 + }, + { + "epoch": 2.185363969878355, + "grad_norm": 0.019141726195812225, + "learning_rate": 6.922447837003324e-05, + "loss": 0.004, + "step": 2828 + }, + { + "epoch": 2.186136319752848, + "grad_norm": 0.015733450651168823, + "learning_rate": 6.919957349243907e-05, + "loss": 0.0045, + "step": 2829 + }, + { + "epoch": 2.186908669627341, + "grad_norm": 0.012079773470759392, + "learning_rate": 6.9174663026313e-05, + "loss": 0.0032, + "step": 2830 + }, + { + "epoch": 2.1876810195018344, + "grad_norm": 0.013535288162529469, + "learning_rate": 6.914974697890581e-05, + "loss": 0.004, + "step": 2831 + }, + { + "epoch": 2.1884533693763273, + "grad_norm": 0.01917087472975254, + "learning_rate": 6.912482535747002e-05, + "loss": 0.0035, + "step": 2832 + }, + { + "epoch": 2.1892257192508207, + "grad_norm": 0.009686388075351715, + "learning_rate": 6.909989816925967e-05, + "loss": 0.0035, + "step": 2833 + }, + { + "epoch": 2.1899980691253136, + "grad_norm": 0.009863371029496193, + "learning_rate": 6.907496542153049e-05, + "loss": 0.0036, + "step": 2834 + }, + { + "epoch": 2.190770418999807, + "grad_norm": 0.010294755920767784, + "learning_rate": 6.90500271215398e-05, + "loss": 0.0039, + "step": 2835 + }, + { + "epoch": 2.1915427688743, + "grad_norm": 0.01450702641159296, + "learning_rate": 6.902508327654649e-05, + "loss": 0.0035, + "step": 2836 + }, + { + "epoch": 2.1923151187487933, + "grad_norm": 0.020422223955392838, + "learning_rate": 6.900013389381117e-05, + "loss": 0.0043, + "step": 2837 + }, + { + "epoch": 2.193087468623286, + "grad_norm": 0.014234035275876522, + "learning_rate": 6.897517898059597e-05, + "loss": 0.0037, + "step": 2838 + }, + { + "epoch": 2.1938598184977796, + "grad_norm": 0.021411612629890442, + "learning_rate": 6.895021854416467e-05, + "loss": 0.0037, + "step": 2839 + }, + { + "epoch": 2.1946321683722725, + "grad_norm": 0.021227024495601654, + "learning_rate": 6.892525259178265e-05, + "loss": 0.0041, + "step": 2840 + }, + { + "epoch": 2.195404518246766, + "grad_norm": 0.012483715079724789, + "learning_rate": 6.89002811307169e-05, + "loss": 0.0036, + "step": 2841 + }, + { + "epoch": 2.196176868121259, + "grad_norm": 0.011528292670845985, + "learning_rate": 6.8875304168236e-05, + "loss": 0.0035, + "step": 2842 + }, + { + "epoch": 2.196949217995752, + "grad_norm": 0.018989963456988335, + "learning_rate": 6.885032171161014e-05, + "loss": 0.0038, + "step": 2843 + }, + { + "epoch": 2.197721567870245, + "grad_norm": 0.018031930550932884, + "learning_rate": 6.882533376811112e-05, + "loss": 0.0041, + "step": 2844 + }, + { + "epoch": 2.1984939177447385, + "grad_norm": 0.026337895542383194, + "learning_rate": 6.880034034501232e-05, + "loss": 0.0046, + "step": 2845 + }, + { + "epoch": 2.1992662676192314, + "grad_norm": 0.01727277971804142, + "learning_rate": 6.877534144958873e-05, + "loss": 0.0032, + "step": 2846 + }, + { + "epoch": 2.2000386174937248, + "grad_norm": 0.012766748666763306, + "learning_rate": 6.875033708911692e-05, + "loss": 0.0032, + "step": 2847 + }, + { + "epoch": 2.2008109673682177, + "grad_norm": 0.02523941732943058, + "learning_rate": 6.872532727087502e-05, + "loss": 0.0034, + "step": 2848 + }, + { + "epoch": 2.201583317242711, + "grad_norm": 0.014299865812063217, + "learning_rate": 6.870031200214285e-05, + "loss": 0.0041, + "step": 2849 + }, + { + "epoch": 2.202355667117204, + "grad_norm": 0.011945880018174648, + "learning_rate": 6.86752912902017e-05, + "loss": 0.0032, + "step": 2850 + }, + { + "epoch": 2.2031280169916974, + "grad_norm": 0.023927869275212288, + "learning_rate": 6.865026514233452e-05, + "loss": 0.004, + "step": 2851 + }, + { + "epoch": 2.2039003668661903, + "grad_norm": 0.01110320258885622, + "learning_rate": 6.862523356582579e-05, + "loss": 0.0038, + "step": 2852 + }, + { + "epoch": 2.2046727167406837, + "grad_norm": 0.010775089263916016, + "learning_rate": 6.860019656796163e-05, + "loss": 0.0038, + "step": 2853 + }, + { + "epoch": 2.2054450666151766, + "grad_norm": 0.013647436164319515, + "learning_rate": 6.85751541560297e-05, + "loss": 0.0039, + "step": 2854 + }, + { + "epoch": 2.20621741648967, + "grad_norm": 0.014270083047449589, + "learning_rate": 6.855010633731923e-05, + "loss": 0.0038, + "step": 2855 + }, + { + "epoch": 2.206989766364163, + "grad_norm": 0.009960106573998928, + "learning_rate": 6.8525053119121e-05, + "loss": 0.0038, + "step": 2856 + }, + { + "epoch": 2.2077621162386563, + "grad_norm": 0.0098283551633358, + "learning_rate": 6.849999450872745e-05, + "loss": 0.0041, + "step": 2857 + }, + { + "epoch": 2.208534466113149, + "grad_norm": 0.01134185679256916, + "learning_rate": 6.847493051343252e-05, + "loss": 0.0045, + "step": 2858 + }, + { + "epoch": 2.2093068159876426, + "grad_norm": 0.012657348066568375, + "learning_rate": 6.844986114053173e-05, + "loss": 0.0037, + "step": 2859 + }, + { + "epoch": 2.2100791658621355, + "grad_norm": 0.01441334281116724, + "learning_rate": 6.842478639732219e-05, + "loss": 0.0039, + "step": 2860 + }, + { + "epoch": 2.210851515736629, + "grad_norm": 0.011827160604298115, + "learning_rate": 6.83997062911025e-05, + "loss": 0.0041, + "step": 2861 + }, + { + "epoch": 2.211623865611122, + "grad_norm": 0.015119560062885284, + "learning_rate": 6.837462082917295e-05, + "loss": 0.0042, + "step": 2862 + }, + { + "epoch": 2.212396215485615, + "grad_norm": 0.007931212894618511, + "learning_rate": 6.834953001883522e-05, + "loss": 0.004, + "step": 2863 + }, + { + "epoch": 2.213168565360108, + "grad_norm": 0.013704811222851276, + "learning_rate": 6.832443386739269e-05, + "loss": 0.0035, + "step": 2864 + }, + { + "epoch": 2.2139409152346015, + "grad_norm": 0.008806932717561722, + "learning_rate": 6.829933238215028e-05, + "loss": 0.0043, + "step": 2865 + }, + { + "epoch": 2.2147132651090944, + "grad_norm": 0.008302022702991962, + "learning_rate": 6.827422557041433e-05, + "loss": 0.0035, + "step": 2866 + }, + { + "epoch": 2.2154856149835878, + "grad_norm": 0.011451393365859985, + "learning_rate": 6.824911343949291e-05, + "loss": 0.0036, + "step": 2867 + }, + { + "epoch": 2.2162579648580807, + "grad_norm": 0.00943044200539589, + "learning_rate": 6.822399599669552e-05, + "loss": 0.0038, + "step": 2868 + }, + { + "epoch": 2.217030314732574, + "grad_norm": 0.008121364749968052, + "learning_rate": 6.819887324933325e-05, + "loss": 0.0036, + "step": 2869 + }, + { + "epoch": 2.217802664607067, + "grad_norm": 0.009307894855737686, + "learning_rate": 6.81737452047187e-05, + "loss": 0.0048, + "step": 2870 + }, + { + "epoch": 2.2185750144815604, + "grad_norm": 0.010115236043930054, + "learning_rate": 6.814861187016608e-05, + "loss": 0.0044, + "step": 2871 + }, + { + "epoch": 2.2193473643560533, + "grad_norm": 0.009662744589149952, + "learning_rate": 6.812347325299107e-05, + "loss": 0.004, + "step": 2872 + }, + { + "epoch": 2.220119714230546, + "grad_norm": 0.01051376387476921, + "learning_rate": 6.809832936051092e-05, + "loss": 0.0044, + "step": 2873 + }, + { + "epoch": 2.2208920641050396, + "grad_norm": 0.010458176024258137, + "learning_rate": 6.80731802000444e-05, + "loss": 0.0038, + "step": 2874 + }, + { + "epoch": 2.221664413979533, + "grad_norm": 0.007859159260988235, + "learning_rate": 6.804802577891182e-05, + "loss": 0.0044, + "step": 2875 + }, + { + "epoch": 2.222436763854026, + "grad_norm": 0.007987123914062977, + "learning_rate": 6.802286610443506e-05, + "loss": 0.0038, + "step": 2876 + }, + { + "epoch": 2.223209113728519, + "grad_norm": 0.008854716084897518, + "learning_rate": 6.799770118393746e-05, + "loss": 0.0037, + "step": 2877 + }, + { + "epoch": 2.223981463603012, + "grad_norm": 0.008117330260574818, + "learning_rate": 6.797253102474392e-05, + "loss": 0.0038, + "step": 2878 + }, + { + "epoch": 2.224753813477505, + "grad_norm": 0.010898245498538017, + "learning_rate": 6.794735563418087e-05, + "loss": 0.0035, + "step": 2879 + }, + { + "epoch": 2.2255261633519985, + "grad_norm": 0.009254688397049904, + "learning_rate": 6.792217501957626e-05, + "loss": 0.004, + "step": 2880 + }, + { + "epoch": 2.2262985132264914, + "grad_norm": 0.02000666782259941, + "learning_rate": 6.789698918825957e-05, + "loss": 0.0033, + "step": 2881 + }, + { + "epoch": 2.227070863100985, + "grad_norm": 0.009464719332754612, + "learning_rate": 6.787179814756177e-05, + "loss": 0.0033, + "step": 2882 + }, + { + "epoch": 2.2278432129754777, + "grad_norm": 0.010253376327455044, + "learning_rate": 6.784660190481535e-05, + "loss": 0.0045, + "step": 2883 + }, + { + "epoch": 2.228615562849971, + "grad_norm": 0.01094959769397974, + "learning_rate": 6.782140046735439e-05, + "loss": 0.0032, + "step": 2884 + }, + { + "epoch": 2.229387912724464, + "grad_norm": 0.0162676814943552, + "learning_rate": 6.779619384251435e-05, + "loss": 0.0037, + "step": 2885 + }, + { + "epoch": 2.2301602625989574, + "grad_norm": 0.00939108245074749, + "learning_rate": 6.77709820376323e-05, + "loss": 0.0035, + "step": 2886 + }, + { + "epoch": 2.2309326124734503, + "grad_norm": 0.010298709385097027, + "learning_rate": 6.774576506004678e-05, + "loss": 0.004, + "step": 2887 + }, + { + "epoch": 2.2317049623479437, + "grad_norm": 0.011534671299159527, + "learning_rate": 6.772054291709784e-05, + "loss": 0.0041, + "step": 2888 + }, + { + "epoch": 2.2324773122224366, + "grad_norm": 0.011011037975549698, + "learning_rate": 6.769531561612706e-05, + "loss": 0.0037, + "step": 2889 + }, + { + "epoch": 2.23324966209693, + "grad_norm": 0.010238991118967533, + "learning_rate": 6.767008316447747e-05, + "loss": 0.0035, + "step": 2890 + }, + { + "epoch": 2.234022011971423, + "grad_norm": 0.010306601412594318, + "learning_rate": 6.764484556949362e-05, + "loss": 0.0044, + "step": 2891 + }, + { + "epoch": 2.2347943618459163, + "grad_norm": 0.01326883677393198, + "learning_rate": 6.76196028385216e-05, + "loss": 0.0033, + "step": 2892 + }, + { + "epoch": 2.235566711720409, + "grad_norm": 0.00924383569508791, + "learning_rate": 6.759435497890894e-05, + "loss": 0.0037, + "step": 2893 + }, + { + "epoch": 2.2363390615949026, + "grad_norm": 0.01186363585293293, + "learning_rate": 6.756910199800468e-05, + "loss": 0.0041, + "step": 2894 + }, + { + "epoch": 2.2371114114693955, + "grad_norm": 0.007807865273207426, + "learning_rate": 6.754384390315936e-05, + "loss": 0.0036, + "step": 2895 + }, + { + "epoch": 2.237883761343889, + "grad_norm": 0.01037057489156723, + "learning_rate": 6.751858070172499e-05, + "loss": 0.0036, + "step": 2896 + }, + { + "epoch": 2.238656111218382, + "grad_norm": 0.011497294530272484, + "learning_rate": 6.749331240105507e-05, + "loss": 0.0037, + "step": 2897 + }, + { + "epoch": 2.239428461092875, + "grad_norm": 0.02164345420897007, + "learning_rate": 6.746803900850462e-05, + "loss": 0.0037, + "step": 2898 + }, + { + "epoch": 2.240200810967368, + "grad_norm": 0.008693995885550976, + "learning_rate": 6.74427605314301e-05, + "loss": 0.0038, + "step": 2899 + }, + { + "epoch": 2.2409731608418615, + "grad_norm": 0.021689802408218384, + "learning_rate": 6.741747697718946e-05, + "loss": 0.0042, + "step": 2900 + }, + { + "epoch": 2.2417455107163544, + "grad_norm": 0.008022323250770569, + "learning_rate": 6.739218835314213e-05, + "loss": 0.0041, + "step": 2901 + }, + { + "epoch": 2.242517860590848, + "grad_norm": 0.010573562234640121, + "learning_rate": 6.736689466664902e-05, + "loss": 0.0043, + "step": 2902 + }, + { + "epoch": 2.2432902104653407, + "grad_norm": 0.009405065327882767, + "learning_rate": 6.734159592507252e-05, + "loss": 0.0041, + "step": 2903 + }, + { + "epoch": 2.244062560339834, + "grad_norm": 0.009706784039735794, + "learning_rate": 6.731629213577647e-05, + "loss": 0.0042, + "step": 2904 + }, + { + "epoch": 2.244834910214327, + "grad_norm": 0.011170423589646816, + "learning_rate": 6.72909833061262e-05, + "loss": 0.0038, + "step": 2905 + }, + { + "epoch": 2.2456072600888204, + "grad_norm": 0.008561627008020878, + "learning_rate": 6.72656694434885e-05, + "loss": 0.0037, + "step": 2906 + }, + { + "epoch": 2.2463796099633133, + "grad_norm": 0.009911119937896729, + "learning_rate": 6.724035055523161e-05, + "loss": 0.0041, + "step": 2907 + }, + { + "epoch": 2.2471519598378067, + "grad_norm": 0.014809616841375828, + "learning_rate": 6.721502664872526e-05, + "loss": 0.0042, + "step": 2908 + }, + { + "epoch": 2.2479243097122996, + "grad_norm": 0.012895144522190094, + "learning_rate": 6.718969773134062e-05, + "loss": 0.0037, + "step": 2909 + }, + { + "epoch": 2.248696659586793, + "grad_norm": 0.009620807133615017, + "learning_rate": 6.716436381045032e-05, + "loss": 0.0039, + "step": 2910 + }, + { + "epoch": 2.249469009461286, + "grad_norm": 0.014791177585721016, + "learning_rate": 6.713902489342849e-05, + "loss": 0.0042, + "step": 2911 + }, + { + "epoch": 2.2502413593357793, + "grad_norm": 0.00880932342261076, + "learning_rate": 6.711368098765063e-05, + "loss": 0.0033, + "step": 2912 + }, + { + "epoch": 2.251013709210272, + "grad_norm": 0.01417181733995676, + "learning_rate": 6.708833210049374e-05, + "loss": 0.0037, + "step": 2913 + }, + { + "epoch": 2.2517860590847656, + "grad_norm": 0.010888157412409782, + "learning_rate": 6.706297823933631e-05, + "loss": 0.0037, + "step": 2914 + }, + { + "epoch": 2.2525584089592585, + "grad_norm": 0.01013087760657072, + "learning_rate": 6.70376194115582e-05, + "loss": 0.0041, + "step": 2915 + }, + { + "epoch": 2.2533307588337514, + "grad_norm": 0.021921338513493538, + "learning_rate": 6.701225562454077e-05, + "loss": 0.0037, + "step": 2916 + }, + { + "epoch": 2.254103108708245, + "grad_norm": 0.013176465407013893, + "learning_rate": 6.698688688566679e-05, + "loss": 0.0042, + "step": 2917 + }, + { + "epoch": 2.254875458582738, + "grad_norm": 0.012408842332661152, + "learning_rate": 6.69615132023205e-05, + "loss": 0.0041, + "step": 2918 + }, + { + "epoch": 2.255647808457231, + "grad_norm": 0.019068893045186996, + "learning_rate": 6.693613458188756e-05, + "loss": 0.0035, + "step": 2919 + }, + { + "epoch": 2.256420158331724, + "grad_norm": 0.008966639637947083, + "learning_rate": 6.691075103175506e-05, + "loss": 0.0037, + "step": 2920 + }, + { + "epoch": 2.2571925082062174, + "grad_norm": 0.010589229874312878, + "learning_rate": 6.688536255931157e-05, + "loss": 0.0037, + "step": 2921 + }, + { + "epoch": 2.257964858080711, + "grad_norm": 0.012341762892901897, + "learning_rate": 6.685996917194705e-05, + "loss": 0.0039, + "step": 2922 + }, + { + "epoch": 2.2587372079552037, + "grad_norm": 0.01653473637998104, + "learning_rate": 6.683457087705287e-05, + "loss": 0.0038, + "step": 2923 + }, + { + "epoch": 2.2595095578296966, + "grad_norm": 0.010807894170284271, + "learning_rate": 6.68091676820219e-05, + "loss": 0.0039, + "step": 2924 + }, + { + "epoch": 2.26028190770419, + "grad_norm": 0.013531602919101715, + "learning_rate": 6.67837595942484e-05, + "loss": 0.0036, + "step": 2925 + }, + { + "epoch": 2.2610542575786834, + "grad_norm": 0.008627147413790226, + "learning_rate": 6.675834662112801e-05, + "loss": 0.0034, + "step": 2926 + }, + { + "epoch": 2.2618266074531763, + "grad_norm": 0.009081604890525341, + "learning_rate": 6.673292877005786e-05, + "loss": 0.0035, + "step": 2927 + }, + { + "epoch": 2.2625989573276692, + "grad_norm": 0.009991762228310108, + "learning_rate": 6.670750604843646e-05, + "loss": 0.0035, + "step": 2928 + }, + { + "epoch": 2.2633713072021626, + "grad_norm": 0.008216849528253078, + "learning_rate": 6.668207846366377e-05, + "loss": 0.0032, + "step": 2929 + }, + { + "epoch": 2.2641436570766555, + "grad_norm": 0.0134681211784482, + "learning_rate": 6.665664602314112e-05, + "loss": 0.0038, + "step": 2930 + }, + { + "epoch": 2.264916006951149, + "grad_norm": 0.013249974697828293, + "learning_rate": 6.663120873427129e-05, + "loss": 0.004, + "step": 2931 + }, + { + "epoch": 2.265688356825642, + "grad_norm": 0.011910191737115383, + "learning_rate": 6.660576660445846e-05, + "loss": 0.0038, + "step": 2932 + }, + { + "epoch": 2.266460706700135, + "grad_norm": 0.01929398812353611, + "learning_rate": 6.658031964110822e-05, + "loss": 0.004, + "step": 2933 + }, + { + "epoch": 2.267233056574628, + "grad_norm": 0.011442175135016441, + "learning_rate": 6.655486785162758e-05, + "loss": 0.0043, + "step": 2934 + }, + { + "epoch": 2.2680054064491215, + "grad_norm": 0.012157517485320568, + "learning_rate": 6.652941124342492e-05, + "loss": 0.0044, + "step": 2935 + }, + { + "epoch": 2.2687777563236144, + "grad_norm": 0.01320195198059082, + "learning_rate": 6.650394982391004e-05, + "loss": 0.0038, + "step": 2936 + }, + { + "epoch": 2.269550106198108, + "grad_norm": 0.012781853787600994, + "learning_rate": 6.647848360049417e-05, + "loss": 0.0041, + "step": 2937 + }, + { + "epoch": 2.2703224560726007, + "grad_norm": 0.019531317055225372, + "learning_rate": 6.645301258058989e-05, + "loss": 0.0038, + "step": 2938 + }, + { + "epoch": 2.271094805947094, + "grad_norm": 0.010299470275640488, + "learning_rate": 6.642753677161121e-05, + "loss": 0.0034, + "step": 2939 + }, + { + "epoch": 2.271867155821587, + "grad_norm": 0.011393277905881405, + "learning_rate": 6.640205618097352e-05, + "loss": 0.0038, + "step": 2940 + }, + { + "epoch": 2.2726395056960804, + "grad_norm": 0.017499489709734917, + "learning_rate": 6.63765708160936e-05, + "loss": 0.0039, + "step": 2941 + }, + { + "epoch": 2.2734118555705733, + "grad_norm": 0.023332005366683006, + "learning_rate": 6.635108068438962e-05, + "loss": 0.0039, + "step": 2942 + }, + { + "epoch": 2.2741842054450667, + "grad_norm": 0.01747935451567173, + "learning_rate": 6.632558579328114e-05, + "loss": 0.0039, + "step": 2943 + }, + { + "epoch": 2.2749565553195596, + "grad_norm": 0.012404967099428177, + "learning_rate": 6.630008615018914e-05, + "loss": 0.0036, + "step": 2944 + }, + { + "epoch": 2.275728905194053, + "grad_norm": 0.027197325602173805, + "learning_rate": 6.627458176253591e-05, + "loss": 0.0037, + "step": 2945 + }, + { + "epoch": 2.276501255068546, + "grad_norm": 0.025847600772976875, + "learning_rate": 6.624907263774518e-05, + "loss": 0.0041, + "step": 2946 + }, + { + "epoch": 2.2772736049430393, + "grad_norm": 0.010057358071208, + "learning_rate": 6.622355878324203e-05, + "loss": 0.0034, + "step": 2947 + }, + { + "epoch": 2.2780459548175322, + "grad_norm": 0.022725243121385574, + "learning_rate": 6.619804020645292e-05, + "loss": 0.0037, + "step": 2948 + }, + { + "epoch": 2.2788183046920256, + "grad_norm": 0.02578037418425083, + "learning_rate": 6.617251691480572e-05, + "loss": 0.0037, + "step": 2949 + }, + { + "epoch": 2.2795906545665185, + "grad_norm": 0.021287426352500916, + "learning_rate": 6.614698891572962e-05, + "loss": 0.0042, + "step": 2950 + }, + { + "epoch": 2.280363004441012, + "grad_norm": 0.010197608731687069, + "learning_rate": 6.612145621665519e-05, + "loss": 0.0041, + "step": 2951 + }, + { + "epoch": 2.281135354315505, + "grad_norm": 0.020424285903573036, + "learning_rate": 6.609591882501444e-05, + "loss": 0.0036, + "step": 2952 + }, + { + "epoch": 2.281907704189998, + "grad_norm": 0.022551648318767548, + "learning_rate": 6.607037674824062e-05, + "loss": 0.0043, + "step": 2953 + }, + { + "epoch": 2.282680054064491, + "grad_norm": 0.016569821164011955, + "learning_rate": 6.604482999376845e-05, + "loss": 0.0042, + "step": 2954 + }, + { + "epoch": 2.2834524039389845, + "grad_norm": 0.018890995532274246, + "learning_rate": 6.601927856903398e-05, + "loss": 0.0042, + "step": 2955 + }, + { + "epoch": 2.2842247538134774, + "grad_norm": 0.01695701666176319, + "learning_rate": 6.599372248147458e-05, + "loss": 0.0037, + "step": 2956 + }, + { + "epoch": 2.284997103687971, + "grad_norm": 0.018755938857793808, + "learning_rate": 6.596816173852903e-05, + "loss": 0.0036, + "step": 2957 + }, + { + "epoch": 2.2857694535624637, + "grad_norm": 0.009337184019386768, + "learning_rate": 6.594259634763742e-05, + "loss": 0.0042, + "step": 2958 + }, + { + "epoch": 2.286541803436957, + "grad_norm": 0.008161675184965134, + "learning_rate": 6.591702631624126e-05, + "loss": 0.0041, + "step": 2959 + }, + { + "epoch": 2.28731415331145, + "grad_norm": 0.02394683100283146, + "learning_rate": 6.589145165178335e-05, + "loss": 0.0038, + "step": 2960 + }, + { + "epoch": 2.2880865031859434, + "grad_norm": 0.016816599294543266, + "learning_rate": 6.586587236170783e-05, + "loss": 0.0043, + "step": 2961 + }, + { + "epoch": 2.2888588530604363, + "grad_norm": 0.014665575698018074, + "learning_rate": 6.584028845346025e-05, + "loss": 0.0035, + "step": 2962 + }, + { + "epoch": 2.2896312029349293, + "grad_norm": 0.008588436990976334, + "learning_rate": 6.581469993448746e-05, + "loss": 0.0036, + "step": 2963 + }, + { + "epoch": 2.2904035528094226, + "grad_norm": 0.03165439888834953, + "learning_rate": 6.578910681223765e-05, + "loss": 0.0041, + "step": 2964 + }, + { + "epoch": 2.291175902683916, + "grad_norm": 0.024436477571725845, + "learning_rate": 6.576350909416034e-05, + "loss": 0.004, + "step": 2965 + }, + { + "epoch": 2.291948252558409, + "grad_norm": 0.023368055000901222, + "learning_rate": 6.573790678770646e-05, + "loss": 0.0041, + "step": 2966 + }, + { + "epoch": 2.292720602432902, + "grad_norm": 0.013346045278012753, + "learning_rate": 6.571229990032817e-05, + "loss": 0.0037, + "step": 2967 + }, + { + "epoch": 2.2934929523073952, + "grad_norm": 0.009437451139092445, + "learning_rate": 6.568668843947906e-05, + "loss": 0.0034, + "step": 2968 + }, + { + "epoch": 2.2942653021818886, + "grad_norm": 0.011323517188429832, + "learning_rate": 6.566107241261397e-05, + "loss": 0.0035, + "step": 2969 + }, + { + "epoch": 2.2950376520563815, + "grad_norm": 0.012972177937626839, + "learning_rate": 6.563545182718914e-05, + "loss": 0.0034, + "step": 2970 + }, + { + "epoch": 2.2958100019308745, + "grad_norm": 0.008187105879187584, + "learning_rate": 6.560982669066207e-05, + "loss": 0.0036, + "step": 2971 + }, + { + "epoch": 2.296582351805368, + "grad_norm": 0.017039962112903595, + "learning_rate": 6.558419701049163e-05, + "loss": 0.0039, + "step": 2972 + }, + { + "epoch": 2.297354701679861, + "grad_norm": 0.008243520744144917, + "learning_rate": 6.555856279413802e-05, + "loss": 0.0037, + "step": 2973 + }, + { + "epoch": 2.298127051554354, + "grad_norm": 0.012830189429223537, + "learning_rate": 6.553292404906271e-05, + "loss": 0.0035, + "step": 2974 + }, + { + "epoch": 2.298899401428847, + "grad_norm": 0.013081861659884453, + "learning_rate": 6.550728078272855e-05, + "loss": 0.0034, + "step": 2975 + }, + { + "epoch": 2.2996717513033405, + "grad_norm": 0.019632283598184586, + "learning_rate": 6.548163300259966e-05, + "loss": 0.0044, + "step": 2976 + }, + { + "epoch": 2.3004441011778334, + "grad_norm": 0.01079633366316557, + "learning_rate": 6.545598071614148e-05, + "loss": 0.004, + "step": 2977 + }, + { + "epoch": 2.3012164510523268, + "grad_norm": 0.012638948857784271, + "learning_rate": 6.543032393082077e-05, + "loss": 0.0034, + "step": 2978 + }, + { + "epoch": 2.3019888009268197, + "grad_norm": 0.016536030918359756, + "learning_rate": 6.540466265410563e-05, + "loss": 0.0035, + "step": 2979 + }, + { + "epoch": 2.302761150801313, + "grad_norm": 0.011115089990198612, + "learning_rate": 6.537899689346541e-05, + "loss": 0.0044, + "step": 2980 + }, + { + "epoch": 2.303533500675806, + "grad_norm": 0.008431201800704002, + "learning_rate": 6.53533266563708e-05, + "loss": 0.0038, + "step": 2981 + }, + { + "epoch": 2.3043058505502994, + "grad_norm": 0.010215497575700283, + "learning_rate": 6.532765195029379e-05, + "loss": 0.0033, + "step": 2982 + }, + { + "epoch": 2.3050782004247923, + "grad_norm": 0.019033120945096016, + "learning_rate": 6.530197278270765e-05, + "loss": 0.004, + "step": 2983 + }, + { + "epoch": 2.3058505502992857, + "grad_norm": 0.009608306922018528, + "learning_rate": 6.527628916108699e-05, + "loss": 0.0037, + "step": 2984 + }, + { + "epoch": 2.3066229001737786, + "grad_norm": 0.011376178823411465, + "learning_rate": 6.525060109290768e-05, + "loss": 0.0039, + "step": 2985 + }, + { + "epoch": 2.307395250048272, + "grad_norm": 0.012053635902702808, + "learning_rate": 6.522490858564689e-05, + "loss": 0.0039, + "step": 2986 + }, + { + "epoch": 2.308167599922765, + "grad_norm": 0.009816590696573257, + "learning_rate": 6.51992116467831e-05, + "loss": 0.0036, + "step": 2987 + }, + { + "epoch": 2.3089399497972583, + "grad_norm": 0.007648579776287079, + "learning_rate": 6.517351028379603e-05, + "loss": 0.0035, + "step": 2988 + }, + { + "epoch": 2.309712299671751, + "grad_norm": 0.010480094701051712, + "learning_rate": 6.51478045041668e-05, + "loss": 0.0038, + "step": 2989 + }, + { + "epoch": 2.3104846495462446, + "grad_norm": 0.0140578243881464, + "learning_rate": 6.51220943153777e-05, + "loss": 0.0042, + "step": 2990 + }, + { + "epoch": 2.3112569994207375, + "grad_norm": 0.011694060638546944, + "learning_rate": 6.509637972491231e-05, + "loss": 0.004, + "step": 2991 + }, + { + "epoch": 2.312029349295231, + "grad_norm": 0.011438708752393723, + "learning_rate": 6.507066074025557e-05, + "loss": 0.0036, + "step": 2992 + }, + { + "epoch": 2.312801699169724, + "grad_norm": 0.027124110609292984, + "learning_rate": 6.504493736889366e-05, + "loss": 0.0043, + "step": 2993 + }, + { + "epoch": 2.313574049044217, + "grad_norm": 0.01168591808527708, + "learning_rate": 6.5019209618314e-05, + "loss": 0.0041, + "step": 2994 + }, + { + "epoch": 2.31434639891871, + "grad_norm": 0.008750442415475845, + "learning_rate": 6.499347749600533e-05, + "loss": 0.0036, + "step": 2995 + }, + { + "epoch": 2.3151187487932035, + "grad_norm": 0.02237485721707344, + "learning_rate": 6.496774100945766e-05, + "loss": 0.004, + "step": 2996 + }, + { + "epoch": 2.3158910986676964, + "grad_norm": 0.012080367654561996, + "learning_rate": 6.494200016616225e-05, + "loss": 0.0035, + "step": 2997 + }, + { + "epoch": 2.3166634485421898, + "grad_norm": 0.01149061881005764, + "learning_rate": 6.491625497361164e-05, + "loss": 0.0038, + "step": 2998 + }, + { + "epoch": 2.3174357984166827, + "grad_norm": 0.01568903587758541, + "learning_rate": 6.489050543929964e-05, + "loss": 0.0039, + "step": 2999 + }, + { + "epoch": 2.318208148291176, + "grad_norm": 0.011648730374872684, + "learning_rate": 6.486475157072129e-05, + "loss": 0.0033, + "step": 3000 + }, + { + "epoch": 2.318980498165669, + "grad_norm": 0.013448680751025677, + "learning_rate": 6.483899337537295e-05, + "loss": 0.004, + "step": 3001 + }, + { + "epoch": 2.3197528480401624, + "grad_norm": 0.0091966912150383, + "learning_rate": 6.481323086075219e-05, + "loss": 0.0039, + "step": 3002 + }, + { + "epoch": 2.3205251979146553, + "grad_norm": 0.010801075026392937, + "learning_rate": 6.478746403435787e-05, + "loss": 0.0038, + "step": 3003 + }, + { + "epoch": 2.3212975477891487, + "grad_norm": 0.014122814871370792, + "learning_rate": 6.476169290369007e-05, + "loss": 0.0038, + "step": 3004 + }, + { + "epoch": 2.3220698976636416, + "grad_norm": 0.022588754072785378, + "learning_rate": 6.473591747625017e-05, + "loss": 0.0041, + "step": 3005 + }, + { + "epoch": 2.322842247538135, + "grad_norm": 0.008739450946450233, + "learning_rate": 6.471013775954076e-05, + "loss": 0.0043, + "step": 3006 + }, + { + "epoch": 2.323614597412628, + "grad_norm": 0.015635278075933456, + "learning_rate": 6.468435376106568e-05, + "loss": 0.0034, + "step": 3007 + }, + { + "epoch": 2.3243869472871213, + "grad_norm": 0.022441934794187546, + "learning_rate": 6.465856548833005e-05, + "loss": 0.0038, + "step": 3008 + }, + { + "epoch": 2.325159297161614, + "grad_norm": 0.010819089598953724, + "learning_rate": 6.46327729488402e-05, + "loss": 0.0041, + "step": 3009 + }, + { + "epoch": 2.325931647036107, + "grad_norm": 0.010484733618795872, + "learning_rate": 6.460697615010373e-05, + "loss": 0.004, + "step": 3010 + }, + { + "epoch": 2.3267039969106005, + "grad_norm": 0.017068954184651375, + "learning_rate": 6.458117509962944e-05, + "loss": 0.0042, + "step": 3011 + }, + { + "epoch": 2.327476346785094, + "grad_norm": 0.027246003970503807, + "learning_rate": 6.455536980492741e-05, + "loss": 0.0041, + "step": 3012 + }, + { + "epoch": 2.328248696659587, + "grad_norm": 0.00963776558637619, + "learning_rate": 6.452956027350893e-05, + "loss": 0.0037, + "step": 3013 + }, + { + "epoch": 2.3290210465340797, + "grad_norm": 0.01445906050503254, + "learning_rate": 6.450374651288656e-05, + "loss": 0.0033, + "step": 3014 + }, + { + "epoch": 2.329793396408573, + "grad_norm": 0.025476105511188507, + "learning_rate": 6.447792853057402e-05, + "loss": 0.0041, + "step": 3015 + }, + { + "epoch": 2.3305657462830665, + "grad_norm": 0.01671411097049713, + "learning_rate": 6.445210633408631e-05, + "loss": 0.0044, + "step": 3016 + }, + { + "epoch": 2.3313380961575594, + "grad_norm": 0.01059251930564642, + "learning_rate": 6.442627993093966e-05, + "loss": 0.0035, + "step": 3017 + }, + { + "epoch": 2.3321104460320523, + "grad_norm": 0.015922479331493378, + "learning_rate": 6.440044932865151e-05, + "loss": 0.0037, + "step": 3018 + }, + { + "epoch": 2.3328827959065457, + "grad_norm": 0.01611727476119995, + "learning_rate": 6.437461453474052e-05, + "loss": 0.0039, + "step": 3019 + }, + { + "epoch": 2.333655145781039, + "grad_norm": 0.015589396469295025, + "learning_rate": 6.434877555672657e-05, + "loss": 0.0037, + "step": 3020 + }, + { + "epoch": 2.334427495655532, + "grad_norm": 0.008948437869548798, + "learning_rate": 6.432293240213079e-05, + "loss": 0.0037, + "step": 3021 + }, + { + "epoch": 2.335199845530025, + "grad_norm": 0.013299311511218548, + "learning_rate": 6.429708507847544e-05, + "loss": 0.0038, + "step": 3022 + }, + { + "epoch": 2.3359721954045183, + "grad_norm": 0.013574497774243355, + "learning_rate": 6.427123359328413e-05, + "loss": 0.0039, + "step": 3023 + }, + { + "epoch": 2.336744545279011, + "grad_norm": 0.021488720551133156, + "learning_rate": 6.424537795408153e-05, + "loss": 0.004, + "step": 3024 + }, + { + "epoch": 2.3375168951535046, + "grad_norm": 0.012841997668147087, + "learning_rate": 6.421951816839364e-05, + "loss": 0.0041, + "step": 3025 + }, + { + "epoch": 2.3382892450279975, + "grad_norm": 0.018972106277942657, + "learning_rate": 6.41936542437476e-05, + "loss": 0.0041, + "step": 3026 + }, + { + "epoch": 2.339061594902491, + "grad_norm": 0.02803695946931839, + "learning_rate": 6.416778618767175e-05, + "loss": 0.0042, + "step": 3027 + }, + { + "epoch": 2.339833944776984, + "grad_norm": 0.020719368010759354, + "learning_rate": 6.414191400769571e-05, + "loss": 0.0043, + "step": 3028 + }, + { + "epoch": 2.340606294651477, + "grad_norm": 0.007930947467684746, + "learning_rate": 6.411603771135019e-05, + "loss": 0.0038, + "step": 3029 + }, + { + "epoch": 2.34137864452597, + "grad_norm": 0.03803905099630356, + "learning_rate": 6.409015730616719e-05, + "loss": 0.0043, + "step": 3030 + }, + { + "epoch": 2.3421509944004635, + "grad_norm": 0.023514466360211372, + "learning_rate": 6.406427279967987e-05, + "loss": 0.0035, + "step": 3031 + }, + { + "epoch": 2.3429233442749564, + "grad_norm": 0.008395480923354626, + "learning_rate": 6.403838419942256e-05, + "loss": 0.0039, + "step": 3032 + }, + { + "epoch": 2.34369569414945, + "grad_norm": 0.0166980791836977, + "learning_rate": 6.401249151293084e-05, + "loss": 0.004, + "step": 3033 + }, + { + "epoch": 2.3444680440239427, + "grad_norm": 0.029495570808649063, + "learning_rate": 6.398659474774142e-05, + "loss": 0.0048, + "step": 3034 + }, + { + "epoch": 2.345240393898436, + "grad_norm": 0.026838259771466255, + "learning_rate": 6.396069391139223e-05, + "loss": 0.0039, + "step": 3035 + }, + { + "epoch": 2.346012743772929, + "grad_norm": 0.010425945743918419, + "learning_rate": 6.393478901142237e-05, + "loss": 0.0041, + "step": 3036 + }, + { + "epoch": 2.3467850936474224, + "grad_norm": 0.026453660801053047, + "learning_rate": 6.390888005537216e-05, + "loss": 0.0038, + "step": 3037 + }, + { + "epoch": 2.3475574435219153, + "grad_norm": 0.035654086619615555, + "learning_rate": 6.388296705078303e-05, + "loss": 0.0043, + "step": 3038 + }, + { + "epoch": 2.3483297933964087, + "grad_norm": 0.023507380858063698, + "learning_rate": 6.385705000519766e-05, + "loss": 0.004, + "step": 3039 + }, + { + "epoch": 2.3491021432709016, + "grad_norm": 0.031890109181404114, + "learning_rate": 6.383112892615986e-05, + "loss": 0.0036, + "step": 3040 + }, + { + "epoch": 2.349874493145395, + "grad_norm": 0.03831114247441292, + "learning_rate": 6.380520382121463e-05, + "loss": 0.0042, + "step": 3041 + }, + { + "epoch": 2.350646843019888, + "grad_norm": 0.022333258762955666, + "learning_rate": 6.377927469790815e-05, + "loss": 0.0039, + "step": 3042 + }, + { + "epoch": 2.3514191928943813, + "grad_norm": 0.008276746608316898, + "learning_rate": 6.375334156378776e-05, + "loss": 0.0037, + "step": 3043 + }, + { + "epoch": 2.352191542768874, + "grad_norm": 0.0267958901822567, + "learning_rate": 6.372740442640196e-05, + "loss": 0.004, + "step": 3044 + }, + { + "epoch": 2.3529638926433676, + "grad_norm": 0.04083576798439026, + "learning_rate": 6.370146329330043e-05, + "loss": 0.0048, + "step": 3045 + }, + { + "epoch": 2.3537362425178605, + "grad_norm": 0.01393703743815422, + "learning_rate": 6.3675518172034e-05, + "loss": 0.0044, + "step": 3046 + }, + { + "epoch": 2.354508592392354, + "grad_norm": 0.019029032438993454, + "learning_rate": 6.364956907015469e-05, + "loss": 0.0039, + "step": 3047 + }, + { + "epoch": 2.355280942266847, + "grad_norm": 0.029930435121059418, + "learning_rate": 6.362361599521563e-05, + "loss": 0.0041, + "step": 3048 + }, + { + "epoch": 2.35605329214134, + "grad_norm": 0.020898763090372086, + "learning_rate": 6.359765895477114e-05, + "loss": 0.0039, + "step": 3049 + }, + { + "epoch": 2.356825642015833, + "grad_norm": 0.016825007274746895, + "learning_rate": 6.357169795637671e-05, + "loss": 0.0041, + "step": 3050 + }, + { + "epoch": 2.3575979918903265, + "grad_norm": 0.01854357123374939, + "learning_rate": 6.354573300758893e-05, + "loss": 0.0038, + "step": 3051 + }, + { + "epoch": 2.3583703417648194, + "grad_norm": 0.040367867797613144, + "learning_rate": 6.351976411596558e-05, + "loss": 0.0044, + "step": 3052 + }, + { + "epoch": 2.359142691639313, + "grad_norm": 0.021105622872710228, + "learning_rate": 6.349379128906559e-05, + "loss": 0.0047, + "step": 3053 + }, + { + "epoch": 2.3599150415138057, + "grad_norm": 0.022896962240338326, + "learning_rate": 6.346781453444898e-05, + "loss": 0.0034, + "step": 3054 + }, + { + "epoch": 2.360687391388299, + "grad_norm": 0.017875095829367638, + "learning_rate": 6.344183385967702e-05, + "loss": 0.0033, + "step": 3055 + }, + { + "epoch": 2.361459741262792, + "grad_norm": 0.04240044206380844, + "learning_rate": 6.341584927231198e-05, + "loss": 0.0041, + "step": 3056 + }, + { + "epoch": 2.3622320911372854, + "grad_norm": 0.024292636662721634, + "learning_rate": 6.338986077991742e-05, + "loss": 0.0042, + "step": 3057 + }, + { + "epoch": 2.3630044410117783, + "grad_norm": 0.011553112417459488, + "learning_rate": 6.336386839005792e-05, + "loss": 0.0038, + "step": 3058 + }, + { + "epoch": 2.3637767908862717, + "grad_norm": 0.01957235299050808, + "learning_rate": 6.333787211029924e-05, + "loss": 0.0044, + "step": 3059 + }, + { + "epoch": 2.3645491407607646, + "grad_norm": 0.03314711153507233, + "learning_rate": 6.331187194820827e-05, + "loss": 0.0041, + "step": 3060 + }, + { + "epoch": 2.3653214906352575, + "grad_norm": 0.027983160689473152, + "learning_rate": 6.328586791135304e-05, + "loss": 0.0043, + "step": 3061 + }, + { + "epoch": 2.366093840509751, + "grad_norm": 0.009940424002707005, + "learning_rate": 6.32598600073027e-05, + "loss": 0.0041, + "step": 3062 + }, + { + "epoch": 2.3668661903842443, + "grad_norm": 0.022160066291689873, + "learning_rate": 6.323384824362753e-05, + "loss": 0.0036, + "step": 3063 + }, + { + "epoch": 2.367638540258737, + "grad_norm": 0.0382474847137928, + "learning_rate": 6.320783262789887e-05, + "loss": 0.0041, + "step": 3064 + }, + { + "epoch": 2.36841089013323, + "grad_norm": 0.011643902398645878, + "learning_rate": 6.318181316768929e-05, + "loss": 0.0039, + "step": 3065 + }, + { + "epoch": 2.3691832400077235, + "grad_norm": 0.009308322332799435, + "learning_rate": 6.315578987057242e-05, + "loss": 0.004, + "step": 3066 + }, + { + "epoch": 2.369955589882217, + "grad_norm": 0.01695558987557888, + "learning_rate": 6.312976274412299e-05, + "loss": 0.0036, + "step": 3067 + }, + { + "epoch": 2.37072793975671, + "grad_norm": 0.017537007108330727, + "learning_rate": 6.310373179591688e-05, + "loss": 0.0035, + "step": 3068 + }, + { + "epoch": 2.3715002896312027, + "grad_norm": 0.02394968830049038, + "learning_rate": 6.307769703353109e-05, + "loss": 0.0041, + "step": 3069 + }, + { + "epoch": 2.372272639505696, + "grad_norm": 0.009026383981108665, + "learning_rate": 6.305165846454369e-05, + "loss": 0.0035, + "step": 3070 + }, + { + "epoch": 2.373044989380189, + "grad_norm": 0.017562277615070343, + "learning_rate": 6.302561609653387e-05, + "loss": 0.0036, + "step": 3071 + }, + { + "epoch": 2.3738173392546824, + "grad_norm": 0.018362585455179214, + "learning_rate": 6.299956993708196e-05, + "loss": 0.004, + "step": 3072 + }, + { + "epoch": 2.3745896891291753, + "grad_norm": 0.015956103801727295, + "learning_rate": 6.297351999376934e-05, + "loss": 0.0036, + "step": 3073 + }, + { + "epoch": 2.3753620390036687, + "grad_norm": 0.009392969310283661, + "learning_rate": 6.294746627417853e-05, + "loss": 0.0034, + "step": 3074 + }, + { + "epoch": 2.3761343888781616, + "grad_norm": 0.013169806450605392, + "learning_rate": 6.292140878589315e-05, + "loss": 0.0042, + "step": 3075 + }, + { + "epoch": 2.376906738752655, + "grad_norm": 0.023034386336803436, + "learning_rate": 6.289534753649788e-05, + "loss": 0.005, + "step": 3076 + }, + { + "epoch": 2.377679088627148, + "grad_norm": 0.011399516835808754, + "learning_rate": 6.286928253357856e-05, + "loss": 0.0039, + "step": 3077 + }, + { + "epoch": 2.3784514385016413, + "grad_norm": 0.010855674743652344, + "learning_rate": 6.284321378472204e-05, + "loss": 0.0034, + "step": 3078 + }, + { + "epoch": 2.3792237883761342, + "grad_norm": 0.007988915778696537, + "learning_rate": 6.281714129751632e-05, + "loss": 0.0034, + "step": 3079 + }, + { + "epoch": 2.3799961382506276, + "grad_norm": 0.014476333744823933, + "learning_rate": 6.279106507955049e-05, + "loss": 0.0038, + "step": 3080 + }, + { + "epoch": 2.3807684881251205, + "grad_norm": 0.011578804813325405, + "learning_rate": 6.276498513841466e-05, + "loss": 0.0033, + "step": 3081 + }, + { + "epoch": 2.381540837999614, + "grad_norm": 0.01192422304302454, + "learning_rate": 6.273890148170011e-05, + "loss": 0.0041, + "step": 3082 + }, + { + "epoch": 2.382313187874107, + "grad_norm": 0.017417842522263527, + "learning_rate": 6.271281411699916e-05, + "loss": 0.0041, + "step": 3083 + }, + { + "epoch": 2.3830855377486, + "grad_norm": 0.018631063401699066, + "learning_rate": 6.268672305190519e-05, + "loss": 0.004, + "step": 3084 + }, + { + "epoch": 2.383857887623093, + "grad_norm": 0.010208629071712494, + "learning_rate": 6.266062829401271e-05, + "loss": 0.0041, + "step": 3085 + }, + { + "epoch": 2.3846302374975865, + "grad_norm": 0.020590389147400856, + "learning_rate": 6.263452985091725e-05, + "loss": 0.0035, + "step": 3086 + }, + { + "epoch": 2.3854025873720794, + "grad_norm": 0.030716968700289726, + "learning_rate": 6.260842773021543e-05, + "loss": 0.0041, + "step": 3087 + }, + { + "epoch": 2.386174937246573, + "grad_norm": 0.014777534641325474, + "learning_rate": 6.258232193950497e-05, + "loss": 0.0032, + "step": 3088 + }, + { + "epoch": 2.3869472871210657, + "grad_norm": 0.008989118970930576, + "learning_rate": 6.255621248638461e-05, + "loss": 0.0039, + "step": 3089 + }, + { + "epoch": 2.387719636995559, + "grad_norm": 0.020442666485905647, + "learning_rate": 6.253009937845422e-05, + "loss": 0.0039, + "step": 3090 + }, + { + "epoch": 2.388491986870052, + "grad_norm": 0.02745845727622509, + "learning_rate": 6.250398262331463e-05, + "loss": 0.0038, + "step": 3091 + }, + { + "epoch": 2.3892643367445454, + "grad_norm": 0.010492071509361267, + "learning_rate": 6.247786222856786e-05, + "loss": 0.0044, + "step": 3092 + }, + { + "epoch": 2.3900366866190383, + "grad_norm": 0.008296754211187363, + "learning_rate": 6.245173820181691e-05, + "loss": 0.0039, + "step": 3093 + }, + { + "epoch": 2.3908090364935317, + "grad_norm": 0.01965721696615219, + "learning_rate": 6.242561055066581e-05, + "loss": 0.0037, + "step": 3094 + }, + { + "epoch": 2.3915813863680246, + "grad_norm": 0.023441633209586143, + "learning_rate": 6.239947928271974e-05, + "loss": 0.004, + "step": 3095 + }, + { + "epoch": 2.392353736242518, + "grad_norm": 0.010678118094801903, + "learning_rate": 6.237334440558487e-05, + "loss": 0.0041, + "step": 3096 + }, + { + "epoch": 2.393126086117011, + "grad_norm": 0.01498490385711193, + "learning_rate": 6.234720592686842e-05, + "loss": 0.0036, + "step": 3097 + }, + { + "epoch": 2.3938984359915043, + "grad_norm": 0.011694070883095264, + "learning_rate": 6.232106385417866e-05, + "loss": 0.0037, + "step": 3098 + }, + { + "epoch": 2.3946707858659972, + "grad_norm": 0.018943822011351585, + "learning_rate": 6.229491819512494e-05, + "loss": 0.0043, + "step": 3099 + }, + { + "epoch": 2.3954431357404906, + "grad_norm": 0.008188261650502682, + "learning_rate": 6.226876895731761e-05, + "loss": 0.0038, + "step": 3100 + }, + { + "epoch": 2.3962154856149835, + "grad_norm": 0.014174428768455982, + "learning_rate": 6.22426161483681e-05, + "loss": 0.0033, + "step": 3101 + }, + { + "epoch": 2.396987835489477, + "grad_norm": 0.021323859691619873, + "learning_rate": 6.221645977588885e-05, + "loss": 0.0039, + "step": 3102 + }, + { + "epoch": 2.39776018536397, + "grad_norm": 0.017304735258221626, + "learning_rate": 6.219029984749334e-05, + "loss": 0.0031, + "step": 3103 + }, + { + "epoch": 2.398532535238463, + "grad_norm": 0.011139573529362679, + "learning_rate": 6.216413637079611e-05, + "loss": 0.0037, + "step": 3104 + }, + { + "epoch": 2.399304885112956, + "grad_norm": 0.01064131036400795, + "learning_rate": 6.213796935341269e-05, + "loss": 0.0042, + "step": 3105 + }, + { + "epoch": 2.4000772349874495, + "grad_norm": 0.01577679067850113, + "learning_rate": 6.211179880295971e-05, + "loss": 0.0034, + "step": 3106 + }, + { + "epoch": 2.4008495848619424, + "grad_norm": 0.009980639442801476, + "learning_rate": 6.208562472705474e-05, + "loss": 0.0039, + "step": 3107 + }, + { + "epoch": 2.4016219347364354, + "grad_norm": 0.007531393319368362, + "learning_rate": 6.205944713331644e-05, + "loss": 0.0032, + "step": 3108 + }, + { + "epoch": 2.4023942846109287, + "grad_norm": 0.007788199465721846, + "learning_rate": 6.203326602936448e-05, + "loss": 0.0034, + "step": 3109 + }, + { + "epoch": 2.403166634485422, + "grad_norm": 0.01703229732811451, + "learning_rate": 6.200708142281954e-05, + "loss": 0.0042, + "step": 3110 + }, + { + "epoch": 2.403938984359915, + "grad_norm": 0.015591508708894253, + "learning_rate": 6.198089332130332e-05, + "loss": 0.004, + "step": 3111 + }, + { + "epoch": 2.404711334234408, + "grad_norm": 0.008327793329954147, + "learning_rate": 6.195470173243857e-05, + "loss": 0.0037, + "step": 3112 + }, + { + "epoch": 2.4054836841089013, + "grad_norm": 0.013708319514989853, + "learning_rate": 6.1928506663849e-05, + "loss": 0.0034, + "step": 3113 + }, + { + "epoch": 2.4062560339833947, + "grad_norm": 0.016529733315110207, + "learning_rate": 6.190230812315936e-05, + "loss": 0.004, + "step": 3114 + }, + { + "epoch": 2.4070283838578876, + "grad_norm": 0.011277355253696442, + "learning_rate": 6.187610611799544e-05, + "loss": 0.004, + "step": 3115 + }, + { + "epoch": 2.4078007337323806, + "grad_norm": 0.009526095353066921, + "learning_rate": 6.184990065598397e-05, + "loss": 0.0038, + "step": 3116 + }, + { + "epoch": 2.408573083606874, + "grad_norm": 0.008797888644039631, + "learning_rate": 6.182369174475278e-05, + "loss": 0.0035, + "step": 3117 + }, + { + "epoch": 2.409345433481367, + "grad_norm": 0.007940378971397877, + "learning_rate": 6.179747939193062e-05, + "loss": 0.0042, + "step": 3118 + }, + { + "epoch": 2.4101177833558602, + "grad_norm": 0.01933273859322071, + "learning_rate": 6.177126360514725e-05, + "loss": 0.004, + "step": 3119 + }, + { + "epoch": 2.410890133230353, + "grad_norm": 0.011832729913294315, + "learning_rate": 6.174504439203349e-05, + "loss": 0.004, + "step": 3120 + }, + { + "epoch": 2.4116624831048465, + "grad_norm": 0.009274031966924667, + "learning_rate": 6.17188217602211e-05, + "loss": 0.0038, + "step": 3121 + }, + { + "epoch": 2.4124348329793395, + "grad_norm": 0.021085087209939957, + "learning_rate": 6.169259571734286e-05, + "loss": 0.0039, + "step": 3122 + }, + { + "epoch": 2.413207182853833, + "grad_norm": 0.00877199973911047, + "learning_rate": 6.166636627103256e-05, + "loss": 0.0035, + "step": 3123 + }, + { + "epoch": 2.4139795327283258, + "grad_norm": 0.01439269445836544, + "learning_rate": 6.16401334289249e-05, + "loss": 0.0039, + "step": 3124 + }, + { + "epoch": 2.414751882602819, + "grad_norm": 0.009551884606480598, + "learning_rate": 6.16138971986557e-05, + "loss": 0.0043, + "step": 3125 + }, + { + "epoch": 2.415524232477312, + "grad_norm": 0.014275209046900272, + "learning_rate": 6.158765758786162e-05, + "loss": 0.0042, + "step": 3126 + }, + { + "epoch": 2.4162965823518054, + "grad_norm": 0.023969031870365143, + "learning_rate": 6.156141460418042e-05, + "loss": 0.0042, + "step": 3127 + }, + { + "epoch": 2.4170689322262984, + "grad_norm": 0.007481127046048641, + "learning_rate": 6.15351682552508e-05, + "loss": 0.0038, + "step": 3128 + }, + { + "epoch": 2.4178412821007917, + "grad_norm": 0.027742892503738403, + "learning_rate": 6.150891854871243e-05, + "loss": 0.0037, + "step": 3129 + }, + { + "epoch": 2.4186136319752847, + "grad_norm": 0.011066229082643986, + "learning_rate": 6.148266549220594e-05, + "loss": 0.004, + "step": 3130 + }, + { + "epoch": 2.419385981849778, + "grad_norm": 0.008486258797347546, + "learning_rate": 6.145640909337302e-05, + "loss": 0.0039, + "step": 3131 + }, + { + "epoch": 2.420158331724271, + "grad_norm": 0.013606767170131207, + "learning_rate": 6.14301493598562e-05, + "loss": 0.0044, + "step": 3132 + }, + { + "epoch": 2.4209306815987643, + "grad_norm": 0.014068580232560635, + "learning_rate": 6.140388629929912e-05, + "loss": 0.0036, + "step": 3133 + }, + { + "epoch": 2.4217030314732573, + "grad_norm": 0.007964743301272392, + "learning_rate": 6.137761991934628e-05, + "loss": 0.0036, + "step": 3134 + }, + { + "epoch": 2.4224753813477506, + "grad_norm": 0.010003727860748768, + "learning_rate": 6.13513502276432e-05, + "loss": 0.0038, + "step": 3135 + }, + { + "epoch": 2.4232477312222436, + "grad_norm": 0.0116347037255764, + "learning_rate": 6.132507723183635e-05, + "loss": 0.0038, + "step": 3136 + }, + { + "epoch": 2.424020081096737, + "grad_norm": 0.013268062844872475, + "learning_rate": 6.129880093957316e-05, + "loss": 0.0042, + "step": 3137 + }, + { + "epoch": 2.42479243097123, + "grad_norm": 0.009738652966916561, + "learning_rate": 6.127252135850206e-05, + "loss": 0.0037, + "step": 3138 + }, + { + "epoch": 2.4255647808457232, + "grad_norm": 0.009008978493511677, + "learning_rate": 6.124623849627236e-05, + "loss": 0.0031, + "step": 3139 + }, + { + "epoch": 2.426337130720216, + "grad_norm": 0.008294097147881985, + "learning_rate": 6.121995236053437e-05, + "loss": 0.0039, + "step": 3140 + }, + { + "epoch": 2.4271094805947095, + "grad_norm": 0.011128397658467293, + "learning_rate": 6.119366295893935e-05, + "loss": 0.0042, + "step": 3141 + }, + { + "epoch": 2.4278818304692025, + "grad_norm": 0.010329133830964565, + "learning_rate": 6.116737029913954e-05, + "loss": 0.0035, + "step": 3142 + }, + { + "epoch": 2.428654180343696, + "grad_norm": 0.011560085229575634, + "learning_rate": 6.114107438878807e-05, + "loss": 0.004, + "step": 3143 + }, + { + "epoch": 2.4294265302181888, + "grad_norm": 0.008150237612426281, + "learning_rate": 6.111477523553905e-05, + "loss": 0.0035, + "step": 3144 + }, + { + "epoch": 2.430198880092682, + "grad_norm": 0.008369138464331627, + "learning_rate": 6.108847284704752e-05, + "loss": 0.0036, + "step": 3145 + }, + { + "epoch": 2.430971229967175, + "grad_norm": 0.019015392288565636, + "learning_rate": 6.106216723096948e-05, + "loss": 0.004, + "step": 3146 + }, + { + "epoch": 2.4317435798416684, + "grad_norm": 0.013174445368349552, + "learning_rate": 6.103585839496187e-05, + "loss": 0.0031, + "step": 3147 + }, + { + "epoch": 2.4325159297161614, + "grad_norm": 0.009038873016834259, + "learning_rate": 6.1009546346682533e-05, + "loss": 0.0035, + "step": 3148 + }, + { + "epoch": 2.4332882795906547, + "grad_norm": 0.016160445287823677, + "learning_rate": 6.098323109379027e-05, + "loss": 0.0034, + "step": 3149 + }, + { + "epoch": 2.4340606294651477, + "grad_norm": 0.01445263996720314, + "learning_rate": 6.0956912643944843e-05, + "loss": 0.0035, + "step": 3150 + }, + { + "epoch": 2.434832979339641, + "grad_norm": 0.007858852855861187, + "learning_rate": 6.093059100480688e-05, + "loss": 0.0037, + "step": 3151 + }, + { + "epoch": 2.435605329214134, + "grad_norm": 0.008782031945884228, + "learning_rate": 6.090426618403802e-05, + "loss": 0.004, + "step": 3152 + }, + { + "epoch": 2.4363776790886273, + "grad_norm": 0.019404729828238487, + "learning_rate": 6.087793818930074e-05, + "loss": 0.0041, + "step": 3153 + }, + { + "epoch": 2.4371500289631203, + "grad_norm": 0.008168632164597511, + "learning_rate": 6.0851607028258494e-05, + "loss": 0.0037, + "step": 3154 + }, + { + "epoch": 2.437922378837613, + "grad_norm": 0.010241009294986725, + "learning_rate": 6.082527270857565e-05, + "loss": 0.0038, + "step": 3155 + }, + { + "epoch": 2.4386947287121066, + "grad_norm": 0.00815881323069334, + "learning_rate": 6.079893523791751e-05, + "loss": 0.003, + "step": 3156 + }, + { + "epoch": 2.4394670785866, + "grad_norm": 0.009282760322093964, + "learning_rate": 6.077259462395025e-05, + "loss": 0.0039, + "step": 3157 + }, + { + "epoch": 2.440239428461093, + "grad_norm": 0.009555073454976082, + "learning_rate": 6.074625087434101e-05, + "loss": 0.0038, + "step": 3158 + }, + { + "epoch": 2.441011778335586, + "grad_norm": 0.00916551798582077, + "learning_rate": 6.071990399675779e-05, + "loss": 0.0037, + "step": 3159 + }, + { + "epoch": 2.441784128210079, + "grad_norm": 0.009903534315526485, + "learning_rate": 6.069355399886955e-05, + "loss": 0.0038, + "step": 3160 + }, + { + "epoch": 2.4425564780845725, + "grad_norm": 0.01959330402314663, + "learning_rate": 6.066720088834612e-05, + "loss": 0.0037, + "step": 3161 + }, + { + "epoch": 2.4433288279590655, + "grad_norm": 0.011863019317388535, + "learning_rate": 6.064084467285828e-05, + "loss": 0.004, + "step": 3162 + }, + { + "epoch": 2.4441011778335584, + "grad_norm": 0.010247189551591873, + "learning_rate": 6.0614485360077656e-05, + "loss": 0.0039, + "step": 3163 + }, + { + "epoch": 2.4448735277080518, + "grad_norm": 0.012665782123804092, + "learning_rate": 6.058812295767684e-05, + "loss": 0.0036, + "step": 3164 + }, + { + "epoch": 2.4456458775825447, + "grad_norm": 0.01594417728483677, + "learning_rate": 6.0561757473329264e-05, + "loss": 0.0043, + "step": 3165 + }, + { + "epoch": 2.446418227457038, + "grad_norm": 0.008561563678085804, + "learning_rate": 6.05353889147093e-05, + "loss": 0.0034, + "step": 3166 + }, + { + "epoch": 2.447190577331531, + "grad_norm": 0.008343924768269062, + "learning_rate": 6.0509017289492184e-05, + "loss": 0.0034, + "step": 3167 + }, + { + "epoch": 2.4479629272060244, + "grad_norm": 0.012697969563305378, + "learning_rate": 6.0482642605354075e-05, + "loss": 0.0038, + "step": 3168 + }, + { + "epoch": 2.4487352770805173, + "grad_norm": 0.018456846475601196, + "learning_rate": 6.0456264869972e-05, + "loss": 0.0038, + "step": 3169 + }, + { + "epoch": 2.4495076269550107, + "grad_norm": 0.007918601855635643, + "learning_rate": 6.0429884091023867e-05, + "loss": 0.0038, + "step": 3170 + }, + { + "epoch": 2.4502799768295036, + "grad_norm": 0.014714458957314491, + "learning_rate": 6.0403500276188494e-05, + "loss": 0.0037, + "step": 3171 + }, + { + "epoch": 2.451052326703997, + "grad_norm": 0.018335428088903427, + "learning_rate": 6.0377113433145596e-05, + "loss": 0.0038, + "step": 3172 + }, + { + "epoch": 2.45182467657849, + "grad_norm": 0.016228122636675835, + "learning_rate": 6.03507235695757e-05, + "loss": 0.0038, + "step": 3173 + }, + { + "epoch": 2.4525970264529833, + "grad_norm": 0.008822061121463776, + "learning_rate": 6.032433069316028e-05, + "loss": 0.0031, + "step": 3174 + }, + { + "epoch": 2.453369376327476, + "grad_norm": 0.01898917555809021, + "learning_rate": 6.029793481158168e-05, + "loss": 0.0042, + "step": 3175 + }, + { + "epoch": 2.4541417262019696, + "grad_norm": 0.020056406036019325, + "learning_rate": 6.027153593252306e-05, + "loss": 0.004, + "step": 3176 + }, + { + "epoch": 2.4549140760764625, + "grad_norm": 0.011279793456196785, + "learning_rate": 6.024513406366855e-05, + "loss": 0.0035, + "step": 3177 + }, + { + "epoch": 2.455686425950956, + "grad_norm": 0.014008576981723309, + "learning_rate": 6.0218729212703064e-05, + "loss": 0.0033, + "step": 3178 + }, + { + "epoch": 2.456458775825449, + "grad_norm": 0.01047800574451685, + "learning_rate": 6.019232138731241e-05, + "loss": 0.004, + "step": 3179 + }, + { + "epoch": 2.457231125699942, + "grad_norm": 0.009189547039568424, + "learning_rate": 6.016591059518329e-05, + "loss": 0.0039, + "step": 3180 + }, + { + "epoch": 2.458003475574435, + "grad_norm": 0.008353308774530888, + "learning_rate": 6.013949684400323e-05, + "loss": 0.0036, + "step": 3181 + }, + { + "epoch": 2.4587758254489285, + "grad_norm": 0.007851125672459602, + "learning_rate": 6.0113080141460646e-05, + "loss": 0.0032, + "step": 3182 + }, + { + "epoch": 2.4595481753234214, + "grad_norm": 0.011206655763089657, + "learning_rate": 6.008666049524481e-05, + "loss": 0.0037, + "step": 3183 + }, + { + "epoch": 2.4603205251979148, + "grad_norm": 0.013563932850956917, + "learning_rate": 6.00602379130458e-05, + "loss": 0.0034, + "step": 3184 + }, + { + "epoch": 2.4610928750724077, + "grad_norm": 0.008387591689825058, + "learning_rate": 6.003381240255466e-05, + "loss": 0.0039, + "step": 3185 + }, + { + "epoch": 2.461865224946901, + "grad_norm": 0.009753430262207985, + "learning_rate": 6.000738397146315e-05, + "loss": 0.0036, + "step": 3186 + }, + { + "epoch": 2.462637574821394, + "grad_norm": 0.009347907267510891, + "learning_rate": 5.998095262746397e-05, + "loss": 0.0036, + "step": 3187 + }, + { + "epoch": 2.4634099246958874, + "grad_norm": 0.010195978917181492, + "learning_rate": 5.995451837825068e-05, + "loss": 0.0037, + "step": 3188 + }, + { + "epoch": 2.4641822745703803, + "grad_norm": 0.008878756314516068, + "learning_rate": 5.99280812315176e-05, + "loss": 0.0034, + "step": 3189 + }, + { + "epoch": 2.4649546244448737, + "grad_norm": 0.010437872260808945, + "learning_rate": 5.9901641194959976e-05, + "loss": 0.0038, + "step": 3190 + }, + { + "epoch": 2.4657269743193666, + "grad_norm": 0.012879699468612671, + "learning_rate": 5.987519827627387e-05, + "loss": 0.0039, + "step": 3191 + }, + { + "epoch": 2.46649932419386, + "grad_norm": 0.007131462451070547, + "learning_rate": 5.9848752483156135e-05, + "loss": 0.0035, + "step": 3192 + }, + { + "epoch": 2.467271674068353, + "grad_norm": 0.009347299113869667, + "learning_rate": 5.9822303823304545e-05, + "loss": 0.0036, + "step": 3193 + }, + { + "epoch": 2.4680440239428463, + "grad_norm": 0.009807485155761242, + "learning_rate": 5.979585230441764e-05, + "loss": 0.0034, + "step": 3194 + }, + { + "epoch": 2.468816373817339, + "grad_norm": 0.010532918386161327, + "learning_rate": 5.976939793419485e-05, + "loss": 0.0039, + "step": 3195 + }, + { + "epoch": 2.4695887236918326, + "grad_norm": 0.008854050189256668, + "learning_rate": 5.974294072033637e-05, + "loss": 0.0036, + "step": 3196 + }, + { + "epoch": 2.4703610735663255, + "grad_norm": 0.012353861704468727, + "learning_rate": 5.971648067054326e-05, + "loss": 0.0034, + "step": 3197 + }, + { + "epoch": 2.471133423440819, + "grad_norm": 0.01505427435040474, + "learning_rate": 5.969001779251742e-05, + "loss": 0.0035, + "step": 3198 + }, + { + "epoch": 2.471905773315312, + "grad_norm": 0.01005632895976305, + "learning_rate": 5.966355209396154e-05, + "loss": 0.0044, + "step": 3199 + }, + { + "epoch": 2.472678123189805, + "grad_norm": 0.018703538924455643, + "learning_rate": 5.9637083582579136e-05, + "loss": 0.0039, + "step": 3200 + }, + { + "epoch": 2.473450473064298, + "grad_norm": 0.008641044609248638, + "learning_rate": 5.9610612266074585e-05, + "loss": 0.0041, + "step": 3201 + }, + { + "epoch": 2.474222822938791, + "grad_norm": 0.013356814160943031, + "learning_rate": 5.9584138152153004e-05, + "loss": 0.0037, + "step": 3202 + }, + { + "epoch": 2.4749951728132844, + "grad_norm": 0.017683392390608788, + "learning_rate": 5.95576612485204e-05, + "loss": 0.0044, + "step": 3203 + }, + { + "epoch": 2.4757675226877778, + "grad_norm": 0.011670832522213459, + "learning_rate": 5.953118156288355e-05, + "loss": 0.004, + "step": 3204 + }, + { + "epoch": 2.4765398725622707, + "grad_norm": 0.008941936306655407, + "learning_rate": 5.9504699102950055e-05, + "loss": 0.0031, + "step": 3205 + }, + { + "epoch": 2.4773122224367636, + "grad_norm": 0.019912226125597954, + "learning_rate": 5.9478213876428316e-05, + "loss": 0.0035, + "step": 3206 + }, + { + "epoch": 2.478084572311257, + "grad_norm": 0.009334501810371876, + "learning_rate": 5.945172589102754e-05, + "loss": 0.0041, + "step": 3207 + }, + { + "epoch": 2.4788569221857504, + "grad_norm": 0.010612251237034798, + "learning_rate": 5.942523515445775e-05, + "loss": 0.0038, + "step": 3208 + }, + { + "epoch": 2.4796292720602433, + "grad_norm": 0.016206124797463417, + "learning_rate": 5.9398741674429726e-05, + "loss": 0.004, + "step": 3209 + }, + { + "epoch": 2.4804016219347362, + "grad_norm": 0.021283496171236038, + "learning_rate": 5.937224545865513e-05, + "loss": 0.0041, + "step": 3210 + }, + { + "epoch": 2.4811739718092296, + "grad_norm": 0.008566698990762234, + "learning_rate": 5.934574651484633e-05, + "loss": 0.0036, + "step": 3211 + }, + { + "epoch": 2.4819463216837225, + "grad_norm": 0.011161359958350658, + "learning_rate": 5.931924485071655e-05, + "loss": 0.0039, + "step": 3212 + }, + { + "epoch": 2.482718671558216, + "grad_norm": 0.015776904299855232, + "learning_rate": 5.929274047397977e-05, + "loss": 0.004, + "step": 3213 + }, + { + "epoch": 2.483491021432709, + "grad_norm": 0.008733643218874931, + "learning_rate": 5.926623339235078e-05, + "loss": 0.0036, + "step": 3214 + }, + { + "epoch": 2.484263371307202, + "grad_norm": 0.012375583872199059, + "learning_rate": 5.923972361354516e-05, + "loss": 0.0039, + "step": 3215 + }, + { + "epoch": 2.485035721181695, + "grad_norm": 0.00923305656760931, + "learning_rate": 5.921321114527926e-05, + "loss": 0.0038, + "step": 3216 + }, + { + "epoch": 2.4858080710561885, + "grad_norm": 0.009024463593959808, + "learning_rate": 5.918669599527019e-05, + "loss": 0.0039, + "step": 3217 + }, + { + "epoch": 2.4865804209306814, + "grad_norm": 0.014953624457120895, + "learning_rate": 5.9160178171235926e-05, + "loss": 0.0041, + "step": 3218 + }, + { + "epoch": 2.487352770805175, + "grad_norm": 0.011638659983873367, + "learning_rate": 5.9133657680895114e-05, + "loss": 0.0038, + "step": 3219 + }, + { + "epoch": 2.4881251206796677, + "grad_norm": 0.011053145863115788, + "learning_rate": 5.910713453196727e-05, + "loss": 0.0033, + "step": 3220 + }, + { + "epoch": 2.488897470554161, + "grad_norm": 0.01029958389699459, + "learning_rate": 5.9080608732172617e-05, + "loss": 0.0034, + "step": 3221 + }, + { + "epoch": 2.489669820428654, + "grad_norm": 0.008136557415127754, + "learning_rate": 5.905408028923216e-05, + "loss": 0.0038, + "step": 3222 + }, + { + "epoch": 2.4904421703031474, + "grad_norm": 0.008826378732919693, + "learning_rate": 5.9027549210867725e-05, + "loss": 0.0034, + "step": 3223 + }, + { + "epoch": 2.4912145201776403, + "grad_norm": 0.015251542441546917, + "learning_rate": 5.900101550480185e-05, + "loss": 0.0038, + "step": 3224 + }, + { + "epoch": 2.4919868700521337, + "grad_norm": 0.008893954567611217, + "learning_rate": 5.8974479178757845e-05, + "loss": 0.0033, + "step": 3225 + }, + { + "epoch": 2.4927592199266266, + "grad_norm": 0.010919424705207348, + "learning_rate": 5.894794024045982e-05, + "loss": 0.0036, + "step": 3226 + }, + { + "epoch": 2.49353156980112, + "grad_norm": 0.02282760478556156, + "learning_rate": 5.892139869763258e-05, + "loss": 0.004, + "step": 3227 + }, + { + "epoch": 2.494303919675613, + "grad_norm": 0.012096663005650043, + "learning_rate": 5.8894854558001756e-05, + "loss": 0.0033, + "step": 3228 + }, + { + "epoch": 2.4950762695501063, + "grad_norm": 0.022534744814038277, + "learning_rate": 5.88683078292937e-05, + "loss": 0.0042, + "step": 3229 + }, + { + "epoch": 2.4958486194245992, + "grad_norm": 0.01121017150580883, + "learning_rate": 5.884175851923552e-05, + "loss": 0.0038, + "step": 3230 + }, + { + "epoch": 2.4966209692990926, + "grad_norm": 0.013400291092693806, + "learning_rate": 5.881520663555509e-05, + "loss": 0.0037, + "step": 3231 + }, + { + "epoch": 2.4973933191735855, + "grad_norm": 0.008116140030324459, + "learning_rate": 5.878865218598101e-05, + "loss": 0.0034, + "step": 3232 + }, + { + "epoch": 2.498165669048079, + "grad_norm": 0.012089090421795845, + "learning_rate": 5.876209517824264e-05, + "loss": 0.0034, + "step": 3233 + }, + { + "epoch": 2.498938018922572, + "grad_norm": 0.014870635233819485, + "learning_rate": 5.873553562007008e-05, + "loss": 0.0037, + "step": 3234 + }, + { + "epoch": 2.499710368797065, + "grad_norm": 0.010396681725978851, + "learning_rate": 5.8708973519194174e-05, + "loss": 0.004, + "step": 3235 + }, + { + "epoch": 2.500482718671558, + "grad_norm": 0.009356563910841942, + "learning_rate": 5.868240888334653e-05, + "loss": 0.0037, + "step": 3236 + }, + { + "epoch": 2.5012550685460515, + "grad_norm": 0.014082294888794422, + "learning_rate": 5.8655841720259444e-05, + "loss": 0.0036, + "step": 3237 + }, + { + "epoch": 2.5020274184205444, + "grad_norm": 0.013190063647925854, + "learning_rate": 5.8629272037665984e-05, + "loss": 0.0036, + "step": 3238 + }, + { + "epoch": 2.502799768295038, + "grad_norm": 0.011822250671684742, + "learning_rate": 5.860269984329995e-05, + "loss": 0.0045, + "step": 3239 + }, + { + "epoch": 2.5035721181695307, + "grad_norm": 0.01531610544770956, + "learning_rate": 5.857612514489585e-05, + "loss": 0.0036, + "step": 3240 + }, + { + "epoch": 2.504344468044024, + "grad_norm": 0.011635066010057926, + "learning_rate": 5.8549547950188964e-05, + "loss": 0.0037, + "step": 3241 + }, + { + "epoch": 2.505116817918517, + "grad_norm": 0.007300242781639099, + "learning_rate": 5.852296826691525e-05, + "loss": 0.0031, + "step": 3242 + }, + { + "epoch": 2.5058891677930104, + "grad_norm": 0.009953463450074196, + "learning_rate": 5.849638610281141e-05, + "loss": 0.0041, + "step": 3243 + }, + { + "epoch": 2.5066615176675033, + "grad_norm": 0.009325490333139896, + "learning_rate": 5.846980146561486e-05, + "loss": 0.0041, + "step": 3244 + }, + { + "epoch": 2.5074338675419963, + "grad_norm": 0.010487103834748268, + "learning_rate": 5.8443214363063795e-05, + "loss": 0.0037, + "step": 3245 + }, + { + "epoch": 2.5082062174164896, + "grad_norm": 0.01220545545220375, + "learning_rate": 5.8416624802897026e-05, + "loss": 0.0038, + "step": 3246 + }, + { + "epoch": 2.508978567290983, + "grad_norm": 0.012766262516379356, + "learning_rate": 5.8390032792854134e-05, + "loss": 0.0042, + "step": 3247 + }, + { + "epoch": 2.509750917165476, + "grad_norm": 0.007656930014491081, + "learning_rate": 5.836343834067546e-05, + "loss": 0.0046, + "step": 3248 + }, + { + "epoch": 2.510523267039969, + "grad_norm": 0.010122626088559628, + "learning_rate": 5.8336841454101945e-05, + "loss": 0.0043, + "step": 3249 + }, + { + "epoch": 2.5112956169144622, + "grad_norm": 0.011598845943808556, + "learning_rate": 5.831024214087534e-05, + "loss": 0.0038, + "step": 3250 + }, + { + "epoch": 2.5120679667889556, + "grad_norm": 0.01554699894040823, + "learning_rate": 5.828364040873806e-05, + "loss": 0.0041, + "step": 3251 + }, + { + "epoch": 2.5128403166634485, + "grad_norm": 0.010791851207613945, + "learning_rate": 5.82570362654332e-05, + "loss": 0.0042, + "step": 3252 + }, + { + "epoch": 2.5136126665379415, + "grad_norm": 0.009610738605260849, + "learning_rate": 5.8230429718704606e-05, + "loss": 0.0037, + "step": 3253 + }, + { + "epoch": 2.514385016412435, + "grad_norm": 0.011307166889309883, + "learning_rate": 5.820382077629678e-05, + "loss": 0.0041, + "step": 3254 + }, + { + "epoch": 2.515157366286928, + "grad_norm": 0.012049585580825806, + "learning_rate": 5.817720944595497e-05, + "loss": 0.0039, + "step": 3255 + }, + { + "epoch": 2.515929716161421, + "grad_norm": 0.01084988471120596, + "learning_rate": 5.815059573542509e-05, + "loss": 0.0042, + "step": 3256 + }, + { + "epoch": 2.516702066035914, + "grad_norm": 0.0171855129301548, + "learning_rate": 5.812397965245372e-05, + "loss": 0.0032, + "step": 3257 + }, + { + "epoch": 2.5174744159104074, + "grad_norm": 0.014846454374492168, + "learning_rate": 5.809736120478817e-05, + "loss": 0.0042, + "step": 3258 + }, + { + "epoch": 2.518246765784901, + "grad_norm": 0.012940598651766777, + "learning_rate": 5.807074040017645e-05, + "loss": 0.0038, + "step": 3259 + }, + { + "epoch": 2.5190191156593937, + "grad_norm": 0.008697236888110638, + "learning_rate": 5.8044117246367205e-05, + "loss": 0.0035, + "step": 3260 + }, + { + "epoch": 2.5197914655338867, + "grad_norm": 0.023681191727519035, + "learning_rate": 5.80174917511098e-05, + "loss": 0.0039, + "step": 3261 + }, + { + "epoch": 2.52056381540838, + "grad_norm": 0.009607069194316864, + "learning_rate": 5.799086392215427e-05, + "loss": 0.0036, + "step": 3262 + }, + { + "epoch": 2.5213361652828734, + "grad_norm": 0.009420047514140606, + "learning_rate": 5.7964233767251354e-05, + "loss": 0.0036, + "step": 3263 + }, + { + "epoch": 2.5221085151573663, + "grad_norm": 0.0105622923001647, + "learning_rate": 5.793760129415241e-05, + "loss": 0.0041, + "step": 3264 + }, + { + "epoch": 2.5228808650318593, + "grad_norm": 0.012450242415070534, + "learning_rate": 5.791096651060954e-05, + "loss": 0.0038, + "step": 3265 + }, + { + "epoch": 2.5236532149063526, + "grad_norm": 0.008220894262194633, + "learning_rate": 5.788432942437547e-05, + "loss": 0.0036, + "step": 3266 + }, + { + "epoch": 2.5244255647808456, + "grad_norm": 0.009492254815995693, + "learning_rate": 5.785769004320362e-05, + "loss": 0.0035, + "step": 3267 + }, + { + "epoch": 2.525197914655339, + "grad_norm": 0.0072402735240757465, + "learning_rate": 5.7831048374848055e-05, + "loss": 0.0038, + "step": 3268 + }, + { + "epoch": 2.525970264529832, + "grad_norm": 0.01509221363812685, + "learning_rate": 5.780440442706354e-05, + "loss": 0.0034, + "step": 3269 + }, + { + "epoch": 2.5267426144043252, + "grad_norm": 0.008010455407202244, + "learning_rate": 5.777775820760547e-05, + "loss": 0.0031, + "step": 3270 + }, + { + "epoch": 2.527514964278818, + "grad_norm": 0.008865603245794773, + "learning_rate": 5.775110972422992e-05, + "loss": 0.004, + "step": 3271 + }, + { + "epoch": 2.5282873141533115, + "grad_norm": 0.011722778901457787, + "learning_rate": 5.772445898469363e-05, + "loss": 0.0042, + "step": 3272 + }, + { + "epoch": 2.5290596640278045, + "grad_norm": 0.014061705209314823, + "learning_rate": 5.769780599675397e-05, + "loss": 0.0037, + "step": 3273 + }, + { + "epoch": 2.529832013902298, + "grad_norm": 0.011701086536049843, + "learning_rate": 5.767115076816898e-05, + "loss": 0.0037, + "step": 3274 + }, + { + "epoch": 2.5306043637767908, + "grad_norm": 0.0137869818136096, + "learning_rate": 5.764449330669738e-05, + "loss": 0.0039, + "step": 3275 + }, + { + "epoch": 2.531376713651284, + "grad_norm": 0.006596204359084368, + "learning_rate": 5.7617833620098495e-05, + "loss": 0.0034, + "step": 3276 + }, + { + "epoch": 2.532149063525777, + "grad_norm": 0.008754332549870014, + "learning_rate": 5.7591171716132285e-05, + "loss": 0.0036, + "step": 3277 + }, + { + "epoch": 2.5329214134002704, + "grad_norm": 0.011859533376991749, + "learning_rate": 5.7564507602559445e-05, + "loss": 0.0035, + "step": 3278 + }, + { + "epoch": 2.5336937632747634, + "grad_norm": 0.007447488605976105, + "learning_rate": 5.753784128714122e-05, + "loss": 0.004, + "step": 3279 + }, + { + "epoch": 2.5344661131492567, + "grad_norm": 0.01583264209330082, + "learning_rate": 5.751117277763953e-05, + "loss": 0.004, + "step": 3280 + }, + { + "epoch": 2.5352384630237497, + "grad_norm": 0.007798062637448311, + "learning_rate": 5.748450208181694e-05, + "loss": 0.0036, + "step": 3281 + }, + { + "epoch": 2.536010812898243, + "grad_norm": 0.013155490159988403, + "learning_rate": 5.745782920743663e-05, + "loss": 0.0038, + "step": 3282 + }, + { + "epoch": 2.536783162772736, + "grad_norm": 0.008394292555749416, + "learning_rate": 5.743115416226247e-05, + "loss": 0.0034, + "step": 3283 + }, + { + "epoch": 2.5375555126472293, + "grad_norm": 0.00787175353616476, + "learning_rate": 5.7404476954058864e-05, + "loss": 0.0034, + "step": 3284 + }, + { + "epoch": 2.5383278625217223, + "grad_norm": 0.008502156473696232, + "learning_rate": 5.7377797590590954e-05, + "loss": 0.0035, + "step": 3285 + }, + { + "epoch": 2.5391002123962156, + "grad_norm": 0.0073899938724935055, + "learning_rate": 5.7351116079624435e-05, + "loss": 0.0036, + "step": 3286 + }, + { + "epoch": 2.5398725622707086, + "grad_norm": 0.007314924616366625, + "learning_rate": 5.7324432428925643e-05, + "loss": 0.0036, + "step": 3287 + }, + { + "epoch": 2.540644912145202, + "grad_norm": 0.009392407722771168, + "learning_rate": 5.729774664626155e-05, + "loss": 0.0037, + "step": 3288 + }, + { + "epoch": 2.541417262019695, + "grad_norm": 0.009202550165355206, + "learning_rate": 5.727105873939975e-05, + "loss": 0.0037, + "step": 3289 + }, + { + "epoch": 2.5421896118941882, + "grad_norm": 0.01186344213783741, + "learning_rate": 5.7244368716108457e-05, + "loss": 0.0035, + "step": 3290 + }, + { + "epoch": 2.542961961768681, + "grad_norm": 0.00778360478579998, + "learning_rate": 5.7217676584156476e-05, + "loss": 0.0039, + "step": 3291 + }, + { + "epoch": 2.543734311643174, + "grad_norm": 0.010938924737274647, + "learning_rate": 5.7190982351313216e-05, + "loss": 0.0037, + "step": 3292 + }, + { + "epoch": 2.5445066615176675, + "grad_norm": 0.00829948391765356, + "learning_rate": 5.716428602534878e-05, + "loss": 0.0038, + "step": 3293 + }, + { + "epoch": 2.545279011392161, + "grad_norm": 0.01430403534322977, + "learning_rate": 5.7137587614033785e-05, + "loss": 0.0035, + "step": 3294 + }, + { + "epoch": 2.5460513612666538, + "grad_norm": 0.024432053789496422, + "learning_rate": 5.71108871251395e-05, + "loss": 0.0039, + "step": 3295 + }, + { + "epoch": 2.5468237111411467, + "grad_norm": 0.009824837557971478, + "learning_rate": 5.7084184566437794e-05, + "loss": 0.0035, + "step": 3296 + }, + { + "epoch": 2.54759606101564, + "grad_norm": 0.018679151311516762, + "learning_rate": 5.705747994570114e-05, + "loss": 0.0041, + "step": 3297 + }, + { + "epoch": 2.5483684108901334, + "grad_norm": 0.018965978175401688, + "learning_rate": 5.70307732707026e-05, + "loss": 0.0038, + "step": 3298 + }, + { + "epoch": 2.5491407607646264, + "grad_norm": 0.01286038663238287, + "learning_rate": 5.700406454921585e-05, + "loss": 0.0038, + "step": 3299 + }, + { + "epoch": 2.5499131106391193, + "grad_norm": 0.011224661022424698, + "learning_rate": 5.6977353789015154e-05, + "loss": 0.0036, + "step": 3300 + }, + { + "epoch": 2.5506854605136127, + "grad_norm": 0.01046342495828867, + "learning_rate": 5.695064099787537e-05, + "loss": 0.0034, + "step": 3301 + }, + { + "epoch": 2.551457810388106, + "grad_norm": 0.008502374403178692, + "learning_rate": 5.692392618357193e-05, + "loss": 0.004, + "step": 3302 + }, + { + "epoch": 2.552230160262599, + "grad_norm": 0.01280928310006857, + "learning_rate": 5.6897209353880885e-05, + "loss": 0.0043, + "step": 3303 + }, + { + "epoch": 2.553002510137092, + "grad_norm": 0.01096299383789301, + "learning_rate": 5.687049051657885e-05, + "loss": 0.0039, + "step": 3304 + }, + { + "epoch": 2.5537748600115853, + "grad_norm": 0.007984945550560951, + "learning_rate": 5.684376967944306e-05, + "loss": 0.004, + "step": 3305 + }, + { + "epoch": 2.5545472098860786, + "grad_norm": 0.00990123488008976, + "learning_rate": 5.681704685025127e-05, + "loss": 0.0036, + "step": 3306 + }, + { + "epoch": 2.5553195597605716, + "grad_norm": 0.009490500204265118, + "learning_rate": 5.679032203678186e-05, + "loss": 0.0037, + "step": 3307 + }, + { + "epoch": 2.5560919096350645, + "grad_norm": 0.012009241618216038, + "learning_rate": 5.6763595246813786e-05, + "loss": 0.0038, + "step": 3308 + }, + { + "epoch": 2.556864259509558, + "grad_norm": 0.009753881022334099, + "learning_rate": 5.673686648812655e-05, + "loss": 0.0032, + "step": 3309 + }, + { + "epoch": 2.5576366093840512, + "grad_norm": 0.008679524064064026, + "learning_rate": 5.6710135768500294e-05, + "loss": 0.0038, + "step": 3310 + }, + { + "epoch": 2.558408959258544, + "grad_norm": 0.00896353181451559, + "learning_rate": 5.668340309571564e-05, + "loss": 0.0037, + "step": 3311 + }, + { + "epoch": 2.559181309133037, + "grad_norm": 0.010172102600336075, + "learning_rate": 5.665666847755383e-05, + "loss": 0.0038, + "step": 3312 + }, + { + "epoch": 2.5599536590075305, + "grad_norm": 0.022809471935033798, + "learning_rate": 5.6629931921796686e-05, + "loss": 0.0038, + "step": 3313 + }, + { + "epoch": 2.5607260088820234, + "grad_norm": 0.01311578880995512, + "learning_rate": 5.660319343622654e-05, + "loss": 0.0037, + "step": 3314 + }, + { + "epoch": 2.5614983587565168, + "grad_norm": 0.013067226856946945, + "learning_rate": 5.6576453028626354e-05, + "loss": 0.0039, + "step": 3315 + }, + { + "epoch": 2.5622707086310097, + "grad_norm": 0.01835964061319828, + "learning_rate": 5.654971070677961e-05, + "loss": 0.004, + "step": 3316 + }, + { + "epoch": 2.563043058505503, + "grad_norm": 0.034658029675483704, + "learning_rate": 5.652296647847032e-05, + "loss": 0.004, + "step": 3317 + }, + { + "epoch": 2.563815408379996, + "grad_norm": 0.009408431127667427, + "learning_rate": 5.649622035148312e-05, + "loss": 0.0042, + "step": 3318 + }, + { + "epoch": 2.5645877582544894, + "grad_norm": 0.026709269732236862, + "learning_rate": 5.6469472333603136e-05, + "loss": 0.0041, + "step": 3319 + }, + { + "epoch": 2.5653601081289823, + "grad_norm": 0.030673233792185783, + "learning_rate": 5.644272243261608e-05, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 2.5661324580034757, + "grad_norm": 0.011289527639746666, + "learning_rate": 5.6415970656308213e-05, + "loss": 0.0039, + "step": 3321 + }, + { + "epoch": 2.5669048078779686, + "grad_norm": 0.016593938693404198, + "learning_rate": 5.63892170124663e-05, + "loss": 0.0038, + "step": 3322 + }, + { + "epoch": 2.567677157752462, + "grad_norm": 0.024493860080838203, + "learning_rate": 5.6362461508877704e-05, + "loss": 0.0035, + "step": 3323 + }, + { + "epoch": 2.568449507626955, + "grad_norm": 0.011858409270644188, + "learning_rate": 5.6335704153330305e-05, + "loss": 0.0036, + "step": 3324 + }, + { + "epoch": 2.5692218575014483, + "grad_norm": 0.010189369320869446, + "learning_rate": 5.630894495361252e-05, + "loss": 0.0032, + "step": 3325 + }, + { + "epoch": 2.569994207375941, + "grad_norm": 0.009152532555162907, + "learning_rate": 5.62821839175133e-05, + "loss": 0.0032, + "step": 3326 + }, + { + "epoch": 2.5707665572504346, + "grad_norm": 0.009873981587588787, + "learning_rate": 5.6255421052822134e-05, + "loss": 0.0035, + "step": 3327 + }, + { + "epoch": 2.5715389071249275, + "grad_norm": 0.013396297581493855, + "learning_rate": 5.622865636732906e-05, + "loss": 0.0033, + "step": 3328 + }, + { + "epoch": 2.572311256999421, + "grad_norm": 0.011942288838326931, + "learning_rate": 5.620188986882461e-05, + "loss": 0.0037, + "step": 3329 + }, + { + "epoch": 2.573083606873914, + "grad_norm": 0.008207396604120731, + "learning_rate": 5.617512156509989e-05, + "loss": 0.0033, + "step": 3330 + }, + { + "epoch": 2.573855956748407, + "grad_norm": 0.011415604501962662, + "learning_rate": 5.614835146394648e-05, + "loss": 0.0038, + "step": 3331 + }, + { + "epoch": 2.5746283066229, + "grad_norm": 0.01788908615708351, + "learning_rate": 5.612157957315654e-05, + "loss": 0.0034, + "step": 3332 + }, + { + "epoch": 2.5754006564973935, + "grad_norm": 0.010203823447227478, + "learning_rate": 5.609480590052268e-05, + "loss": 0.0037, + "step": 3333 + }, + { + "epoch": 2.5761730063718864, + "grad_norm": 0.009772097691893578, + "learning_rate": 5.606803045383811e-05, + "loss": 0.0038, + "step": 3334 + }, + { + "epoch": 2.5769453562463798, + "grad_norm": 0.010885546915233135, + "learning_rate": 5.6041253240896495e-05, + "loss": 0.0038, + "step": 3335 + }, + { + "epoch": 2.5777177061208727, + "grad_norm": 0.017759766429662704, + "learning_rate": 5.6014474269492036e-05, + "loss": 0.0038, + "step": 3336 + }, + { + "epoch": 2.578490055995366, + "grad_norm": 0.013944578357040882, + "learning_rate": 5.598769354741945e-05, + "loss": 0.0038, + "step": 3337 + }, + { + "epoch": 2.579262405869859, + "grad_norm": 0.010490084066987038, + "learning_rate": 5.5960911082473956e-05, + "loss": 0.0043, + "step": 3338 + }, + { + "epoch": 2.580034755744352, + "grad_norm": 0.023333175107836723, + "learning_rate": 5.5934126882451266e-05, + "loss": 0.004, + "step": 3339 + }, + { + "epoch": 2.5808071056188453, + "grad_norm": 0.01074046641588211, + "learning_rate": 5.5907340955147645e-05, + "loss": 0.0037, + "step": 3340 + }, + { + "epoch": 2.5815794554933387, + "grad_norm": 0.030795851722359657, + "learning_rate": 5.588055330835981e-05, + "loss": 0.0036, + "step": 3341 + }, + { + "epoch": 2.5823518053678316, + "grad_norm": 0.009321062825620174, + "learning_rate": 5.5853763949884976e-05, + "loss": 0.0037, + "step": 3342 + }, + { + "epoch": 2.5831241552423245, + "grad_norm": 0.011110929772257805, + "learning_rate": 5.5826972887520935e-05, + "loss": 0.0041, + "step": 3343 + }, + { + "epoch": 2.583896505116818, + "grad_norm": 0.010194243863224983, + "learning_rate": 5.580018012906586e-05, + "loss": 0.0037, + "step": 3344 + }, + { + "epoch": 2.5846688549913113, + "grad_norm": 0.011587006971240044, + "learning_rate": 5.577338568231852e-05, + "loss": 0.0036, + "step": 3345 + }, + { + "epoch": 2.585441204865804, + "grad_norm": 0.00983353890478611, + "learning_rate": 5.574658955507811e-05, + "loss": 0.0038, + "step": 3346 + }, + { + "epoch": 2.586213554740297, + "grad_norm": 0.01402513962239027, + "learning_rate": 5.571979175514431e-05, + "loss": 0.0039, + "step": 3347 + }, + { + "epoch": 2.5869859046147905, + "grad_norm": 0.018349068239331245, + "learning_rate": 5.5692992290317366e-05, + "loss": 0.0043, + "step": 3348 + }, + { + "epoch": 2.587758254489284, + "grad_norm": 0.009055593982338905, + "learning_rate": 5.566619116839792e-05, + "loss": 0.0042, + "step": 3349 + }, + { + "epoch": 2.588530604363777, + "grad_norm": 0.01099521853029728, + "learning_rate": 5.5639388397187134e-05, + "loss": 0.0038, + "step": 3350 + }, + { + "epoch": 2.5893029542382697, + "grad_norm": 0.007961034774780273, + "learning_rate": 5.5612583984486666e-05, + "loss": 0.0037, + "step": 3351 + }, + { + "epoch": 2.590075304112763, + "grad_norm": 0.007924498990178108, + "learning_rate": 5.558577793809861e-05, + "loss": 0.004, + "step": 3352 + }, + { + "epoch": 2.5908476539872565, + "grad_norm": 0.010857324115931988, + "learning_rate": 5.555897026582555e-05, + "loss": 0.0037, + "step": 3353 + }, + { + "epoch": 2.5916200038617494, + "grad_norm": 0.011452377773821354, + "learning_rate": 5.553216097547058e-05, + "loss": 0.0042, + "step": 3354 + }, + { + "epoch": 2.5923923537362423, + "grad_norm": 0.010629558935761452, + "learning_rate": 5.550535007483724e-05, + "loss": 0.0039, + "step": 3355 + }, + { + "epoch": 2.5931647036107357, + "grad_norm": 0.011495045386254787, + "learning_rate": 5.547853757172951e-05, + "loss": 0.0035, + "step": 3356 + }, + { + "epoch": 2.593937053485229, + "grad_norm": 0.0070584677159786224, + "learning_rate": 5.545172347395186e-05, + "loss": 0.0034, + "step": 3357 + }, + { + "epoch": 2.594709403359722, + "grad_norm": 0.008315347135066986, + "learning_rate": 5.542490778930924e-05, + "loss": 0.0035, + "step": 3358 + }, + { + "epoch": 2.595481753234215, + "grad_norm": 0.016000311821699142, + "learning_rate": 5.539809052560706e-05, + "loss": 0.0039, + "step": 3359 + }, + { + "epoch": 2.5962541031087083, + "grad_norm": 0.016649696975946426, + "learning_rate": 5.537127169065116e-05, + "loss": 0.0036, + "step": 3360 + }, + { + "epoch": 2.5970264529832012, + "grad_norm": 0.011692811734974384, + "learning_rate": 5.534445129224786e-05, + "loss": 0.0038, + "step": 3361 + }, + { + "epoch": 2.5977988028576946, + "grad_norm": 0.008047428913414478, + "learning_rate": 5.531762933820391e-05, + "loss": 0.0037, + "step": 3362 + }, + { + "epoch": 2.5985711527321875, + "grad_norm": 0.009794252924621105, + "learning_rate": 5.529080583632656e-05, + "loss": 0.0036, + "step": 3363 + }, + { + "epoch": 2.599343502606681, + "grad_norm": 0.013193776831030846, + "learning_rate": 5.5263980794423484e-05, + "loss": 0.0032, + "step": 3364 + }, + { + "epoch": 2.600115852481174, + "grad_norm": 0.010043804533779621, + "learning_rate": 5.523715422030279e-05, + "loss": 0.0039, + "step": 3365 + }, + { + "epoch": 2.600888202355667, + "grad_norm": 0.013375181704759598, + "learning_rate": 5.5210326121773034e-05, + "loss": 0.004, + "step": 3366 + }, + { + "epoch": 2.60166055223016, + "grad_norm": 0.013488434255123138, + "learning_rate": 5.5183496506643264e-05, + "loss": 0.0035, + "step": 3367 + }, + { + "epoch": 2.6024329021046535, + "grad_norm": 0.01303744688630104, + "learning_rate": 5.51566653827229e-05, + "loss": 0.004, + "step": 3368 + }, + { + "epoch": 2.6032052519791464, + "grad_norm": 0.009145365096628666, + "learning_rate": 5.5129832757821834e-05, + "loss": 0.0037, + "step": 3369 + }, + { + "epoch": 2.60397760185364, + "grad_norm": 0.010411336086690426, + "learning_rate": 5.5102998639750424e-05, + "loss": 0.0039, + "step": 3370 + }, + { + "epoch": 2.6047499517281327, + "grad_norm": 0.008222127333283424, + "learning_rate": 5.507616303631941e-05, + "loss": 0.0036, + "step": 3371 + }, + { + "epoch": 2.605522301602626, + "grad_norm": 0.011100389994680882, + "learning_rate": 5.5049325955339984e-05, + "loss": 0.0039, + "step": 3372 + }, + { + "epoch": 2.606294651477119, + "grad_norm": 0.008225099183619022, + "learning_rate": 5.502248740462379e-05, + "loss": 0.0037, + "step": 3373 + }, + { + "epoch": 2.6070670013516124, + "grad_norm": 0.016636086627840996, + "learning_rate": 5.4995647391982875e-05, + "loss": 0.0042, + "step": 3374 + }, + { + "epoch": 2.6078393512261053, + "grad_norm": 0.007184876129031181, + "learning_rate": 5.49688059252297e-05, + "loss": 0.0037, + "step": 3375 + }, + { + "epoch": 2.6086117011005987, + "grad_norm": 0.008062731474637985, + "learning_rate": 5.49419630121772e-05, + "loss": 0.0036, + "step": 3376 + }, + { + "epoch": 2.6093840509750916, + "grad_norm": 0.012067383155226707, + "learning_rate": 5.491511866063865e-05, + "loss": 0.0038, + "step": 3377 + }, + { + "epoch": 2.610156400849585, + "grad_norm": 0.00804864801466465, + "learning_rate": 5.488827287842786e-05, + "loss": 0.0036, + "step": 3378 + }, + { + "epoch": 2.610928750724078, + "grad_norm": 0.009306215681135654, + "learning_rate": 5.486142567335894e-05, + "loss": 0.0038, + "step": 3379 + }, + { + "epoch": 2.6117011005985713, + "grad_norm": 0.012968027032911777, + "learning_rate": 5.483457705324646e-05, + "loss": 0.0037, + "step": 3380 + }, + { + "epoch": 2.6124734504730642, + "grad_norm": 0.009706765413284302, + "learning_rate": 5.480772702590544e-05, + "loss": 0.0039, + "step": 3381 + }, + { + "epoch": 2.6132458003475576, + "grad_norm": 0.009779158048331738, + "learning_rate": 5.478087559915123e-05, + "loss": 0.0039, + "step": 3382 + }, + { + "epoch": 2.6140181502220505, + "grad_norm": 0.008816637098789215, + "learning_rate": 5.4754022780799665e-05, + "loss": 0.0036, + "step": 3383 + }, + { + "epoch": 2.614790500096544, + "grad_norm": 0.010408301837742329, + "learning_rate": 5.4727168578666956e-05, + "loss": 0.0035, + "step": 3384 + }, + { + "epoch": 2.615562849971037, + "grad_norm": 0.00888033676892519, + "learning_rate": 5.470031300056968e-05, + "loss": 0.0035, + "step": 3385 + }, + { + "epoch": 2.6163351998455298, + "grad_norm": 0.00977409165352583, + "learning_rate": 5.4673456054324875e-05, + "loss": 0.0039, + "step": 3386 + }, + { + "epoch": 2.617107549720023, + "grad_norm": 0.01601114310324192, + "learning_rate": 5.4646597747749916e-05, + "loss": 0.0039, + "step": 3387 + }, + { + "epoch": 2.6178798995945165, + "grad_norm": 0.014116347767412663, + "learning_rate": 5.461973808866265e-05, + "loss": 0.0041, + "step": 3388 + }, + { + "epoch": 2.6186522494690094, + "grad_norm": 0.01782173663377762, + "learning_rate": 5.4592877084881254e-05, + "loss": 0.0044, + "step": 3389 + }, + { + "epoch": 2.6194245993435024, + "grad_norm": 0.00860170554369688, + "learning_rate": 5.45660147442243e-05, + "loss": 0.0035, + "step": 3390 + }, + { + "epoch": 2.6201969492179957, + "grad_norm": 0.017179779708385468, + "learning_rate": 5.4539151074510805e-05, + "loss": 0.0036, + "step": 3391 + }, + { + "epoch": 2.620969299092489, + "grad_norm": 0.010571127757430077, + "learning_rate": 5.45122860835601e-05, + "loss": 0.0039, + "step": 3392 + }, + { + "epoch": 2.621741648966982, + "grad_norm": 0.013925640843808651, + "learning_rate": 5.448541977919195e-05, + "loss": 0.0035, + "step": 3393 + }, + { + "epoch": 2.622513998841475, + "grad_norm": 0.030185796320438385, + "learning_rate": 5.4458552169226486e-05, + "loss": 0.0041, + "step": 3394 + }, + { + "epoch": 2.6232863487159683, + "grad_norm": 0.014250488951802254, + "learning_rate": 5.4431683261484224e-05, + "loss": 0.0038, + "step": 3395 + }, + { + "epoch": 2.6240586985904617, + "grad_norm": 0.009191480465233326, + "learning_rate": 5.440481306378604e-05, + "loss": 0.0039, + "step": 3396 + }, + { + "epoch": 2.6248310484649546, + "grad_norm": 0.028713863343000412, + "learning_rate": 5.4377941583953206e-05, + "loss": 0.0042, + "step": 3397 + }, + { + "epoch": 2.6256033983394476, + "grad_norm": 0.019584739580750465, + "learning_rate": 5.4351068829807375e-05, + "loss": 0.0037, + "step": 3398 + }, + { + "epoch": 2.626375748213941, + "grad_norm": 0.009171944111585617, + "learning_rate": 5.432419480917053e-05, + "loss": 0.0037, + "step": 3399 + }, + { + "epoch": 2.6271480980884343, + "grad_norm": 0.008588941767811775, + "learning_rate": 5.429731952986506e-05, + "loss": 0.0038, + "step": 3400 + }, + { + "epoch": 2.6279204479629272, + "grad_norm": 0.01289437897503376, + "learning_rate": 5.427044299971372e-05, + "loss": 0.004, + "step": 3401 + }, + { + "epoch": 2.62869279783742, + "grad_norm": 0.025774981826543808, + "learning_rate": 5.4243565226539613e-05, + "loss": 0.0044, + "step": 3402 + }, + { + "epoch": 2.6294651477119135, + "grad_norm": 0.011850577779114246, + "learning_rate": 5.4216686218166204e-05, + "loss": 0.004, + "step": 3403 + }, + { + "epoch": 2.630237497586407, + "grad_norm": 0.01651032827794552, + "learning_rate": 5.418980598241733e-05, + "loss": 0.0039, + "step": 3404 + }, + { + "epoch": 2.6310098474609, + "grad_norm": 0.01289384812116623, + "learning_rate": 5.416292452711716e-05, + "loss": 0.0035, + "step": 3405 + }, + { + "epoch": 2.6317821973353928, + "grad_norm": 0.018764130771160126, + "learning_rate": 5.413604186009027e-05, + "loss": 0.0035, + "step": 3406 + }, + { + "epoch": 2.632554547209886, + "grad_norm": 0.01137736439704895, + "learning_rate": 5.410915798916151e-05, + "loss": 0.004, + "step": 3407 + }, + { + "epoch": 2.633326897084379, + "grad_norm": 0.00818009115755558, + "learning_rate": 5.4082272922156176e-05, + "loss": 0.0038, + "step": 3408 + }, + { + "epoch": 2.6340992469588724, + "grad_norm": 0.017443781718611717, + "learning_rate": 5.405538666689982e-05, + "loss": 0.0038, + "step": 3409 + }, + { + "epoch": 2.6348715968333654, + "grad_norm": 0.008871911093592644, + "learning_rate": 5.402849923121839e-05, + "loss": 0.0037, + "step": 3410 + }, + { + "epoch": 2.6356439467078587, + "grad_norm": 0.00904800370335579, + "learning_rate": 5.400161062293819e-05, + "loss": 0.0036, + "step": 3411 + }, + { + "epoch": 2.6364162965823517, + "grad_norm": 0.00901111587882042, + "learning_rate": 5.3974720849885807e-05, + "loss": 0.0035, + "step": 3412 + }, + { + "epoch": 2.637188646456845, + "grad_norm": 0.012365750037133694, + "learning_rate": 5.394782991988826e-05, + "loss": 0.0042, + "step": 3413 + }, + { + "epoch": 2.637960996331338, + "grad_norm": 0.009294239804148674, + "learning_rate": 5.39209378407728e-05, + "loss": 0.0037, + "step": 3414 + }, + { + "epoch": 2.6387333462058313, + "grad_norm": 0.01763080060482025, + "learning_rate": 5.3894044620367056e-05, + "loss": 0.0037, + "step": 3415 + }, + { + "epoch": 2.6395056960803243, + "grad_norm": 0.03039529360830784, + "learning_rate": 5.386715026649906e-05, + "loss": 0.0034, + "step": 3416 + }, + { + "epoch": 2.6402780459548176, + "grad_norm": 0.009638811461627483, + "learning_rate": 5.384025478699702e-05, + "loss": 0.0039, + "step": 3417 + }, + { + "epoch": 2.6410503958293106, + "grad_norm": 0.016960902139544487, + "learning_rate": 5.381335818968962e-05, + "loss": 0.0034, + "step": 3418 + }, + { + "epoch": 2.641822745703804, + "grad_norm": 0.02310163341462612, + "learning_rate": 5.378646048240581e-05, + "loss": 0.0032, + "step": 3419 + }, + { + "epoch": 2.642595095578297, + "grad_norm": 0.020672015845775604, + "learning_rate": 5.3759561672974825e-05, + "loss": 0.004, + "step": 3420 + }, + { + "epoch": 2.6433674454527902, + "grad_norm": 0.010249595157802105, + "learning_rate": 5.373266176922629e-05, + "loss": 0.0038, + "step": 3421 + }, + { + "epoch": 2.644139795327283, + "grad_norm": 0.01078090351074934, + "learning_rate": 5.3705760778990114e-05, + "loss": 0.0032, + "step": 3422 + }, + { + "epoch": 2.6449121452017765, + "grad_norm": 0.009476094506680965, + "learning_rate": 5.36788587100965e-05, + "loss": 0.0035, + "step": 3423 + }, + { + "epoch": 2.6456844950762695, + "grad_norm": 0.008092152886092663, + "learning_rate": 5.365195557037602e-05, + "loss": 0.0034, + "step": 3424 + }, + { + "epoch": 2.646456844950763, + "grad_norm": 0.02135201171040535, + "learning_rate": 5.362505136765952e-05, + "loss": 0.004, + "step": 3425 + }, + { + "epoch": 2.6472291948252558, + "grad_norm": 0.00874929130077362, + "learning_rate": 5.359814610977816e-05, + "loss": 0.0037, + "step": 3426 + }, + { + "epoch": 2.648001544699749, + "grad_norm": 0.011419526301324368, + "learning_rate": 5.35712398045634e-05, + "loss": 0.0041, + "step": 3427 + }, + { + "epoch": 2.648773894574242, + "grad_norm": 0.01089134905487299, + "learning_rate": 5.3544332459847034e-05, + "loss": 0.0039, + "step": 3428 + }, + { + "epoch": 2.6495462444487354, + "grad_norm": 0.016542969271540642, + "learning_rate": 5.3517424083461134e-05, + "loss": 0.0033, + "step": 3429 + }, + { + "epoch": 2.6503185943232284, + "grad_norm": 0.011641754768788815, + "learning_rate": 5.349051468323807e-05, + "loss": 0.0037, + "step": 3430 + }, + { + "epoch": 2.6510909441977217, + "grad_norm": 0.00846849475055933, + "learning_rate": 5.346360426701051e-05, + "loss": 0.0037, + "step": 3431 + }, + { + "epoch": 2.6518632940722147, + "grad_norm": 0.013782123103737831, + "learning_rate": 5.343669284261147e-05, + "loss": 0.0035, + "step": 3432 + }, + { + "epoch": 2.6526356439467076, + "grad_norm": 0.014771669171750546, + "learning_rate": 5.340978041787417e-05, + "loss": 0.0044, + "step": 3433 + }, + { + "epoch": 2.653407993821201, + "grad_norm": 0.010918432846665382, + "learning_rate": 5.3382867000632174e-05, + "loss": 0.0034, + "step": 3434 + }, + { + "epoch": 2.6541803436956943, + "grad_norm": 0.008296050131320953, + "learning_rate": 5.335595259871934e-05, + "loss": 0.0041, + "step": 3435 + }, + { + "epoch": 2.6549526935701873, + "grad_norm": 0.013633402064442635, + "learning_rate": 5.33290372199698e-05, + "loss": 0.0037, + "step": 3436 + }, + { + "epoch": 2.65572504344468, + "grad_norm": 0.0165613554418087, + "learning_rate": 5.3302120872217955e-05, + "loss": 0.004, + "step": 3437 + }, + { + "epoch": 2.6564973933191736, + "grad_norm": 0.015982868149876595, + "learning_rate": 5.327520356329853e-05, + "loss": 0.0042, + "step": 3438 + }, + { + "epoch": 2.657269743193667, + "grad_norm": 0.015539965592324734, + "learning_rate": 5.3248285301046476e-05, + "loss": 0.0044, + "step": 3439 + }, + { + "epoch": 2.65804209306816, + "grad_norm": 0.016152702271938324, + "learning_rate": 5.3221366093297066e-05, + "loss": 0.0037, + "step": 3440 + }, + { + "epoch": 2.658814442942653, + "grad_norm": 0.014841979369521141, + "learning_rate": 5.3194445947885816e-05, + "loss": 0.0036, + "step": 3441 + }, + { + "epoch": 2.659586792817146, + "grad_norm": 0.010280294343829155, + "learning_rate": 5.316752487264853e-05, + "loss": 0.0044, + "step": 3442 + }, + { + "epoch": 2.6603591426916395, + "grad_norm": 0.008196298032999039, + "learning_rate": 5.314060287542132e-05, + "loss": 0.0032, + "step": 3443 + }, + { + "epoch": 2.6611314925661325, + "grad_norm": 0.01514330506324768, + "learning_rate": 5.311367996404049e-05, + "loss": 0.0039, + "step": 3444 + }, + { + "epoch": 2.6619038424406254, + "grad_norm": 0.00861449260264635, + "learning_rate": 5.308675614634264e-05, + "loss": 0.0036, + "step": 3445 + }, + { + "epoch": 2.6626761923151188, + "grad_norm": 0.010334886610507965, + "learning_rate": 5.305983143016469e-05, + "loss": 0.004, + "step": 3446 + }, + { + "epoch": 2.663448542189612, + "grad_norm": 0.01663089171051979, + "learning_rate": 5.303290582334372e-05, + "loss": 0.0038, + "step": 3447 + }, + { + "epoch": 2.664220892064105, + "grad_norm": 0.009869327768683434, + "learning_rate": 5.300597933371716e-05, + "loss": 0.0038, + "step": 3448 + }, + { + "epoch": 2.664993241938598, + "grad_norm": 0.011241073720157146, + "learning_rate": 5.297905196912266e-05, + "loss": 0.0039, + "step": 3449 + }, + { + "epoch": 2.6657655918130914, + "grad_norm": 0.02179298736155033, + "learning_rate": 5.29521237373981e-05, + "loss": 0.0035, + "step": 3450 + }, + { + "epoch": 2.6665379416875847, + "grad_norm": 0.01262400671839714, + "learning_rate": 5.292519464638166e-05, + "loss": 0.0033, + "step": 3451 + }, + { + "epoch": 2.6673102915620777, + "grad_norm": 0.008806941099464893, + "learning_rate": 5.289826470391174e-05, + "loss": 0.0042, + "step": 3452 + }, + { + "epoch": 2.6680826414365706, + "grad_norm": 0.01608050800859928, + "learning_rate": 5.287133391782699e-05, + "loss": 0.0041, + "step": 3453 + }, + { + "epoch": 2.668854991311064, + "grad_norm": 0.021533267572522163, + "learning_rate": 5.2844402295966346e-05, + "loss": 0.004, + "step": 3454 + }, + { + "epoch": 2.6696273411855573, + "grad_norm": 0.01042283233255148, + "learning_rate": 5.281746984616889e-05, + "loss": 0.0037, + "step": 3455 + }, + { + "epoch": 2.6703996910600503, + "grad_norm": 0.015076697804033756, + "learning_rate": 5.2790536576274055e-05, + "loss": 0.0041, + "step": 3456 + }, + { + "epoch": 2.671172040934543, + "grad_norm": 0.019963202998042107, + "learning_rate": 5.276360249412144e-05, + "loss": 0.004, + "step": 3457 + }, + { + "epoch": 2.6719443908090366, + "grad_norm": 0.0230875164270401, + "learning_rate": 5.2736667607550925e-05, + "loss": 0.0035, + "step": 3458 + }, + { + "epoch": 2.6727167406835295, + "grad_norm": 0.011236096732318401, + "learning_rate": 5.2709731924402596e-05, + "loss": 0.0037, + "step": 3459 + }, + { + "epoch": 2.673489090558023, + "grad_norm": 0.007921960204839706, + "learning_rate": 5.2682795452516784e-05, + "loss": 0.0034, + "step": 3460 + }, + { + "epoch": 2.674261440432516, + "grad_norm": 0.018090009689331055, + "learning_rate": 5.265585819973403e-05, + "loss": 0.0045, + "step": 3461 + }, + { + "epoch": 2.675033790307009, + "grad_norm": 0.019070502370595932, + "learning_rate": 5.2628920173895134e-05, + "loss": 0.0043, + "step": 3462 + }, + { + "epoch": 2.675806140181502, + "grad_norm": 0.015746183693408966, + "learning_rate": 5.26019813828411e-05, + "loss": 0.0035, + "step": 3463 + }, + { + "epoch": 2.6765784900559955, + "grad_norm": 0.010020890273153782, + "learning_rate": 5.257504183441316e-05, + "loss": 0.0031, + "step": 3464 + }, + { + "epoch": 2.6773508399304884, + "grad_norm": 0.027315745130181313, + "learning_rate": 5.254810153645277e-05, + "loss": 0.004, + "step": 3465 + }, + { + "epoch": 2.6781231898049818, + "grad_norm": 0.02392551489174366, + "learning_rate": 5.2521160496801566e-05, + "loss": 0.0037, + "step": 3466 + }, + { + "epoch": 2.6788955396794747, + "grad_norm": 0.009675184264779091, + "learning_rate": 5.2494218723301483e-05, + "loss": 0.0036, + "step": 3467 + }, + { + "epoch": 2.679667889553968, + "grad_norm": 0.021859876811504364, + "learning_rate": 5.2467276223794595e-05, + "loss": 0.0035, + "step": 3468 + }, + { + "epoch": 2.680440239428461, + "grad_norm": 0.01790783368051052, + "learning_rate": 5.244033300612321e-05, + "loss": 0.0039, + "step": 3469 + }, + { + "epoch": 2.6812125893029544, + "grad_norm": 0.012461425736546516, + "learning_rate": 5.241338907812986e-05, + "loss": 0.0032, + "step": 3470 + }, + { + "epoch": 2.6819849391774473, + "grad_norm": 0.00857500173151493, + "learning_rate": 5.2386444447657256e-05, + "loss": 0.0034, + "step": 3471 + }, + { + "epoch": 2.6827572890519407, + "grad_norm": 0.011089037172496319, + "learning_rate": 5.235949912254834e-05, + "loss": 0.0042, + "step": 3472 + }, + { + "epoch": 2.6835296389264336, + "grad_norm": 0.01732746884226799, + "learning_rate": 5.233255311064625e-05, + "loss": 0.0041, + "step": 3473 + }, + { + "epoch": 2.684301988800927, + "grad_norm": 0.026455463841557503, + "learning_rate": 5.2305606419794305e-05, + "loss": 0.004, + "step": 3474 + }, + { + "epoch": 2.68507433867542, + "grad_norm": 0.006931504234671593, + "learning_rate": 5.227865905783603e-05, + "loss": 0.0033, + "step": 3475 + }, + { + "epoch": 2.6858466885499133, + "grad_norm": 0.022621190175414085, + "learning_rate": 5.225171103261519e-05, + "loss": 0.0039, + "step": 3476 + }, + { + "epoch": 2.686619038424406, + "grad_norm": 0.015190225094556808, + "learning_rate": 5.2224762351975655e-05, + "loss": 0.0035, + "step": 3477 + }, + { + "epoch": 2.6873913882988996, + "grad_norm": 0.010020987130701542, + "learning_rate": 5.2197813023761564e-05, + "loss": 0.0042, + "step": 3478 + }, + { + "epoch": 2.6881637381733925, + "grad_norm": 0.009694269858300686, + "learning_rate": 5.217086305581722e-05, + "loss": 0.0038, + "step": 3479 + }, + { + "epoch": 2.6889360880478854, + "grad_norm": 0.014760488644242287, + "learning_rate": 5.2143912455987075e-05, + "loss": 0.0039, + "step": 3480 + }, + { + "epoch": 2.689708437922379, + "grad_norm": 0.008779791183769703, + "learning_rate": 5.211696123211585e-05, + "loss": 0.0037, + "step": 3481 + }, + { + "epoch": 2.690480787796872, + "grad_norm": 0.015196164138615131, + "learning_rate": 5.209000939204832e-05, + "loss": 0.0041, + "step": 3482 + }, + { + "epoch": 2.691253137671365, + "grad_norm": 0.011902782134711742, + "learning_rate": 5.206305694362959e-05, + "loss": 0.004, + "step": 3483 + }, + { + "epoch": 2.692025487545858, + "grad_norm": 0.008937469683587551, + "learning_rate": 5.2036103894704825e-05, + "loss": 0.0034, + "step": 3484 + }, + { + "epoch": 2.6927978374203514, + "grad_norm": 0.010444491170346737, + "learning_rate": 5.20091502531194e-05, + "loss": 0.0038, + "step": 3485 + }, + { + "epoch": 2.6935701872948448, + "grad_norm": 0.014897272922098637, + "learning_rate": 5.1982196026718896e-05, + "loss": 0.0035, + "step": 3486 + }, + { + "epoch": 2.6943425371693377, + "grad_norm": 0.011194158345460892, + "learning_rate": 5.195524122334903e-05, + "loss": 0.004, + "step": 3487 + }, + { + "epoch": 2.6951148870438306, + "grad_norm": 0.0157900582998991, + "learning_rate": 5.1928285850855676e-05, + "loss": 0.0043, + "step": 3488 + }, + { + "epoch": 2.695887236918324, + "grad_norm": 0.00894899107515812, + "learning_rate": 5.190132991708492e-05, + "loss": 0.0032, + "step": 3489 + }, + { + "epoch": 2.6966595867928174, + "grad_norm": 0.00943822506815195, + "learning_rate": 5.187437342988295e-05, + "loss": 0.0037, + "step": 3490 + }, + { + "epoch": 2.6974319366673103, + "grad_norm": 0.009946034289896488, + "learning_rate": 5.184741639709618e-05, + "loss": 0.0043, + "step": 3491 + }, + { + "epoch": 2.698204286541803, + "grad_norm": 0.010902882553637028, + "learning_rate": 5.1820458826571126e-05, + "loss": 0.0042, + "step": 3492 + }, + { + "epoch": 2.6989766364162966, + "grad_norm": 0.011112508364021778, + "learning_rate": 5.1793500726154506e-05, + "loss": 0.0038, + "step": 3493 + }, + { + "epoch": 2.69974898629079, + "grad_norm": 0.010520882904529572, + "learning_rate": 5.176654210369315e-05, + "loss": 0.0037, + "step": 3494 + }, + { + "epoch": 2.700521336165283, + "grad_norm": 0.01588641107082367, + "learning_rate": 5.173958296703408e-05, + "loss": 0.0042, + "step": 3495 + }, + { + "epoch": 2.701293686039776, + "grad_norm": 0.008596131578087807, + "learning_rate": 5.1712623324024444e-05, + "loss": 0.0034, + "step": 3496 + }, + { + "epoch": 2.702066035914269, + "grad_norm": 0.009026461280882359, + "learning_rate": 5.1685663182511535e-05, + "loss": 0.0037, + "step": 3497 + }, + { + "epoch": 2.7028383857887626, + "grad_norm": 0.01139860600233078, + "learning_rate": 5.165870255034281e-05, + "loss": 0.0039, + "step": 3498 + }, + { + "epoch": 2.7036107356632555, + "grad_norm": 0.010759508237242699, + "learning_rate": 5.1631741435365856e-05, + "loss": 0.0042, + "step": 3499 + }, + { + "epoch": 2.7043830855377484, + "grad_norm": 0.011585521511733532, + "learning_rate": 5.160477984542839e-05, + "loss": 0.0037, + "step": 3500 + }, + { + "epoch": 2.705155435412242, + "grad_norm": 0.008951091207563877, + "learning_rate": 5.157781778837829e-05, + "loss": 0.0038, + "step": 3501 + }, + { + "epoch": 2.705927785286735, + "grad_norm": 0.019819583743810654, + "learning_rate": 5.1550855272063545e-05, + "loss": 0.0041, + "step": 3502 + }, + { + "epoch": 2.706700135161228, + "grad_norm": 0.010994885116815567, + "learning_rate": 5.152389230433232e-05, + "loss": 0.0037, + "step": 3503 + }, + { + "epoch": 2.707472485035721, + "grad_norm": 0.010090935043990612, + "learning_rate": 5.149692889303287e-05, + "loss": 0.0036, + "step": 3504 + }, + { + "epoch": 2.7082448349102144, + "grad_norm": 0.008905458264052868, + "learning_rate": 5.146996504601357e-05, + "loss": 0.0037, + "step": 3505 + }, + { + "epoch": 2.7090171847847073, + "grad_norm": 0.008558421395719051, + "learning_rate": 5.1443000771122995e-05, + "loss": 0.0038, + "step": 3506 + }, + { + "epoch": 2.7097895346592007, + "grad_norm": 0.015720965340733528, + "learning_rate": 5.1416036076209725e-05, + "loss": 0.0036, + "step": 3507 + }, + { + "epoch": 2.7105618845336936, + "grad_norm": 0.0087439501658082, + "learning_rate": 5.138907096912261e-05, + "loss": 0.0037, + "step": 3508 + }, + { + "epoch": 2.711334234408187, + "grad_norm": 0.009726365096867085, + "learning_rate": 5.1362105457710477e-05, + "loss": 0.0035, + "step": 3509 + }, + { + "epoch": 2.71210658428268, + "grad_norm": 0.014834542758762836, + "learning_rate": 5.133513954982235e-05, + "loss": 0.0039, + "step": 3510 + }, + { + "epoch": 2.7128789341571733, + "grad_norm": 0.01073391456156969, + "learning_rate": 5.130817325330738e-05, + "loss": 0.0034, + "step": 3511 + }, + { + "epoch": 2.7136512840316662, + "grad_norm": 0.011810685507953167, + "learning_rate": 5.128120657601477e-05, + "loss": 0.004, + "step": 3512 + }, + { + "epoch": 2.7144236339061596, + "grad_norm": 0.009675235487520695, + "learning_rate": 5.125423952579389e-05, + "loss": 0.0032, + "step": 3513 + }, + { + "epoch": 2.7151959837806525, + "grad_norm": 0.009556422010064125, + "learning_rate": 5.122727211049421e-05, + "loss": 0.0036, + "step": 3514 + }, + { + "epoch": 2.715968333655146, + "grad_norm": 0.00851303432136774, + "learning_rate": 5.120030433796524e-05, + "loss": 0.0037, + "step": 3515 + }, + { + "epoch": 2.716740683529639, + "grad_norm": 0.01430381927639246, + "learning_rate": 5.11733362160567e-05, + "loss": 0.0037, + "step": 3516 + }, + { + "epoch": 2.717513033404132, + "grad_norm": 0.011229856871068478, + "learning_rate": 5.114636775261833e-05, + "loss": 0.0036, + "step": 3517 + }, + { + "epoch": 2.718285383278625, + "grad_norm": 0.01209938246756792, + "learning_rate": 5.111939895550001e-05, + "loss": 0.0039, + "step": 3518 + }, + { + "epoch": 2.7190577331531185, + "grad_norm": 0.009382943622767925, + "learning_rate": 5.109242983255171e-05, + "loss": 0.0039, + "step": 3519 + }, + { + "epoch": 2.7198300830276114, + "grad_norm": 0.021627336740493774, + "learning_rate": 5.106546039162348e-05, + "loss": 0.0043, + "step": 3520 + }, + { + "epoch": 2.720602432902105, + "grad_norm": 0.011568021960556507, + "learning_rate": 5.103849064056546e-05, + "loss": 0.0035, + "step": 3521 + }, + { + "epoch": 2.7213747827765977, + "grad_norm": 0.011675220914185047, + "learning_rate": 5.1011520587227924e-05, + "loss": 0.004, + "step": 3522 + }, + { + "epoch": 2.722147132651091, + "grad_norm": 0.008323492482304573, + "learning_rate": 5.0984550239461184e-05, + "loss": 0.0036, + "step": 3523 + }, + { + "epoch": 2.722919482525584, + "grad_norm": 0.009397026151418686, + "learning_rate": 5.095757960511566e-05, + "loss": 0.0033, + "step": 3524 + }, + { + "epoch": 2.7236918324000774, + "grad_norm": 0.007559982128441334, + "learning_rate": 5.093060869204185e-05, + "loss": 0.0033, + "step": 3525 + }, + { + "epoch": 2.7244641822745703, + "grad_norm": 0.009094711393117905, + "learning_rate": 5.090363750809033e-05, + "loss": 0.0036, + "step": 3526 + }, + { + "epoch": 2.7252365321490633, + "grad_norm": 0.009485473856329918, + "learning_rate": 5.0876666061111774e-05, + "loss": 0.003, + "step": 3527 + }, + { + "epoch": 2.7260088820235566, + "grad_norm": 0.012808065861463547, + "learning_rate": 5.084969435895691e-05, + "loss": 0.0035, + "step": 3528 + }, + { + "epoch": 2.72678123189805, + "grad_norm": 0.009124809876084328, + "learning_rate": 5.082272240947654e-05, + "loss": 0.0036, + "step": 3529 + }, + { + "epoch": 2.727553581772543, + "grad_norm": 0.010432286188006401, + "learning_rate": 5.079575022052157e-05, + "loss": 0.0036, + "step": 3530 + }, + { + "epoch": 2.728325931647036, + "grad_norm": 0.011838859878480434, + "learning_rate": 5.0768777799942934e-05, + "loss": 0.0042, + "step": 3531 + }, + { + "epoch": 2.7290982815215292, + "grad_norm": 0.017166869714856148, + "learning_rate": 5.0741805155591634e-05, + "loss": 0.0044, + "step": 3532 + }, + { + "epoch": 2.7298706313960226, + "grad_norm": 0.010602729395031929, + "learning_rate": 5.0714832295318815e-05, + "loss": 0.0038, + "step": 3533 + }, + { + "epoch": 2.7306429812705155, + "grad_norm": 0.012442387640476227, + "learning_rate": 5.068785922697558e-05, + "loss": 0.0037, + "step": 3534 + }, + { + "epoch": 2.7314153311450085, + "grad_norm": 0.008895857259631157, + "learning_rate": 5.066088595841313e-05, + "loss": 0.0035, + "step": 3535 + }, + { + "epoch": 2.732187681019502, + "grad_norm": 0.007561968639492989, + "learning_rate": 5.063391249748275e-05, + "loss": 0.0038, + "step": 3536 + }, + { + "epoch": 2.732960030893995, + "grad_norm": 0.011665033176541328, + "learning_rate": 5.0606938852035756e-05, + "loss": 0.0034, + "step": 3537 + }, + { + "epoch": 2.733732380768488, + "grad_norm": 0.013502943329513073, + "learning_rate": 5.057996502992355e-05, + "loss": 0.0034, + "step": 3538 + }, + { + "epoch": 2.734504730642981, + "grad_norm": 0.009370699524879456, + "learning_rate": 5.055299103899751e-05, + "loss": 0.0031, + "step": 3539 + }, + { + "epoch": 2.7352770805174744, + "grad_norm": 0.013993117958307266, + "learning_rate": 5.052601688710914e-05, + "loss": 0.0035, + "step": 3540 + }, + { + "epoch": 2.736049430391968, + "grad_norm": 0.013802869245409966, + "learning_rate": 5.049904258210999e-05, + "loss": 0.0039, + "step": 3541 + }, + { + "epoch": 2.7368217802664607, + "grad_norm": 0.0092271463945508, + "learning_rate": 5.047206813185158e-05, + "loss": 0.004, + "step": 3542 + }, + { + "epoch": 2.7375941301409537, + "grad_norm": 0.014459514059126377, + "learning_rate": 5.044509354418555e-05, + "loss": 0.0039, + "step": 3543 + }, + { + "epoch": 2.738366480015447, + "grad_norm": 0.014967890456318855, + "learning_rate": 5.041811882696356e-05, + "loss": 0.004, + "step": 3544 + }, + { + "epoch": 2.7391388298899404, + "grad_norm": 0.007593820337206125, + "learning_rate": 5.039114398803726e-05, + "loss": 0.0039, + "step": 3545 + }, + { + "epoch": 2.7399111797644333, + "grad_norm": 0.009915469214320183, + "learning_rate": 5.0364169035258414e-05, + "loss": 0.0037, + "step": 3546 + }, + { + "epoch": 2.7406835296389263, + "grad_norm": 0.019180258736014366, + "learning_rate": 5.033719397647875e-05, + "loss": 0.0039, + "step": 3547 + }, + { + "epoch": 2.7414558795134196, + "grad_norm": 0.01391974650323391, + "learning_rate": 5.031021881955007e-05, + "loss": 0.0037, + "step": 3548 + }, + { + "epoch": 2.742228229387913, + "grad_norm": 0.007851621136069298, + "learning_rate": 5.028324357232419e-05, + "loss": 0.0038, + "step": 3549 + }, + { + "epoch": 2.743000579262406, + "grad_norm": 0.016175515949726105, + "learning_rate": 5.025626824265294e-05, + "loss": 0.0036, + "step": 3550 + }, + { + "epoch": 2.743772929136899, + "grad_norm": 0.009228870272636414, + "learning_rate": 5.022929283838821e-05, + "loss": 0.0038, + "step": 3551 + }, + { + "epoch": 2.7445452790113922, + "grad_norm": 0.022975003346800804, + "learning_rate": 5.020231736738187e-05, + "loss": 0.004, + "step": 3552 + }, + { + "epoch": 2.745317628885885, + "grad_norm": 0.013293186202645302, + "learning_rate": 5.0175341837485835e-05, + "loss": 0.0039, + "step": 3553 + }, + { + "epoch": 2.7460899787603785, + "grad_norm": 0.020917177200317383, + "learning_rate": 5.014836625655201e-05, + "loss": 0.0035, + "step": 3554 + }, + { + "epoch": 2.7468623286348715, + "grad_norm": 0.027095790952444077, + "learning_rate": 5.0121390632432376e-05, + "loss": 0.0038, + "step": 3555 + }, + { + "epoch": 2.747634678509365, + "grad_norm": 0.027216967195272446, + "learning_rate": 5.0094414972978857e-05, + "loss": 0.0042, + "step": 3556 + }, + { + "epoch": 2.7484070283838578, + "grad_norm": 0.008903938345611095, + "learning_rate": 5.0067439286043414e-05, + "loss": 0.0038, + "step": 3557 + }, + { + "epoch": 2.749179378258351, + "grad_norm": 0.0110005559399724, + "learning_rate": 5.0040463579478036e-05, + "loss": 0.004, + "step": 3558 + }, + { + "epoch": 2.749951728132844, + "grad_norm": 0.026676757261157036, + "learning_rate": 5.001348786113468e-05, + "loss": 0.0038, + "step": 3559 + }, + { + "epoch": 2.7507240780073374, + "grad_norm": 0.0195772722363472, + "learning_rate": 4.998651213886533e-05, + "loss": 0.0041, + "step": 3560 + }, + { + "epoch": 2.7514964278818304, + "grad_norm": 0.015704307705163956, + "learning_rate": 4.995953642052197e-05, + "loss": 0.0036, + "step": 3561 + }, + { + "epoch": 2.7522687777563237, + "grad_norm": 0.009365121833980083, + "learning_rate": 4.993256071395659e-05, + "loss": 0.0041, + "step": 3562 + }, + { + "epoch": 2.7530411276308167, + "grad_norm": 0.013832955621182919, + "learning_rate": 4.990558502702115e-05, + "loss": 0.0032, + "step": 3563 + }, + { + "epoch": 2.75381347750531, + "grad_norm": 0.01772594451904297, + "learning_rate": 4.987860936756762e-05, + "loss": 0.0041, + "step": 3564 + }, + { + "epoch": 2.754585827379803, + "grad_norm": 0.01973775029182434, + "learning_rate": 4.985163374344799e-05, + "loss": 0.0039, + "step": 3565 + }, + { + "epoch": 2.7553581772542963, + "grad_norm": 0.006610201671719551, + "learning_rate": 4.9824658162514183e-05, + "loss": 0.0033, + "step": 3566 + }, + { + "epoch": 2.7561305271287893, + "grad_norm": 0.01677447371184826, + "learning_rate": 4.9797682632618134e-05, + "loss": 0.0037, + "step": 3567 + }, + { + "epoch": 2.7569028770032826, + "grad_norm": 0.019037803635001183, + "learning_rate": 4.9770707161611806e-05, + "loss": 0.0036, + "step": 3568 + }, + { + "epoch": 2.7576752268777756, + "grad_norm": 0.011565999127924442, + "learning_rate": 4.974373175734707e-05, + "loss": 0.0034, + "step": 3569 + }, + { + "epoch": 2.758447576752269, + "grad_norm": 0.019641969352960587, + "learning_rate": 4.9716756427675816e-05, + "loss": 0.0036, + "step": 3570 + }, + { + "epoch": 2.759219926626762, + "grad_norm": 0.017786724492907524, + "learning_rate": 4.9689781180449935e-05, + "loss": 0.0038, + "step": 3571 + }, + { + "epoch": 2.7599922765012552, + "grad_norm": 0.010379564017057419, + "learning_rate": 4.966280602352127e-05, + "loss": 0.004, + "step": 3572 + }, + { + "epoch": 2.760764626375748, + "grad_norm": 0.014459546655416489, + "learning_rate": 4.963583096474159e-05, + "loss": 0.0039, + "step": 3573 + }, + { + "epoch": 2.761536976250241, + "grad_norm": 0.009286866523325443, + "learning_rate": 4.960885601196274e-05, + "loss": 0.0038, + "step": 3574 + }, + { + "epoch": 2.7623093261247345, + "grad_norm": 0.008317566476762295, + "learning_rate": 4.958188117303646e-05, + "loss": 0.0037, + "step": 3575 + }, + { + "epoch": 2.763081675999228, + "grad_norm": 0.009271105751395226, + "learning_rate": 4.955490645581446e-05, + "loss": 0.0036, + "step": 3576 + }, + { + "epoch": 2.7638540258737208, + "grad_norm": 0.012317502871155739, + "learning_rate": 4.952793186814842e-05, + "loss": 0.0042, + "step": 3577 + }, + { + "epoch": 2.7646263757482137, + "grad_norm": 0.015943629667162895, + "learning_rate": 4.950095741789003e-05, + "loss": 0.0041, + "step": 3578 + }, + { + "epoch": 2.765398725622707, + "grad_norm": 0.011819128878414631, + "learning_rate": 4.9473983112890865e-05, + "loss": 0.0037, + "step": 3579 + }, + { + "epoch": 2.7661710754972004, + "grad_norm": 0.012603342533111572, + "learning_rate": 4.9447008961002495e-05, + "loss": 0.0038, + "step": 3580 + }, + { + "epoch": 2.7669434253716934, + "grad_norm": 0.014154767617583275, + "learning_rate": 4.942003497007647e-05, + "loss": 0.004, + "step": 3581 + }, + { + "epoch": 2.7677157752461863, + "grad_norm": 0.01127578690648079, + "learning_rate": 4.939306114796426e-05, + "loss": 0.0039, + "step": 3582 + }, + { + "epoch": 2.7684881251206797, + "grad_norm": 0.009542621672153473, + "learning_rate": 4.936608750251726e-05, + "loss": 0.0037, + "step": 3583 + }, + { + "epoch": 2.769260474995173, + "grad_norm": 0.008893107995390892, + "learning_rate": 4.933911404158688e-05, + "loss": 0.004, + "step": 3584 + }, + { + "epoch": 2.770032824869666, + "grad_norm": 0.01047214213758707, + "learning_rate": 4.931214077302445e-05, + "loss": 0.0034, + "step": 3585 + }, + { + "epoch": 2.770805174744159, + "grad_norm": 0.011454110965132713, + "learning_rate": 4.928516770468119e-05, + "loss": 0.0038, + "step": 3586 + }, + { + "epoch": 2.7715775246186523, + "grad_norm": 0.009302152320742607, + "learning_rate": 4.925819484440836e-05, + "loss": 0.0036, + "step": 3587 + }, + { + "epoch": 2.7723498744931456, + "grad_norm": 0.008845715783536434, + "learning_rate": 4.923122220005709e-05, + "loss": 0.0035, + "step": 3588 + }, + { + "epoch": 2.7731222243676386, + "grad_norm": 0.011453505605459213, + "learning_rate": 4.920424977947844e-05, + "loss": 0.0038, + "step": 3589 + }, + { + "epoch": 2.7738945742421315, + "grad_norm": 0.014364033006131649, + "learning_rate": 4.9177277590523464e-05, + "loss": 0.0038, + "step": 3590 + }, + { + "epoch": 2.774666924116625, + "grad_norm": 0.009715999476611614, + "learning_rate": 4.9150305641043096e-05, + "loss": 0.0037, + "step": 3591 + }, + { + "epoch": 2.7754392739911182, + "grad_norm": 0.011447625234723091, + "learning_rate": 4.912333393888824e-05, + "loss": 0.0039, + "step": 3592 + }, + { + "epoch": 2.776211623865611, + "grad_norm": 0.01904742605984211, + "learning_rate": 4.909636249190968e-05, + "loss": 0.003, + "step": 3593 + }, + { + "epoch": 2.776983973740104, + "grad_norm": 0.010695524513721466, + "learning_rate": 4.906939130795815e-05, + "loss": 0.0046, + "step": 3594 + }, + { + "epoch": 2.7777563236145975, + "grad_norm": 0.007318440359085798, + "learning_rate": 4.904242039488435e-05, + "loss": 0.0034, + "step": 3595 + }, + { + "epoch": 2.778528673489091, + "grad_norm": 0.007491338532418013, + "learning_rate": 4.901544976053882e-05, + "loss": 0.0037, + "step": 3596 + }, + { + "epoch": 2.7793010233635838, + "grad_norm": 0.012006079778075218, + "learning_rate": 4.8988479412772074e-05, + "loss": 0.0039, + "step": 3597 + }, + { + "epoch": 2.7800733732380767, + "grad_norm": 0.009866135194897652, + "learning_rate": 4.896150935943454e-05, + "loss": 0.0039, + "step": 3598 + }, + { + "epoch": 2.78084572311257, + "grad_norm": 0.012143891304731369, + "learning_rate": 4.893453960837653e-05, + "loss": 0.0039, + "step": 3599 + }, + { + "epoch": 2.781618072987063, + "grad_norm": 0.008424376137554646, + "learning_rate": 4.8907570167448294e-05, + "loss": 0.0037, + "step": 3600 + }, + { + "epoch": 2.7823904228615564, + "grad_norm": 0.012690001167356968, + "learning_rate": 4.8880601044499984e-05, + "loss": 0.0035, + "step": 3601 + }, + { + "epoch": 2.7831627727360493, + "grad_norm": 0.006906248163431883, + "learning_rate": 4.885363224738168e-05, + "loss": 0.003, + "step": 3602 + }, + { + "epoch": 2.7839351226105427, + "grad_norm": 0.009611250832676888, + "learning_rate": 4.8826663783943314e-05, + "loss": 0.0042, + "step": 3603 + }, + { + "epoch": 2.7847074724850356, + "grad_norm": 0.01498124934732914, + "learning_rate": 4.8799695662034764e-05, + "loss": 0.0039, + "step": 3604 + }, + { + "epoch": 2.785479822359529, + "grad_norm": 0.009965925477445126, + "learning_rate": 4.877272788950582e-05, + "loss": 0.0037, + "step": 3605 + }, + { + "epoch": 2.786252172234022, + "grad_norm": 0.009315697476267815, + "learning_rate": 4.874576047420612e-05, + "loss": 0.0043, + "step": 3606 + }, + { + "epoch": 2.7870245221085153, + "grad_norm": 0.008523601107299328, + "learning_rate": 4.8718793423985235e-05, + "loss": 0.0033, + "step": 3607 + }, + { + "epoch": 2.787796871983008, + "grad_norm": 0.010219255462288857, + "learning_rate": 4.869182674669264e-05, + "loss": 0.0039, + "step": 3608 + }, + { + "epoch": 2.7885692218575016, + "grad_norm": 0.011321432888507843, + "learning_rate": 4.866486045017766e-05, + "loss": 0.0035, + "step": 3609 + }, + { + "epoch": 2.7893415717319945, + "grad_norm": 0.009535894729197025, + "learning_rate": 4.8637894542289535e-05, + "loss": 0.0035, + "step": 3610 + }, + { + "epoch": 2.790113921606488, + "grad_norm": 0.011316893622279167, + "learning_rate": 4.8610929030877405e-05, + "loss": 0.0034, + "step": 3611 + }, + { + "epoch": 2.790886271480981, + "grad_norm": 0.009314599446952343, + "learning_rate": 4.858396392379028e-05, + "loss": 0.0038, + "step": 3612 + }, + { + "epoch": 2.791658621355474, + "grad_norm": 0.008782761171460152, + "learning_rate": 4.8556999228877023e-05, + "loss": 0.0032, + "step": 3613 + }, + { + "epoch": 2.792430971229967, + "grad_norm": 0.010352780111134052, + "learning_rate": 4.8530034953986426e-05, + "loss": 0.0038, + "step": 3614 + }, + { + "epoch": 2.7932033211044605, + "grad_norm": 0.013563552871346474, + "learning_rate": 4.850307110696715e-05, + "loss": 0.0041, + "step": 3615 + }, + { + "epoch": 2.7939756709789534, + "grad_norm": 0.01034247875213623, + "learning_rate": 4.8476107695667686e-05, + "loss": 0.0034, + "step": 3616 + }, + { + "epoch": 2.7947480208534468, + "grad_norm": 0.008694970980286598, + "learning_rate": 4.844914472793646e-05, + "loss": 0.0039, + "step": 3617 + }, + { + "epoch": 2.7955203707279397, + "grad_norm": 0.02558278851211071, + "learning_rate": 4.842218221162174e-05, + "loss": 0.0038, + "step": 3618 + }, + { + "epoch": 2.796292720602433, + "grad_norm": 0.013983160257339478, + "learning_rate": 4.839522015457162e-05, + "loss": 0.0045, + "step": 3619 + }, + { + "epoch": 2.797065070476926, + "grad_norm": 0.01667140983045101, + "learning_rate": 4.836825856463416e-05, + "loss": 0.0038, + "step": 3620 + }, + { + "epoch": 2.7978374203514194, + "grad_norm": 0.008836453780531883, + "learning_rate": 4.834129744965719e-05, + "loss": 0.0039, + "step": 3621 + }, + { + "epoch": 2.7986097702259123, + "grad_norm": 0.014575183391571045, + "learning_rate": 4.831433681748847e-05, + "loss": 0.0036, + "step": 3622 + }, + { + "epoch": 2.7993821201004057, + "grad_norm": 0.026069698855280876, + "learning_rate": 4.828737667597557e-05, + "loss": 0.0044, + "step": 3623 + }, + { + "epoch": 2.8001544699748986, + "grad_norm": 0.00918895099312067, + "learning_rate": 4.8260417032965915e-05, + "loss": 0.0037, + "step": 3624 + }, + { + "epoch": 2.8009268198493915, + "grad_norm": 0.019233256578445435, + "learning_rate": 4.8233457896306853e-05, + "loss": 0.0033, + "step": 3625 + }, + { + "epoch": 2.801699169723885, + "grad_norm": 0.017233747988939285, + "learning_rate": 4.8206499273845505e-05, + "loss": 0.0039, + "step": 3626 + }, + { + "epoch": 2.8024715195983783, + "grad_norm": 0.018169382587075233, + "learning_rate": 4.817954117342887e-05, + "loss": 0.004, + "step": 3627 + }, + { + "epoch": 2.803243869472871, + "grad_norm": 0.011694537475705147, + "learning_rate": 4.8152583602903846e-05, + "loss": 0.0037, + "step": 3628 + }, + { + "epoch": 2.804016219347364, + "grad_norm": 0.010528466664254665, + "learning_rate": 4.812562657011706e-05, + "loss": 0.0039, + "step": 3629 + }, + { + "epoch": 2.8047885692218575, + "grad_norm": 0.02013496682047844, + "learning_rate": 4.80986700829151e-05, + "loss": 0.0037, + "step": 3630 + }, + { + "epoch": 2.805560919096351, + "grad_norm": 0.023819101974368095, + "learning_rate": 4.807171414914432e-05, + "loss": 0.004, + "step": 3631 + }, + { + "epoch": 2.806333268970844, + "grad_norm": 0.04680509865283966, + "learning_rate": 4.804475877665099e-05, + "loss": 0.004, + "step": 3632 + }, + { + "epoch": 2.8071056188453367, + "grad_norm": 0.0160503126680851, + "learning_rate": 4.8017803973281115e-05, + "loss": 0.0035, + "step": 3633 + }, + { + "epoch": 2.80787796871983, + "grad_norm": 0.02833872102200985, + "learning_rate": 4.79908497468806e-05, + "loss": 0.005, + "step": 3634 + }, + { + "epoch": 2.8086503185943235, + "grad_norm": 0.011657784692943096, + "learning_rate": 4.79638961052952e-05, + "loss": 0.0031, + "step": 3635 + }, + { + "epoch": 2.8094226684688164, + "grad_norm": 0.010503173805773258, + "learning_rate": 4.793694305637043e-05, + "loss": 0.0036, + "step": 3636 + }, + { + "epoch": 2.8101950183433093, + "grad_norm": 0.019909678027033806, + "learning_rate": 4.790999060795168e-05, + "loss": 0.0036, + "step": 3637 + }, + { + "epoch": 2.8109673682178027, + "grad_norm": 0.01406138576567173, + "learning_rate": 4.788303876788418e-05, + "loss": 0.0038, + "step": 3638 + }, + { + "epoch": 2.811739718092296, + "grad_norm": 0.009920360520482063, + "learning_rate": 4.7856087544012936e-05, + "loss": 0.0038, + "step": 3639 + }, + { + "epoch": 2.812512067966789, + "grad_norm": 0.016837509348988533, + "learning_rate": 4.7829136944182786e-05, + "loss": 0.0034, + "step": 3640 + }, + { + "epoch": 2.813284417841282, + "grad_norm": 0.009123590774834156, + "learning_rate": 4.7802186976238434e-05, + "loss": 0.0036, + "step": 3641 + }, + { + "epoch": 2.8140567677157753, + "grad_norm": 0.00945031177252531, + "learning_rate": 4.777523764802435e-05, + "loss": 0.0035, + "step": 3642 + }, + { + "epoch": 2.8148291175902687, + "grad_norm": 0.0078117093071341515, + "learning_rate": 4.7748288967384815e-05, + "loss": 0.004, + "step": 3643 + }, + { + "epoch": 2.8156014674647616, + "grad_norm": 0.01575397700071335, + "learning_rate": 4.772134094216396e-05, + "loss": 0.0038, + "step": 3644 + }, + { + "epoch": 2.8163738173392545, + "grad_norm": 0.025902222841978073, + "learning_rate": 4.7694393580205706e-05, + "loss": 0.0042, + "step": 3645 + }, + { + "epoch": 2.817146167213748, + "grad_norm": 0.008085416629910469, + "learning_rate": 4.766744688935376e-05, + "loss": 0.004, + "step": 3646 + }, + { + "epoch": 2.817918517088241, + "grad_norm": 0.017281895503401756, + "learning_rate": 4.764050087745167e-05, + "loss": 0.0041, + "step": 3647 + }, + { + "epoch": 2.818690866962734, + "grad_norm": 0.009094124659895897, + "learning_rate": 4.7613555552342756e-05, + "loss": 0.0036, + "step": 3648 + }, + { + "epoch": 2.819463216837227, + "grad_norm": 0.008755745366215706, + "learning_rate": 4.758661092187015e-05, + "loss": 0.0034, + "step": 3649 + }, + { + "epoch": 2.8202355667117205, + "grad_norm": 0.01971414126455784, + "learning_rate": 4.75596669938768e-05, + "loss": 0.0035, + "step": 3650 + }, + { + "epoch": 2.8210079165862134, + "grad_norm": 0.020349010825157166, + "learning_rate": 4.7532723776205403e-05, + "loss": 0.0038, + "step": 3651 + }, + { + "epoch": 2.821780266460707, + "grad_norm": 0.01196723897010088, + "learning_rate": 4.750578127669852e-05, + "loss": 0.0046, + "step": 3652 + }, + { + "epoch": 2.8225526163351997, + "grad_norm": 0.0126083018258214, + "learning_rate": 4.747883950319844e-05, + "loss": 0.0036, + "step": 3653 + }, + { + "epoch": 2.823324966209693, + "grad_norm": 0.015116846188902855, + "learning_rate": 4.745189846354724e-05, + "loss": 0.0042, + "step": 3654 + }, + { + "epoch": 2.824097316084186, + "grad_norm": 0.009867295622825623, + "learning_rate": 4.742495816558686e-05, + "loss": 0.0038, + "step": 3655 + }, + { + "epoch": 2.8248696659586794, + "grad_norm": 0.015143278986215591, + "learning_rate": 4.739801861715891e-05, + "loss": 0.0037, + "step": 3656 + }, + { + "epoch": 2.8256420158331723, + "grad_norm": 0.008129570633172989, + "learning_rate": 4.737107982610488e-05, + "loss": 0.0034, + "step": 3657 + }, + { + "epoch": 2.8264143657076657, + "grad_norm": 0.012850960716605186, + "learning_rate": 4.7344141800265987e-05, + "loss": 0.0039, + "step": 3658 + }, + { + "epoch": 2.8271867155821586, + "grad_norm": 0.014351118355989456, + "learning_rate": 4.731720454748323e-05, + "loss": 0.0042, + "step": 3659 + }, + { + "epoch": 2.827959065456652, + "grad_norm": 0.009173419326543808, + "learning_rate": 4.729026807559741e-05, + "loss": 0.0039, + "step": 3660 + }, + { + "epoch": 2.828731415331145, + "grad_norm": 0.0093191834166646, + "learning_rate": 4.7263332392449094e-05, + "loss": 0.004, + "step": 3661 + }, + { + "epoch": 2.8295037652056383, + "grad_norm": 0.007909681648015976, + "learning_rate": 4.723639750587857e-05, + "loss": 0.0032, + "step": 3662 + }, + { + "epoch": 2.830276115080131, + "grad_norm": 0.012290013954043388, + "learning_rate": 4.720946342372596e-05, + "loss": 0.0035, + "step": 3663 + }, + { + "epoch": 2.8310484649546246, + "grad_norm": 0.01055870484560728, + "learning_rate": 4.718253015383111e-05, + "loss": 0.0035, + "step": 3664 + }, + { + "epoch": 2.8318208148291175, + "grad_norm": 0.009200993925333023, + "learning_rate": 4.715559770403368e-05, + "loss": 0.0037, + "step": 3665 + }, + { + "epoch": 2.832593164703611, + "grad_norm": 0.008415855467319489, + "learning_rate": 4.712866608217301e-05, + "loss": 0.0037, + "step": 3666 + }, + { + "epoch": 2.833365514578104, + "grad_norm": 0.016717437654733658, + "learning_rate": 4.710173529608825e-05, + "loss": 0.0037, + "step": 3667 + }, + { + "epoch": 2.834137864452597, + "grad_norm": 0.01344334241002798, + "learning_rate": 4.707480535361835e-05, + "loss": 0.0037, + "step": 3668 + }, + { + "epoch": 2.83491021432709, + "grad_norm": 0.008762449026107788, + "learning_rate": 4.7047876262601906e-05, + "loss": 0.0033, + "step": 3669 + }, + { + "epoch": 2.8356825642015835, + "grad_norm": 0.014935498125851154, + "learning_rate": 4.7020948030877346e-05, + "loss": 0.0033, + "step": 3670 + }, + { + "epoch": 2.8364549140760764, + "grad_norm": 0.011157029308378696, + "learning_rate": 4.699402066628285e-05, + "loss": 0.0037, + "step": 3671 + }, + { + "epoch": 2.8372272639505693, + "grad_norm": 0.008815648965537548, + "learning_rate": 4.696709417665629e-05, + "loss": 0.0035, + "step": 3672 + }, + { + "epoch": 2.8379996138250627, + "grad_norm": 0.012259745970368385, + "learning_rate": 4.6940168569835324e-05, + "loss": 0.0038, + "step": 3673 + }, + { + "epoch": 2.838771963699556, + "grad_norm": 0.013299521990120411, + "learning_rate": 4.6913243853657356e-05, + "loss": 0.0035, + "step": 3674 + }, + { + "epoch": 2.839544313574049, + "grad_norm": 0.012281878851354122, + "learning_rate": 4.688632003595954e-05, + "loss": 0.0041, + "step": 3675 + }, + { + "epoch": 2.840316663448542, + "grad_norm": 0.008334542624652386, + "learning_rate": 4.6859397124578684e-05, + "loss": 0.0035, + "step": 3676 + }, + { + "epoch": 2.8410890133230353, + "grad_norm": 0.009239261038601398, + "learning_rate": 4.683247512735146e-05, + "loss": 0.004, + "step": 3677 + }, + { + "epoch": 2.8418613631975287, + "grad_norm": 0.010223859921097755, + "learning_rate": 4.68055540521142e-05, + "loss": 0.0034, + "step": 3678 + }, + { + "epoch": 2.8426337130720216, + "grad_norm": 0.007821132428944111, + "learning_rate": 4.6778633906702945e-05, + "loss": 0.0032, + "step": 3679 + }, + { + "epoch": 2.8434060629465145, + "grad_norm": 0.009876555763185024, + "learning_rate": 4.6751714698953536e-05, + "loss": 0.0038, + "step": 3680 + }, + { + "epoch": 2.844178412821008, + "grad_norm": 0.010038082487881184, + "learning_rate": 4.6724796436701496e-05, + "loss": 0.0037, + "step": 3681 + }, + { + "epoch": 2.8449507626955013, + "grad_norm": 0.009820781648159027, + "learning_rate": 4.6697879127782064e-05, + "loss": 0.004, + "step": 3682 + }, + { + "epoch": 2.845723112569994, + "grad_norm": 0.008844954892992973, + "learning_rate": 4.667096278003021e-05, + "loss": 0.0034, + "step": 3683 + }, + { + "epoch": 2.846495462444487, + "grad_norm": 0.012067172676324844, + "learning_rate": 4.6644047401280664e-05, + "loss": 0.0041, + "step": 3684 + }, + { + "epoch": 2.8472678123189805, + "grad_norm": 0.009092634543776512, + "learning_rate": 4.6617132999367844e-05, + "loss": 0.0034, + "step": 3685 + }, + { + "epoch": 2.848040162193474, + "grad_norm": 0.009050995111465454, + "learning_rate": 4.659021958212585e-05, + "loss": 0.004, + "step": 3686 + }, + { + "epoch": 2.848812512067967, + "grad_norm": 0.02117002010345459, + "learning_rate": 4.656330715738855e-05, + "loss": 0.0036, + "step": 3687 + }, + { + "epoch": 2.8495848619424597, + "grad_norm": 0.010049187578260899, + "learning_rate": 4.65363957329895e-05, + "loss": 0.0037, + "step": 3688 + }, + { + "epoch": 2.850357211816953, + "grad_norm": 0.011103583499789238, + "learning_rate": 4.650948531676195e-05, + "loss": 0.0034, + "step": 3689 + }, + { + "epoch": 2.8511295616914465, + "grad_norm": 0.009087778627872467, + "learning_rate": 4.6482575916538885e-05, + "loss": 0.0033, + "step": 3690 + }, + { + "epoch": 2.8519019115659394, + "grad_norm": 0.011496257968246937, + "learning_rate": 4.6455667540152984e-05, + "loss": 0.0038, + "step": 3691 + }, + { + "epoch": 2.8526742614404323, + "grad_norm": 0.009799706749618053, + "learning_rate": 4.642876019543661e-05, + "loss": 0.0042, + "step": 3692 + }, + { + "epoch": 2.8534466113149257, + "grad_norm": 0.009193724021315575, + "learning_rate": 4.640185389022186e-05, + "loss": 0.0041, + "step": 3693 + }, + { + "epoch": 2.8542189611894186, + "grad_norm": 0.006750943139195442, + "learning_rate": 4.637494863234048e-05, + "loss": 0.0036, + "step": 3694 + }, + { + "epoch": 2.854991311063912, + "grad_norm": 0.020224595442414284, + "learning_rate": 4.6348044429623986e-05, + "loss": 0.0038, + "step": 3695 + }, + { + "epoch": 2.855763660938405, + "grad_norm": 0.007359965238720179, + "learning_rate": 4.632114128990351e-05, + "loss": 0.0033, + "step": 3696 + }, + { + "epoch": 2.8565360108128983, + "grad_norm": 0.009418971836566925, + "learning_rate": 4.629423922100989e-05, + "loss": 0.004, + "step": 3697 + }, + { + "epoch": 2.8573083606873912, + "grad_norm": 0.007746202405542135, + "learning_rate": 4.626733823077372e-05, + "loss": 0.0034, + "step": 3698 + }, + { + "epoch": 2.8580807105618846, + "grad_norm": 0.011419177986681461, + "learning_rate": 4.624043832702519e-05, + "loss": 0.0042, + "step": 3699 + }, + { + "epoch": 2.8588530604363775, + "grad_norm": 0.014163573272526264, + "learning_rate": 4.62135395175942e-05, + "loss": 0.0037, + "step": 3700 + }, + { + "epoch": 2.859625410310871, + "grad_norm": 0.010520074516534805, + "learning_rate": 4.618664181031039e-05, + "loss": 0.004, + "step": 3701 + }, + { + "epoch": 2.860397760185364, + "grad_norm": 0.01044239941984415, + "learning_rate": 4.615974521300299e-05, + "loss": 0.0041, + "step": 3702 + }, + { + "epoch": 2.861170110059857, + "grad_norm": 0.0111888712272048, + "learning_rate": 4.613284973350096e-05, + "loss": 0.0036, + "step": 3703 + }, + { + "epoch": 2.86194245993435, + "grad_norm": 0.010530322790145874, + "learning_rate": 4.6105955379632936e-05, + "loss": 0.0038, + "step": 3704 + }, + { + "epoch": 2.8627148098088435, + "grad_norm": 0.0078042857348918915, + "learning_rate": 4.6079062159227225e-05, + "loss": 0.0039, + "step": 3705 + }, + { + "epoch": 2.8634871596833364, + "grad_norm": 0.009296247735619545, + "learning_rate": 4.605217008011176e-05, + "loss": 0.0037, + "step": 3706 + }, + { + "epoch": 2.86425950955783, + "grad_norm": 0.012510127387940884, + "learning_rate": 4.6025279150114185e-05, + "loss": 0.0034, + "step": 3707 + }, + { + "epoch": 2.8650318594323227, + "grad_norm": 0.010169142857193947, + "learning_rate": 4.599838937706183e-05, + "loss": 0.0032, + "step": 3708 + }, + { + "epoch": 2.865804209306816, + "grad_norm": 0.007322363089770079, + "learning_rate": 4.597150076878163e-05, + "loss": 0.0032, + "step": 3709 + }, + { + "epoch": 2.866576559181309, + "grad_norm": 0.010280710645020008, + "learning_rate": 4.5944613333100195e-05, + "loss": 0.0038, + "step": 3710 + }, + { + "epoch": 2.8673489090558024, + "grad_norm": 0.008753606118261814, + "learning_rate": 4.5917727077843856e-05, + "loss": 0.0034, + "step": 3711 + }, + { + "epoch": 2.8681212589302953, + "grad_norm": 0.012354779988527298, + "learning_rate": 4.5890842010838504e-05, + "loss": 0.0034, + "step": 3712 + }, + { + "epoch": 2.8688936088047887, + "grad_norm": 0.009628398343920708, + "learning_rate": 4.586395813990974e-05, + "loss": 0.004, + "step": 3713 + }, + { + "epoch": 2.8696659586792816, + "grad_norm": 0.01673782244324684, + "learning_rate": 4.583707547288285e-05, + "loss": 0.0035, + "step": 3714 + }, + { + "epoch": 2.870438308553775, + "grad_norm": 0.018587319180369377, + "learning_rate": 4.58101940175827e-05, + "loss": 0.0042, + "step": 3715 + }, + { + "epoch": 2.871210658428268, + "grad_norm": 0.013515827246010303, + "learning_rate": 4.57833137818338e-05, + "loss": 0.0042, + "step": 3716 + }, + { + "epoch": 2.8719830083027613, + "grad_norm": 0.010271236300468445, + "learning_rate": 4.575643477346039e-05, + "loss": 0.0042, + "step": 3717 + }, + { + "epoch": 2.8727553581772542, + "grad_norm": 0.015272647142410278, + "learning_rate": 4.5729557000286296e-05, + "loss": 0.0037, + "step": 3718 + }, + { + "epoch": 2.873527708051747, + "grad_norm": 0.011950243264436722, + "learning_rate": 4.570268047013495e-05, + "loss": 0.0039, + "step": 3719 + }, + { + "epoch": 2.8743000579262405, + "grad_norm": 0.008711851201951504, + "learning_rate": 4.567580519082948e-05, + "loss": 0.0034, + "step": 3720 + }, + { + "epoch": 2.875072407800734, + "grad_norm": 0.011989584192633629, + "learning_rate": 4.564893117019266e-05, + "loss": 0.0032, + "step": 3721 + }, + { + "epoch": 2.875844757675227, + "grad_norm": 0.016780303791165352, + "learning_rate": 4.5622058416046805e-05, + "loss": 0.0039, + "step": 3722 + }, + { + "epoch": 2.8766171075497198, + "grad_norm": 0.008650211617350578, + "learning_rate": 4.559518693621397e-05, + "loss": 0.0033, + "step": 3723 + }, + { + "epoch": 2.877389457424213, + "grad_norm": 0.007928716950118542, + "learning_rate": 4.556831673851578e-05, + "loss": 0.0039, + "step": 3724 + }, + { + "epoch": 2.8781618072987065, + "grad_norm": 0.014184878207743168, + "learning_rate": 4.554144783077352e-05, + "loss": 0.0037, + "step": 3725 + }, + { + "epoch": 2.8789341571731994, + "grad_norm": 0.00899726152420044, + "learning_rate": 4.551458022080806e-05, + "loss": 0.0033, + "step": 3726 + }, + { + "epoch": 2.8797065070476924, + "grad_norm": 0.006811381317675114, + "learning_rate": 4.54877139164399e-05, + "loss": 0.0036, + "step": 3727 + }, + { + "epoch": 2.8804788569221857, + "grad_norm": 0.007462832145392895, + "learning_rate": 4.5460848925489206e-05, + "loss": 0.0035, + "step": 3728 + }, + { + "epoch": 2.881251206796679, + "grad_norm": 0.006794137414544821, + "learning_rate": 4.5433985255775705e-05, + "loss": 0.0033, + "step": 3729 + }, + { + "epoch": 2.882023556671172, + "grad_norm": 0.014314756728708744, + "learning_rate": 4.540712291511875e-05, + "loss": 0.0039, + "step": 3730 + }, + { + "epoch": 2.882795906545665, + "grad_norm": 0.00948580913245678, + "learning_rate": 4.538026191133736e-05, + "loss": 0.0037, + "step": 3731 + }, + { + "epoch": 2.8835682564201583, + "grad_norm": 0.00665636220946908, + "learning_rate": 4.535340225225009e-05, + "loss": 0.0036, + "step": 3732 + }, + { + "epoch": 2.8843406062946517, + "grad_norm": 0.010609416291117668, + "learning_rate": 4.5326543945675136e-05, + "loss": 0.0039, + "step": 3733 + }, + { + "epoch": 2.8851129561691446, + "grad_norm": 0.009739573113620281, + "learning_rate": 4.529968699943033e-05, + "loss": 0.0042, + "step": 3734 + }, + { + "epoch": 2.8858853060436376, + "grad_norm": 0.011525544337928295, + "learning_rate": 4.527283142133306e-05, + "loss": 0.0039, + "step": 3735 + }, + { + "epoch": 2.886657655918131, + "grad_norm": 0.009056229144334793, + "learning_rate": 4.524597721920034e-05, + "loss": 0.0038, + "step": 3736 + }, + { + "epoch": 2.8874300057926243, + "grad_norm": 0.010327644646167755, + "learning_rate": 4.521912440084877e-05, + "loss": 0.0032, + "step": 3737 + }, + { + "epoch": 2.8882023556671172, + "grad_norm": 0.00854728277772665, + "learning_rate": 4.519227297409458e-05, + "loss": 0.0034, + "step": 3738 + }, + { + "epoch": 2.88897470554161, + "grad_norm": 0.016212280839681625, + "learning_rate": 4.5165422946753546e-05, + "loss": 0.003, + "step": 3739 + }, + { + "epoch": 2.8897470554161035, + "grad_norm": 0.010443037375807762, + "learning_rate": 4.513857432664107e-05, + "loss": 0.0037, + "step": 3740 + }, + { + "epoch": 2.8905194052905965, + "grad_norm": 0.008367817848920822, + "learning_rate": 4.5111727121572156e-05, + "loss": 0.0036, + "step": 3741 + }, + { + "epoch": 2.89129175516509, + "grad_norm": 0.010264886543154716, + "learning_rate": 4.508488133936135e-05, + "loss": 0.0035, + "step": 3742 + }, + { + "epoch": 2.8920641050395828, + "grad_norm": 0.014999981969594955, + "learning_rate": 4.505803698782281e-05, + "loss": 0.0037, + "step": 3743 + }, + { + "epoch": 2.892836454914076, + "grad_norm": 0.009613145142793655, + "learning_rate": 4.50311940747703e-05, + "loss": 0.0037, + "step": 3744 + }, + { + "epoch": 2.893608804788569, + "grad_norm": 0.008537041023373604, + "learning_rate": 4.500435260801715e-05, + "loss": 0.0037, + "step": 3745 + }, + { + "epoch": 2.8943811546630624, + "grad_norm": 0.013094227761030197, + "learning_rate": 4.497751259537622e-05, + "loss": 0.0034, + "step": 3746 + }, + { + "epoch": 2.8951535045375554, + "grad_norm": 0.014482134021818638, + "learning_rate": 4.495067404466002e-05, + "loss": 0.0035, + "step": 3747 + }, + { + "epoch": 2.8959258544120487, + "grad_norm": 0.011909635737538338, + "learning_rate": 4.492383696368061e-05, + "loss": 0.0035, + "step": 3748 + }, + { + "epoch": 2.8966982042865417, + "grad_norm": 0.01069470215588808, + "learning_rate": 4.489700136024959e-05, + "loss": 0.0036, + "step": 3749 + }, + { + "epoch": 2.897470554161035, + "grad_norm": 0.008906307630240917, + "learning_rate": 4.487016724217817e-05, + "loss": 0.0034, + "step": 3750 + }, + { + "epoch": 2.898242904035528, + "grad_norm": 0.009430951438844204, + "learning_rate": 4.484333461727712e-05, + "loss": 0.0035, + "step": 3751 + }, + { + "epoch": 2.8990152539100214, + "grad_norm": 0.009873777627944946, + "learning_rate": 4.481650349335675e-05, + "loss": 0.0033, + "step": 3752 + }, + { + "epoch": 2.8997876037845143, + "grad_norm": 0.0071570733562111855, + "learning_rate": 4.478967387822697e-05, + "loss": 0.0036, + "step": 3753 + }, + { + "epoch": 2.9005599536590077, + "grad_norm": 0.010955625213682652, + "learning_rate": 4.476284577969722e-05, + "loss": 0.0037, + "step": 3754 + }, + { + "epoch": 2.9013323035335006, + "grad_norm": 0.008684576489031315, + "learning_rate": 4.473601920557653e-05, + "loss": 0.0035, + "step": 3755 + }, + { + "epoch": 2.902104653407994, + "grad_norm": 0.0068272012285888195, + "learning_rate": 4.470919416367344e-05, + "loss": 0.0035, + "step": 3756 + }, + { + "epoch": 2.902877003282487, + "grad_norm": 0.008168086409568787, + "learning_rate": 4.468237066179609e-05, + "loss": 0.0039, + "step": 3757 + }, + { + "epoch": 2.9036493531569803, + "grad_norm": 0.010575056076049805, + "learning_rate": 4.465554870775216e-05, + "loss": 0.0033, + "step": 3758 + }, + { + "epoch": 2.904421703031473, + "grad_norm": 0.008518901653587818, + "learning_rate": 4.462872830934886e-05, + "loss": 0.0036, + "step": 3759 + }, + { + "epoch": 2.9051940529059666, + "grad_norm": 0.010122411884367466, + "learning_rate": 4.460190947439294e-05, + "loss": 0.0036, + "step": 3760 + }, + { + "epoch": 2.9059664027804595, + "grad_norm": 0.014623790979385376, + "learning_rate": 4.457509221069077e-05, + "loss": 0.004, + "step": 3761 + }, + { + "epoch": 2.906738752654953, + "grad_norm": 0.008349210023880005, + "learning_rate": 4.454827652604815e-05, + "loss": 0.0035, + "step": 3762 + }, + { + "epoch": 2.907511102529446, + "grad_norm": 0.015230577439069748, + "learning_rate": 4.452146242827051e-05, + "loss": 0.003, + "step": 3763 + }, + { + "epoch": 2.908283452403939, + "grad_norm": 0.013238787651062012, + "learning_rate": 4.4494649925162765e-05, + "loss": 0.0036, + "step": 3764 + }, + { + "epoch": 2.909055802278432, + "grad_norm": 0.011168868280947208, + "learning_rate": 4.4467839024529425e-05, + "loss": 0.0034, + "step": 3765 + }, + { + "epoch": 2.909828152152925, + "grad_norm": 0.009999548085033894, + "learning_rate": 4.4441029734174456e-05, + "loss": 0.0035, + "step": 3766 + }, + { + "epoch": 2.9106005020274184, + "grad_norm": 0.024452844634652138, + "learning_rate": 4.44142220619014e-05, + "loss": 0.0037, + "step": 3767 + }, + { + "epoch": 2.9113728519019118, + "grad_norm": 0.018125806003808975, + "learning_rate": 4.438741601551335e-05, + "loss": 0.0039, + "step": 3768 + }, + { + "epoch": 2.9121452017764047, + "grad_norm": 0.009755297563970089, + "learning_rate": 4.436061160281287e-05, + "loss": 0.0036, + "step": 3769 + }, + { + "epoch": 2.9129175516508976, + "grad_norm": 0.008294413797557354, + "learning_rate": 4.433380883160208e-05, + "loss": 0.0037, + "step": 3770 + }, + { + "epoch": 2.913689901525391, + "grad_norm": 0.01363943051546812, + "learning_rate": 4.4307007709682645e-05, + "loss": 0.0037, + "step": 3771 + }, + { + "epoch": 2.9144622513998844, + "grad_norm": 0.010448621585965157, + "learning_rate": 4.4280208244855695e-05, + "loss": 0.0037, + "step": 3772 + }, + { + "epoch": 2.9152346012743773, + "grad_norm": 0.015610828064382076, + "learning_rate": 4.4253410444921904e-05, + "loss": 0.0035, + "step": 3773 + }, + { + "epoch": 2.91600695114887, + "grad_norm": 0.008082563057541847, + "learning_rate": 4.422661431768149e-05, + "loss": 0.0041, + "step": 3774 + }, + { + "epoch": 2.9167793010233636, + "grad_norm": 0.01200641319155693, + "learning_rate": 4.419981987093415e-05, + "loss": 0.0039, + "step": 3775 + }, + { + "epoch": 2.917551650897857, + "grad_norm": 0.01190141774713993, + "learning_rate": 4.4173027112479076e-05, + "loss": 0.0039, + "step": 3776 + }, + { + "epoch": 2.91832400077235, + "grad_norm": 0.01024235412478447, + "learning_rate": 4.414623605011502e-05, + "loss": 0.0038, + "step": 3777 + }, + { + "epoch": 2.919096350646843, + "grad_norm": 0.009192753583192825, + "learning_rate": 4.411944669164022e-05, + "loss": 0.0043, + "step": 3778 + }, + { + "epoch": 2.919868700521336, + "grad_norm": 0.011779602617025375, + "learning_rate": 4.4092659044852366e-05, + "loss": 0.0036, + "step": 3779 + }, + { + "epoch": 2.9206410503958296, + "grad_norm": 0.011895395815372467, + "learning_rate": 4.406587311754874e-05, + "loss": 0.0038, + "step": 3780 + }, + { + "epoch": 2.9214134002703225, + "grad_norm": 0.007252044975757599, + "learning_rate": 4.4039088917526075e-05, + "loss": 0.0036, + "step": 3781 + }, + { + "epoch": 2.9221857501448154, + "grad_norm": 0.009333361871540546, + "learning_rate": 4.401230645258056e-05, + "loss": 0.0039, + "step": 3782 + }, + { + "epoch": 2.922958100019309, + "grad_norm": 0.008951723575592041, + "learning_rate": 4.398552573050797e-05, + "loss": 0.0033, + "step": 3783 + }, + { + "epoch": 2.923730449893802, + "grad_norm": 0.009833808057010174, + "learning_rate": 4.39587467591035e-05, + "loss": 0.0034, + "step": 3784 + }, + { + "epoch": 2.924502799768295, + "grad_norm": 0.009983672760426998, + "learning_rate": 4.39319695461619e-05, + "loss": 0.0034, + "step": 3785 + }, + { + "epoch": 2.925275149642788, + "grad_norm": 0.007449030876159668, + "learning_rate": 4.390519409947732e-05, + "loss": 0.0038, + "step": 3786 + }, + { + "epoch": 2.9260474995172814, + "grad_norm": 0.015035904943943024, + "learning_rate": 4.387842042684346e-05, + "loss": 0.0039, + "step": 3787 + }, + { + "epoch": 2.9268198493917743, + "grad_norm": 0.022417740896344185, + "learning_rate": 4.385164853605354e-05, + "loss": 0.0035, + "step": 3788 + }, + { + "epoch": 2.9275921992662677, + "grad_norm": 0.011493900790810585, + "learning_rate": 4.382487843490012e-05, + "loss": 0.0036, + "step": 3789 + }, + { + "epoch": 2.9283645491407606, + "grad_norm": 0.016776161268353462, + "learning_rate": 4.3798110131175396e-05, + "loss": 0.0036, + "step": 3790 + }, + { + "epoch": 2.929136899015254, + "grad_norm": 0.021847503259778023, + "learning_rate": 4.377134363267097e-05, + "loss": 0.0038, + "step": 3791 + }, + { + "epoch": 2.929909248889747, + "grad_norm": 0.015478880144655704, + "learning_rate": 4.374457894717788e-05, + "loss": 0.0043, + "step": 3792 + }, + { + "epoch": 2.9306815987642403, + "grad_norm": 0.009824762120842934, + "learning_rate": 4.371781608248672e-05, + "loss": 0.0036, + "step": 3793 + }, + { + "epoch": 2.931453948638733, + "grad_norm": 0.011714638210833073, + "learning_rate": 4.3691055046387484e-05, + "loss": 0.0041, + "step": 3794 + }, + { + "epoch": 2.9322262985132266, + "grad_norm": 0.020869752392172813, + "learning_rate": 4.366429584666971e-05, + "loss": 0.0037, + "step": 3795 + }, + { + "epoch": 2.9329986483877195, + "grad_norm": 0.023330258205533028, + "learning_rate": 4.363753849112231e-05, + "loss": 0.0037, + "step": 3796 + }, + { + "epoch": 2.933770998262213, + "grad_norm": 0.011580456048250198, + "learning_rate": 4.361078298753371e-05, + "loss": 0.004, + "step": 3797 + }, + { + "epoch": 2.934543348136706, + "grad_norm": 0.017411785200238228, + "learning_rate": 4.3584029343691805e-05, + "loss": 0.0036, + "step": 3798 + }, + { + "epoch": 2.935315698011199, + "grad_norm": 0.01577589102089405, + "learning_rate": 4.355727756738393e-05, + "loss": 0.0041, + "step": 3799 + }, + { + "epoch": 2.936088047885692, + "grad_norm": 0.02870682254433632, + "learning_rate": 4.353052766639687e-05, + "loss": 0.0041, + "step": 3800 + }, + { + "epoch": 2.9368603977601855, + "grad_norm": 0.008758428506553173, + "learning_rate": 4.3503779648516896e-05, + "loss": 0.0042, + "step": 3801 + }, + { + "epoch": 2.9376327476346784, + "grad_norm": 0.018319813534617424, + "learning_rate": 4.3477033521529686e-05, + "loss": 0.0041, + "step": 3802 + }, + { + "epoch": 2.938405097509172, + "grad_norm": 0.012039601802825928, + "learning_rate": 4.34502892932204e-05, + "loss": 0.0032, + "step": 3803 + }, + { + "epoch": 2.9391774473836647, + "grad_norm": 0.021439632400870323, + "learning_rate": 4.342354697137364e-05, + "loss": 0.0037, + "step": 3804 + }, + { + "epoch": 2.939949797258158, + "grad_norm": 0.011071378365159035, + "learning_rate": 4.339680656377347e-05, + "loss": 0.0038, + "step": 3805 + }, + { + "epoch": 2.940722147132651, + "grad_norm": 0.011408895254135132, + "learning_rate": 4.3370068078203326e-05, + "loss": 0.0038, + "step": 3806 + }, + { + "epoch": 2.9414944970071444, + "grad_norm": 0.01955719292163849, + "learning_rate": 4.3343331522446175e-05, + "loss": 0.004, + "step": 3807 + }, + { + "epoch": 2.9422668468816373, + "grad_norm": 0.01151325274258852, + "learning_rate": 4.331659690428438e-05, + "loss": 0.0035, + "step": 3808 + }, + { + "epoch": 2.9430391967561307, + "grad_norm": 0.01454999204725027, + "learning_rate": 4.328986423149972e-05, + "loss": 0.0034, + "step": 3809 + }, + { + "epoch": 2.9438115466306236, + "grad_norm": 0.010412359610199928, + "learning_rate": 4.326313351187344e-05, + "loss": 0.0035, + "step": 3810 + }, + { + "epoch": 2.944583896505117, + "grad_norm": 0.008726815693080425, + "learning_rate": 4.323640475318623e-05, + "loss": 0.0035, + "step": 3811 + }, + { + "epoch": 2.94535624637961, + "grad_norm": 0.010784812271595001, + "learning_rate": 4.320967796321815e-05, + "loss": 0.0037, + "step": 3812 + }, + { + "epoch": 2.946128596254103, + "grad_norm": 0.008588474243879318, + "learning_rate": 4.3182953149748745e-05, + "loss": 0.0037, + "step": 3813 + }, + { + "epoch": 2.946900946128596, + "grad_norm": 0.013325064443051815, + "learning_rate": 4.315623032055694e-05, + "loss": 0.0037, + "step": 3814 + }, + { + "epoch": 2.9476732960030896, + "grad_norm": 0.007062042597681284, + "learning_rate": 4.3129509483421157e-05, + "loss": 0.0033, + "step": 3815 + }, + { + "epoch": 2.9484456458775825, + "grad_norm": 0.00899225752800703, + "learning_rate": 4.310279064611912e-05, + "loss": 0.0039, + "step": 3816 + }, + { + "epoch": 2.9492179957520754, + "grad_norm": 0.008381965570151806, + "learning_rate": 4.307607381642808e-05, + "loss": 0.0035, + "step": 3817 + }, + { + "epoch": 2.949990345626569, + "grad_norm": 0.009166887030005455, + "learning_rate": 4.304935900212466e-05, + "loss": 0.0036, + "step": 3818 + }, + { + "epoch": 2.950762695501062, + "grad_norm": 0.010656706057488918, + "learning_rate": 4.302264621098486e-05, + "loss": 0.0036, + "step": 3819 + }, + { + "epoch": 2.951535045375555, + "grad_norm": 0.007658713962882757, + "learning_rate": 4.299593545078416e-05, + "loss": 0.0039, + "step": 3820 + }, + { + "epoch": 2.952307395250048, + "grad_norm": 0.0119070615619421, + "learning_rate": 4.296922672929742e-05, + "loss": 0.0036, + "step": 3821 + }, + { + "epoch": 2.9530797451245414, + "grad_norm": 0.015731122344732285, + "learning_rate": 4.294252005429888e-05, + "loss": 0.0036, + "step": 3822 + }, + { + "epoch": 2.953852094999035, + "grad_norm": 0.008103919215500355, + "learning_rate": 4.2915815433562224e-05, + "loss": 0.0037, + "step": 3823 + }, + { + "epoch": 2.9546244448735277, + "grad_norm": 0.010937350802123547, + "learning_rate": 4.28891128748605e-05, + "loss": 0.0037, + "step": 3824 + }, + { + "epoch": 2.9553967947480206, + "grad_norm": 0.023103781044483185, + "learning_rate": 4.2862412385966233e-05, + "loss": 0.0037, + "step": 3825 + }, + { + "epoch": 2.956169144622514, + "grad_norm": 0.009802715852856636, + "learning_rate": 4.283571397465124e-05, + "loss": 0.0035, + "step": 3826 + }, + { + "epoch": 2.9569414944970074, + "grad_norm": 0.010085641406476498, + "learning_rate": 4.2809017648686775e-05, + "loss": 0.0032, + "step": 3827 + }, + { + "epoch": 2.9577138443715003, + "grad_norm": 0.026056107133626938, + "learning_rate": 4.278232341584355e-05, + "loss": 0.0038, + "step": 3828 + }, + { + "epoch": 2.9584861942459932, + "grad_norm": 0.015221530571579933, + "learning_rate": 4.2755631283891555e-05, + "loss": 0.0041, + "step": 3829 + }, + { + "epoch": 2.9592585441204866, + "grad_norm": 0.009898537769913673, + "learning_rate": 4.272894126060024e-05, + "loss": 0.004, + "step": 3830 + }, + { + "epoch": 2.96003089399498, + "grad_norm": 0.019119389355182648, + "learning_rate": 4.270225335373846e-05, + "loss": 0.0037, + "step": 3831 + }, + { + "epoch": 2.960803243869473, + "grad_norm": 0.01502426527440548, + "learning_rate": 4.267556757107437e-05, + "loss": 0.0042, + "step": 3832 + }, + { + "epoch": 2.961575593743966, + "grad_norm": 0.01574239507317543, + "learning_rate": 4.264888392037557e-05, + "loss": 0.0037, + "step": 3833 + }, + { + "epoch": 2.962347943618459, + "grad_norm": 0.008315333165228367, + "learning_rate": 4.262220240940905e-05, + "loss": 0.0034, + "step": 3834 + }, + { + "epoch": 2.9631202934929526, + "grad_norm": 0.022397365421056747, + "learning_rate": 4.259552304594114e-05, + "loss": 0.0037, + "step": 3835 + }, + { + "epoch": 2.9638926433674455, + "grad_norm": 0.011859522201120853, + "learning_rate": 4.256884583773754e-05, + "loss": 0.0039, + "step": 3836 + }, + { + "epoch": 2.9646649932419384, + "grad_norm": 0.010500779375433922, + "learning_rate": 4.254217079256337e-05, + "loss": 0.0037, + "step": 3837 + }, + { + "epoch": 2.965437343116432, + "grad_norm": 0.009014081209897995, + "learning_rate": 4.2515497918183086e-05, + "loss": 0.0043, + "step": 3838 + }, + { + "epoch": 2.9662096929909247, + "grad_norm": 0.007724730763584375, + "learning_rate": 4.2488827222360487e-05, + "loss": 0.0037, + "step": 3839 + }, + { + "epoch": 2.966982042865418, + "grad_norm": 0.010915543884038925, + "learning_rate": 4.246215871285879e-05, + "loss": 0.004, + "step": 3840 + }, + { + "epoch": 2.967754392739911, + "grad_norm": 0.021229470148682594, + "learning_rate": 4.243549239744057e-05, + "loss": 0.0037, + "step": 3841 + }, + { + "epoch": 2.9685267426144044, + "grad_norm": 0.011718028225004673, + "learning_rate": 4.2408828283867727e-05, + "loss": 0.004, + "step": 3842 + }, + { + "epoch": 2.9692990924888973, + "grad_norm": 0.007771989796310663, + "learning_rate": 4.238216637990152e-05, + "loss": 0.0035, + "step": 3843 + }, + { + "epoch": 2.9700714423633907, + "grad_norm": 0.017838140949606895, + "learning_rate": 4.2355506693302635e-05, + "loss": 0.0034, + "step": 3844 + }, + { + "epoch": 2.9708437922378836, + "grad_norm": 0.030527664348483086, + "learning_rate": 4.232884923183103e-05, + "loss": 0.0037, + "step": 3845 + }, + { + "epoch": 2.971616142112377, + "grad_norm": 0.009560693055391312, + "learning_rate": 4.230219400324604e-05, + "loss": 0.0034, + "step": 3846 + }, + { + "epoch": 2.97238849198687, + "grad_norm": 0.022195350378751755, + "learning_rate": 4.2275541015306384e-05, + "loss": 0.004, + "step": 3847 + }, + { + "epoch": 2.9731608418613633, + "grad_norm": 0.021256253123283386, + "learning_rate": 4.2248890275770096e-05, + "loss": 0.0041, + "step": 3848 + }, + { + "epoch": 2.9739331917358562, + "grad_norm": 0.007295470684766769, + "learning_rate": 4.222224179239455e-05, + "loss": 0.0033, + "step": 3849 + }, + { + "epoch": 2.9747055416103496, + "grad_norm": 0.009077059105038643, + "learning_rate": 4.219559557293647e-05, + "loss": 0.0036, + "step": 3850 + }, + { + "epoch": 2.9754778914848425, + "grad_norm": 0.009278069250285625, + "learning_rate": 4.216895162515197e-05, + "loss": 0.0039, + "step": 3851 + }, + { + "epoch": 2.976250241359336, + "grad_norm": 0.014763458631932735, + "learning_rate": 4.21423099567964e-05, + "loss": 0.0036, + "step": 3852 + }, + { + "epoch": 2.977022591233829, + "grad_norm": 0.017561612650752068, + "learning_rate": 4.211567057562454e-05, + "loss": 0.0041, + "step": 3853 + }, + { + "epoch": 2.977794941108322, + "grad_norm": 0.016626615077257156, + "learning_rate": 4.2089033489390483e-05, + "loss": 0.0038, + "step": 3854 + }, + { + "epoch": 2.978567290982815, + "grad_norm": 0.010037774220108986, + "learning_rate": 4.20623987058476e-05, + "loss": 0.004, + "step": 3855 + }, + { + "epoch": 2.9793396408573085, + "grad_norm": 0.008447550237178802, + "learning_rate": 4.2035766232748664e-05, + "loss": 0.0037, + "step": 3856 + }, + { + "epoch": 2.9801119907318014, + "grad_norm": 0.01665206253528595, + "learning_rate": 4.2009136077845725e-05, + "loss": 0.004, + "step": 3857 + }, + { + "epoch": 2.980884340606295, + "grad_norm": 0.009643926285207272, + "learning_rate": 4.198250824889021e-05, + "loss": 0.0036, + "step": 3858 + }, + { + "epoch": 2.9816566904807877, + "grad_norm": 0.0100821228697896, + "learning_rate": 4.1955882753632806e-05, + "loss": 0.0038, + "step": 3859 + }, + { + "epoch": 2.9824290403552807, + "grad_norm": 0.013579235412180424, + "learning_rate": 4.1929259599823556e-05, + "loss": 0.0039, + "step": 3860 + }, + { + "epoch": 2.983201390229774, + "grad_norm": 0.007406320888549089, + "learning_rate": 4.1902638795211836e-05, + "loss": 0.0032, + "step": 3861 + }, + { + "epoch": 2.9839737401042674, + "grad_norm": 0.014929025433957577, + "learning_rate": 4.18760203475463e-05, + "loss": 0.0038, + "step": 3862 + }, + { + "epoch": 2.9847460899787603, + "grad_norm": 0.010060267522931099, + "learning_rate": 4.184940426457492e-05, + "loss": 0.0037, + "step": 3863 + }, + { + "epoch": 2.9855184398532533, + "grad_norm": 0.00800973642617464, + "learning_rate": 4.182279055404504e-05, + "loss": 0.0034, + "step": 3864 + }, + { + "epoch": 2.9862907897277466, + "grad_norm": 0.012781900353729725, + "learning_rate": 4.1796179223703225e-05, + "loss": 0.0033, + "step": 3865 + }, + { + "epoch": 2.98706313960224, + "grad_norm": 0.007149236276745796, + "learning_rate": 4.17695702812954e-05, + "loss": 0.0034, + "step": 3866 + }, + { + "epoch": 2.987835489476733, + "grad_norm": 0.007939104922115803, + "learning_rate": 4.174296373456681e-05, + "loss": 0.0035, + "step": 3867 + }, + { + "epoch": 2.988607839351226, + "grad_norm": 0.008486042730510235, + "learning_rate": 4.1716359591261964e-05, + "loss": 0.0034, + "step": 3868 + }, + { + "epoch": 2.9893801892257192, + "grad_norm": 0.01805216819047928, + "learning_rate": 4.168975785912467e-05, + "loss": 0.0042, + "step": 3869 + }, + { + "epoch": 2.9901525391002126, + "grad_norm": 0.013067138381302357, + "learning_rate": 4.166315854589805e-05, + "loss": 0.0036, + "step": 3870 + }, + { + "epoch": 2.9909248889747055, + "grad_norm": 0.01489514485001564, + "learning_rate": 4.1636561659324565e-05, + "loss": 0.0039, + "step": 3871 + }, + { + "epoch": 2.9916972388491985, + "grad_norm": 0.015756510198116302, + "learning_rate": 4.160996720714587e-05, + "loss": 0.0041, + "step": 3872 + }, + { + "epoch": 2.992469588723692, + "grad_norm": 0.012170984409749508, + "learning_rate": 4.1583375197102985e-05, + "loss": 0.0038, + "step": 3873 + }, + { + "epoch": 2.993241938598185, + "grad_norm": 0.013005238026380539, + "learning_rate": 4.155678563693623e-05, + "loss": 0.0039, + "step": 3874 + }, + { + "epoch": 2.994014288472678, + "grad_norm": 0.013325825333595276, + "learning_rate": 4.153019853438515e-05, + "loss": 0.0038, + "step": 3875 + }, + { + "epoch": 2.994786638347171, + "grad_norm": 0.01133064366877079, + "learning_rate": 4.15036138971886e-05, + "loss": 0.0036, + "step": 3876 + }, + { + "epoch": 2.9955589882216644, + "grad_norm": 0.00813497044146061, + "learning_rate": 4.147703173308477e-05, + "loss": 0.0039, + "step": 3877 + }, + { + "epoch": 2.996331338096158, + "grad_norm": 0.011179421097040176, + "learning_rate": 4.145045204981106e-05, + "loss": 0.004, + "step": 3878 + }, + { + "epoch": 2.9971036879706507, + "grad_norm": 0.011623677797615528, + "learning_rate": 4.142387485510416e-05, + "loss": 0.0039, + "step": 3879 + }, + { + "epoch": 2.9978760378451437, + "grad_norm": 0.009855561889708042, + "learning_rate": 4.139730015670006e-05, + "loss": 0.0039, + "step": 3880 + }, + { + "epoch": 2.998648387719637, + "grad_norm": 0.006677902769297361, + "learning_rate": 4.137072796233404e-05, + "loss": 0.0034, + "step": 3881 + }, + { + "epoch": 2.9994207375941304, + "grad_norm": 0.008960848674178123, + "learning_rate": 4.1344158279740574e-05, + "loss": 0.0037, + "step": 3882 + }, + { + "epoch": 3.000772349874493, + "grad_norm": 0.04119117185473442, + "learning_rate": 4.131759111665349e-05, + "loss": 0.0073, + "step": 3883 + }, + { + "epoch": 3.0015446997489863, + "grad_norm": 0.0093689551576972, + "learning_rate": 4.1291026480805845e-05, + "loss": 0.0033, + "step": 3884 + }, + { + "epoch": 3.0023170496234792, + "grad_norm": 0.013623661361634731, + "learning_rate": 4.126446437992993e-05, + "loss": 0.0033, + "step": 3885 + }, + { + "epoch": 3.0030893994979726, + "grad_norm": 0.013883264735341072, + "learning_rate": 4.1237904821757374e-05, + "loss": 0.0034, + "step": 3886 + }, + { + "epoch": 3.0038617493724655, + "grad_norm": 0.010895649902522564, + "learning_rate": 4.121134781401899e-05, + "loss": 0.0034, + "step": 3887 + }, + { + "epoch": 3.004634099246959, + "grad_norm": 0.010138518176972866, + "learning_rate": 4.118479336444492e-05, + "loss": 0.0034, + "step": 3888 + }, + { + "epoch": 3.005406449121452, + "grad_norm": 0.010731861926615238, + "learning_rate": 4.1158241480764483e-05, + "loss": 0.0034, + "step": 3889 + }, + { + "epoch": 3.006178798995945, + "grad_norm": 0.009778124280273914, + "learning_rate": 4.113169217070629e-05, + "loss": 0.0038, + "step": 3890 + }, + { + "epoch": 3.006951148870438, + "grad_norm": 0.011736907996237278, + "learning_rate": 4.110514544199825e-05, + "loss": 0.0035, + "step": 3891 + }, + { + "epoch": 3.0077234987449315, + "grad_norm": 0.01224558986723423, + "learning_rate": 4.107860130236743e-05, + "loss": 0.0034, + "step": 3892 + }, + { + "epoch": 3.0084958486194244, + "grad_norm": 0.009522565640509129, + "learning_rate": 4.105205975954019e-05, + "loss": 0.0033, + "step": 3893 + }, + { + "epoch": 3.009268198493918, + "grad_norm": 0.01398923434317112, + "learning_rate": 4.102552082124217e-05, + "loss": 0.0031, + "step": 3894 + }, + { + "epoch": 3.0100405483684107, + "grad_norm": 0.014189448207616806, + "learning_rate": 4.099898449519817e-05, + "loss": 0.0037, + "step": 3895 + }, + { + "epoch": 3.010812898242904, + "grad_norm": 0.011617259122431278, + "learning_rate": 4.097245078913229e-05, + "loss": 0.0037, + "step": 3896 + }, + { + "epoch": 3.011585248117397, + "grad_norm": 0.00984710082411766, + "learning_rate": 4.094591971076783e-05, + "loss": 0.003, + "step": 3897 + }, + { + "epoch": 3.0123575979918904, + "grad_norm": 0.012181985192000866, + "learning_rate": 4.09193912678274e-05, + "loss": 0.0029, + "step": 3898 + }, + { + "epoch": 3.0131299478663833, + "grad_norm": 0.010762302204966545, + "learning_rate": 4.089286546803275e-05, + "loss": 0.0035, + "step": 3899 + }, + { + "epoch": 3.0139022977408767, + "grad_norm": 0.00949389673769474, + "learning_rate": 4.0866342319104884e-05, + "loss": 0.0033, + "step": 3900 + }, + { + "epoch": 3.0146746476153696, + "grad_norm": 0.014336561784148216, + "learning_rate": 4.083982182876409e-05, + "loss": 0.0033, + "step": 3901 + }, + { + "epoch": 3.015446997489863, + "grad_norm": 0.010391092859208584, + "learning_rate": 4.081330400472982e-05, + "loss": 0.0031, + "step": 3902 + }, + { + "epoch": 3.016219347364356, + "grad_norm": 0.010914616286754608, + "learning_rate": 4.078678885472076e-05, + "loss": 0.0037, + "step": 3903 + }, + { + "epoch": 3.0169916972388493, + "grad_norm": 0.00893909391015768, + "learning_rate": 4.0760276386454856e-05, + "loss": 0.0032, + "step": 3904 + }, + { + "epoch": 3.0177640471133422, + "grad_norm": 0.008087982423603535, + "learning_rate": 4.073376660764924e-05, + "loss": 0.0032, + "step": 3905 + }, + { + "epoch": 3.0185363969878356, + "grad_norm": 0.008801883086562157, + "learning_rate": 4.0707259526020244e-05, + "loss": 0.0033, + "step": 3906 + }, + { + "epoch": 3.0193087468623285, + "grad_norm": 0.007589471992105246, + "learning_rate": 4.068075514928347e-05, + "loss": 0.003, + "step": 3907 + }, + { + "epoch": 3.020081096736822, + "grad_norm": 0.009592815302312374, + "learning_rate": 4.065425348515369e-05, + "loss": 0.0036, + "step": 3908 + }, + { + "epoch": 3.020853446611315, + "grad_norm": 0.014320937916636467, + "learning_rate": 4.062775454134489e-05, + "loss": 0.003, + "step": 3909 + }, + { + "epoch": 3.021625796485808, + "grad_norm": 0.009289335459470749, + "learning_rate": 4.060125832557028e-05, + "loss": 0.0031, + "step": 3910 + }, + { + "epoch": 3.022398146360301, + "grad_norm": 0.00905862171202898, + "learning_rate": 4.0574764845542276e-05, + "loss": 0.0034, + "step": 3911 + }, + { + "epoch": 3.0231704962347945, + "grad_norm": 0.006691919639706612, + "learning_rate": 4.0548274108972464e-05, + "loss": 0.0031, + "step": 3912 + }, + { + "epoch": 3.0239428461092874, + "grad_norm": 0.014596517197787762, + "learning_rate": 4.052178612357169e-05, + "loss": 0.0031, + "step": 3913 + }, + { + "epoch": 3.024715195983781, + "grad_norm": 0.01569315977394581, + "learning_rate": 4.0495300897049957e-05, + "loss": 0.003, + "step": 3914 + }, + { + "epoch": 3.0254875458582737, + "grad_norm": 0.009716901928186417, + "learning_rate": 4.046881843711645e-05, + "loss": 0.0033, + "step": 3915 + }, + { + "epoch": 3.026259895732767, + "grad_norm": 0.00683940015733242, + "learning_rate": 4.04423387514796e-05, + "loss": 0.003, + "step": 3916 + }, + { + "epoch": 3.02703224560726, + "grad_norm": 0.009737824089825153, + "learning_rate": 4.0415861847846994e-05, + "loss": 0.0032, + "step": 3917 + }, + { + "epoch": 3.0278045954817534, + "grad_norm": 0.008718915283679962, + "learning_rate": 4.0389387733925434e-05, + "loss": 0.003, + "step": 3918 + }, + { + "epoch": 3.0285769453562463, + "grad_norm": 0.010790186002850533, + "learning_rate": 4.036291641742087e-05, + "loss": 0.0031, + "step": 3919 + }, + { + "epoch": 3.0293492952307397, + "grad_norm": 0.02050062268972397, + "learning_rate": 4.0336447906038466e-05, + "loss": 0.0032, + "step": 3920 + }, + { + "epoch": 3.0301216451052326, + "grad_norm": 0.010391199961304665, + "learning_rate": 4.030998220748261e-05, + "loss": 0.0029, + "step": 3921 + }, + { + "epoch": 3.030893994979726, + "grad_norm": 0.009953645057976246, + "learning_rate": 4.028351932945675e-05, + "loss": 0.003, + "step": 3922 + }, + { + "epoch": 3.031666344854219, + "grad_norm": 0.017743706703186035, + "learning_rate": 4.025705927966365e-05, + "loss": 0.0034, + "step": 3923 + }, + { + "epoch": 3.0324386947287123, + "grad_norm": 0.01259995810687542, + "learning_rate": 4.0230602065805176e-05, + "loss": 0.0033, + "step": 3924 + }, + { + "epoch": 3.0332110446032052, + "grad_norm": 0.007997802458703518, + "learning_rate": 4.0204147695582364e-05, + "loss": 0.003, + "step": 3925 + }, + { + "epoch": 3.0339833944776986, + "grad_norm": 0.009885910898447037, + "learning_rate": 4.0177696176695466e-05, + "loss": 0.0032, + "step": 3926 + }, + { + "epoch": 3.0347557443521915, + "grad_norm": 0.015047162771224976, + "learning_rate": 4.015124751684386e-05, + "loss": 0.0035, + "step": 3927 + }, + { + "epoch": 3.035528094226685, + "grad_norm": 0.010488315485417843, + "learning_rate": 4.0124801723726155e-05, + "loss": 0.0029, + "step": 3928 + }, + { + "epoch": 3.036300444101178, + "grad_norm": 0.011665630154311657, + "learning_rate": 4.0098358805040036e-05, + "loss": 0.003, + "step": 3929 + }, + { + "epoch": 3.037072793975671, + "grad_norm": 0.010001543909311295, + "learning_rate": 4.0071918768482406e-05, + "loss": 0.0035, + "step": 3930 + }, + { + "epoch": 3.037845143850164, + "grad_norm": 0.0158031415194273, + "learning_rate": 4.0045481621749345e-05, + "loss": 0.0031, + "step": 3931 + }, + { + "epoch": 3.038617493724657, + "grad_norm": 0.008115153759717941, + "learning_rate": 4.001904737253604e-05, + "loss": 0.003, + "step": 3932 + }, + { + "epoch": 3.0393898435991504, + "grad_norm": 0.010430651716887951, + "learning_rate": 3.999261602853686e-05, + "loss": 0.0031, + "step": 3933 + }, + { + "epoch": 3.0401621934736434, + "grad_norm": 0.008416597731411457, + "learning_rate": 3.9966187597445373e-05, + "loss": 0.003, + "step": 3934 + }, + { + "epoch": 3.0409345433481367, + "grad_norm": 0.007000258192420006, + "learning_rate": 3.993976208695421e-05, + "loss": 0.0035, + "step": 3935 + }, + { + "epoch": 3.0417068932226297, + "grad_norm": 0.016911238431930542, + "learning_rate": 3.99133395047552e-05, + "loss": 0.0028, + "step": 3936 + }, + { + "epoch": 3.042479243097123, + "grad_norm": 0.007231170777231455, + "learning_rate": 3.988691985853936e-05, + "loss": 0.0034, + "step": 3937 + }, + { + "epoch": 3.043251592971616, + "grad_norm": 0.009564080275595188, + "learning_rate": 3.986050315599678e-05, + "loss": 0.003, + "step": 3938 + }, + { + "epoch": 3.0440239428461093, + "grad_norm": 0.010936416685581207, + "learning_rate": 3.983408940481672e-05, + "loss": 0.0033, + "step": 3939 + }, + { + "epoch": 3.0447962927206023, + "grad_norm": 0.012983507476747036, + "learning_rate": 3.9807678612687596e-05, + "loss": 0.0037, + "step": 3940 + }, + { + "epoch": 3.0455686425950956, + "grad_norm": 0.009794589132070541, + "learning_rate": 3.9781270787296954e-05, + "loss": 0.0034, + "step": 3941 + }, + { + "epoch": 3.0463409924695886, + "grad_norm": 0.007537867408245802, + "learning_rate": 3.9754865936331455e-05, + "loss": 0.0032, + "step": 3942 + }, + { + "epoch": 3.047113342344082, + "grad_norm": 0.010640479624271393, + "learning_rate": 3.972846406747694e-05, + "loss": 0.0035, + "step": 3943 + }, + { + "epoch": 3.047885692218575, + "grad_norm": 0.02730044722557068, + "learning_rate": 3.9702065188418344e-05, + "loss": 0.0032, + "step": 3944 + }, + { + "epoch": 3.0486580420930682, + "grad_norm": 0.014430705457925797, + "learning_rate": 3.9675669306839724e-05, + "loss": 0.0029, + "step": 3945 + }, + { + "epoch": 3.049430391967561, + "grad_norm": 0.0166519396007061, + "learning_rate": 3.9649276430424306e-05, + "loss": 0.0036, + "step": 3946 + }, + { + "epoch": 3.0502027418420545, + "grad_norm": 0.01753905788064003, + "learning_rate": 3.962288656685441e-05, + "loss": 0.0034, + "step": 3947 + }, + { + "epoch": 3.0509750917165475, + "grad_norm": 0.027564140036702156, + "learning_rate": 3.959649972381152e-05, + "loss": 0.0032, + "step": 3948 + }, + { + "epoch": 3.051747441591041, + "grad_norm": 0.008522110059857368, + "learning_rate": 3.957011590897614e-05, + "loss": 0.0033, + "step": 3949 + }, + { + "epoch": 3.0525197914655338, + "grad_norm": 0.013643836602568626, + "learning_rate": 3.9543735130028015e-05, + "loss": 0.0034, + "step": 3950 + }, + { + "epoch": 3.053292141340027, + "grad_norm": 0.009889103472232819, + "learning_rate": 3.951735739464594e-05, + "loss": 0.0031, + "step": 3951 + }, + { + "epoch": 3.05406449121452, + "grad_norm": 0.023804178461432457, + "learning_rate": 3.949098271050782e-05, + "loss": 0.0034, + "step": 3952 + }, + { + "epoch": 3.0548368410890134, + "grad_norm": 0.02104552462697029, + "learning_rate": 3.9464611085290714e-05, + "loss": 0.004, + "step": 3953 + }, + { + "epoch": 3.0556091909635064, + "grad_norm": 0.0075318883173167706, + "learning_rate": 3.9438242526670754e-05, + "loss": 0.0028, + "step": 3954 + }, + { + "epoch": 3.0563815408379997, + "grad_norm": 0.023547343909740448, + "learning_rate": 3.941187704232318e-05, + "loss": 0.0033, + "step": 3955 + }, + { + "epoch": 3.0571538907124927, + "grad_norm": 0.030767524614930153, + "learning_rate": 3.938551463992235e-05, + "loss": 0.0035, + "step": 3956 + }, + { + "epoch": 3.057926240586986, + "grad_norm": 0.01460769958794117, + "learning_rate": 3.935915532714173e-05, + "loss": 0.0037, + "step": 3957 + }, + { + "epoch": 3.058698590461479, + "grad_norm": 0.008200748823583126, + "learning_rate": 3.933279911165389e-05, + "loss": 0.003, + "step": 3958 + }, + { + "epoch": 3.0594709403359723, + "grad_norm": 0.011066111735999584, + "learning_rate": 3.930644600113047e-05, + "loss": 0.0033, + "step": 3959 + }, + { + "epoch": 3.0602432902104653, + "grad_norm": 0.015494465827941895, + "learning_rate": 3.928009600324222e-05, + "loss": 0.0033, + "step": 3960 + }, + { + "epoch": 3.0610156400849586, + "grad_norm": 0.015704691410064697, + "learning_rate": 3.9253749125659005e-05, + "loss": 0.0034, + "step": 3961 + }, + { + "epoch": 3.0617879899594516, + "grad_norm": 0.011681852862238884, + "learning_rate": 3.9227405376049754e-05, + "loss": 0.003, + "step": 3962 + }, + { + "epoch": 3.062560339833945, + "grad_norm": 0.011715327389538288, + "learning_rate": 3.920106476208248e-05, + "loss": 0.0032, + "step": 3963 + }, + { + "epoch": 3.063332689708438, + "grad_norm": 0.008908935822546482, + "learning_rate": 3.917472729142435e-05, + "loss": 0.0027, + "step": 3964 + }, + { + "epoch": 3.0641050395829312, + "grad_norm": 0.0069875093176960945, + "learning_rate": 3.914839297174152e-05, + "loss": 0.003, + "step": 3965 + }, + { + "epoch": 3.064877389457424, + "grad_norm": 0.017726223915815353, + "learning_rate": 3.912206181069927e-05, + "loss": 0.0029, + "step": 3966 + }, + { + "epoch": 3.0656497393319175, + "grad_norm": 0.01484143827110529, + "learning_rate": 3.9095733815961986e-05, + "loss": 0.0032, + "step": 3967 + }, + { + "epoch": 3.0664220892064105, + "grad_norm": 0.013785186223685741, + "learning_rate": 3.906940899519312e-05, + "loss": 0.0031, + "step": 3968 + }, + { + "epoch": 3.067194439080904, + "grad_norm": 0.010157103650271893, + "learning_rate": 3.904308735605516e-05, + "loss": 0.0035, + "step": 3969 + }, + { + "epoch": 3.0679667889553968, + "grad_norm": 0.011927854269742966, + "learning_rate": 3.901676890620973e-05, + "loss": 0.0033, + "step": 3970 + }, + { + "epoch": 3.06873913882989, + "grad_norm": 0.02505023591220379, + "learning_rate": 3.8990453653317485e-05, + "loss": 0.0033, + "step": 3971 + }, + { + "epoch": 3.069511488704383, + "grad_norm": 0.0361270047724247, + "learning_rate": 3.896414160503814e-05, + "loss": 0.0034, + "step": 3972 + }, + { + "epoch": 3.0702838385788764, + "grad_norm": 0.010872379876673222, + "learning_rate": 3.8937832769030516e-05, + "loss": 0.0032, + "step": 3973 + }, + { + "epoch": 3.0710561884533694, + "grad_norm": 0.015281944535672665, + "learning_rate": 3.8911527152952496e-05, + "loss": 0.0036, + "step": 3974 + }, + { + "epoch": 3.0718285383278627, + "grad_norm": 0.007319580763578415, + "learning_rate": 3.888522476446097e-05, + "loss": 0.0033, + "step": 3975 + }, + { + "epoch": 3.0726008882023557, + "grad_norm": 0.022427208721637726, + "learning_rate": 3.885892561121194e-05, + "loss": 0.0035, + "step": 3976 + }, + { + "epoch": 3.073373238076849, + "grad_norm": 0.019485650584101677, + "learning_rate": 3.883262970086045e-05, + "loss": 0.0035, + "step": 3977 + }, + { + "epoch": 3.074145587951342, + "grad_norm": 0.009180407039821148, + "learning_rate": 3.880633704106066e-05, + "loss": 0.0031, + "step": 3978 + }, + { + "epoch": 3.074917937825835, + "grad_norm": 0.010681082494556904, + "learning_rate": 3.878004763946564e-05, + "loss": 0.0033, + "step": 3979 + }, + { + "epoch": 3.0756902877003283, + "grad_norm": 0.02105492167174816, + "learning_rate": 3.8753761503727656e-05, + "loss": 0.0036, + "step": 3980 + }, + { + "epoch": 3.076462637574821, + "grad_norm": 0.025038886815309525, + "learning_rate": 3.872747864149797e-05, + "loss": 0.0032, + "step": 3981 + }, + { + "epoch": 3.0772349874493146, + "grad_norm": 0.009126921184360981, + "learning_rate": 3.870119906042684e-05, + "loss": 0.0032, + "step": 3982 + }, + { + "epoch": 3.0780073373238075, + "grad_norm": 0.015841223299503326, + "learning_rate": 3.867492276816366e-05, + "loss": 0.0035, + "step": 3983 + }, + { + "epoch": 3.078779687198301, + "grad_norm": 0.02008405700325966, + "learning_rate": 3.8648649772356824e-05, + "loss": 0.0033, + "step": 3984 + }, + { + "epoch": 3.079552037072794, + "grad_norm": 0.013557328842580318, + "learning_rate": 3.862238008065374e-05, + "loss": 0.0034, + "step": 3985 + }, + { + "epoch": 3.080324386947287, + "grad_norm": 0.012391114607453346, + "learning_rate": 3.85961137007009e-05, + "loss": 0.0032, + "step": 3986 + }, + { + "epoch": 3.08109673682178, + "grad_norm": 0.011869030073285103, + "learning_rate": 3.8569850640143796e-05, + "loss": 0.004, + "step": 3987 + }, + { + "epoch": 3.0818690866962735, + "grad_norm": 0.01839050091803074, + "learning_rate": 3.8543590906627e-05, + "loss": 0.0033, + "step": 3988 + }, + { + "epoch": 3.0826414365707664, + "grad_norm": 0.022180140018463135, + "learning_rate": 3.851733450779406e-05, + "loss": 0.0033, + "step": 3989 + }, + { + "epoch": 3.0834137864452598, + "grad_norm": 0.010339627042412758, + "learning_rate": 3.8491081451287577e-05, + "loss": 0.0033, + "step": 3990 + }, + { + "epoch": 3.0841861363197527, + "grad_norm": 0.015834074467420578, + "learning_rate": 3.846483174474921e-05, + "loss": 0.0032, + "step": 3991 + }, + { + "epoch": 3.084958486194246, + "grad_norm": 0.01692846417427063, + "learning_rate": 3.843858539581959e-05, + "loss": 0.0035, + "step": 3992 + }, + { + "epoch": 3.085730836068739, + "grad_norm": 0.011141604743897915, + "learning_rate": 3.841234241213838e-05, + "loss": 0.0034, + "step": 3993 + }, + { + "epoch": 3.0865031859432324, + "grad_norm": 0.010867138393223286, + "learning_rate": 3.838610280134432e-05, + "loss": 0.0032, + "step": 3994 + }, + { + "epoch": 3.0872755358177253, + "grad_norm": 0.010458271950483322, + "learning_rate": 3.835986657107511e-05, + "loss": 0.003, + "step": 3995 + }, + { + "epoch": 3.0880478856922187, + "grad_norm": 0.010842096991837025, + "learning_rate": 3.8333633728967456e-05, + "loss": 0.0033, + "step": 3996 + }, + { + "epoch": 3.0888202355667116, + "grad_norm": 0.01667606830596924, + "learning_rate": 3.8307404282657134e-05, + "loss": 0.0037, + "step": 3997 + }, + { + "epoch": 3.089592585441205, + "grad_norm": 0.011396470479667187, + "learning_rate": 3.8281178239778915e-05, + "loss": 0.0033, + "step": 3998 + }, + { + "epoch": 3.090364935315698, + "grad_norm": 0.0088053522631526, + "learning_rate": 3.825495560796651e-05, + "loss": 0.0032, + "step": 3999 + }, + { + "epoch": 3.0911372851901913, + "grad_norm": 0.012660624459385872, + "learning_rate": 3.822873639485276e-05, + "loss": 0.0029, + "step": 4000 + }, + { + "epoch": 3.091909635064684, + "grad_norm": 0.010186905972659588, + "learning_rate": 3.820252060806941e-05, + "loss": 0.0032, + "step": 4001 + }, + { + "epoch": 3.0926819849391776, + "grad_norm": 0.01091783307492733, + "learning_rate": 3.817630825524723e-05, + "loss": 0.0034, + "step": 4002 + }, + { + "epoch": 3.0934543348136705, + "grad_norm": 0.008735897950828075, + "learning_rate": 3.8150099344016024e-05, + "loss": 0.003, + "step": 4003 + }, + { + "epoch": 3.094226684688164, + "grad_norm": 0.011613654904067516, + "learning_rate": 3.812389388200458e-05, + "loss": 0.0036, + "step": 4004 + }, + { + "epoch": 3.094999034562657, + "grad_norm": 0.013497716747224331, + "learning_rate": 3.8097691876840655e-05, + "loss": 0.0034, + "step": 4005 + }, + { + "epoch": 3.09577138443715, + "grad_norm": 0.00860733911395073, + "learning_rate": 3.807149333615101e-05, + "loss": 0.0029, + "step": 4006 + }, + { + "epoch": 3.096543734311643, + "grad_norm": 0.010404443368315697, + "learning_rate": 3.804529826756144e-05, + "loss": 0.003, + "step": 4007 + }, + { + "epoch": 3.0973160841861365, + "grad_norm": 0.010187176056206226, + "learning_rate": 3.8019106678696695e-05, + "loss": 0.0033, + "step": 4008 + }, + { + "epoch": 3.0980884340606294, + "grad_norm": 0.015230610966682434, + "learning_rate": 3.799291857718047e-05, + "loss": 0.0033, + "step": 4009 + }, + { + "epoch": 3.0988607839351228, + "grad_norm": 0.007961034774780273, + "learning_rate": 3.7966733970635526e-05, + "loss": 0.0032, + "step": 4010 + }, + { + "epoch": 3.0996331338096157, + "grad_norm": 0.014632215723395348, + "learning_rate": 3.794055286668358e-05, + "loss": 0.0031, + "step": 4011 + }, + { + "epoch": 3.100405483684109, + "grad_norm": 0.01650458574295044, + "learning_rate": 3.791437527294527e-05, + "loss": 0.0037, + "step": 4012 + }, + { + "epoch": 3.101177833558602, + "grad_norm": 0.01053455751389265, + "learning_rate": 3.7888201197040304e-05, + "loss": 0.0028, + "step": 4013 + }, + { + "epoch": 3.1019501834330954, + "grad_norm": 0.009459201246500015, + "learning_rate": 3.786203064658732e-05, + "loss": 0.0035, + "step": 4014 + }, + { + "epoch": 3.1027225333075883, + "grad_norm": 0.012510925531387329, + "learning_rate": 3.7835863629203904e-05, + "loss": 0.0031, + "step": 4015 + }, + { + "epoch": 3.1034948831820817, + "grad_norm": 0.013898443430662155, + "learning_rate": 3.780970015250667e-05, + "loss": 0.0038, + "step": 4016 + }, + { + "epoch": 3.1042672330565746, + "grad_norm": 0.017648473381996155, + "learning_rate": 3.778354022411115e-05, + "loss": 0.0029, + "step": 4017 + }, + { + "epoch": 3.105039582931068, + "grad_norm": 0.013399518094956875, + "learning_rate": 3.775738385163191e-05, + "loss": 0.0035, + "step": 4018 + }, + { + "epoch": 3.105811932805561, + "grad_norm": 0.010458532720804214, + "learning_rate": 3.773123104268239e-05, + "loss": 0.0031, + "step": 4019 + }, + { + "epoch": 3.1065842826800543, + "grad_norm": 0.01588922180235386, + "learning_rate": 3.770508180487506e-05, + "loss": 0.0032, + "step": 4020 + }, + { + "epoch": 3.107356632554547, + "grad_norm": 0.008511990308761597, + "learning_rate": 3.7678936145821344e-05, + "loss": 0.0035, + "step": 4021 + }, + { + "epoch": 3.1081289824290406, + "grad_norm": 0.01115653570741415, + "learning_rate": 3.7652794073131595e-05, + "loss": 0.0031, + "step": 4022 + }, + { + "epoch": 3.1089013323035335, + "grad_norm": 0.015124349854886532, + "learning_rate": 3.762665559441513e-05, + "loss": 0.0031, + "step": 4023 + }, + { + "epoch": 3.109673682178027, + "grad_norm": 0.012387467548251152, + "learning_rate": 3.760052071728026e-05, + "loss": 0.0037, + "step": 4024 + }, + { + "epoch": 3.11044603205252, + "grad_norm": 0.010778908617794514, + "learning_rate": 3.757438944933419e-05, + "loss": 0.0031, + "step": 4025 + }, + { + "epoch": 3.1112183819270127, + "grad_norm": 0.022256221622228622, + "learning_rate": 3.75482617981831e-05, + "loss": 0.0032, + "step": 4026 + }, + { + "epoch": 3.111990731801506, + "grad_norm": 0.0103048300370574, + "learning_rate": 3.752213777143214e-05, + "loss": 0.0034, + "step": 4027 + }, + { + "epoch": 3.112763081675999, + "grad_norm": 0.009700235910713673, + "learning_rate": 3.749601737668538e-05, + "loss": 0.0029, + "step": 4028 + }, + { + "epoch": 3.1135354315504924, + "grad_norm": 0.015991652384400368, + "learning_rate": 3.7469900621545796e-05, + "loss": 0.0036, + "step": 4029 + }, + { + "epoch": 3.1143077814249853, + "grad_norm": 0.013409771025180817, + "learning_rate": 3.744378751361539e-05, + "loss": 0.0035, + "step": 4030 + }, + { + "epoch": 3.1150801312994787, + "grad_norm": 0.009320034645497799, + "learning_rate": 3.7417678060495045e-05, + "loss": 0.0031, + "step": 4031 + }, + { + "epoch": 3.1158524811739716, + "grad_norm": 0.009379249997437, + "learning_rate": 3.739157226978458e-05, + "loss": 0.0029, + "step": 4032 + }, + { + "epoch": 3.116624831048465, + "grad_norm": 0.00768303731456399, + "learning_rate": 3.736547014908276e-05, + "loss": 0.0032, + "step": 4033 + }, + { + "epoch": 3.117397180922958, + "grad_norm": 0.007516008801758289, + "learning_rate": 3.73393717059873e-05, + "loss": 0.0032, + "step": 4034 + }, + { + "epoch": 3.1181695307974513, + "grad_norm": 0.008907015435397625, + "learning_rate": 3.731327694809481e-05, + "loss": 0.0032, + "step": 4035 + }, + { + "epoch": 3.1189418806719442, + "grad_norm": 0.011298132129013538, + "learning_rate": 3.728718588300084e-05, + "loss": 0.003, + "step": 4036 + }, + { + "epoch": 3.1197142305464376, + "grad_norm": 0.008996719494462013, + "learning_rate": 3.72610985182999e-05, + "loss": 0.0032, + "step": 4037 + }, + { + "epoch": 3.1204865804209305, + "grad_norm": 0.009170308709144592, + "learning_rate": 3.7235014861585356e-05, + "loss": 0.0031, + "step": 4038 + }, + { + "epoch": 3.121258930295424, + "grad_norm": 0.012968270108103752, + "learning_rate": 3.720893492044953e-05, + "loss": 0.0031, + "step": 4039 + }, + { + "epoch": 3.122031280169917, + "grad_norm": 0.008478841744363308, + "learning_rate": 3.718285870248368e-05, + "loss": 0.0034, + "step": 4040 + }, + { + "epoch": 3.12280363004441, + "grad_norm": 0.00782487541437149, + "learning_rate": 3.715678621527798e-05, + "loss": 0.0027, + "step": 4041 + }, + { + "epoch": 3.123575979918903, + "grad_norm": 0.010733025148510933, + "learning_rate": 3.713071746642145e-05, + "loss": 0.0033, + "step": 4042 + }, + { + "epoch": 3.1243483297933965, + "grad_norm": 0.010959485545754433, + "learning_rate": 3.710465246350212e-05, + "loss": 0.0029, + "step": 4043 + }, + { + "epoch": 3.1251206796678894, + "grad_norm": 0.009859909303486347, + "learning_rate": 3.707859121410687e-05, + "loss": 0.0032, + "step": 4044 + }, + { + "epoch": 3.125893029542383, + "grad_norm": 0.01753888465464115, + "learning_rate": 3.7052533725821484e-05, + "loss": 0.0029, + "step": 4045 + }, + { + "epoch": 3.1266653794168757, + "grad_norm": 0.010642552748322487, + "learning_rate": 3.702648000623067e-05, + "loss": 0.0033, + "step": 4046 + }, + { + "epoch": 3.127437729291369, + "grad_norm": 0.012091566808521748, + "learning_rate": 3.700043006291807e-05, + "loss": 0.0029, + "step": 4047 + }, + { + "epoch": 3.128210079165862, + "grad_norm": 0.008897331543266773, + "learning_rate": 3.6974383903466134e-05, + "loss": 0.0034, + "step": 4048 + }, + { + "epoch": 3.1289824290403554, + "grad_norm": 0.009017202071845531, + "learning_rate": 3.694834153545632e-05, + "loss": 0.0034, + "step": 4049 + }, + { + "epoch": 3.1297547789148483, + "grad_norm": 0.011252693831920624, + "learning_rate": 3.692230296646891e-05, + "loss": 0.0037, + "step": 4050 + }, + { + "epoch": 3.1305271287893417, + "grad_norm": 0.010351852513849735, + "learning_rate": 3.689626820408312e-05, + "loss": 0.0033, + "step": 4051 + }, + { + "epoch": 3.1312994786638346, + "grad_norm": 0.010088905692100525, + "learning_rate": 3.6870237255877025e-05, + "loss": 0.0028, + "step": 4052 + }, + { + "epoch": 3.132071828538328, + "grad_norm": 0.008366243913769722, + "learning_rate": 3.684421012942759e-05, + "loss": 0.003, + "step": 4053 + }, + { + "epoch": 3.132844178412821, + "grad_norm": 0.01793886534869671, + "learning_rate": 3.6818186832310716e-05, + "loss": 0.0033, + "step": 4054 + }, + { + "epoch": 3.1336165282873143, + "grad_norm": 0.011342871934175491, + "learning_rate": 3.6792167372101146e-05, + "loss": 0.0031, + "step": 4055 + }, + { + "epoch": 3.1343888781618072, + "grad_norm": 0.008785545825958252, + "learning_rate": 3.676615175637249e-05, + "loss": 0.0029, + "step": 4056 + }, + { + "epoch": 3.1351612280363006, + "grad_norm": 0.010612391866743565, + "learning_rate": 3.6740139992697316e-05, + "loss": 0.003, + "step": 4057 + }, + { + "epoch": 3.1359335779107935, + "grad_norm": 0.016039269044995308, + "learning_rate": 3.671413208864696e-05, + "loss": 0.0035, + "step": 4058 + }, + { + "epoch": 3.136705927785287, + "grad_norm": 0.01777689903974533, + "learning_rate": 3.668812805179173e-05, + "loss": 0.003, + "step": 4059 + }, + { + "epoch": 3.13747827765978, + "grad_norm": 0.010719449259340763, + "learning_rate": 3.666212788970076e-05, + "loss": 0.0031, + "step": 4060 + }, + { + "epoch": 3.138250627534273, + "grad_norm": 0.015978340059518814, + "learning_rate": 3.6636131609942094e-05, + "loss": 0.0035, + "step": 4061 + }, + { + "epoch": 3.139022977408766, + "grad_norm": 0.010096929967403412, + "learning_rate": 3.6610139220082596e-05, + "loss": 0.0036, + "step": 4062 + }, + { + "epoch": 3.1397953272832595, + "grad_norm": 0.02071520686149597, + "learning_rate": 3.6584150727688015e-05, + "loss": 0.0037, + "step": 4063 + }, + { + "epoch": 3.1405676771577524, + "grad_norm": 0.009962460026144981, + "learning_rate": 3.655816614032301e-05, + "loss": 0.0031, + "step": 4064 + }, + { + "epoch": 3.141340027032246, + "grad_norm": 0.009225048124790192, + "learning_rate": 3.653218546555103e-05, + "loss": 0.0036, + "step": 4065 + }, + { + "epoch": 3.1421123769067387, + "grad_norm": 0.009466026909649372, + "learning_rate": 3.6506208710934426e-05, + "loss": 0.0032, + "step": 4066 + }, + { + "epoch": 3.142884726781232, + "grad_norm": 0.012480610981583595, + "learning_rate": 3.648023588403443e-05, + "loss": 0.0036, + "step": 4067 + }, + { + "epoch": 3.143657076655725, + "grad_norm": 0.018842605873942375, + "learning_rate": 3.6454266992411084e-05, + "loss": 0.0032, + "step": 4068 + }, + { + "epoch": 3.1444294265302184, + "grad_norm": 0.013484551571309566, + "learning_rate": 3.64283020436233e-05, + "loss": 0.003, + "step": 4069 + }, + { + "epoch": 3.1452017764047113, + "grad_norm": 0.008349167183041573, + "learning_rate": 3.6402341045228857e-05, + "loss": 0.0031, + "step": 4070 + }, + { + "epoch": 3.1459741262792047, + "grad_norm": 0.007275492884218693, + "learning_rate": 3.6376384004784383e-05, + "loss": 0.0031, + "step": 4071 + }, + { + "epoch": 3.1467464761536976, + "grad_norm": 0.012611397542059422, + "learning_rate": 3.635043092984531e-05, + "loss": 0.0031, + "step": 4072 + }, + { + "epoch": 3.1475188260281906, + "grad_norm": 0.011832364834845066, + "learning_rate": 3.6324481827966e-05, + "loss": 0.003, + "step": 4073 + }, + { + "epoch": 3.148291175902684, + "grad_norm": 0.017449690029025078, + "learning_rate": 3.6298536706699595e-05, + "loss": 0.003, + "step": 4074 + }, + { + "epoch": 3.1490635257771773, + "grad_norm": 0.014712398871779442, + "learning_rate": 3.627259557359805e-05, + "loss": 0.0029, + "step": 4075 + }, + { + "epoch": 3.1498358756516702, + "grad_norm": 0.018742509186267853, + "learning_rate": 3.6246658436212256e-05, + "loss": 0.003, + "step": 4076 + }, + { + "epoch": 3.150608225526163, + "grad_norm": 0.012672246433794498, + "learning_rate": 3.622072530209187e-05, + "loss": 0.0033, + "step": 4077 + }, + { + "epoch": 3.1513805754006565, + "grad_norm": 0.01303433533757925, + "learning_rate": 3.6194796178785376e-05, + "loss": 0.0035, + "step": 4078 + }, + { + "epoch": 3.1521529252751495, + "grad_norm": 0.014445452019572258, + "learning_rate": 3.616887107384015e-05, + "loss": 0.0029, + "step": 4079 + }, + { + "epoch": 3.152925275149643, + "grad_norm": 0.01622670516371727, + "learning_rate": 3.614294999480234e-05, + "loss": 0.0035, + "step": 4080 + }, + { + "epoch": 3.1536976250241358, + "grad_norm": 0.01022647600620985, + "learning_rate": 3.611703294921698e-05, + "loss": 0.0033, + "step": 4081 + }, + { + "epoch": 3.154469974898629, + "grad_norm": 0.008269752375781536, + "learning_rate": 3.609111994462785e-05, + "loss": 0.0034, + "step": 4082 + }, + { + "epoch": 3.155242324773122, + "grad_norm": 0.014312672428786755, + "learning_rate": 3.606521098857762e-05, + "loss": 0.0032, + "step": 4083 + }, + { + "epoch": 3.1560146746476154, + "grad_norm": 0.01480252668261528, + "learning_rate": 3.603930608860778e-05, + "loss": 0.0032, + "step": 4084 + }, + { + "epoch": 3.1567870245221084, + "grad_norm": 0.014581169001758099, + "learning_rate": 3.6013405252258585e-05, + "loss": 0.0035, + "step": 4085 + }, + { + "epoch": 3.1575593743966017, + "grad_norm": 0.008861055597662926, + "learning_rate": 3.598750848706917e-05, + "loss": 0.0029, + "step": 4086 + }, + { + "epoch": 3.1583317242710947, + "grad_norm": 0.009202736429870129, + "learning_rate": 3.596161580057745e-05, + "loss": 0.0032, + "step": 4087 + }, + { + "epoch": 3.159104074145588, + "grad_norm": 0.013064440339803696, + "learning_rate": 3.5935727200320144e-05, + "loss": 0.0039, + "step": 4088 + }, + { + "epoch": 3.159876424020081, + "grad_norm": 0.011419693939387798, + "learning_rate": 3.5909842693832817e-05, + "loss": 0.0039, + "step": 4089 + }, + { + "epoch": 3.1606487738945743, + "grad_norm": 0.015743782743811607, + "learning_rate": 3.588396228864981e-05, + "loss": 0.0035, + "step": 4090 + }, + { + "epoch": 3.1614211237690673, + "grad_norm": 0.009340745396912098, + "learning_rate": 3.5858085992304316e-05, + "loss": 0.0034, + "step": 4091 + }, + { + "epoch": 3.1621934736435606, + "grad_norm": 0.012455140240490437, + "learning_rate": 3.583221381232826e-05, + "loss": 0.0031, + "step": 4092 + }, + { + "epoch": 3.1629658235180536, + "grad_norm": 0.008808081969618797, + "learning_rate": 3.5806345756252414e-05, + "loss": 0.0032, + "step": 4093 + }, + { + "epoch": 3.163738173392547, + "grad_norm": 0.016623718664050102, + "learning_rate": 3.578048183160638e-05, + "loss": 0.0036, + "step": 4094 + }, + { + "epoch": 3.16451052326704, + "grad_norm": 0.009980283677577972, + "learning_rate": 3.575462204591848e-05, + "loss": 0.0031, + "step": 4095 + }, + { + "epoch": 3.1652828731415332, + "grad_norm": 0.00870469119399786, + "learning_rate": 3.5728766406715876e-05, + "loss": 0.0033, + "step": 4096 + }, + { + "epoch": 3.166055223016026, + "grad_norm": 0.009748588316142559, + "learning_rate": 3.570291492152455e-05, + "loss": 0.0037, + "step": 4097 + }, + { + "epoch": 3.1668275728905195, + "grad_norm": 0.009119248017668724, + "learning_rate": 3.567706759786923e-05, + "loss": 0.0033, + "step": 4098 + }, + { + "epoch": 3.1675999227650125, + "grad_norm": 0.009126069024205208, + "learning_rate": 3.565122444327342e-05, + "loss": 0.0036, + "step": 4099 + }, + { + "epoch": 3.168372272639506, + "grad_norm": 0.01185399480164051, + "learning_rate": 3.562538546525949e-05, + "loss": 0.0033, + "step": 4100 + }, + { + "epoch": 3.1691446225139988, + "grad_norm": 0.007178458850830793, + "learning_rate": 3.55995506713485e-05, + "loss": 0.0035, + "step": 4101 + }, + { + "epoch": 3.169916972388492, + "grad_norm": 0.009440583176910877, + "learning_rate": 3.557372006906035e-05, + "loss": 0.0033, + "step": 4102 + }, + { + "epoch": 3.170689322262985, + "grad_norm": 0.010119931772351265, + "learning_rate": 3.55478936659137e-05, + "loss": 0.0034, + "step": 4103 + }, + { + "epoch": 3.1714616721374784, + "grad_norm": 0.00834848452359438, + "learning_rate": 3.552207146942601e-05, + "loss": 0.0029, + "step": 4104 + }, + { + "epoch": 3.1722340220119714, + "grad_norm": 0.012394800782203674, + "learning_rate": 3.5496253487113455e-05, + "loss": 0.0034, + "step": 4105 + }, + { + "epoch": 3.1730063718864647, + "grad_norm": 0.011674412526190281, + "learning_rate": 3.547043972649107e-05, + "loss": 0.0037, + "step": 4106 + }, + { + "epoch": 3.1737787217609577, + "grad_norm": 0.011983788572251797, + "learning_rate": 3.544463019507261e-05, + "loss": 0.0036, + "step": 4107 + }, + { + "epoch": 3.174551071635451, + "grad_norm": 0.017204612493515015, + "learning_rate": 3.541882490037057e-05, + "loss": 0.0033, + "step": 4108 + }, + { + "epoch": 3.175323421509944, + "grad_norm": 0.009163384325802326, + "learning_rate": 3.539302384989629e-05, + "loss": 0.0033, + "step": 4109 + }, + { + "epoch": 3.1760957713844373, + "grad_norm": 0.007902873679995537, + "learning_rate": 3.53672270511598e-05, + "loss": 0.0031, + "step": 4110 + }, + { + "epoch": 3.1768681212589303, + "grad_norm": 0.008102684281766415, + "learning_rate": 3.534143451166997e-05, + "loss": 0.003, + "step": 4111 + }, + { + "epoch": 3.1776404711334236, + "grad_norm": 0.010337292216718197, + "learning_rate": 3.531564623893433e-05, + "loss": 0.0037, + "step": 4112 + }, + { + "epoch": 3.1784128210079166, + "grad_norm": 0.008929918520152569, + "learning_rate": 3.5289862240459255e-05, + "loss": 0.0034, + "step": 4113 + }, + { + "epoch": 3.17918517088241, + "grad_norm": 0.009258911944925785, + "learning_rate": 3.526408252374985e-05, + "loss": 0.0033, + "step": 4114 + }, + { + "epoch": 3.179957520756903, + "grad_norm": 0.01704230345785618, + "learning_rate": 3.523830709630993e-05, + "loss": 0.003, + "step": 4115 + }, + { + "epoch": 3.1807298706313962, + "grad_norm": 0.00898750964552164, + "learning_rate": 3.521253596564214e-05, + "loss": 0.0033, + "step": 4116 + }, + { + "epoch": 3.181502220505889, + "grad_norm": 0.007785670459270477, + "learning_rate": 3.518676913924783e-05, + "loss": 0.0034, + "step": 4117 + }, + { + "epoch": 3.1822745703803825, + "grad_norm": 0.013011530973017216, + "learning_rate": 3.516100662462706e-05, + "loss": 0.003, + "step": 4118 + }, + { + "epoch": 3.1830469202548755, + "grad_norm": 0.009192707017064095, + "learning_rate": 3.5135248429278724e-05, + "loss": 0.0033, + "step": 4119 + }, + { + "epoch": 3.1838192701293684, + "grad_norm": 0.012670222669839859, + "learning_rate": 3.510949456070037e-05, + "loss": 0.0031, + "step": 4120 + }, + { + "epoch": 3.1845916200038618, + "grad_norm": 0.009181042201817036, + "learning_rate": 3.508374502638837e-05, + "loss": 0.0032, + "step": 4121 + }, + { + "epoch": 3.185363969878355, + "grad_norm": 0.01356884092092514, + "learning_rate": 3.505799983383776e-05, + "loss": 0.003, + "step": 4122 + }, + { + "epoch": 3.186136319752848, + "grad_norm": 0.01130085252225399, + "learning_rate": 3.5032258990542336e-05, + "loss": 0.0034, + "step": 4123 + }, + { + "epoch": 3.186908669627341, + "grad_norm": 0.010613438673317432, + "learning_rate": 3.500652250399468e-05, + "loss": 0.0028, + "step": 4124 + }, + { + "epoch": 3.1876810195018344, + "grad_norm": 0.01142602413892746, + "learning_rate": 3.498079038168601e-05, + "loss": 0.0029, + "step": 4125 + }, + { + "epoch": 3.1884533693763273, + "grad_norm": 0.008843295276165009, + "learning_rate": 3.495506263110635e-05, + "loss": 0.003, + "step": 4126 + }, + { + "epoch": 3.1892257192508207, + "grad_norm": 0.01249662134796381, + "learning_rate": 3.492933925974444e-05, + "loss": 0.0029, + "step": 4127 + }, + { + "epoch": 3.1899980691253136, + "grad_norm": 0.007697202730923891, + "learning_rate": 3.490362027508771e-05, + "loss": 0.0028, + "step": 4128 + }, + { + "epoch": 3.190770418999807, + "grad_norm": 0.01017635315656662, + "learning_rate": 3.487790568462232e-05, + "loss": 0.0028, + "step": 4129 + }, + { + "epoch": 3.1915427688743, + "grad_norm": 0.011190581135451794, + "learning_rate": 3.4852195495833204e-05, + "loss": 0.0036, + "step": 4130 + }, + { + "epoch": 3.1923151187487933, + "grad_norm": 0.012526176869869232, + "learning_rate": 3.482648971620397e-05, + "loss": 0.0031, + "step": 4131 + }, + { + "epoch": 3.193087468623286, + "grad_norm": 0.008728506974875927, + "learning_rate": 3.4800788353216914e-05, + "loss": 0.0033, + "step": 4132 + }, + { + "epoch": 3.1938598184977796, + "grad_norm": 0.01112066674977541, + "learning_rate": 3.477509141435312e-05, + "loss": 0.0032, + "step": 4133 + }, + { + "epoch": 3.1946321683722725, + "grad_norm": 0.010328928008675575, + "learning_rate": 3.4749398907092344e-05, + "loss": 0.0033, + "step": 4134 + }, + { + "epoch": 3.195404518246766, + "grad_norm": 0.009153681807219982, + "learning_rate": 3.472371083891301e-05, + "loss": 0.0032, + "step": 4135 + }, + { + "epoch": 3.196176868121259, + "grad_norm": 0.009757447056472301, + "learning_rate": 3.4698027217292356e-05, + "loss": 0.0031, + "step": 4136 + }, + { + "epoch": 3.196949217995752, + "grad_norm": 0.011577093042433262, + "learning_rate": 3.467234804970624e-05, + "loss": 0.004, + "step": 4137 + }, + { + "epoch": 3.197721567870245, + "grad_norm": 0.011520899832248688, + "learning_rate": 3.464667334362922e-05, + "loss": 0.0036, + "step": 4138 + }, + { + "epoch": 3.1984939177447385, + "grad_norm": 0.007746752351522446, + "learning_rate": 3.46210031065346e-05, + "loss": 0.0034, + "step": 4139 + }, + { + "epoch": 3.1992662676192314, + "grad_norm": 0.01598169095814228, + "learning_rate": 3.459533734589438e-05, + "loss": 0.0038, + "step": 4140 + }, + { + "epoch": 3.2000386174937248, + "grad_norm": 0.010562562383711338, + "learning_rate": 3.4569676069179244e-05, + "loss": 0.0035, + "step": 4141 + }, + { + "epoch": 3.2008109673682177, + "grad_norm": 0.007787355221807957, + "learning_rate": 3.4544019283858534e-05, + "loss": 0.0029, + "step": 4142 + }, + { + "epoch": 3.201583317242711, + "grad_norm": 0.01091032288968563, + "learning_rate": 3.4518366997400355e-05, + "loss": 0.0037, + "step": 4143 + }, + { + "epoch": 3.202355667117204, + "grad_norm": 0.006965094245970249, + "learning_rate": 3.4492719217271474e-05, + "loss": 0.0032, + "step": 4144 + }, + { + "epoch": 3.2031280169916974, + "grad_norm": 0.015202803537249565, + "learning_rate": 3.446707595093729e-05, + "loss": 0.0029, + "step": 4145 + }, + { + "epoch": 3.2039003668661903, + "grad_norm": 0.012899445369839668, + "learning_rate": 3.444143720586199e-05, + "loss": 0.0033, + "step": 4146 + }, + { + "epoch": 3.2046727167406837, + "grad_norm": 0.008736785501241684, + "learning_rate": 3.4415802989508384e-05, + "loss": 0.0029, + "step": 4147 + }, + { + "epoch": 3.2054450666151766, + "grad_norm": 0.008177345618605614, + "learning_rate": 3.4390173309337944e-05, + "loss": 0.0028, + "step": 4148 + }, + { + "epoch": 3.20621741648967, + "grad_norm": 0.015837090089917183, + "learning_rate": 3.436454817281088e-05, + "loss": 0.0032, + "step": 4149 + }, + { + "epoch": 3.206989766364163, + "grad_norm": 0.009351897053420544, + "learning_rate": 3.433892758738603e-05, + "loss": 0.0033, + "step": 4150 + }, + { + "epoch": 3.2077621162386563, + "grad_norm": 0.010843559168279171, + "learning_rate": 3.4313311560520955e-05, + "loss": 0.0027, + "step": 4151 + }, + { + "epoch": 3.208534466113149, + "grad_norm": 0.017377877607941628, + "learning_rate": 3.428770009967183e-05, + "loss": 0.0031, + "step": 4152 + }, + { + "epoch": 3.2093068159876426, + "grad_norm": 0.008192894980311394, + "learning_rate": 3.426209321229355e-05, + "loss": 0.0033, + "step": 4153 + }, + { + "epoch": 3.2100791658621355, + "grad_norm": 0.014300121925771236, + "learning_rate": 3.4236490905839656e-05, + "loss": 0.0029, + "step": 4154 + }, + { + "epoch": 3.210851515736629, + "grad_norm": 0.008843852207064629, + "learning_rate": 3.421089318776237e-05, + "loss": 0.0035, + "step": 4155 + }, + { + "epoch": 3.211623865611122, + "grad_norm": 0.010597603395581245, + "learning_rate": 3.418530006551255e-05, + "loss": 0.003, + "step": 4156 + }, + { + "epoch": 3.212396215485615, + "grad_norm": 0.016319219022989273, + "learning_rate": 3.415971154653976e-05, + "loss": 0.0034, + "step": 4157 + }, + { + "epoch": 3.213168565360108, + "grad_norm": 0.010809916071593761, + "learning_rate": 3.413412763829218e-05, + "loss": 0.0031, + "step": 4158 + }, + { + "epoch": 3.2139409152346015, + "grad_norm": 0.010344364680349827, + "learning_rate": 3.410854834821666e-05, + "loss": 0.003, + "step": 4159 + }, + { + "epoch": 3.2147132651090944, + "grad_norm": 0.013101106509566307, + "learning_rate": 3.408297368375874e-05, + "loss": 0.0035, + "step": 4160 + }, + { + "epoch": 3.2154856149835878, + "grad_norm": 0.015558012761175632, + "learning_rate": 3.405740365236258e-05, + "loss": 0.0032, + "step": 4161 + }, + { + "epoch": 3.2162579648580807, + "grad_norm": 0.011329291388392448, + "learning_rate": 3.4031838261470986e-05, + "loss": 0.0031, + "step": 4162 + }, + { + "epoch": 3.217030314732574, + "grad_norm": 0.008331413380801678, + "learning_rate": 3.400627751852543e-05, + "loss": 0.0027, + "step": 4163 + }, + { + "epoch": 3.217802664607067, + "grad_norm": 0.009807384572923183, + "learning_rate": 3.398072143096604e-05, + "loss": 0.0028, + "step": 4164 + }, + { + "epoch": 3.2185750144815604, + "grad_norm": 0.008590004406869411, + "learning_rate": 3.3955170006231555e-05, + "loss": 0.0032, + "step": 4165 + }, + { + "epoch": 3.2193473643560533, + "grad_norm": 0.009014099836349487, + "learning_rate": 3.392962325175938e-05, + "loss": 0.0032, + "step": 4166 + }, + { + "epoch": 3.220119714230546, + "grad_norm": 0.01811886765062809, + "learning_rate": 3.390408117498558e-05, + "loss": 0.0034, + "step": 4167 + }, + { + "epoch": 3.2208920641050396, + "grad_norm": 0.010788844898343086, + "learning_rate": 3.3878543783344806e-05, + "loss": 0.003, + "step": 4168 + }, + { + "epoch": 3.221664413979533, + "grad_norm": 0.013252652250230312, + "learning_rate": 3.385301108427039e-05, + "loss": 0.0031, + "step": 4169 + }, + { + "epoch": 3.222436763854026, + "grad_norm": 0.014306911267340183, + "learning_rate": 3.382748308519429e-05, + "loss": 0.0031, + "step": 4170 + }, + { + "epoch": 3.223209113728519, + "grad_norm": 0.01070572528988123, + "learning_rate": 3.38019597935471e-05, + "loss": 0.0029, + "step": 4171 + }, + { + "epoch": 3.223981463603012, + "grad_norm": 0.008928976953029633, + "learning_rate": 3.3776441216757984e-05, + "loss": 0.0034, + "step": 4172 + }, + { + "epoch": 3.224753813477505, + "grad_norm": 0.01367044635117054, + "learning_rate": 3.375092736225484e-05, + "loss": 0.0034, + "step": 4173 + }, + { + "epoch": 3.2255261633519985, + "grad_norm": 0.007461641449481249, + "learning_rate": 3.372541823746411e-05, + "loss": 0.0031, + "step": 4174 + }, + { + "epoch": 3.2262985132264914, + "grad_norm": 0.008332383818924427, + "learning_rate": 3.369991384981087e-05, + "loss": 0.0033, + "step": 4175 + }, + { + "epoch": 3.227070863100985, + "grad_norm": 0.02094440907239914, + "learning_rate": 3.3674414206718854e-05, + "loss": 0.0033, + "step": 4176 + }, + { + "epoch": 3.2278432129754777, + "grad_norm": 0.013396904803812504, + "learning_rate": 3.36489193156104e-05, + "loss": 0.0035, + "step": 4177 + }, + { + "epoch": 3.228615562849971, + "grad_norm": 0.00786169059574604, + "learning_rate": 3.362342918390641e-05, + "loss": 0.0029, + "step": 4178 + }, + { + "epoch": 3.229387912724464, + "grad_norm": 0.011862647719681263, + "learning_rate": 3.359794381902649e-05, + "loss": 0.0029, + "step": 4179 + }, + { + "epoch": 3.2301602625989574, + "grad_norm": 0.010191472247242928, + "learning_rate": 3.357246322838878e-05, + "loss": 0.0031, + "step": 4180 + }, + { + "epoch": 3.2309326124734503, + "grad_norm": 0.010995290242135525, + "learning_rate": 3.354698741941011e-05, + "loss": 0.0029, + "step": 4181 + }, + { + "epoch": 3.2317049623479437, + "grad_norm": 0.00857754610478878, + "learning_rate": 3.352151639950584e-05, + "loss": 0.0032, + "step": 4182 + }, + { + "epoch": 3.2324773122224366, + "grad_norm": 0.011650115251541138, + "learning_rate": 3.3496050176089946e-05, + "loss": 0.0031, + "step": 4183 + }, + { + "epoch": 3.23324966209693, + "grad_norm": 0.009595906361937523, + "learning_rate": 3.347058875657509e-05, + "loss": 0.003, + "step": 4184 + }, + { + "epoch": 3.234022011971423, + "grad_norm": 0.014852471649646759, + "learning_rate": 3.344513214837243e-05, + "loss": 0.0033, + "step": 4185 + }, + { + "epoch": 3.2347943618459163, + "grad_norm": 0.009725824929773808, + "learning_rate": 3.341968035889177e-05, + "loss": 0.0035, + "step": 4186 + }, + { + "epoch": 3.235566711720409, + "grad_norm": 0.01160483993589878, + "learning_rate": 3.339423339554155e-05, + "loss": 0.0032, + "step": 4187 + }, + { + "epoch": 3.2363390615949026, + "grad_norm": 0.010148034431040287, + "learning_rate": 3.336879126572872e-05, + "loss": 0.0028, + "step": 4188 + }, + { + "epoch": 3.2371114114693955, + "grad_norm": 0.012596935965120792, + "learning_rate": 3.334335397685889e-05, + "loss": 0.0039, + "step": 4189 + }, + { + "epoch": 3.237883761343889, + "grad_norm": 0.011795789934694767, + "learning_rate": 3.331792153633624e-05, + "loss": 0.0033, + "step": 4190 + }, + { + "epoch": 3.238656111218382, + "grad_norm": 0.009832086972892284, + "learning_rate": 3.329249395156355e-05, + "loss": 0.0034, + "step": 4191 + }, + { + "epoch": 3.239428461092875, + "grad_norm": 0.010105843655765057, + "learning_rate": 3.3267071229942157e-05, + "loss": 0.003, + "step": 4192 + }, + { + "epoch": 3.240200810967368, + "grad_norm": 0.010233977809548378, + "learning_rate": 3.3241653378872e-05, + "loss": 0.0034, + "step": 4193 + }, + { + "epoch": 3.2409731608418615, + "grad_norm": 0.011225282214581966, + "learning_rate": 3.321624040575162e-05, + "loss": 0.0034, + "step": 4194 + }, + { + "epoch": 3.2417455107163544, + "grad_norm": 0.008810597471892834, + "learning_rate": 3.31908323179781e-05, + "loss": 0.0031, + "step": 4195 + }, + { + "epoch": 3.242517860590848, + "grad_norm": 0.008012303151190281, + "learning_rate": 3.316542912294712e-05, + "loss": 0.003, + "step": 4196 + }, + { + "epoch": 3.2432902104653407, + "grad_norm": 0.0090979328379035, + "learning_rate": 3.314003082805297e-05, + "loss": 0.0036, + "step": 4197 + }, + { + "epoch": 3.244062560339834, + "grad_norm": 0.010045964270830154, + "learning_rate": 3.3114637440688445e-05, + "loss": 0.0031, + "step": 4198 + }, + { + "epoch": 3.244834910214327, + "grad_norm": 0.012850887142121792, + "learning_rate": 3.308924896824494e-05, + "loss": 0.0037, + "step": 4199 + }, + { + "epoch": 3.2456072600888204, + "grad_norm": 0.01774417981505394, + "learning_rate": 3.3063865418112456e-05, + "loss": 0.0032, + "step": 4200 + }, + { + "epoch": 3.2463796099633133, + "grad_norm": 0.009460978209972382, + "learning_rate": 3.303848679767952e-05, + "loss": 0.003, + "step": 4201 + }, + { + "epoch": 3.2471519598378067, + "grad_norm": 0.01683228649199009, + "learning_rate": 3.301311311433322e-05, + "loss": 0.0037, + "step": 4202 + }, + { + "epoch": 3.2479243097122996, + "grad_norm": 0.010335755534470081, + "learning_rate": 3.298774437545924e-05, + "loss": 0.0032, + "step": 4203 + }, + { + "epoch": 3.248696659586793, + "grad_norm": 0.018849408254027367, + "learning_rate": 3.296238058844182e-05, + "loss": 0.0033, + "step": 4204 + }, + { + "epoch": 3.249469009461286, + "grad_norm": 0.008381340652704239, + "learning_rate": 3.29370217606637e-05, + "loss": 0.0031, + "step": 4205 + }, + { + "epoch": 3.2502413593357793, + "grad_norm": 0.0117068225517869, + "learning_rate": 3.291166789950626e-05, + "loss": 0.0032, + "step": 4206 + }, + { + "epoch": 3.251013709210272, + "grad_norm": 0.01788029633462429, + "learning_rate": 3.28863190123494e-05, + "loss": 0.0031, + "step": 4207 + }, + { + "epoch": 3.2517860590847656, + "grad_norm": 0.017351066693663597, + "learning_rate": 3.2860975106571525e-05, + "loss": 0.0032, + "step": 4208 + }, + { + "epoch": 3.2525584089592585, + "grad_norm": 0.01061096228659153, + "learning_rate": 3.2835636189549676e-05, + "loss": 0.0029, + "step": 4209 + }, + { + "epoch": 3.2533307588337514, + "grad_norm": 0.02109973505139351, + "learning_rate": 3.2810302268659375e-05, + "loss": 0.0029, + "step": 4210 + }, + { + "epoch": 3.254103108708245, + "grad_norm": 0.014287742786109447, + "learning_rate": 3.278497335127475e-05, + "loss": 0.0032, + "step": 4211 + }, + { + "epoch": 3.254875458582738, + "grad_norm": 0.014870606362819672, + "learning_rate": 3.2759649444768406e-05, + "loss": 0.0034, + "step": 4212 + }, + { + "epoch": 3.255647808457231, + "grad_norm": 0.007436896208673716, + "learning_rate": 3.273433055651151e-05, + "loss": 0.0031, + "step": 4213 + }, + { + "epoch": 3.256420158331724, + "grad_norm": 0.01197846606373787, + "learning_rate": 3.2709016693873803e-05, + "loss": 0.0036, + "step": 4214 + }, + { + "epoch": 3.2571925082062174, + "grad_norm": 0.012271657586097717, + "learning_rate": 3.2683707864223534e-05, + "loss": 0.0039, + "step": 4215 + }, + { + "epoch": 3.257964858080711, + "grad_norm": 0.020635711029171944, + "learning_rate": 3.265840407492748e-05, + "loss": 0.0031, + "step": 4216 + }, + { + "epoch": 3.2587372079552037, + "grad_norm": 0.007999719120562077, + "learning_rate": 3.2633105333351e-05, + "loss": 0.0032, + "step": 4217 + }, + { + "epoch": 3.2595095578296966, + "grad_norm": 0.010632477700710297, + "learning_rate": 3.260781164685788e-05, + "loss": 0.0033, + "step": 4218 + }, + { + "epoch": 3.26028190770419, + "grad_norm": 0.012916402891278267, + "learning_rate": 3.2582523022810555e-05, + "loss": 0.003, + "step": 4219 + }, + { + "epoch": 3.2610542575786834, + "grad_norm": 0.010575252585113049, + "learning_rate": 3.255723946856992e-05, + "loss": 0.003, + "step": 4220 + }, + { + "epoch": 3.2618266074531763, + "grad_norm": 0.010021179914474487, + "learning_rate": 3.253196099149539e-05, + "loss": 0.0032, + "step": 4221 + }, + { + "epoch": 3.2625989573276692, + "grad_norm": 0.006727861240506172, + "learning_rate": 3.2506687598944935e-05, + "loss": 0.003, + "step": 4222 + }, + { + "epoch": 3.2633713072021626, + "grad_norm": 0.0075446320697665215, + "learning_rate": 3.248141929827502e-05, + "loss": 0.0031, + "step": 4223 + }, + { + "epoch": 3.2641436570766555, + "grad_norm": 0.010262253694236279, + "learning_rate": 3.2456156096840656e-05, + "loss": 0.0031, + "step": 4224 + }, + { + "epoch": 3.264916006951149, + "grad_norm": 0.009761231951415539, + "learning_rate": 3.2430898001995335e-05, + "loss": 0.003, + "step": 4225 + }, + { + "epoch": 3.265688356825642, + "grad_norm": 0.008487430401146412, + "learning_rate": 3.2405645021091056e-05, + "loss": 0.0031, + "step": 4226 + }, + { + "epoch": 3.266460706700135, + "grad_norm": 0.011206555180251598, + "learning_rate": 3.238039716147841e-05, + "loss": 0.0033, + "step": 4227 + }, + { + "epoch": 3.267233056574628, + "grad_norm": 0.008502183482050896, + "learning_rate": 3.2355154430506385e-05, + "loss": 0.003, + "step": 4228 + }, + { + "epoch": 3.2680054064491215, + "grad_norm": 0.008844464085996151, + "learning_rate": 3.232991683552254e-05, + "loss": 0.0034, + "step": 4229 + }, + { + "epoch": 3.2687777563236144, + "grad_norm": 0.0106724938377738, + "learning_rate": 3.2304684383872966e-05, + "loss": 0.0036, + "step": 4230 + }, + { + "epoch": 3.269550106198108, + "grad_norm": 0.00910135731101036, + "learning_rate": 3.2279457082902175e-05, + "loss": 0.0035, + "step": 4231 + }, + { + "epoch": 3.2703224560726007, + "grad_norm": 0.01070884894579649, + "learning_rate": 3.2254234939953234e-05, + "loss": 0.003, + "step": 4232 + }, + { + "epoch": 3.271094805947094, + "grad_norm": 0.013538029044866562, + "learning_rate": 3.222901796236771e-05, + "loss": 0.0029, + "step": 4233 + }, + { + "epoch": 3.271867155821587, + "grad_norm": 0.009791580028831959, + "learning_rate": 3.220380615748567e-05, + "loss": 0.0032, + "step": 4234 + }, + { + "epoch": 3.2726395056960804, + "grad_norm": 0.012030337005853653, + "learning_rate": 3.2178599532645625e-05, + "loss": 0.0028, + "step": 4235 + }, + { + "epoch": 3.2734118555705733, + "grad_norm": 0.010877392254769802, + "learning_rate": 3.215339809518464e-05, + "loss": 0.0037, + "step": 4236 + }, + { + "epoch": 3.2741842054450667, + "grad_norm": 0.00963950902223587, + "learning_rate": 3.2128201852438255e-05, + "loss": 0.0029, + "step": 4237 + }, + { + "epoch": 3.2749565553195596, + "grad_norm": 0.009462360292673111, + "learning_rate": 3.210301081174044e-05, + "loss": 0.0029, + "step": 4238 + }, + { + "epoch": 3.275728905194053, + "grad_norm": 0.0111409155651927, + "learning_rate": 3.207782498042374e-05, + "loss": 0.0029, + "step": 4239 + }, + { + "epoch": 3.276501255068546, + "grad_norm": 0.009481188841164112, + "learning_rate": 3.205264436581915e-05, + "loss": 0.0037, + "step": 4240 + }, + { + "epoch": 3.2772736049430393, + "grad_norm": 0.008347220718860626, + "learning_rate": 3.2027468975256094e-05, + "loss": 0.0028, + "step": 4241 + }, + { + "epoch": 3.2780459548175322, + "grad_norm": 0.01125909760594368, + "learning_rate": 3.200229881606256e-05, + "loss": 0.0032, + "step": 4242 + }, + { + "epoch": 3.2788183046920256, + "grad_norm": 0.011121939867734909, + "learning_rate": 3.197713389556494e-05, + "loss": 0.0029, + "step": 4243 + }, + { + "epoch": 3.2795906545665185, + "grad_norm": 0.007758612744510174, + "learning_rate": 3.195197422108819e-05, + "loss": 0.0029, + "step": 4244 + }, + { + "epoch": 3.280363004441012, + "grad_norm": 0.009592241607606411, + "learning_rate": 3.192681979995561e-05, + "loss": 0.0027, + "step": 4245 + }, + { + "epoch": 3.281135354315505, + "grad_norm": 0.010534889996051788, + "learning_rate": 3.190167063948909e-05, + "loss": 0.0036, + "step": 4246 + }, + { + "epoch": 3.281907704189998, + "grad_norm": 0.009834305383265018, + "learning_rate": 3.187652674700895e-05, + "loss": 0.003, + "step": 4247 + }, + { + "epoch": 3.282680054064491, + "grad_norm": 0.008079975843429565, + "learning_rate": 3.185138812983393e-05, + "loss": 0.0029, + "step": 4248 + }, + { + "epoch": 3.2834524039389845, + "grad_norm": 0.008383155800402164, + "learning_rate": 3.18262547952813e-05, + "loss": 0.003, + "step": 4249 + }, + { + "epoch": 3.2842247538134774, + "grad_norm": 0.008969396352767944, + "learning_rate": 3.1801126750666775e-05, + "loss": 0.0031, + "step": 4250 + }, + { + "epoch": 3.284997103687971, + "grad_norm": 0.008961372077465057, + "learning_rate": 3.177600400330449e-05, + "loss": 0.0028, + "step": 4251 + }, + { + "epoch": 3.2857694535624637, + "grad_norm": 0.009899982251226902, + "learning_rate": 3.17508865605071e-05, + "loss": 0.0035, + "step": 4252 + }, + { + "epoch": 3.286541803436957, + "grad_norm": 0.01790609210729599, + "learning_rate": 3.1725774429585664e-05, + "loss": 0.0029, + "step": 4253 + }, + { + "epoch": 3.28731415331145, + "grad_norm": 0.008014030754566193, + "learning_rate": 3.1700667617849755e-05, + "loss": 0.0034, + "step": 4254 + }, + { + "epoch": 3.2880865031859434, + "grad_norm": 0.00846839789301157, + "learning_rate": 3.167556613260732e-05, + "loss": 0.003, + "step": 4255 + }, + { + "epoch": 3.2888588530604363, + "grad_norm": 0.00876067578792572, + "learning_rate": 3.165046998116479e-05, + "loss": 0.0029, + "step": 4256 + }, + { + "epoch": 3.2896312029349293, + "grad_norm": 0.010461987927556038, + "learning_rate": 3.1625379170827084e-05, + "loss": 0.0035, + "step": 4257 + }, + { + "epoch": 3.2904035528094226, + "grad_norm": 0.008245227858424187, + "learning_rate": 3.1600293708897504e-05, + "loss": 0.0034, + "step": 4258 + }, + { + "epoch": 3.291175902683916, + "grad_norm": 0.01120483223348856, + "learning_rate": 3.1575213602677824e-05, + "loss": 0.0033, + "step": 4259 + }, + { + "epoch": 3.291948252558409, + "grad_norm": 0.00878420751541853, + "learning_rate": 3.155013885946827e-05, + "loss": 0.0034, + "step": 4260 + }, + { + "epoch": 3.292720602432902, + "grad_norm": 0.012798376381397247, + "learning_rate": 3.152506948656749e-05, + "loss": 0.0031, + "step": 4261 + }, + { + "epoch": 3.2934929523073952, + "grad_norm": 0.01274291705340147, + "learning_rate": 3.150000549127255e-05, + "loss": 0.0031, + "step": 4262 + }, + { + "epoch": 3.2942653021818886, + "grad_norm": 0.01563984714448452, + "learning_rate": 3.1474946880879e-05, + "loss": 0.0036, + "step": 4263 + }, + { + "epoch": 3.2950376520563815, + "grad_norm": 0.014731112867593765, + "learning_rate": 3.1449893662680806e-05, + "loss": 0.0031, + "step": 4264 + }, + { + "epoch": 3.2958100019308745, + "grad_norm": 0.01007825881242752, + "learning_rate": 3.1424845843970314e-05, + "loss": 0.0034, + "step": 4265 + }, + { + "epoch": 3.296582351805368, + "grad_norm": 0.0087623605504632, + "learning_rate": 3.139980343203837e-05, + "loss": 0.0025, + "step": 4266 + }, + { + "epoch": 3.297354701679861, + "grad_norm": 0.017609968781471252, + "learning_rate": 3.137476643417422e-05, + "loss": 0.0032, + "step": 4267 + }, + { + "epoch": 3.298127051554354, + "grad_norm": 0.007911840453743935, + "learning_rate": 3.134973485766549e-05, + "loss": 0.0029, + "step": 4268 + }, + { + "epoch": 3.298899401428847, + "grad_norm": 0.012556039728224277, + "learning_rate": 3.13247087097983e-05, + "loss": 0.003, + "step": 4269 + }, + { + "epoch": 3.2996717513033405, + "grad_norm": 0.007746555842459202, + "learning_rate": 3.129968799785717e-05, + "loss": 0.0026, + "step": 4270 + }, + { + "epoch": 3.3004441011778334, + "grad_norm": 0.009839855134487152, + "learning_rate": 3.127467272912499e-05, + "loss": 0.0031, + "step": 4271 + }, + { + "epoch": 3.3012164510523268, + "grad_norm": 0.015597268007695675, + "learning_rate": 3.12496629108831e-05, + "loss": 0.0036, + "step": 4272 + }, + { + "epoch": 3.3019888009268197, + "grad_norm": 0.010683227330446243, + "learning_rate": 3.122465855041128e-05, + "loss": 0.003, + "step": 4273 + }, + { + "epoch": 3.302761150801313, + "grad_norm": 0.019207023084163666, + "learning_rate": 3.119965965498769e-05, + "loss": 0.0033, + "step": 4274 + }, + { + "epoch": 3.303533500675806, + "grad_norm": 0.013781500048935413, + "learning_rate": 3.117466623188888e-05, + "loss": 0.0032, + "step": 4275 + }, + { + "epoch": 3.3043058505502994, + "grad_norm": 0.00863671489059925, + "learning_rate": 3.114967828838987e-05, + "loss": 0.0031, + "step": 4276 + }, + { + "epoch": 3.3050782004247923, + "grad_norm": 0.009823620319366455, + "learning_rate": 3.112469583176402e-05, + "loss": 0.0036, + "step": 4277 + }, + { + "epoch": 3.3058505502992857, + "grad_norm": 0.010731927119195461, + "learning_rate": 3.109971886928311e-05, + "loss": 0.0033, + "step": 4278 + }, + { + "epoch": 3.3066229001737786, + "grad_norm": 0.017347775399684906, + "learning_rate": 3.107474740821736e-05, + "loss": 0.0031, + "step": 4279 + }, + { + "epoch": 3.307395250048272, + "grad_norm": 0.009366197511553764, + "learning_rate": 3.104978145583535e-05, + "loss": 0.0033, + "step": 4280 + }, + { + "epoch": 3.308167599922765, + "grad_norm": 0.014173255302011967, + "learning_rate": 3.1024821019404047e-05, + "loss": 0.0033, + "step": 4281 + }, + { + "epoch": 3.3089399497972583, + "grad_norm": 0.01867513731122017, + "learning_rate": 3.099986610618885e-05, + "loss": 0.0035, + "step": 4282 + }, + { + "epoch": 3.309712299671751, + "grad_norm": 0.01063159853219986, + "learning_rate": 3.097491672345351e-05, + "loss": 0.0029, + "step": 4283 + }, + { + "epoch": 3.3104846495462446, + "grad_norm": 0.010607903823256493, + "learning_rate": 3.094997287846023e-05, + "loss": 0.003, + "step": 4284 + }, + { + "epoch": 3.3112569994207375, + "grad_norm": 0.007606419734656811, + "learning_rate": 3.092503457846952e-05, + "loss": 0.0032, + "step": 4285 + }, + { + "epoch": 3.312029349295231, + "grad_norm": 0.015880458056926727, + "learning_rate": 3.0900101830740325e-05, + "loss": 0.0033, + "step": 4286 + }, + { + "epoch": 3.312801699169724, + "grad_norm": 0.008047875016927719, + "learning_rate": 3.087517464252999e-05, + "loss": 0.0034, + "step": 4287 + }, + { + "epoch": 3.313574049044217, + "grad_norm": 0.00905267708003521, + "learning_rate": 3.08502530210942e-05, + "loss": 0.0033, + "step": 4288 + }, + { + "epoch": 3.31434639891871, + "grad_norm": 0.009174262173473835, + "learning_rate": 3.0825336973687015e-05, + "loss": 0.0031, + "step": 4289 + }, + { + "epoch": 3.3151187487932035, + "grad_norm": 0.010227401740849018, + "learning_rate": 3.0800426507560934e-05, + "loss": 0.0032, + "step": 4290 + }, + { + "epoch": 3.3158910986676964, + "grad_norm": 0.009878808632493019, + "learning_rate": 3.077552162996677e-05, + "loss": 0.0033, + "step": 4291 + }, + { + "epoch": 3.3166634485421898, + "grad_norm": 0.009305208921432495, + "learning_rate": 3.0750622348153726e-05, + "loss": 0.0031, + "step": 4292 + }, + { + "epoch": 3.3174357984166827, + "grad_norm": 0.00841295626014471, + "learning_rate": 3.072572866936939e-05, + "loss": 0.0036, + "step": 4293 + }, + { + "epoch": 3.318208148291176, + "grad_norm": 0.008652658201754093, + "learning_rate": 3.0700840600859727e-05, + "loss": 0.0033, + "step": 4294 + }, + { + "epoch": 3.318980498165669, + "grad_norm": 0.01883789338171482, + "learning_rate": 3.067595814986901e-05, + "loss": 0.003, + "step": 4295 + }, + { + "epoch": 3.3197528480401624, + "grad_norm": 0.010518788360059261, + "learning_rate": 3.0651081323639956e-05, + "loss": 0.0037, + "step": 4296 + }, + { + "epoch": 3.3205251979146553, + "grad_norm": 0.010141740553081036, + "learning_rate": 3.0626210129413606e-05, + "loss": 0.0032, + "step": 4297 + }, + { + "epoch": 3.3212975477891487, + "grad_norm": 0.010843572206795216, + "learning_rate": 3.060134457442935e-05, + "loss": 0.003, + "step": 4298 + }, + { + "epoch": 3.3220698976636416, + "grad_norm": 0.009291105903685093, + "learning_rate": 3.057648466592494e-05, + "loss": 0.0028, + "step": 4299 + }, + { + "epoch": 3.322842247538135, + "grad_norm": 0.010753841139376163, + "learning_rate": 3.055163041113653e-05, + "loss": 0.0032, + "step": 4300 + }, + { + "epoch": 3.323614597412628, + "grad_norm": 0.01016042847186327, + "learning_rate": 3.052678181729856e-05, + "loss": 0.003, + "step": 4301 + }, + { + "epoch": 3.3243869472871213, + "grad_norm": 0.010187407955527306, + "learning_rate": 3.0501938891643856e-05, + "loss": 0.0035, + "step": 4302 + }, + { + "epoch": 3.325159297161614, + "grad_norm": 0.009946313686668873, + "learning_rate": 3.047710164140362e-05, + "loss": 0.0034, + "step": 4303 + }, + { + "epoch": 3.325931647036107, + "grad_norm": 0.009504447691142559, + "learning_rate": 3.0452270073807366e-05, + "loss": 0.0028, + "step": 4304 + }, + { + "epoch": 3.3267039969106005, + "grad_norm": 0.010526880621910095, + "learning_rate": 3.0427444196082943e-05, + "loss": 0.0028, + "step": 4305 + }, + { + "epoch": 3.327476346785094, + "grad_norm": 0.010826265439391136, + "learning_rate": 3.040262401545658e-05, + "loss": 0.0032, + "step": 4306 + }, + { + "epoch": 3.328248696659587, + "grad_norm": 0.0090884268283844, + "learning_rate": 3.0377809539152846e-05, + "loss": 0.0034, + "step": 4307 + }, + { + "epoch": 3.3290210465340797, + "grad_norm": 0.008787116967141628, + "learning_rate": 3.0353000774394602e-05, + "loss": 0.0032, + "step": 4308 + }, + { + "epoch": 3.329793396408573, + "grad_norm": 0.020502891391515732, + "learning_rate": 3.0328197728403107e-05, + "loss": 0.0038, + "step": 4309 + }, + { + "epoch": 3.3305657462830665, + "grad_norm": 0.011007401160895824, + "learning_rate": 3.030340040839793e-05, + "loss": 0.0034, + "step": 4310 + }, + { + "epoch": 3.3313380961575594, + "grad_norm": 0.008363964967429638, + "learning_rate": 3.027860882159693e-05, + "loss": 0.0032, + "step": 4311 + }, + { + "epoch": 3.3321104460320523, + "grad_norm": 0.01503025647252798, + "learning_rate": 3.0253822975216397e-05, + "loss": 0.0034, + "step": 4312 + }, + { + "epoch": 3.3328827959065457, + "grad_norm": 0.014644940383732319, + "learning_rate": 3.0229042876470835e-05, + "loss": 0.0039, + "step": 4313 + }, + { + "epoch": 3.333655145781039, + "grad_norm": 0.012132090516388416, + "learning_rate": 3.02042685325732e-05, + "loss": 0.0031, + "step": 4314 + }, + { + "epoch": 3.334427495655532, + "grad_norm": 0.012613854371011257, + "learning_rate": 3.0179499950734647e-05, + "loss": 0.0036, + "step": 4315 + }, + { + "epoch": 3.335199845530025, + "grad_norm": 0.008916930295526981, + "learning_rate": 3.0154737138164723e-05, + "loss": 0.0035, + "step": 4316 + }, + { + "epoch": 3.3359721954045183, + "grad_norm": 0.02021050825715065, + "learning_rate": 3.012998010207131e-05, + "loss": 0.0032, + "step": 4317 + }, + { + "epoch": 3.336744545279011, + "grad_norm": 0.014060622081160545, + "learning_rate": 3.010522884966056e-05, + "loss": 0.003, + "step": 4318 + }, + { + "epoch": 3.3375168951535046, + "grad_norm": 0.015764638781547546, + "learning_rate": 3.008048338813696e-05, + "loss": 0.0034, + "step": 4319 + }, + { + "epoch": 3.3382892450279975, + "grad_norm": 0.010799824260175228, + "learning_rate": 3.0055743724703343e-05, + "loss": 0.0036, + "step": 4320 + }, + { + "epoch": 3.339061594902491, + "grad_norm": 0.018081430345773697, + "learning_rate": 3.00310098665608e-05, + "loss": 0.0036, + "step": 4321 + }, + { + "epoch": 3.339833944776984, + "grad_norm": 0.014368054457008839, + "learning_rate": 3.0006281820908766e-05, + "loss": 0.0032, + "step": 4322 + }, + { + "epoch": 3.340606294651477, + "grad_norm": 0.016936352476477623, + "learning_rate": 2.9981559594944987e-05, + "loss": 0.0034, + "step": 4323 + }, + { + "epoch": 3.34137864452597, + "grad_norm": 0.012375048361718655, + "learning_rate": 2.9956843195865515e-05, + "loss": 0.0027, + "step": 4324 + }, + { + "epoch": 3.3421509944004635, + "grad_norm": 0.013704188168048859, + "learning_rate": 2.9932132630864662e-05, + "loss": 0.003, + "step": 4325 + }, + { + "epoch": 3.3429233442749564, + "grad_norm": 0.012674598023295403, + "learning_rate": 2.990742790713509e-05, + "loss": 0.0028, + "step": 4326 + }, + { + "epoch": 3.34369569414945, + "grad_norm": 0.016746561974287033, + "learning_rate": 2.9882729031867772e-05, + "loss": 0.0034, + "step": 4327 + }, + { + "epoch": 3.3444680440239427, + "grad_norm": 0.012405228801071644, + "learning_rate": 2.9858036012251928e-05, + "loss": 0.0028, + "step": 4328 + }, + { + "epoch": 3.345240393898436, + "grad_norm": 0.011292995885014534, + "learning_rate": 2.9833348855475097e-05, + "loss": 0.0032, + "step": 4329 + }, + { + "epoch": 3.346012743772929, + "grad_norm": 0.009587439708411694, + "learning_rate": 2.9808667568723136e-05, + "loss": 0.0029, + "step": 4330 + }, + { + "epoch": 3.3467850936474224, + "grad_norm": 0.008292783051729202, + "learning_rate": 2.978399215918015e-05, + "loss": 0.0029, + "step": 4331 + }, + { + "epoch": 3.3475574435219153, + "grad_norm": 0.015449166297912598, + "learning_rate": 2.9759322634028557e-05, + "loss": 0.0033, + "step": 4332 + }, + { + "epoch": 3.3483297933964087, + "grad_norm": 0.007792491000145674, + "learning_rate": 2.9734659000449073e-05, + "loss": 0.0031, + "step": 4333 + }, + { + "epoch": 3.3491021432709016, + "grad_norm": 0.008889835327863693, + "learning_rate": 2.971000126562069e-05, + "loss": 0.003, + "step": 4334 + }, + { + "epoch": 3.349874493145395, + "grad_norm": 0.007819905877113342, + "learning_rate": 2.9685349436720646e-05, + "loss": 0.0032, + "step": 4335 + }, + { + "epoch": 3.350646843019888, + "grad_norm": 0.008440805599093437, + "learning_rate": 2.9660703520924528e-05, + "loss": 0.0029, + "step": 4336 + }, + { + "epoch": 3.3514191928943813, + "grad_norm": 0.012632417492568493, + "learning_rate": 2.9636063525406156e-05, + "loss": 0.0035, + "step": 4337 + }, + { + "epoch": 3.352191542768874, + "grad_norm": 0.010322070680558681, + "learning_rate": 2.9611429457337613e-05, + "loss": 0.0034, + "step": 4338 + }, + { + "epoch": 3.3529638926433676, + "grad_norm": 0.009829297661781311, + "learning_rate": 2.9586801323889314e-05, + "loss": 0.0034, + "step": 4339 + }, + { + "epoch": 3.3537362425178605, + "grad_norm": 0.010246274061501026, + "learning_rate": 2.9562179132229906e-05, + "loss": 0.0037, + "step": 4340 + }, + { + "epoch": 3.354508592392354, + "grad_norm": 0.010691756382584572, + "learning_rate": 2.9537562889526292e-05, + "loss": 0.0035, + "step": 4341 + }, + { + "epoch": 3.355280942266847, + "grad_norm": 0.012063859961926937, + "learning_rate": 2.95129526029437e-05, + "loss": 0.003, + "step": 4342 + }, + { + "epoch": 3.35605329214134, + "grad_norm": 0.009755225852131844, + "learning_rate": 2.948834827964555e-05, + "loss": 0.0035, + "step": 4343 + }, + { + "epoch": 3.356825642015833, + "grad_norm": 0.010563291609287262, + "learning_rate": 2.9463749926793616e-05, + "loss": 0.0032, + "step": 4344 + }, + { + "epoch": 3.3575979918903265, + "grad_norm": 0.009643464349210262, + "learning_rate": 2.9439157551547848e-05, + "loss": 0.0035, + "step": 4345 + }, + { + "epoch": 3.3583703417648194, + "grad_norm": 0.007082138676196337, + "learning_rate": 2.9414571161066485e-05, + "loss": 0.0027, + "step": 4346 + }, + { + "epoch": 3.359142691639313, + "grad_norm": 0.014375030994415283, + "learning_rate": 2.938999076250607e-05, + "loss": 0.0032, + "step": 4347 + }, + { + "epoch": 3.3599150415138057, + "grad_norm": 0.00861175823956728, + "learning_rate": 2.9365416363021325e-05, + "loss": 0.0027, + "step": 4348 + }, + { + "epoch": 3.360687391388299, + "grad_norm": 0.011097020469605923, + "learning_rate": 2.9340847969765278e-05, + "loss": 0.0032, + "step": 4349 + }, + { + "epoch": 3.361459741262792, + "grad_norm": 0.01772846095263958, + "learning_rate": 2.931628558988922e-05, + "loss": 0.0032, + "step": 4350 + }, + { + "epoch": 3.3622320911372854, + "grad_norm": 0.011551863513886929, + "learning_rate": 2.9291729230542607e-05, + "loss": 0.0032, + "step": 4351 + }, + { + "epoch": 3.3630044410117783, + "grad_norm": 0.0144035080447793, + "learning_rate": 2.926717889887326e-05, + "loss": 0.0033, + "step": 4352 + }, + { + "epoch": 3.3637767908862717, + "grad_norm": 0.01008586585521698, + "learning_rate": 2.9242634602027142e-05, + "loss": 0.0029, + "step": 4353 + }, + { + "epoch": 3.3645491407607646, + "grad_norm": 0.013119881972670555, + "learning_rate": 2.921809634714855e-05, + "loss": 0.0032, + "step": 4354 + }, + { + "epoch": 3.3653214906352575, + "grad_norm": 0.019614320248365402, + "learning_rate": 2.9193564141379942e-05, + "loss": 0.003, + "step": 4355 + }, + { + "epoch": 3.366093840509751, + "grad_norm": 0.016630122438073158, + "learning_rate": 2.9169037991862052e-05, + "loss": 0.0032, + "step": 4356 + }, + { + "epoch": 3.3668661903842443, + "grad_norm": 0.013291397131979465, + "learning_rate": 2.914451790573389e-05, + "loss": 0.0031, + "step": 4357 + }, + { + "epoch": 3.367638540258737, + "grad_norm": 0.010741799138486385, + "learning_rate": 2.9120003890132596e-05, + "loss": 0.0028, + "step": 4358 + }, + { + "epoch": 3.36841089013323, + "grad_norm": 0.008894594386219978, + "learning_rate": 2.9095495952193652e-05, + "loss": 0.0037, + "step": 4359 + }, + { + "epoch": 3.3691832400077235, + "grad_norm": 0.020004941150546074, + "learning_rate": 2.9070994099050742e-05, + "loss": 0.0035, + "step": 4360 + }, + { + "epoch": 3.369955589882217, + "grad_norm": 0.022412752732634544, + "learning_rate": 2.9046498337835698e-05, + "loss": 0.0038, + "step": 4361 + }, + { + "epoch": 3.37072793975671, + "grad_norm": 0.013329553417861462, + "learning_rate": 2.9022008675678703e-05, + "loss": 0.0031, + "step": 4362 + }, + { + "epoch": 3.3715002896312027, + "grad_norm": 0.009267443791031837, + "learning_rate": 2.8997525119708086e-05, + "loss": 0.0031, + "step": 4363 + }, + { + "epoch": 3.372272639505696, + "grad_norm": 0.01385226659476757, + "learning_rate": 2.897304767705042e-05, + "loss": 0.0037, + "step": 4364 + }, + { + "epoch": 3.373044989380189, + "grad_norm": 0.015692459419369698, + "learning_rate": 2.894857635483049e-05, + "loss": 0.0031, + "step": 4365 + }, + { + "epoch": 3.3738173392546824, + "grad_norm": 0.02431158907711506, + "learning_rate": 2.8924111160171318e-05, + "loss": 0.0032, + "step": 4366 + }, + { + "epoch": 3.3745896891291753, + "grad_norm": 0.014064965769648552, + "learning_rate": 2.889965210019413e-05, + "loss": 0.0032, + "step": 4367 + }, + { + "epoch": 3.3753620390036687, + "grad_norm": 0.008142712526023388, + "learning_rate": 2.8875199182018363e-05, + "loss": 0.0028, + "step": 4368 + }, + { + "epoch": 3.3761343888781616, + "grad_norm": 0.02171209640800953, + "learning_rate": 2.8850752412761666e-05, + "loss": 0.0033, + "step": 4369 + }, + { + "epoch": 3.376906738752655, + "grad_norm": 0.01805691048502922, + "learning_rate": 2.8826311799539907e-05, + "loss": 0.003, + "step": 4370 + }, + { + "epoch": 3.377679088627148, + "grad_norm": 0.016302617266774178, + "learning_rate": 2.8801877349467166e-05, + "loss": 0.0029, + "step": 4371 + }, + { + "epoch": 3.3784514385016413, + "grad_norm": 0.011869368143379688, + "learning_rate": 2.8777449069655704e-05, + "loss": 0.0031, + "step": 4372 + }, + { + "epoch": 3.3792237883761342, + "grad_norm": 0.011161305010318756, + "learning_rate": 2.8753026967216025e-05, + "loss": 0.0033, + "step": 4373 + }, + { + "epoch": 3.3799961382506276, + "grad_norm": 0.013350460678339005, + "learning_rate": 2.8728611049256794e-05, + "loss": 0.0033, + "step": 4374 + }, + { + "epoch": 3.3807684881251205, + "grad_norm": 0.01783025823533535, + "learning_rate": 2.8704201322884895e-05, + "loss": 0.0035, + "step": 4375 + }, + { + "epoch": 3.381540837999614, + "grad_norm": 0.017704160884022713, + "learning_rate": 2.867979779520542e-05, + "loss": 0.0034, + "step": 4376 + }, + { + "epoch": 3.382313187874107, + "grad_norm": 0.015627730637788773, + "learning_rate": 2.865540047332165e-05, + "loss": 0.0029, + "step": 4377 + }, + { + "epoch": 3.3830855377486, + "grad_norm": 0.009404877200722694, + "learning_rate": 2.8631009364335036e-05, + "loss": 0.0028, + "step": 4378 + }, + { + "epoch": 3.383857887623093, + "grad_norm": 0.011657997965812683, + "learning_rate": 2.8606624475345255e-05, + "loss": 0.0034, + "step": 4379 + }, + { + "epoch": 3.3846302374975865, + "grad_norm": 0.029644042253494263, + "learning_rate": 2.858224581345016e-05, + "loss": 0.0035, + "step": 4380 + }, + { + "epoch": 3.3854025873720794, + "grad_norm": 0.012362591922283173, + "learning_rate": 2.8557873385745793e-05, + "loss": 0.0032, + "step": 4381 + }, + { + "epoch": 3.386174937246573, + "grad_norm": 0.011876262724399567, + "learning_rate": 2.8533507199326358e-05, + "loss": 0.0033, + "step": 4382 + }, + { + "epoch": 3.3869472871210657, + "grad_norm": 0.015863923355937004, + "learning_rate": 2.8509147261284287e-05, + "loss": 0.003, + "step": 4383 + }, + { + "epoch": 3.387719636995559, + "grad_norm": 0.011141281574964523, + "learning_rate": 2.848479357871016e-05, + "loss": 0.0034, + "step": 4384 + }, + { + "epoch": 3.388491986870052, + "grad_norm": 0.012105172500014305, + "learning_rate": 2.8460446158692743e-05, + "loss": 0.0031, + "step": 4385 + }, + { + "epoch": 3.3892643367445454, + "grad_norm": 0.01008552499115467, + "learning_rate": 2.8436105008318993e-05, + "loss": 0.0036, + "step": 4386 + }, + { + "epoch": 3.3900366866190383, + "grad_norm": 0.010281000286340714, + "learning_rate": 2.841177013467401e-05, + "loss": 0.0036, + "step": 4387 + }, + { + "epoch": 3.3908090364935317, + "grad_norm": 0.013542928732931614, + "learning_rate": 2.8387441544841103e-05, + "loss": 0.0035, + "step": 4388 + }, + { + "epoch": 3.3915813863680246, + "grad_norm": 0.01576226018369198, + "learning_rate": 2.8363119245901737e-05, + "loss": 0.0035, + "step": 4389 + }, + { + "epoch": 3.392353736242518, + "grad_norm": 0.016558324918150902, + "learning_rate": 2.8338803244935537e-05, + "loss": 0.0034, + "step": 4390 + }, + { + "epoch": 3.393126086117011, + "grad_norm": 0.009921234101057053, + "learning_rate": 2.831449354902031e-05, + "loss": 0.0034, + "step": 4391 + }, + { + "epoch": 3.3938984359915043, + "grad_norm": 0.011850431561470032, + "learning_rate": 2.8290190165232023e-05, + "loss": 0.0033, + "step": 4392 + }, + { + "epoch": 3.3946707858659972, + "grad_norm": 0.014793440699577332, + "learning_rate": 2.8265893100644774e-05, + "loss": 0.0032, + "step": 4393 + }, + { + "epoch": 3.3954431357404906, + "grad_norm": 0.02236754447221756, + "learning_rate": 2.824160236233092e-05, + "loss": 0.0039, + "step": 4394 + }, + { + "epoch": 3.3962154856149835, + "grad_norm": 0.008079919032752514, + "learning_rate": 2.8217317957360843e-05, + "loss": 0.003, + "step": 4395 + }, + { + "epoch": 3.396987835489477, + "grad_norm": 0.015229464508593082, + "learning_rate": 2.819303989280315e-05, + "loss": 0.0033, + "step": 4396 + }, + { + "epoch": 3.39776018536397, + "grad_norm": 0.008884802460670471, + "learning_rate": 2.8168768175724662e-05, + "loss": 0.0032, + "step": 4397 + }, + { + "epoch": 3.398532535238463, + "grad_norm": 0.01749601401388645, + "learning_rate": 2.814450281319022e-05, + "loss": 0.0035, + "step": 4398 + }, + { + "epoch": 3.399304885112956, + "grad_norm": 0.021450838074088097, + "learning_rate": 2.8120243812262892e-05, + "loss": 0.0037, + "step": 4399 + }, + { + "epoch": 3.4000772349874495, + "grad_norm": 0.006622872315347195, + "learning_rate": 2.809599118000395e-05, + "loss": 0.0032, + "step": 4400 + }, + { + "epoch": 3.4008495848619424, + "grad_norm": 0.024547424167394638, + "learning_rate": 2.8071744923472675e-05, + "loss": 0.0035, + "step": 4401 + }, + { + "epoch": 3.4016219347364354, + "grad_norm": 0.020672762766480446, + "learning_rate": 2.8047505049726586e-05, + "loss": 0.0034, + "step": 4402 + }, + { + "epoch": 3.4023942846109287, + "grad_norm": 0.007940291427075863, + "learning_rate": 2.8023271565821347e-05, + "loss": 0.0028, + "step": 4403 + }, + { + "epoch": 3.403166634485422, + "grad_norm": 0.00886029377579689, + "learning_rate": 2.799904447881074e-05, + "loss": 0.003, + "step": 4404 + }, + { + "epoch": 3.403938984359915, + "grad_norm": 0.011499281041324139, + "learning_rate": 2.7974823795746642e-05, + "loss": 0.0031, + "step": 4405 + }, + { + "epoch": 3.404711334234408, + "grad_norm": 0.030732352286577225, + "learning_rate": 2.795060952367916e-05, + "loss": 0.0029, + "step": 4406 + }, + { + "epoch": 3.4054836841089013, + "grad_norm": 0.011285774409770966, + "learning_rate": 2.7926401669656476e-05, + "loss": 0.0037, + "step": 4407 + }, + { + "epoch": 3.4062560339833947, + "grad_norm": 0.010116951540112495, + "learning_rate": 2.7902200240724875e-05, + "loss": 0.0034, + "step": 4408 + }, + { + "epoch": 3.4070283838578876, + "grad_norm": 0.014413068071007729, + "learning_rate": 2.7878005243928813e-05, + "loss": 0.0031, + "step": 4409 + }, + { + "epoch": 3.4078007337323806, + "grad_norm": 0.009910531342029572, + "learning_rate": 2.7853816686310924e-05, + "loss": 0.0033, + "step": 4410 + }, + { + "epoch": 3.408573083606874, + "grad_norm": 0.007654004730284214, + "learning_rate": 2.7829634574911854e-05, + "loss": 0.003, + "step": 4411 + }, + { + "epoch": 3.409345433481367, + "grad_norm": 0.009115166030824184, + "learning_rate": 2.7805458916770432e-05, + "loss": 0.0035, + "step": 4412 + }, + { + "epoch": 3.4101177833558602, + "grad_norm": 0.010139040648937225, + "learning_rate": 2.7781289718923674e-05, + "loss": 0.0033, + "step": 4413 + }, + { + "epoch": 3.410890133230353, + "grad_norm": 0.009890168905258179, + "learning_rate": 2.7757126988406578e-05, + "loss": 0.003, + "step": 4414 + }, + { + "epoch": 3.4116624831048465, + "grad_norm": 0.00818744394928217, + "learning_rate": 2.7732970732252337e-05, + "loss": 0.003, + "step": 4415 + }, + { + "epoch": 3.4124348329793395, + "grad_norm": 0.008600006811320782, + "learning_rate": 2.7708820957492287e-05, + "loss": 0.0033, + "step": 4416 + }, + { + "epoch": 3.413207182853833, + "grad_norm": 0.007462901528924704, + "learning_rate": 2.7684677671155856e-05, + "loss": 0.0029, + "step": 4417 + }, + { + "epoch": 3.4139795327283258, + "grad_norm": 0.010175359435379505, + "learning_rate": 2.7660540880270507e-05, + "loss": 0.0033, + "step": 4418 + }, + { + "epoch": 3.414751882602819, + "grad_norm": 0.01211362425237894, + "learning_rate": 2.7636410591861927e-05, + "loss": 0.0033, + "step": 4419 + }, + { + "epoch": 3.415524232477312, + "grad_norm": 0.007548519875854254, + "learning_rate": 2.7612286812953876e-05, + "loss": 0.0027, + "step": 4420 + }, + { + "epoch": 3.4162965823518054, + "grad_norm": 0.011262309737503529, + "learning_rate": 2.7588169550568132e-05, + "loss": 0.003, + "step": 4421 + }, + { + "epoch": 3.4170689322262984, + "grad_norm": 0.011649170890450478, + "learning_rate": 2.756405881172471e-05, + "loss": 0.003, + "step": 4422 + }, + { + "epoch": 3.4178412821007917, + "grad_norm": 0.011346418410539627, + "learning_rate": 2.753995460344166e-05, + "loss": 0.0029, + "step": 4423 + }, + { + "epoch": 3.4186136319752847, + "grad_norm": 0.014188813045620918, + "learning_rate": 2.751585693273508e-05, + "loss": 0.0039, + "step": 4424 + }, + { + "epoch": 3.419385981849778, + "grad_norm": 0.00839192047715187, + "learning_rate": 2.7491765806619275e-05, + "loss": 0.0034, + "step": 4425 + }, + { + "epoch": 3.420158331724271, + "grad_norm": 0.00843577366322279, + "learning_rate": 2.7467681232106573e-05, + "loss": 0.003, + "step": 4426 + }, + { + "epoch": 3.4209306815987643, + "grad_norm": 0.009509515017271042, + "learning_rate": 2.7443603216207414e-05, + "loss": 0.0029, + "step": 4427 + }, + { + "epoch": 3.4217030314732573, + "grad_norm": 0.008844722993671894, + "learning_rate": 2.7419531765930324e-05, + "loss": 0.0031, + "step": 4428 + }, + { + "epoch": 3.4224753813477506, + "grad_norm": 0.010160795412957668, + "learning_rate": 2.7395466888281916e-05, + "loss": 0.003, + "step": 4429 + }, + { + "epoch": 3.4232477312222436, + "grad_norm": 0.01939941942691803, + "learning_rate": 2.7371408590266902e-05, + "loss": 0.0033, + "step": 4430 + }, + { + "epoch": 3.424020081096737, + "grad_norm": 0.01017013005912304, + "learning_rate": 2.7347356878888066e-05, + "loss": 0.0034, + "step": 4431 + }, + { + "epoch": 3.42479243097123, + "grad_norm": 0.007695165928453207, + "learning_rate": 2.732331176114628e-05, + "loss": 0.0034, + "step": 4432 + }, + { + "epoch": 3.4255647808457232, + "grad_norm": 0.007821792736649513, + "learning_rate": 2.7299273244040523e-05, + "loss": 0.0031, + "step": 4433 + }, + { + "epoch": 3.426337130720216, + "grad_norm": 0.015497798100113869, + "learning_rate": 2.727524133456776e-05, + "loss": 0.0032, + "step": 4434 + }, + { + "epoch": 3.4271094805947095, + "grad_norm": 0.01589919440448284, + "learning_rate": 2.725121603972316e-05, + "loss": 0.0028, + "step": 4435 + }, + { + "epoch": 3.4278818304692025, + "grad_norm": 0.012722725979983807, + "learning_rate": 2.722719736649989e-05, + "loss": 0.0037, + "step": 4436 + }, + { + "epoch": 3.428654180343696, + "grad_norm": 0.010282904841005802, + "learning_rate": 2.7203185321889203e-05, + "loss": 0.0034, + "step": 4437 + }, + { + "epoch": 3.4294265302181888, + "grad_norm": 0.010863185860216618, + "learning_rate": 2.7179179912880416e-05, + "loss": 0.0033, + "step": 4438 + }, + { + "epoch": 3.430198880092682, + "grad_norm": 0.013260203413665295, + "learning_rate": 2.715518114646094e-05, + "loss": 0.003, + "step": 4439 + }, + { + "epoch": 3.430971229967175, + "grad_norm": 0.018179895356297493, + "learning_rate": 2.713118902961622e-05, + "loss": 0.0031, + "step": 4440 + }, + { + "epoch": 3.4317435798416684, + "grad_norm": 0.021851636469364166, + "learning_rate": 2.71072035693298e-05, + "loss": 0.0032, + "step": 4441 + }, + { + "epoch": 3.4325159297161614, + "grad_norm": 0.01149999350309372, + "learning_rate": 2.7083224772583255e-05, + "loss": 0.0027, + "step": 4442 + }, + { + "epoch": 3.4332882795906547, + "grad_norm": 0.009091292507946491, + "learning_rate": 2.705925264635623e-05, + "loss": 0.0031, + "step": 4443 + }, + { + "epoch": 3.4340606294651477, + "grad_norm": 0.01148899830877781, + "learning_rate": 2.703528719762644e-05, + "loss": 0.0032, + "step": 4444 + }, + { + "epoch": 3.434832979339641, + "grad_norm": 0.01642238348722458, + "learning_rate": 2.701132843336965e-05, + "loss": 0.0035, + "step": 4445 + }, + { + "epoch": 3.435605329214134, + "grad_norm": 0.011737337335944176, + "learning_rate": 2.698737636055967e-05, + "loss": 0.003, + "step": 4446 + }, + { + "epoch": 3.4363776790886273, + "grad_norm": 0.013013274408876896, + "learning_rate": 2.696343098616837e-05, + "loss": 0.003, + "step": 4447 + }, + { + "epoch": 3.4371500289631203, + "grad_norm": 0.0073637585155665874, + "learning_rate": 2.693949231716567e-05, + "loss": 0.0032, + "step": 4448 + }, + { + "epoch": 3.437922378837613, + "grad_norm": 0.01137218065559864, + "learning_rate": 2.6915560360519544e-05, + "loss": 0.0031, + "step": 4449 + }, + { + "epoch": 3.4386947287121066, + "grad_norm": 0.00971105694770813, + "learning_rate": 2.6891635123196003e-05, + "loss": 0.0034, + "step": 4450 + }, + { + "epoch": 3.4394670785866, + "grad_norm": 0.013804982416331768, + "learning_rate": 2.68677166121591e-05, + "loss": 0.0036, + "step": 4451 + }, + { + "epoch": 3.440239428461093, + "grad_norm": 0.00971175916492939, + "learning_rate": 2.6843804834370946e-05, + "loss": 0.0034, + "step": 4452 + }, + { + "epoch": 3.441011778335586, + "grad_norm": 0.010609552264213562, + "learning_rate": 2.681989979679168e-05, + "loss": 0.0031, + "step": 4453 + }, + { + "epoch": 3.441784128210079, + "grad_norm": 0.009030761197209358, + "learning_rate": 2.679600150637947e-05, + "loss": 0.0033, + "step": 4454 + }, + { + "epoch": 3.4425564780845725, + "grad_norm": 0.013067315332591534, + "learning_rate": 2.6772109970090543e-05, + "loss": 0.0033, + "step": 4455 + }, + { + "epoch": 3.4433288279590655, + "grad_norm": 0.01771661825478077, + "learning_rate": 2.6748225194879117e-05, + "loss": 0.0033, + "step": 4456 + }, + { + "epoch": 3.4441011778335584, + "grad_norm": 0.008268121629953384, + "learning_rate": 2.6724347187697547e-05, + "loss": 0.0029, + "step": 4457 + }, + { + "epoch": 3.4448735277080518, + "grad_norm": 0.009157293476164341, + "learning_rate": 2.6700475955496073e-05, + "loss": 0.003, + "step": 4458 + }, + { + "epoch": 3.4456458775825447, + "grad_norm": 0.016499847173690796, + "learning_rate": 2.667661150522304e-05, + "loss": 0.0031, + "step": 4459 + }, + { + "epoch": 3.446418227457038, + "grad_norm": 0.010825825855135918, + "learning_rate": 2.6652753843824862e-05, + "loss": 0.003, + "step": 4460 + }, + { + "epoch": 3.447190577331531, + "grad_norm": 0.008508339524269104, + "learning_rate": 2.662890297824588e-05, + "loss": 0.0035, + "step": 4461 + }, + { + "epoch": 3.4479629272060244, + "grad_norm": 0.009811155498027802, + "learning_rate": 2.6605058915428515e-05, + "loss": 0.003, + "step": 4462 + }, + { + "epoch": 3.4487352770805173, + "grad_norm": 0.009760680608451366, + "learning_rate": 2.658122166231321e-05, + "loss": 0.003, + "step": 4463 + }, + { + "epoch": 3.4495076269550107, + "grad_norm": 0.012883470393717289, + "learning_rate": 2.65573912258384e-05, + "loss": 0.0033, + "step": 4464 + }, + { + "epoch": 3.4502799768295036, + "grad_norm": 0.009158185683190823, + "learning_rate": 2.6533567612940558e-05, + "loss": 0.003, + "step": 4465 + }, + { + "epoch": 3.451052326703997, + "grad_norm": 0.0091365035623312, + "learning_rate": 2.6509750830554143e-05, + "loss": 0.003, + "step": 4466 + }, + { + "epoch": 3.45182467657849, + "grad_norm": 0.015592905692756176, + "learning_rate": 2.6485940885611692e-05, + "loss": 0.0033, + "step": 4467 + }, + { + "epoch": 3.4525970264529833, + "grad_norm": 0.011113499291241169, + "learning_rate": 2.646213778504366e-05, + "loss": 0.0032, + "step": 4468 + }, + { + "epoch": 3.453369376327476, + "grad_norm": 0.012405202724039555, + "learning_rate": 2.6438341535778554e-05, + "loss": 0.0035, + "step": 4469 + }, + { + "epoch": 3.4541417262019696, + "grad_norm": 0.009333050809800625, + "learning_rate": 2.6414552144742933e-05, + "loss": 0.0029, + "step": 4470 + }, + { + "epoch": 3.4549140760764625, + "grad_norm": 0.010121024213731289, + "learning_rate": 2.6390769618861265e-05, + "loss": 0.0033, + "step": 4471 + }, + { + "epoch": 3.455686425950956, + "grad_norm": 0.009522978216409683, + "learning_rate": 2.6366993965056063e-05, + "loss": 0.003, + "step": 4472 + }, + { + "epoch": 3.456458775825449, + "grad_norm": 0.007180164568126202, + "learning_rate": 2.634322519024791e-05, + "loss": 0.0028, + "step": 4473 + }, + { + "epoch": 3.457231125699942, + "grad_norm": 0.01016936358064413, + "learning_rate": 2.6319463301355264e-05, + "loss": 0.0032, + "step": 4474 + }, + { + "epoch": 3.458003475574435, + "grad_norm": 0.007608390878885984, + "learning_rate": 2.629570830529463e-05, + "loss": 0.003, + "step": 4475 + }, + { + "epoch": 3.4587758254489285, + "grad_norm": 0.008880065754055977, + "learning_rate": 2.627196020898056e-05, + "loss": 0.0031, + "step": 4476 + }, + { + "epoch": 3.4595481753234214, + "grad_norm": 0.011755947023630142, + "learning_rate": 2.6248219019325544e-05, + "loss": 0.0027, + "step": 4477 + }, + { + "epoch": 3.4603205251979148, + "grad_norm": 0.010038244538009167, + "learning_rate": 2.622448474324001e-05, + "loss": 0.0031, + "step": 4478 + }, + { + "epoch": 3.4610928750724077, + "grad_norm": 0.01359093002974987, + "learning_rate": 2.6200757387632496e-05, + "loss": 0.0029, + "step": 4479 + }, + { + "epoch": 3.461865224946901, + "grad_norm": 0.008809817023575306, + "learning_rate": 2.617703695940946e-05, + "loss": 0.0027, + "step": 4480 + }, + { + "epoch": 3.462637574821394, + "grad_norm": 0.010269363410770893, + "learning_rate": 2.615332346547529e-05, + "loss": 0.0032, + "step": 4481 + }, + { + "epoch": 3.4634099246958874, + "grad_norm": 0.00966144073754549, + "learning_rate": 2.6129616912732457e-05, + "loss": 0.0032, + "step": 4482 + }, + { + "epoch": 3.4641822745703803, + "grad_norm": 0.014523585326969624, + "learning_rate": 2.6105917308081384e-05, + "loss": 0.003, + "step": 4483 + }, + { + "epoch": 3.4649546244448737, + "grad_norm": 0.014877774752676487, + "learning_rate": 2.6082224658420385e-05, + "loss": 0.0029, + "step": 4484 + }, + { + "epoch": 3.4657269743193666, + "grad_norm": 0.009811308234930038, + "learning_rate": 2.6058538970645875e-05, + "loss": 0.003, + "step": 4485 + }, + { + "epoch": 3.46649932419386, + "grad_norm": 0.016033053398132324, + "learning_rate": 2.603486025165216e-05, + "loss": 0.0031, + "step": 4486 + }, + { + "epoch": 3.467271674068353, + "grad_norm": 0.011928362771868706, + "learning_rate": 2.601118850833158e-05, + "loss": 0.0027, + "step": 4487 + }, + { + "epoch": 3.4680440239428463, + "grad_norm": 0.007985375821590424, + "learning_rate": 2.598752374757434e-05, + "loss": 0.003, + "step": 4488 + }, + { + "epoch": 3.468816373817339, + "grad_norm": 0.011764492839574814, + "learning_rate": 2.596386597626873e-05, + "loss": 0.0031, + "step": 4489 + }, + { + "epoch": 3.4695887236918326, + "grad_norm": 0.016873568296432495, + "learning_rate": 2.5940215201300965e-05, + "loss": 0.0027, + "step": 4490 + }, + { + "epoch": 3.4703610735663255, + "grad_norm": 0.012676803395152092, + "learning_rate": 2.5916571429555157e-05, + "loss": 0.0029, + "step": 4491 + }, + { + "epoch": 3.471133423440819, + "grad_norm": 0.009303895756602287, + "learning_rate": 2.58929346679135e-05, + "loss": 0.0034, + "step": 4492 + }, + { + "epoch": 3.471905773315312, + "grad_norm": 0.0083675691857934, + "learning_rate": 2.5869304923256067e-05, + "loss": 0.003, + "step": 4493 + }, + { + "epoch": 3.472678123189805, + "grad_norm": 0.021918458864092827, + "learning_rate": 2.5845682202460857e-05, + "loss": 0.0037, + "step": 4494 + }, + { + "epoch": 3.473450473064298, + "grad_norm": 0.013355713337659836, + "learning_rate": 2.582206651240393e-05, + "loss": 0.0031, + "step": 4495 + }, + { + "epoch": 3.474222822938791, + "grad_norm": 0.008953472599387169, + "learning_rate": 2.579845785995923e-05, + "loss": 0.0027, + "step": 4496 + }, + { + "epoch": 3.4749951728132844, + "grad_norm": 0.007987641729414463, + "learning_rate": 2.5774856251998657e-05, + "loss": 0.0034, + "step": 4497 + }, + { + "epoch": 3.4757675226877778, + "grad_norm": 0.010496587492525578, + "learning_rate": 2.575126169539207e-05, + "loss": 0.0031, + "step": 4498 + }, + { + "epoch": 3.4765398725622707, + "grad_norm": 0.010220379568636417, + "learning_rate": 2.572767419700728e-05, + "loss": 0.0033, + "step": 4499 + }, + { + "epoch": 3.4773122224367636, + "grad_norm": 0.013059530407190323, + "learning_rate": 2.570409376371003e-05, + "loss": 0.0032, + "step": 4500 + }, + { + "epoch": 3.478084572311257, + "grad_norm": 0.009420751594007015, + "learning_rate": 2.5680520402364015e-05, + "loss": 0.0033, + "step": 4501 + }, + { + "epoch": 3.4788569221857504, + "grad_norm": 0.010078287683427334, + "learning_rate": 2.5656954119830885e-05, + "loss": 0.0031, + "step": 4502 + }, + { + "epoch": 3.4796292720602433, + "grad_norm": 0.00803342368453741, + "learning_rate": 2.5633394922970194e-05, + "loss": 0.0029, + "step": 4503 + }, + { + "epoch": 3.4804016219347362, + "grad_norm": 0.00801512785255909, + "learning_rate": 2.5609842818639473e-05, + "loss": 0.0031, + "step": 4504 + }, + { + "epoch": 3.4811739718092296, + "grad_norm": 0.009992556646466255, + "learning_rate": 2.558629781369416e-05, + "loss": 0.0036, + "step": 4505 + }, + { + "epoch": 3.4819463216837225, + "grad_norm": 0.010577374137938023, + "learning_rate": 2.5562759914987632e-05, + "loss": 0.0033, + "step": 4506 + }, + { + "epoch": 3.482718671558216, + "grad_norm": 0.008530070073902607, + "learning_rate": 2.5539229129371222e-05, + "loss": 0.0028, + "step": 4507 + }, + { + "epoch": 3.483491021432709, + "grad_norm": 0.009210624732077122, + "learning_rate": 2.551570546369415e-05, + "loss": 0.0032, + "step": 4508 + }, + { + "epoch": 3.484263371307202, + "grad_norm": 0.012428054586052895, + "learning_rate": 2.5492188924803606e-05, + "loss": 0.0028, + "step": 4509 + }, + { + "epoch": 3.485035721181695, + "grad_norm": 0.009001705795526505, + "learning_rate": 2.5468679519544682e-05, + "loss": 0.0033, + "step": 4510 + }, + { + "epoch": 3.4858080710561885, + "grad_norm": 0.012225321494042873, + "learning_rate": 2.5445177254760387e-05, + "loss": 0.0036, + "step": 4511 + }, + { + "epoch": 3.4865804209306814, + "grad_norm": 0.00832716841250658, + "learning_rate": 2.542168213729167e-05, + "loss": 0.0031, + "step": 4512 + }, + { + "epoch": 3.487352770805175, + "grad_norm": 0.01162449549883604, + "learning_rate": 2.5398194173977397e-05, + "loss": 0.0034, + "step": 4513 + }, + { + "epoch": 3.4881251206796677, + "grad_norm": 0.010331190191209316, + "learning_rate": 2.5374713371654335e-05, + "loss": 0.0029, + "step": 4514 + }, + { + "epoch": 3.488897470554161, + "grad_norm": 0.01203492097556591, + "learning_rate": 2.5351239737157184e-05, + "loss": 0.0031, + "step": 4515 + }, + { + "epoch": 3.489669820428654, + "grad_norm": 0.013049033470451832, + "learning_rate": 2.5327773277318558e-05, + "loss": 0.0033, + "step": 4516 + }, + { + "epoch": 3.4904421703031474, + "grad_norm": 0.00960687268525362, + "learning_rate": 2.5304313998968964e-05, + "loss": 0.0032, + "step": 4517 + }, + { + "epoch": 3.4912145201776403, + "grad_norm": 0.009507626295089722, + "learning_rate": 2.5280861908936843e-05, + "loss": 0.0029, + "step": 4518 + }, + { + "epoch": 3.4919868700521337, + "grad_norm": 0.011058974079787731, + "learning_rate": 2.5257417014048522e-05, + "loss": 0.0031, + "step": 4519 + }, + { + "epoch": 3.4927592199266266, + "grad_norm": 0.016918856650590897, + "learning_rate": 2.5233979321128254e-05, + "loss": 0.0035, + "step": 4520 + }, + { + "epoch": 3.49353156980112, + "grad_norm": 0.009435974061489105, + "learning_rate": 2.521054883699817e-05, + "loss": 0.0032, + "step": 4521 + }, + { + "epoch": 3.494303919675613, + "grad_norm": 0.00975219439715147, + "learning_rate": 2.5187125568478332e-05, + "loss": 0.003, + "step": 4522 + }, + { + "epoch": 3.4950762695501063, + "grad_norm": 0.013011807575821877, + "learning_rate": 2.5163709522386685e-05, + "loss": 0.0035, + "step": 4523 + }, + { + "epoch": 3.4958486194245992, + "grad_norm": 0.010547825135290623, + "learning_rate": 2.5140300705539065e-05, + "loss": 0.0028, + "step": 4524 + }, + { + "epoch": 3.4966209692990926, + "grad_norm": 0.00943104736506939, + "learning_rate": 2.5116899124749228e-05, + "loss": 0.0031, + "step": 4525 + }, + { + "epoch": 3.4973933191735855, + "grad_norm": 0.00984232034534216, + "learning_rate": 2.509350478682877e-05, + "loss": 0.0029, + "step": 4526 + }, + { + "epoch": 3.498165669048079, + "grad_norm": 0.007884285412728786, + "learning_rate": 2.507011769858729e-05, + "loss": 0.0033, + "step": 4527 + }, + { + "epoch": 3.498938018922572, + "grad_norm": 0.011110519990324974, + "learning_rate": 2.504673786683215e-05, + "loss": 0.0029, + "step": 4528 + }, + { + "epoch": 3.499710368797065, + "grad_norm": 0.012727515771985054, + "learning_rate": 2.502336529836864e-05, + "loss": 0.0032, + "step": 4529 + }, + { + "epoch": 3.500482718671558, + "grad_norm": 0.011709541082382202, + "learning_rate": 2.500000000000001e-05, + "loss": 0.003, + "step": 4530 + }, + { + "epoch": 3.5012550685460515, + "grad_norm": 0.006993581540882587, + "learning_rate": 2.4976641978527287e-05, + "loss": 0.0029, + "step": 4531 + }, + { + "epoch": 3.5020274184205444, + "grad_norm": 0.008528415113687515, + "learning_rate": 2.4953291240749416e-05, + "loss": 0.0033, + "step": 4532 + }, + { + "epoch": 3.502799768295038, + "grad_norm": 0.010851817205548286, + "learning_rate": 2.49299477934633e-05, + "loss": 0.0034, + "step": 4533 + }, + { + "epoch": 3.5035721181695307, + "grad_norm": 0.015704330056905746, + "learning_rate": 2.4906611643463577e-05, + "loss": 0.0036, + "step": 4534 + }, + { + "epoch": 3.504344468044024, + "grad_norm": 0.02083994261920452, + "learning_rate": 2.4883282797542862e-05, + "loss": 0.0036, + "step": 4535 + }, + { + "epoch": 3.505116817918517, + "grad_norm": 0.010550920851528645, + "learning_rate": 2.4859961262491633e-05, + "loss": 0.0032, + "step": 4536 + }, + { + "epoch": 3.5058891677930104, + "grad_norm": 0.00966651551425457, + "learning_rate": 2.4836647045098244e-05, + "loss": 0.0031, + "step": 4537 + }, + { + "epoch": 3.5066615176675033, + "grad_norm": 0.012379195541143417, + "learning_rate": 2.481334015214883e-05, + "loss": 0.0033, + "step": 4538 + }, + { + "epoch": 3.5074338675419963, + "grad_norm": 0.01680929958820343, + "learning_rate": 2.4790040590427522e-05, + "loss": 0.0035, + "step": 4539 + }, + { + "epoch": 3.5082062174164896, + "grad_norm": 0.011752322316169739, + "learning_rate": 2.476674836671627e-05, + "loss": 0.0031, + "step": 4540 + }, + { + "epoch": 3.508978567290983, + "grad_norm": 0.014056556858122349, + "learning_rate": 2.4743463487794826e-05, + "loss": 0.0032, + "step": 4541 + }, + { + "epoch": 3.509750917165476, + "grad_norm": 0.008536022156476974, + "learning_rate": 2.4720185960440867e-05, + "loss": 0.0034, + "step": 4542 + }, + { + "epoch": 3.510523267039969, + "grad_norm": 0.009675630368292332, + "learning_rate": 2.469691579142997e-05, + "loss": 0.0037, + "step": 4543 + }, + { + "epoch": 3.5112956169144622, + "grad_norm": 0.015018140897154808, + "learning_rate": 2.467365298753547e-05, + "loss": 0.0033, + "step": 4544 + }, + { + "epoch": 3.5120679667889556, + "grad_norm": 0.015164218842983246, + "learning_rate": 2.4650397555528594e-05, + "loss": 0.0032, + "step": 4545 + }, + { + "epoch": 3.5128403166634485, + "grad_norm": 0.00981762446463108, + "learning_rate": 2.4627149502178476e-05, + "loss": 0.0028, + "step": 4546 + }, + { + "epoch": 3.5136126665379415, + "grad_norm": 0.010019616223871708, + "learning_rate": 2.460390883425207e-05, + "loss": 0.0025, + "step": 4547 + }, + { + "epoch": 3.514385016412435, + "grad_norm": 0.011683452874422073, + "learning_rate": 2.4580675558514116e-05, + "loss": 0.0029, + "step": 4548 + }, + { + "epoch": 3.515157366286928, + "grad_norm": 0.012275191023945808, + "learning_rate": 2.4557449681727312e-05, + "loss": 0.0034, + "step": 4549 + }, + { + "epoch": 3.515929716161421, + "grad_norm": 0.009555431082844734, + "learning_rate": 2.4534231210652152e-05, + "loss": 0.0029, + "step": 4550 + }, + { + "epoch": 3.516702066035914, + "grad_norm": 0.01259004045277834, + "learning_rate": 2.4511020152046914e-05, + "loss": 0.0034, + "step": 4551 + }, + { + "epoch": 3.5174744159104074, + "grad_norm": 0.008695184253156185, + "learning_rate": 2.4487816512667833e-05, + "loss": 0.0035, + "step": 4552 + }, + { + "epoch": 3.518246765784901, + "grad_norm": 0.010859877802431583, + "learning_rate": 2.4464620299268925e-05, + "loss": 0.0033, + "step": 4553 + }, + { + "epoch": 3.5190191156593937, + "grad_norm": 0.011442749761044979, + "learning_rate": 2.444143151860199e-05, + "loss": 0.0031, + "step": 4554 + }, + { + "epoch": 3.5197914655338867, + "grad_norm": 0.009345135651528835, + "learning_rate": 2.441825017741679e-05, + "loss": 0.0029, + "step": 4555 + }, + { + "epoch": 3.52056381540838, + "grad_norm": 0.009865041822195053, + "learning_rate": 2.4395076282460823e-05, + "loss": 0.0032, + "step": 4556 + }, + { + "epoch": 3.5213361652828734, + "grad_norm": 0.008451285772025585, + "learning_rate": 2.437190984047945e-05, + "loss": 0.0036, + "step": 4557 + }, + { + "epoch": 3.5221085151573663, + "grad_norm": 0.023029906675219536, + "learning_rate": 2.434875085821587e-05, + "loss": 0.0032, + "step": 4558 + }, + { + "epoch": 3.5228808650318593, + "grad_norm": 0.013170885853469372, + "learning_rate": 2.4325599342411094e-05, + "loss": 0.003, + "step": 4559 + }, + { + "epoch": 3.5236532149063526, + "grad_norm": 0.008758633397519588, + "learning_rate": 2.430245529980397e-05, + "loss": 0.0032, + "step": 4560 + }, + { + "epoch": 3.5244255647808456, + "grad_norm": 0.011905158869922161, + "learning_rate": 2.427931873713118e-05, + "loss": 0.0033, + "step": 4561 + }, + { + "epoch": 3.525197914655339, + "grad_norm": 0.013737745583057404, + "learning_rate": 2.425618966112721e-05, + "loss": 0.0033, + "step": 4562 + }, + { + "epoch": 3.525970264529832, + "grad_norm": 0.013534064404666424, + "learning_rate": 2.4233068078524375e-05, + "loss": 0.0035, + "step": 4563 + }, + { + "epoch": 3.5267426144043252, + "grad_norm": 0.008490565232932568, + "learning_rate": 2.4209953996052807e-05, + "loss": 0.0032, + "step": 4564 + }, + { + "epoch": 3.527514964278818, + "grad_norm": 0.008811703883111477, + "learning_rate": 2.4186847420440462e-05, + "loss": 0.003, + "step": 4565 + }, + { + "epoch": 3.5282873141533115, + "grad_norm": 0.01871240697801113, + "learning_rate": 2.4163748358413106e-05, + "loss": 0.0034, + "step": 4566 + }, + { + "epoch": 3.5290596640278045, + "grad_norm": 0.012067742645740509, + "learning_rate": 2.4140656816694318e-05, + "loss": 0.003, + "step": 4567 + }, + { + "epoch": 3.529832013902298, + "grad_norm": 0.012779579497873783, + "learning_rate": 2.4117572802005483e-05, + "loss": 0.0031, + "step": 4568 + }, + { + "epoch": 3.5306043637767908, + "grad_norm": 0.008804393000900745, + "learning_rate": 2.4094496321065803e-05, + "loss": 0.0031, + "step": 4569 + }, + { + "epoch": 3.531376713651284, + "grad_norm": 0.011759286746382713, + "learning_rate": 2.407142738059228e-05, + "loss": 0.0029, + "step": 4570 + }, + { + "epoch": 3.532149063525777, + "grad_norm": 0.017655447125434875, + "learning_rate": 2.4048365987299732e-05, + "loss": 0.0034, + "step": 4571 + }, + { + "epoch": 3.5329214134002704, + "grad_norm": 0.009172452613711357, + "learning_rate": 2.402531214790077e-05, + "loss": 0.0028, + "step": 4572 + }, + { + "epoch": 3.5336937632747634, + "grad_norm": 0.0073136230930686, + "learning_rate": 2.4002265869105813e-05, + "loss": 0.0027, + "step": 4573 + }, + { + "epoch": 3.5344661131492567, + "grad_norm": 0.009583295322954655, + "learning_rate": 2.397922715762307e-05, + "loss": 0.0028, + "step": 4574 + }, + { + "epoch": 3.5352384630237497, + "grad_norm": 0.009433571249246597, + "learning_rate": 2.395619602015855e-05, + "loss": 0.003, + "step": 4575 + }, + { + "epoch": 3.536010812898243, + "grad_norm": 0.010750960558652878, + "learning_rate": 2.393317246341607e-05, + "loss": 0.003, + "step": 4576 + }, + { + "epoch": 3.536783162772736, + "grad_norm": 0.01642046682536602, + "learning_rate": 2.391015649409723e-05, + "loss": 0.0029, + "step": 4577 + }, + { + "epoch": 3.5375555126472293, + "grad_norm": 0.012440281920135021, + "learning_rate": 2.388714811890142e-05, + "loss": 0.0033, + "step": 4578 + }, + { + "epoch": 3.5383278625217223, + "grad_norm": 0.011066598817706108, + "learning_rate": 2.3864147344525823e-05, + "loss": 0.003, + "step": 4579 + }, + { + "epoch": 3.5391002123962156, + "grad_norm": 0.010555957444012165, + "learning_rate": 2.3841154177665403e-05, + "loss": 0.0038, + "step": 4580 + }, + { + "epoch": 3.5398725622707086, + "grad_norm": 0.011121445335447788, + "learning_rate": 2.381816862501292e-05, + "loss": 0.0034, + "step": 4581 + }, + { + "epoch": 3.540644912145202, + "grad_norm": 0.009262949228286743, + "learning_rate": 2.3795190693258918e-05, + "loss": 0.0029, + "step": 4582 + }, + { + "epoch": 3.541417262019695, + "grad_norm": 0.013648148626089096, + "learning_rate": 2.377222038909171e-05, + "loss": 0.0034, + "step": 4583 + }, + { + "epoch": 3.5421896118941882, + "grad_norm": 0.01162190455943346, + "learning_rate": 2.37492577191974e-05, + "loss": 0.0034, + "step": 4584 + }, + { + "epoch": 3.542961961768681, + "grad_norm": 0.008952927775681019, + "learning_rate": 2.3726302690259867e-05, + "loss": 0.0034, + "step": 4585 + }, + { + "epoch": 3.543734311643174, + "grad_norm": 0.011494430713355541, + "learning_rate": 2.370335530896074e-05, + "loss": 0.0035, + "step": 4586 + }, + { + "epoch": 3.5445066615176675, + "grad_norm": 0.007824796251952648, + "learning_rate": 2.3680415581979508e-05, + "loss": 0.0032, + "step": 4587 + }, + { + "epoch": 3.545279011392161, + "grad_norm": 0.008753915317356586, + "learning_rate": 2.3657483515993324e-05, + "loss": 0.0033, + "step": 4588 + }, + { + "epoch": 3.5460513612666538, + "grad_norm": 0.008178495801985264, + "learning_rate": 2.3634559117677142e-05, + "loss": 0.003, + "step": 4589 + }, + { + "epoch": 3.5468237111411467, + "grad_norm": 0.010274248197674751, + "learning_rate": 2.361164239370377e-05, + "loss": 0.0032, + "step": 4590 + }, + { + "epoch": 3.54759606101564, + "grad_norm": 0.009580101817846298, + "learning_rate": 2.358873335074365e-05, + "loss": 0.0032, + "step": 4591 + }, + { + "epoch": 3.5483684108901334, + "grad_norm": 0.00803334079682827, + "learning_rate": 2.3565831995465045e-05, + "loss": 0.0032, + "step": 4592 + }, + { + "epoch": 3.5491407607646264, + "grad_norm": 0.01077238842844963, + "learning_rate": 2.3542938334534055e-05, + "loss": 0.0031, + "step": 4593 + }, + { + "epoch": 3.5499131106391193, + "grad_norm": 0.012328112497925758, + "learning_rate": 2.3520052374614403e-05, + "loss": 0.0038, + "step": 4594 + }, + { + "epoch": 3.5506854605136127, + "grad_norm": 0.010183589532971382, + "learning_rate": 2.349717412236767e-05, + "loss": 0.0037, + "step": 4595 + }, + { + "epoch": 3.551457810388106, + "grad_norm": 0.011016666889190674, + "learning_rate": 2.3474303584453155e-05, + "loss": 0.0024, + "step": 4596 + }, + { + "epoch": 3.552230160262599, + "grad_norm": 0.008774489164352417, + "learning_rate": 2.345144076752792e-05, + "loss": 0.0036, + "step": 4597 + }, + { + "epoch": 3.553002510137092, + "grad_norm": 0.013731655664741993, + "learning_rate": 2.342858567824678e-05, + "loss": 0.0033, + "step": 4598 + }, + { + "epoch": 3.5537748600115853, + "grad_norm": 0.00853305496275425, + "learning_rate": 2.3405738323262277e-05, + "loss": 0.003, + "step": 4599 + }, + { + "epoch": 3.5545472098860786, + "grad_norm": 0.0098152756690979, + "learning_rate": 2.3382898709224787e-05, + "loss": 0.0029, + "step": 4600 + }, + { + "epoch": 3.5553195597605716, + "grad_norm": 0.008680333383381367, + "learning_rate": 2.3360066842782306e-05, + "loss": 0.0033, + "step": 4601 + }, + { + "epoch": 3.5560919096350645, + "grad_norm": 0.009033185429871082, + "learning_rate": 2.3337242730580644e-05, + "loss": 0.0033, + "step": 4602 + }, + { + "epoch": 3.556864259509558, + "grad_norm": 0.013320722617208958, + "learning_rate": 2.33144263792634e-05, + "loss": 0.0034, + "step": 4603 + }, + { + "epoch": 3.5576366093840512, + "grad_norm": 0.0077407159842550755, + "learning_rate": 2.329161779547181e-05, + "loss": 0.003, + "step": 4604 + }, + { + "epoch": 3.558408959258544, + "grad_norm": 0.009022112935781479, + "learning_rate": 2.32688169858449e-05, + "loss": 0.0029, + "step": 4605 + }, + { + "epoch": 3.559181309133037, + "grad_norm": 0.008708495646715164, + "learning_rate": 2.324602395701949e-05, + "loss": 0.0033, + "step": 4606 + }, + { + "epoch": 3.5599536590075305, + "grad_norm": 0.01210398692637682, + "learning_rate": 2.3223238715630025e-05, + "loss": 0.0027, + "step": 4607 + }, + { + "epoch": 3.5607260088820234, + "grad_norm": 0.008263209834694862, + "learning_rate": 2.3200461268308744e-05, + "loss": 0.003, + "step": 4608 + }, + { + "epoch": 3.5614983587565168, + "grad_norm": 0.009071472100913525, + "learning_rate": 2.3177691621685638e-05, + "loss": 0.0035, + "step": 4609 + }, + { + "epoch": 3.5622707086310097, + "grad_norm": 0.007416809909045696, + "learning_rate": 2.3154929782388406e-05, + "loss": 0.0026, + "step": 4610 + }, + { + "epoch": 3.563043058505503, + "grad_norm": 0.010057068429887295, + "learning_rate": 2.313217575704242e-05, + "loss": 0.0032, + "step": 4611 + }, + { + "epoch": 3.563815408379996, + "grad_norm": 0.010340395383536816, + "learning_rate": 2.3109429552270874e-05, + "loss": 0.0034, + "step": 4612 + }, + { + "epoch": 3.5645877582544894, + "grad_norm": 0.012999104335904121, + "learning_rate": 2.3086691174694636e-05, + "loss": 0.0033, + "step": 4613 + }, + { + "epoch": 3.5653601081289823, + "grad_norm": 0.010097958147525787, + "learning_rate": 2.3063960630932252e-05, + "loss": 0.0033, + "step": 4614 + }, + { + "epoch": 3.5661324580034757, + "grad_norm": 0.01187471579760313, + "learning_rate": 2.3041237927600085e-05, + "loss": 0.0034, + "step": 4615 + }, + { + "epoch": 3.5669048078779686, + "grad_norm": 0.009638137184083462, + "learning_rate": 2.3018523071312164e-05, + "loss": 0.0028, + "step": 4616 + }, + { + "epoch": 3.567677157752462, + "grad_norm": 0.011620833538472652, + "learning_rate": 2.299581606868018e-05, + "loss": 0.0033, + "step": 4617 + }, + { + "epoch": 3.568449507626955, + "grad_norm": 0.008225577883422375, + "learning_rate": 2.2973116926313655e-05, + "loss": 0.0026, + "step": 4618 + }, + { + "epoch": 3.5692218575014483, + "grad_norm": 0.010084412060678005, + "learning_rate": 2.2950425650819728e-05, + "loss": 0.0037, + "step": 4619 + }, + { + "epoch": 3.569994207375941, + "grad_norm": 0.010329210199415684, + "learning_rate": 2.2927742248803313e-05, + "loss": 0.003, + "step": 4620 + }, + { + "epoch": 3.5707665572504346, + "grad_norm": 0.009796016849577427, + "learning_rate": 2.2905066726866937e-05, + "loss": 0.003, + "step": 4621 + }, + { + "epoch": 3.5715389071249275, + "grad_norm": 0.010266847908496857, + "learning_rate": 2.2882399091610957e-05, + "loss": 0.0029, + "step": 4622 + }, + { + "epoch": 3.572311256999421, + "grad_norm": 0.008089693263173103, + "learning_rate": 2.2859739349633368e-05, + "loss": 0.0031, + "step": 4623 + }, + { + "epoch": 3.573083606873914, + "grad_norm": 0.009946372359991074, + "learning_rate": 2.2837087507529826e-05, + "loss": 0.003, + "step": 4624 + }, + { + "epoch": 3.573855956748407, + "grad_norm": 0.009956549853086472, + "learning_rate": 2.2814443571893783e-05, + "loss": 0.0031, + "step": 4625 + }, + { + "epoch": 3.5746283066229, + "grad_norm": 0.007649792358279228, + "learning_rate": 2.279180754931634e-05, + "loss": 0.003, + "step": 4626 + }, + { + "epoch": 3.5754006564973935, + "grad_norm": 0.009878559038043022, + "learning_rate": 2.2769179446386253e-05, + "loss": 0.0035, + "step": 4627 + }, + { + "epoch": 3.5761730063718864, + "grad_norm": 0.009770630858838558, + "learning_rate": 2.274655926969006e-05, + "loss": 0.0031, + "step": 4628 + }, + { + "epoch": 3.5769453562463798, + "grad_norm": 0.009213805198669434, + "learning_rate": 2.2723947025811933e-05, + "loss": 0.003, + "step": 4629 + }, + { + "epoch": 3.5777177061208727, + "grad_norm": 0.013433797284960747, + "learning_rate": 2.2701342721333746e-05, + "loss": 0.0032, + "step": 4630 + }, + { + "epoch": 3.578490055995366, + "grad_norm": 0.01271948404610157, + "learning_rate": 2.2678746362835073e-05, + "loss": 0.0028, + "step": 4631 + }, + { + "epoch": 3.579262405869859, + "grad_norm": 0.009294467978179455, + "learning_rate": 2.2656157956893165e-05, + "loss": 0.0028, + "step": 4632 + }, + { + "epoch": 3.580034755744352, + "grad_norm": 0.007768784649670124, + "learning_rate": 2.2633577510082953e-05, + "loss": 0.0026, + "step": 4633 + }, + { + "epoch": 3.5808071056188453, + "grad_norm": 0.010570479556918144, + "learning_rate": 2.261100502897708e-05, + "loss": 0.0029, + "step": 4634 + }, + { + "epoch": 3.5815794554933387, + "grad_norm": 0.010714959353208542, + "learning_rate": 2.2588440520145824e-05, + "loss": 0.003, + "step": 4635 + }, + { + "epoch": 3.5823518053678316, + "grad_norm": 0.008927794173359871, + "learning_rate": 2.256588399015719e-05, + "loss": 0.0034, + "step": 4636 + }, + { + "epoch": 3.5831241552423245, + "grad_norm": 0.011611155234277248, + "learning_rate": 2.2543335445576824e-05, + "loss": 0.0034, + "step": 4637 + }, + { + "epoch": 3.583896505116818, + "grad_norm": 0.016657888889312744, + "learning_rate": 2.2520794892968072e-05, + "loss": 0.0035, + "step": 4638 + }, + { + "epoch": 3.5846688549913113, + "grad_norm": 0.009623616002500057, + "learning_rate": 2.249826233889194e-05, + "loss": 0.0033, + "step": 4639 + }, + { + "epoch": 3.585441204865804, + "grad_norm": 0.009875661693513393, + "learning_rate": 2.2475737789907107e-05, + "loss": 0.0031, + "step": 4640 + }, + { + "epoch": 3.586213554740297, + "grad_norm": 0.010192835703492165, + "learning_rate": 2.2453221252569932e-05, + "loss": 0.0027, + "step": 4641 + }, + { + "epoch": 3.5869859046147905, + "grad_norm": 0.00782682653516531, + "learning_rate": 2.2430712733434433e-05, + "loss": 0.0032, + "step": 4642 + }, + { + "epoch": 3.587758254489284, + "grad_norm": 0.009048606269061565, + "learning_rate": 2.2408212239052294e-05, + "loss": 0.0033, + "step": 4643 + }, + { + "epoch": 3.588530604363777, + "grad_norm": 0.011508859694004059, + "learning_rate": 2.2385719775972863e-05, + "loss": 0.0031, + "step": 4644 + }, + { + "epoch": 3.5893029542382697, + "grad_norm": 0.007973534055054188, + "learning_rate": 2.2363235350743162e-05, + "loss": 0.0027, + "step": 4645 + }, + { + "epoch": 3.590075304112763, + "grad_norm": 0.008861503563821316, + "learning_rate": 2.234075896990785e-05, + "loss": 0.0034, + "step": 4646 + }, + { + "epoch": 3.5908476539872565, + "grad_norm": 0.014525181613862514, + "learning_rate": 2.2318290640009264e-05, + "loss": 0.0035, + "step": 4647 + }, + { + "epoch": 3.5916200038617494, + "grad_norm": 0.0103535745292902, + "learning_rate": 2.22958303675874e-05, + "loss": 0.0031, + "step": 4648 + }, + { + "epoch": 3.5923923537362423, + "grad_norm": 0.006877128500491381, + "learning_rate": 2.2273378159179892e-05, + "loss": 0.0029, + "step": 4649 + }, + { + "epoch": 3.5931647036107357, + "grad_norm": 0.01847553625702858, + "learning_rate": 2.225093402132204e-05, + "loss": 0.0035, + "step": 4650 + }, + { + "epoch": 3.593937053485229, + "grad_norm": 0.010287228040397167, + "learning_rate": 2.2228497960546778e-05, + "loss": 0.0031, + "step": 4651 + }, + { + "epoch": 3.594709403359722, + "grad_norm": 0.01384812779724598, + "learning_rate": 2.220606998338472e-05, + "loss": 0.003, + "step": 4652 + }, + { + "epoch": 3.595481753234215, + "grad_norm": 0.008724525570869446, + "learning_rate": 2.2183650096364095e-05, + "loss": 0.0031, + "step": 4653 + }, + { + "epoch": 3.5962541031087083, + "grad_norm": 0.010740946978330612, + "learning_rate": 2.21612383060108e-05, + "loss": 0.0037, + "step": 4654 + }, + { + "epoch": 3.5970264529832012, + "grad_norm": 0.012365031987428665, + "learning_rate": 2.213883461884835e-05, + "loss": 0.0031, + "step": 4655 + }, + { + "epoch": 3.5977988028576946, + "grad_norm": 0.01322922669351101, + "learning_rate": 2.2116439041397936e-05, + "loss": 0.0033, + "step": 4656 + }, + { + "epoch": 3.5985711527321875, + "grad_norm": 0.012405367568135262, + "learning_rate": 2.209405158017836e-05, + "loss": 0.0034, + "step": 4657 + }, + { + "epoch": 3.599343502606681, + "grad_norm": 0.010477261617779732, + "learning_rate": 2.2071672241706067e-05, + "loss": 0.0031, + "step": 4658 + }, + { + "epoch": 3.600115852481174, + "grad_norm": 0.009934378787875175, + "learning_rate": 2.204930103249513e-05, + "loss": 0.0032, + "step": 4659 + }, + { + "epoch": 3.600888202355667, + "grad_norm": 0.013846274465322495, + "learning_rate": 2.202693795905732e-05, + "loss": 0.0032, + "step": 4660 + }, + { + "epoch": 3.60166055223016, + "grad_norm": 0.011807041242718697, + "learning_rate": 2.2004583027901932e-05, + "loss": 0.0028, + "step": 4661 + }, + { + "epoch": 3.6024329021046535, + "grad_norm": 0.009982018731534481, + "learning_rate": 2.198223624553595e-05, + "loss": 0.0031, + "step": 4662 + }, + { + "epoch": 3.6032052519791464, + "grad_norm": 0.00852767750620842, + "learning_rate": 2.1959897618464025e-05, + "loss": 0.0024, + "step": 4663 + }, + { + "epoch": 3.60397760185364, + "grad_norm": 0.016557959839701653, + "learning_rate": 2.1937567153188353e-05, + "loss": 0.003, + "step": 4664 + }, + { + "epoch": 3.6047499517281327, + "grad_norm": 0.00804685615003109, + "learning_rate": 2.1915244856208788e-05, + "loss": 0.0029, + "step": 4665 + }, + { + "epoch": 3.605522301602626, + "grad_norm": 0.007616143673658371, + "learning_rate": 2.189293073402286e-05, + "loss": 0.003, + "step": 4666 + }, + { + "epoch": 3.606294651477119, + "grad_norm": 0.010534284636378288, + "learning_rate": 2.187062479312562e-05, + "loss": 0.0034, + "step": 4667 + }, + { + "epoch": 3.6070670013516124, + "grad_norm": 0.01100563257932663, + "learning_rate": 2.184832704000979e-05, + "loss": 0.0033, + "step": 4668 + }, + { + "epoch": 3.6078393512261053, + "grad_norm": 0.010521691292524338, + "learning_rate": 2.182603748116574e-05, + "loss": 0.0031, + "step": 4669 + }, + { + "epoch": 3.6086117011005987, + "grad_norm": 0.008225660771131516, + "learning_rate": 2.1803756123081425e-05, + "loss": 0.0029, + "step": 4670 + }, + { + "epoch": 3.6093840509750916, + "grad_norm": 0.00809970311820507, + "learning_rate": 2.1781482972242352e-05, + "loss": 0.0031, + "step": 4671 + }, + { + "epoch": 3.610156400849585, + "grad_norm": 0.008778408169746399, + "learning_rate": 2.1759218035131758e-05, + "loss": 0.0028, + "step": 4672 + }, + { + "epoch": 3.610928750724078, + "grad_norm": 0.010100818239152431, + "learning_rate": 2.173696131823042e-05, + "loss": 0.0028, + "step": 4673 + }, + { + "epoch": 3.6117011005985713, + "grad_norm": 0.007120962720364332, + "learning_rate": 2.17147128280167e-05, + "loss": 0.0031, + "step": 4674 + }, + { + "epoch": 3.6124734504730642, + "grad_norm": 0.009938807226717472, + "learning_rate": 2.1692472570966593e-05, + "loss": 0.0026, + "step": 4675 + }, + { + "epoch": 3.6132458003475576, + "grad_norm": 0.009664575569331646, + "learning_rate": 2.1670240553553755e-05, + "loss": 0.0033, + "step": 4676 + }, + { + "epoch": 3.6140181502220505, + "grad_norm": 0.010297637432813644, + "learning_rate": 2.1648016782249337e-05, + "loss": 0.0034, + "step": 4677 + }, + { + "epoch": 3.614790500096544, + "grad_norm": 0.008731442503631115, + "learning_rate": 2.1625801263522143e-05, + "loss": 0.003, + "step": 4678 + }, + { + "epoch": 3.615562849971037, + "grad_norm": 0.013439536094665527, + "learning_rate": 2.1603594003838602e-05, + "loss": 0.0033, + "step": 4679 + }, + { + "epoch": 3.6163351998455298, + "grad_norm": 0.007052401080727577, + "learning_rate": 2.158139500966272e-05, + "loss": 0.0021, + "step": 4680 + }, + { + "epoch": 3.617107549720023, + "grad_norm": 0.0082322983071208, + "learning_rate": 2.155920428745603e-05, + "loss": 0.0025, + "step": 4681 + }, + { + "epoch": 3.6178798995945165, + "grad_norm": 0.01100889965891838, + "learning_rate": 2.153702184367777e-05, + "loss": 0.0034, + "step": 4682 + }, + { + "epoch": 3.6186522494690094, + "grad_norm": 0.022772731259465218, + "learning_rate": 2.1514847684784706e-05, + "loss": 0.0032, + "step": 4683 + }, + { + "epoch": 3.6194245993435024, + "grad_norm": 0.0094189727678895, + "learning_rate": 2.1492681817231153e-05, + "loss": 0.0027, + "step": 4684 + }, + { + "epoch": 3.6201969492179957, + "grad_norm": 0.008233190514147282, + "learning_rate": 2.1470524247469115e-05, + "loss": 0.0032, + "step": 4685 + }, + { + "epoch": 3.620969299092489, + "grad_norm": 0.011170231737196445, + "learning_rate": 2.1448374981948123e-05, + "loss": 0.003, + "step": 4686 + }, + { + "epoch": 3.621741648966982, + "grad_norm": 0.010109986178576946, + "learning_rate": 2.1426234027115237e-05, + "loss": 0.0031, + "step": 4687 + }, + { + "epoch": 3.622513998841475, + "grad_norm": 0.010346760973334312, + "learning_rate": 2.140410138941521e-05, + "loss": 0.0033, + "step": 4688 + }, + { + "epoch": 3.6232863487159683, + "grad_norm": 0.00939234159886837, + "learning_rate": 2.13819770752903e-05, + "loss": 0.0032, + "step": 4689 + }, + { + "epoch": 3.6240586985904617, + "grad_norm": 0.009421358816325665, + "learning_rate": 2.135986109118036e-05, + "loss": 0.0028, + "step": 4690 + }, + { + "epoch": 3.6248310484649546, + "grad_norm": 0.007759148720651865, + "learning_rate": 2.1337753443522818e-05, + "loss": 0.0033, + "step": 4691 + }, + { + "epoch": 3.6256033983394476, + "grad_norm": 0.009183508343994617, + "learning_rate": 2.1315654138752678e-05, + "loss": 0.0031, + "step": 4692 + }, + { + "epoch": 3.626375748213941, + "grad_norm": 0.016507580876350403, + "learning_rate": 2.129356318330251e-05, + "loss": 0.003, + "step": 4693 + }, + { + "epoch": 3.6271480980884343, + "grad_norm": 0.00906956847757101, + "learning_rate": 2.127148058360246e-05, + "loss": 0.0034, + "step": 4694 + }, + { + "epoch": 3.6279204479629272, + "grad_norm": 0.008150842040777206, + "learning_rate": 2.124940634608024e-05, + "loss": 0.0037, + "step": 4695 + }, + { + "epoch": 3.62869279783742, + "grad_norm": 0.008811009116470814, + "learning_rate": 2.1227340477161122e-05, + "loss": 0.003, + "step": 4696 + }, + { + "epoch": 3.6294651477119135, + "grad_norm": 0.012078574858605862, + "learning_rate": 2.1205282983267954e-05, + "loss": 0.0034, + "step": 4697 + }, + { + "epoch": 3.630237497586407, + "grad_norm": 0.01125948503613472, + "learning_rate": 2.118323387082114e-05, + "loss": 0.0032, + "step": 4698 + }, + { + "epoch": 3.6310098474609, + "grad_norm": 0.017745889723300934, + "learning_rate": 2.1161193146238633e-05, + "loss": 0.0032, + "step": 4699 + }, + { + "epoch": 3.6317821973353928, + "grad_norm": 0.010856762528419495, + "learning_rate": 2.1139160815935965e-05, + "loss": 0.0034, + "step": 4700 + }, + { + "epoch": 3.632554547209886, + "grad_norm": 0.008668830618262291, + "learning_rate": 2.1117136886326216e-05, + "loss": 0.0029, + "step": 4701 + }, + { + "epoch": 3.633326897084379, + "grad_norm": 0.019786952063441277, + "learning_rate": 2.1095121363820015e-05, + "loss": 0.0035, + "step": 4702 + }, + { + "epoch": 3.6340992469588724, + "grad_norm": 0.015364222228527069, + "learning_rate": 2.1073114254825547e-05, + "loss": 0.0029, + "step": 4703 + }, + { + "epoch": 3.6348715968333654, + "grad_norm": 0.007902778685092926, + "learning_rate": 2.105111556574856e-05, + "loss": 0.0027, + "step": 4704 + }, + { + "epoch": 3.6356439467078587, + "grad_norm": 0.0083285728469491, + "learning_rate": 2.1029125302992325e-05, + "loss": 0.0033, + "step": 4705 + }, + { + "epoch": 3.6364162965823517, + "grad_norm": 0.01187330111861229, + "learning_rate": 2.100714347295769e-05, + "loss": 0.0032, + "step": 4706 + }, + { + "epoch": 3.637188646456845, + "grad_norm": 0.010464648716151714, + "learning_rate": 2.0985170082043027e-05, + "loss": 0.0032, + "step": 4707 + }, + { + "epoch": 3.637960996331338, + "grad_norm": 0.010964437387883663, + "learning_rate": 2.096320513664426e-05, + "loss": 0.0028, + "step": 4708 + }, + { + "epoch": 3.6387333462058313, + "grad_norm": 0.008737288415431976, + "learning_rate": 2.0941248643154858e-05, + "loss": 0.0031, + "step": 4709 + }, + { + "epoch": 3.6395056960803243, + "grad_norm": 0.012885338626801968, + "learning_rate": 2.0919300607965824e-05, + "loss": 0.0034, + "step": 4710 + }, + { + "epoch": 3.6402780459548176, + "grad_norm": 0.00732586532831192, + "learning_rate": 2.0897361037465695e-05, + "loss": 0.0028, + "step": 4711 + }, + { + "epoch": 3.6410503958293106, + "grad_norm": 0.012714467942714691, + "learning_rate": 2.087542993804056e-05, + "loss": 0.003, + "step": 4712 + }, + { + "epoch": 3.641822745703804, + "grad_norm": 0.009296673350036144, + "learning_rate": 2.085350731607403e-05, + "loss": 0.0031, + "step": 4713 + }, + { + "epoch": 3.642595095578297, + "grad_norm": 0.009091252461075783, + "learning_rate": 2.083159317794724e-05, + "loss": 0.0034, + "step": 4714 + }, + { + "epoch": 3.6433674454527902, + "grad_norm": 0.010541771538555622, + "learning_rate": 2.0809687530038872e-05, + "loss": 0.0031, + "step": 4715 + }, + { + "epoch": 3.644139795327283, + "grad_norm": 0.014411487616598606, + "learning_rate": 2.0787790378725135e-05, + "loss": 0.0031, + "step": 4716 + }, + { + "epoch": 3.6449121452017765, + "grad_norm": 0.009617066010832787, + "learning_rate": 2.0765901730379756e-05, + "loss": 0.0031, + "step": 4717 + }, + { + "epoch": 3.6456844950762695, + "grad_norm": 0.00811022613197565, + "learning_rate": 2.0744021591373996e-05, + "loss": 0.0028, + "step": 4718 + }, + { + "epoch": 3.646456844950763, + "grad_norm": 0.01548534631729126, + "learning_rate": 2.0722149968076615e-05, + "loss": 0.0032, + "step": 4719 + }, + { + "epoch": 3.6472291948252558, + "grad_norm": 0.008539475500583649, + "learning_rate": 2.0700286866853963e-05, + "loss": 0.0029, + "step": 4720 + }, + { + "epoch": 3.648001544699749, + "grad_norm": 0.009630308486521244, + "learning_rate": 2.0678432294069815e-05, + "loss": 0.0032, + "step": 4721 + }, + { + "epoch": 3.648773894574242, + "grad_norm": 0.01124610099941492, + "learning_rate": 2.0656586256085504e-05, + "loss": 0.0034, + "step": 4722 + }, + { + "epoch": 3.6495462444487354, + "grad_norm": 0.012255601584911346, + "learning_rate": 2.0634748759259936e-05, + "loss": 0.0032, + "step": 4723 + }, + { + "epoch": 3.6503185943232284, + "grad_norm": 0.015464738011360168, + "learning_rate": 2.0612919809949427e-05, + "loss": 0.0035, + "step": 4724 + }, + { + "epoch": 3.6510909441977217, + "grad_norm": 0.009021823294460773, + "learning_rate": 2.059109941450786e-05, + "loss": 0.0032, + "step": 4725 + }, + { + "epoch": 3.6518632940722147, + "grad_norm": 0.012433023191988468, + "learning_rate": 2.0569287579286674e-05, + "loss": 0.0036, + "step": 4726 + }, + { + "epoch": 3.6526356439467076, + "grad_norm": 0.008983463048934937, + "learning_rate": 2.054748431063472e-05, + "loss": 0.003, + "step": 4727 + }, + { + "epoch": 3.653407993821201, + "grad_norm": 0.008886398747563362, + "learning_rate": 2.0525689614898407e-05, + "loss": 0.0031, + "step": 4728 + }, + { + "epoch": 3.6541803436956943, + "grad_norm": 0.01756923831999302, + "learning_rate": 2.050390349842164e-05, + "loss": 0.0029, + "step": 4729 + }, + { + "epoch": 3.6549526935701873, + "grad_norm": 0.010268906131386757, + "learning_rate": 2.0482125967545878e-05, + "loss": 0.0027, + "step": 4730 + }, + { + "epoch": 3.65572504344468, + "grad_norm": 0.007442606147378683, + "learning_rate": 2.046035702860998e-05, + "loss": 0.003, + "step": 4731 + }, + { + "epoch": 3.6564973933191736, + "grad_norm": 0.017092349007725716, + "learning_rate": 2.043859668795036e-05, + "loss": 0.0035, + "step": 4732 + }, + { + "epoch": 3.657269743193667, + "grad_norm": 0.007818550802767277, + "learning_rate": 2.0416844951900987e-05, + "loss": 0.003, + "step": 4733 + }, + { + "epoch": 3.65804209306816, + "grad_norm": 0.010054933838546276, + "learning_rate": 2.0395101826793207e-05, + "loss": 0.0031, + "step": 4734 + }, + { + "epoch": 3.658814442942653, + "grad_norm": 0.010219238698482513, + "learning_rate": 2.037336731895591e-05, + "loss": 0.0034, + "step": 4735 + }, + { + "epoch": 3.659586792817146, + "grad_norm": 0.009758195839822292, + "learning_rate": 2.0351641434715553e-05, + "loss": 0.0029, + "step": 4736 + }, + { + "epoch": 3.6603591426916395, + "grad_norm": 0.011908262968063354, + "learning_rate": 2.0329924180395953e-05, + "loss": 0.003, + "step": 4737 + }, + { + "epoch": 3.6611314925661325, + "grad_norm": 0.008491923101246357, + "learning_rate": 2.030821556231849e-05, + "loss": 0.0033, + "step": 4738 + }, + { + "epoch": 3.6619038424406254, + "grad_norm": 0.01156783476471901, + "learning_rate": 2.0286515586802034e-05, + "loss": 0.0027, + "step": 4739 + }, + { + "epoch": 3.6626761923151188, + "grad_norm": 0.011989074759185314, + "learning_rate": 2.0264824260162946e-05, + "loss": 0.0032, + "step": 4740 + }, + { + "epoch": 3.663448542189612, + "grad_norm": 0.01123703084886074, + "learning_rate": 2.0243141588714974e-05, + "loss": 0.0033, + "step": 4741 + }, + { + "epoch": 3.664220892064105, + "grad_norm": 0.008847612887620926, + "learning_rate": 2.0221467578769487e-05, + "loss": 0.0031, + "step": 4742 + }, + { + "epoch": 3.664993241938598, + "grad_norm": 0.008390386588871479, + "learning_rate": 2.0199802236635257e-05, + "loss": 0.0028, + "step": 4743 + }, + { + "epoch": 3.6657655918130914, + "grad_norm": 0.010448389686644077, + "learning_rate": 2.0178145568618495e-05, + "loss": 0.0032, + "step": 4744 + }, + { + "epoch": 3.6665379416875847, + "grad_norm": 0.011151538230478764, + "learning_rate": 2.0156497581022983e-05, + "loss": 0.003, + "step": 4745 + }, + { + "epoch": 3.6673102915620777, + "grad_norm": 0.011992346495389938, + "learning_rate": 2.0134858280149928e-05, + "loss": 0.0035, + "step": 4746 + }, + { + "epoch": 3.6680826414365706, + "grad_norm": 0.010773863643407822, + "learning_rate": 2.011322767229795e-05, + "loss": 0.0031, + "step": 4747 + }, + { + "epoch": 3.668854991311064, + "grad_norm": 0.014904526993632317, + "learning_rate": 2.0091605763763255e-05, + "loss": 0.0029, + "step": 4748 + }, + { + "epoch": 3.6696273411855573, + "grad_norm": 0.011230251751840115, + "learning_rate": 2.006999256083944e-05, + "loss": 0.0034, + "step": 4749 + }, + { + "epoch": 3.6703996910600503, + "grad_norm": 0.009951038286089897, + "learning_rate": 2.0048388069817585e-05, + "loss": 0.003, + "step": 4750 + }, + { + "epoch": 3.671172040934543, + "grad_norm": 0.014170085079967976, + "learning_rate": 2.0026792296986242e-05, + "loss": 0.0032, + "step": 4751 + }, + { + "epoch": 3.6719443908090366, + "grad_norm": 0.0095702288672328, + "learning_rate": 2.0005205248631404e-05, + "loss": 0.0031, + "step": 4752 + }, + { + "epoch": 3.6727167406835295, + "grad_norm": 0.014286153018474579, + "learning_rate": 1.998362693103657e-05, + "loss": 0.0036, + "step": 4753 + }, + { + "epoch": 3.673489090558023, + "grad_norm": 0.01710379868745804, + "learning_rate": 1.9962057350482605e-05, + "loss": 0.0029, + "step": 4754 + }, + { + "epoch": 3.674261440432516, + "grad_norm": 0.010805051773786545, + "learning_rate": 1.994049651324795e-05, + "loss": 0.0029, + "step": 4755 + }, + { + "epoch": 3.675033790307009, + "grad_norm": 0.008329571224749088, + "learning_rate": 1.9918944425608442e-05, + "loss": 0.0028, + "step": 4756 + }, + { + "epoch": 3.675806140181502, + "grad_norm": 0.008502728305757046, + "learning_rate": 1.9897401093837325e-05, + "loss": 0.0032, + "step": 4757 + }, + { + "epoch": 3.6765784900559955, + "grad_norm": 0.013527288101613522, + "learning_rate": 1.9875866524205395e-05, + "loss": 0.0033, + "step": 4758 + }, + { + "epoch": 3.6773508399304884, + "grad_norm": 0.012341136112809181, + "learning_rate": 1.985434072298081e-05, + "loss": 0.0031, + "step": 4759 + }, + { + "epoch": 3.6781231898049818, + "grad_norm": 0.011203516274690628, + "learning_rate": 1.9832823696429236e-05, + "loss": 0.003, + "step": 4760 + }, + { + "epoch": 3.6788955396794747, + "grad_norm": 0.013294699601829052, + "learning_rate": 1.981131545081375e-05, + "loss": 0.0031, + "step": 4761 + }, + { + "epoch": 3.679667889553968, + "grad_norm": 0.008355311118066311, + "learning_rate": 1.9789815992394873e-05, + "loss": 0.003, + "step": 4762 + }, + { + "epoch": 3.680440239428461, + "grad_norm": 0.013429731130599976, + "learning_rate": 1.9768325327430588e-05, + "loss": 0.0033, + "step": 4763 + }, + { + "epoch": 3.6812125893029544, + "grad_norm": 0.015098560601472855, + "learning_rate": 1.9746843462176307e-05, + "loss": 0.003, + "step": 4764 + }, + { + "epoch": 3.6819849391774473, + "grad_norm": 0.010638771578669548, + "learning_rate": 1.972537040288488e-05, + "loss": 0.0027, + "step": 4765 + }, + { + "epoch": 3.6827572890519407, + "grad_norm": 0.010106007568538189, + "learning_rate": 1.9703906155806594e-05, + "loss": 0.0035, + "step": 4766 + }, + { + "epoch": 3.6835296389264336, + "grad_norm": 0.009524368681013584, + "learning_rate": 1.968245072718918e-05, + "loss": 0.0034, + "step": 4767 + }, + { + "epoch": 3.684301988800927, + "grad_norm": 0.014750136993825436, + "learning_rate": 1.9661004123277783e-05, + "loss": 0.0032, + "step": 4768 + }, + { + "epoch": 3.68507433867542, + "grad_norm": 0.01715848408639431, + "learning_rate": 1.9639566350315003e-05, + "loss": 0.003, + "step": 4769 + }, + { + "epoch": 3.6858466885499133, + "grad_norm": 0.009163351729512215, + "learning_rate": 1.961813741454085e-05, + "loss": 0.0031, + "step": 4770 + }, + { + "epoch": 3.686619038424406, + "grad_norm": 0.008157082833349705, + "learning_rate": 1.959671732219277e-05, + "loss": 0.0032, + "step": 4771 + }, + { + "epoch": 3.6873913882988996, + "grad_norm": 0.008473200723528862, + "learning_rate": 1.9575306079505638e-05, + "loss": 0.0036, + "step": 4772 + }, + { + "epoch": 3.6881637381733925, + "grad_norm": 0.011803386732935905, + "learning_rate": 1.955390369271175e-05, + "loss": 0.0033, + "step": 4773 + }, + { + "epoch": 3.6889360880478854, + "grad_norm": 0.015068240463733673, + "learning_rate": 1.9532510168040824e-05, + "loss": 0.0033, + "step": 4774 + }, + { + "epoch": 3.689708437922379, + "grad_norm": 0.008709238842129707, + "learning_rate": 1.9511125511719992e-05, + "loss": 0.003, + "step": 4775 + }, + { + "epoch": 3.690480787796872, + "grad_norm": 0.00890977494418621, + "learning_rate": 1.9489749729973812e-05, + "loss": 0.0032, + "step": 4776 + }, + { + "epoch": 3.691253137671365, + "grad_norm": 0.006033789366483688, + "learning_rate": 1.9468382829024263e-05, + "loss": 0.0024, + "step": 4777 + }, + { + "epoch": 3.692025487545858, + "grad_norm": 0.012680809944868088, + "learning_rate": 1.9447024815090726e-05, + "loss": 0.0031, + "step": 4778 + }, + { + "epoch": 3.6927978374203514, + "grad_norm": 0.009670529514551163, + "learning_rate": 1.9425675694389995e-05, + "loss": 0.0032, + "step": 4779 + }, + { + "epoch": 3.6935701872948448, + "grad_norm": 0.014543136581778526, + "learning_rate": 1.9404335473136327e-05, + "loss": 0.0027, + "step": 4780 + }, + { + "epoch": 3.6943425371693377, + "grad_norm": 0.008619261905550957, + "learning_rate": 1.9383004157541296e-05, + "loss": 0.003, + "step": 4781 + }, + { + "epoch": 3.6951148870438306, + "grad_norm": 0.014257128350436687, + "learning_rate": 1.936168175381395e-05, + "loss": 0.003, + "step": 4782 + }, + { + "epoch": 3.695887236918324, + "grad_norm": 0.009102889336645603, + "learning_rate": 1.9340368268160725e-05, + "loss": 0.0029, + "step": 4783 + }, + { + "epoch": 3.6966595867928174, + "grad_norm": 0.00875469483435154, + "learning_rate": 1.9319063706785462e-05, + "loss": 0.0033, + "step": 4784 + }, + { + "epoch": 3.6974319366673103, + "grad_norm": 0.009385279379785061, + "learning_rate": 1.92977680758894e-05, + "loss": 0.0031, + "step": 4785 + }, + { + "epoch": 3.698204286541803, + "grad_norm": 0.01086689718067646, + "learning_rate": 1.927648138167119e-05, + "loss": 0.0029, + "step": 4786 + }, + { + "epoch": 3.6989766364162966, + "grad_norm": 0.01208088081330061, + "learning_rate": 1.9255203630326872e-05, + "loss": 0.0028, + "step": 4787 + }, + { + "epoch": 3.69974898629079, + "grad_norm": 0.008676470257341862, + "learning_rate": 1.9233934828049884e-05, + "loss": 0.0031, + "step": 4788 + }, + { + "epoch": 3.700521336165283, + "grad_norm": 0.008346854709088802, + "learning_rate": 1.9212674981031054e-05, + "loss": 0.003, + "step": 4789 + }, + { + "epoch": 3.701293686039776, + "grad_norm": 0.00872017815709114, + "learning_rate": 1.919142409545862e-05, + "loss": 0.0029, + "step": 4790 + }, + { + "epoch": 3.702066035914269, + "grad_norm": 0.00791302789002657, + "learning_rate": 1.9170182177518203e-05, + "loss": 0.003, + "step": 4791 + }, + { + "epoch": 3.7028383857887626, + "grad_norm": 0.010449966415762901, + "learning_rate": 1.9148949233392783e-05, + "loss": 0.0028, + "step": 4792 + }, + { + "epoch": 3.7036107356632555, + "grad_norm": 0.012326085940003395, + "learning_rate": 1.9127725269262814e-05, + "loss": 0.0033, + "step": 4793 + }, + { + "epoch": 3.7043830855377484, + "grad_norm": 0.01011696644127369, + "learning_rate": 1.9106510291306033e-05, + "loss": 0.003, + "step": 4794 + }, + { + "epoch": 3.705155435412242, + "grad_norm": 0.009063688106834888, + "learning_rate": 1.90853043056976e-05, + "loss": 0.003, + "step": 4795 + }, + { + "epoch": 3.705927785286735, + "grad_norm": 0.010787200182676315, + "learning_rate": 1.906410731861012e-05, + "loss": 0.003, + "step": 4796 + }, + { + "epoch": 3.706700135161228, + "grad_norm": 0.009205954149365425, + "learning_rate": 1.904291933621347e-05, + "loss": 0.0034, + "step": 4797 + }, + { + "epoch": 3.707472485035721, + "grad_norm": 0.013273040764033794, + "learning_rate": 1.9021740364674966e-05, + "loss": 0.003, + "step": 4798 + }, + { + "epoch": 3.7082448349102144, + "grad_norm": 0.02008615992963314, + "learning_rate": 1.900057041015934e-05, + "loss": 0.0037, + "step": 4799 + }, + { + "epoch": 3.7090171847847073, + "grad_norm": 0.006965349894016981, + "learning_rate": 1.8979409478828604e-05, + "loss": 0.003, + "step": 4800 + }, + { + "epoch": 3.7097895346592007, + "grad_norm": 0.011672982014715672, + "learning_rate": 1.8958257576842194e-05, + "loss": 0.0032, + "step": 4801 + }, + { + "epoch": 3.7105618845336936, + "grad_norm": 0.010026531293988228, + "learning_rate": 1.893711471035695e-05, + "loss": 0.0033, + "step": 4802 + }, + { + "epoch": 3.711334234408187, + "grad_norm": 0.007827379740774632, + "learning_rate": 1.8915980885527053e-05, + "loss": 0.0032, + "step": 4803 + }, + { + "epoch": 3.71210658428268, + "grad_norm": 0.012139009311795235, + "learning_rate": 1.8894856108503993e-05, + "loss": 0.0033, + "step": 4804 + }, + { + "epoch": 3.7128789341571733, + "grad_norm": 0.008282513357698917, + "learning_rate": 1.8873740385436743e-05, + "loss": 0.0029, + "step": 4805 + }, + { + "epoch": 3.7136512840316662, + "grad_norm": 0.00895603746175766, + "learning_rate": 1.885263372247157e-05, + "loss": 0.0028, + "step": 4806 + }, + { + "epoch": 3.7144236339061596, + "grad_norm": 0.008988351561129093, + "learning_rate": 1.8831536125752086e-05, + "loss": 0.0031, + "step": 4807 + }, + { + "epoch": 3.7151959837806525, + "grad_norm": 0.007388737518340349, + "learning_rate": 1.8810447601419285e-05, + "loss": 0.0027, + "step": 4808 + }, + { + "epoch": 3.715968333655146, + "grad_norm": 0.010159352794289589, + "learning_rate": 1.878936815561158e-05, + "loss": 0.0033, + "step": 4809 + }, + { + "epoch": 3.716740683529639, + "grad_norm": 0.011480720713734627, + "learning_rate": 1.876829779446464e-05, + "loss": 0.0028, + "step": 4810 + }, + { + "epoch": 3.717513033404132, + "grad_norm": 0.01044251024723053, + "learning_rate": 1.8747236524111534e-05, + "loss": 0.0029, + "step": 4811 + }, + { + "epoch": 3.718285383278625, + "grad_norm": 0.010147932916879654, + "learning_rate": 1.872618435068273e-05, + "loss": 0.0035, + "step": 4812 + }, + { + "epoch": 3.7190577331531185, + "grad_norm": 0.008173985406756401, + "learning_rate": 1.8705141280305998e-05, + "loss": 0.0029, + "step": 4813 + }, + { + "epoch": 3.7198300830276114, + "grad_norm": 0.014352631755173206, + "learning_rate": 1.8684107319106424e-05, + "loss": 0.0031, + "step": 4814 + }, + { + "epoch": 3.720602432902105, + "grad_norm": 0.013318825513124466, + "learning_rate": 1.8663082473206535e-05, + "loss": 0.0029, + "step": 4815 + }, + { + "epoch": 3.7213747827765977, + "grad_norm": 0.014817917719483376, + "learning_rate": 1.864206674872615e-05, + "loss": 0.0031, + "step": 4816 + }, + { + "epoch": 3.722147132651091, + "grad_norm": 0.010158638469874859, + "learning_rate": 1.8621060151782393e-05, + "loss": 0.0033, + "step": 4817 + }, + { + "epoch": 3.722919482525584, + "grad_norm": 0.014668983407318592, + "learning_rate": 1.8600062688489827e-05, + "loss": 0.0038, + "step": 4818 + }, + { + "epoch": 3.7236918324000774, + "grad_norm": 0.009824811480939388, + "learning_rate": 1.857907436496031e-05, + "loss": 0.0028, + "step": 4819 + }, + { + "epoch": 3.7244641822745703, + "grad_norm": 0.008027785457670689, + "learning_rate": 1.8558095187302977e-05, + "loss": 0.003, + "step": 4820 + }, + { + "epoch": 3.7252365321490633, + "grad_norm": 0.012118781916797161, + "learning_rate": 1.8537125161624414e-05, + "loss": 0.0034, + "step": 4821 + }, + { + "epoch": 3.7260088820235566, + "grad_norm": 0.008490202017128468, + "learning_rate": 1.8516164294028472e-05, + "loss": 0.0028, + "step": 4822 + }, + { + "epoch": 3.72678123189805, + "grad_norm": 0.008115014061331749, + "learning_rate": 1.8495212590616357e-05, + "loss": 0.0026, + "step": 4823 + }, + { + "epoch": 3.727553581772543, + "grad_norm": 0.011491495184600353, + "learning_rate": 1.8474270057486593e-05, + "loss": 0.0032, + "step": 4824 + }, + { + "epoch": 3.728325931647036, + "grad_norm": 0.010381430387496948, + "learning_rate": 1.8453336700735057e-05, + "loss": 0.0025, + "step": 4825 + }, + { + "epoch": 3.7290982815215292, + "grad_norm": 0.008572252467274666, + "learning_rate": 1.843241252645494e-05, + "loss": 0.0032, + "step": 4826 + }, + { + "epoch": 3.7298706313960226, + "grad_norm": 0.011093181557953358, + "learning_rate": 1.8411497540736757e-05, + "loss": 0.0032, + "step": 4827 + }, + { + "epoch": 3.7306429812705155, + "grad_norm": 0.01273476891219616, + "learning_rate": 1.8390591749668362e-05, + "loss": 0.003, + "step": 4828 + }, + { + "epoch": 3.7314153311450085, + "grad_norm": 0.009482769295573235, + "learning_rate": 1.8369695159334925e-05, + "loss": 0.003, + "step": 4829 + }, + { + "epoch": 3.732187681019502, + "grad_norm": 0.008260451257228851, + "learning_rate": 1.8348807775818932e-05, + "loss": 0.0031, + "step": 4830 + }, + { + "epoch": 3.732960030893995, + "grad_norm": 0.008538060821592808, + "learning_rate": 1.8327929605200206e-05, + "loss": 0.0028, + "step": 4831 + }, + { + "epoch": 3.733732380768488, + "grad_norm": 0.00773623026907444, + "learning_rate": 1.8307060653555874e-05, + "loss": 0.0029, + "step": 4832 + }, + { + "epoch": 3.734504730642981, + "grad_norm": 0.010263267904520035, + "learning_rate": 1.828620092696038e-05, + "loss": 0.0037, + "step": 4833 + }, + { + "epoch": 3.7352770805174744, + "grad_norm": 0.01147653441876173, + "learning_rate": 1.8265350431485488e-05, + "loss": 0.0029, + "step": 4834 + }, + { + "epoch": 3.736049430391968, + "grad_norm": 0.011068695224821568, + "learning_rate": 1.8244509173200276e-05, + "loss": 0.0031, + "step": 4835 + }, + { + "epoch": 3.7368217802664607, + "grad_norm": 0.009672621265053749, + "learning_rate": 1.8223677158171128e-05, + "loss": 0.0031, + "step": 4836 + }, + { + "epoch": 3.7375941301409537, + "grad_norm": 0.008995790034532547, + "learning_rate": 1.8202854392461738e-05, + "loss": 0.003, + "step": 4837 + }, + { + "epoch": 3.738366480015447, + "grad_norm": 0.00977084506303072, + "learning_rate": 1.8182040882133118e-05, + "loss": 0.0032, + "step": 4838 + }, + { + "epoch": 3.7391388298899404, + "grad_norm": 0.016638806089758873, + "learning_rate": 1.8161236633243566e-05, + "loss": 0.0031, + "step": 4839 + }, + { + "epoch": 3.7399111797644333, + "grad_norm": 0.008934970945119858, + "learning_rate": 1.814044165184871e-05, + "loss": 0.0031, + "step": 4840 + }, + { + "epoch": 3.7406835296389263, + "grad_norm": 0.008013790473341942, + "learning_rate": 1.8119655944001458e-05, + "loss": 0.0031, + "step": 4841 + }, + { + "epoch": 3.7414558795134196, + "grad_norm": 0.008421190083026886, + "learning_rate": 1.8098879515752025e-05, + "loss": 0.0028, + "step": 4842 + }, + { + "epoch": 3.742228229387913, + "grad_norm": 0.009693178348243237, + "learning_rate": 1.807811237314794e-05, + "loss": 0.0028, + "step": 4843 + }, + { + "epoch": 3.743000579262406, + "grad_norm": 0.016526630148291588, + "learning_rate": 1.8057354522233998e-05, + "loss": 0.0027, + "step": 4844 + }, + { + "epoch": 3.743772929136899, + "grad_norm": 0.008722502738237381, + "learning_rate": 1.8036605969052322e-05, + "loss": 0.0031, + "step": 4845 + }, + { + "epoch": 3.7445452790113922, + "grad_norm": 0.012371312826871872, + "learning_rate": 1.801586671964231e-05, + "loss": 0.0031, + "step": 4846 + }, + { + "epoch": 3.745317628885885, + "grad_norm": 0.009250137954950333, + "learning_rate": 1.7995136780040656e-05, + "loss": 0.0032, + "step": 4847 + }, + { + "epoch": 3.7460899787603785, + "grad_norm": 0.008596301078796387, + "learning_rate": 1.7974416156281342e-05, + "loss": 0.0037, + "step": 4848 + }, + { + "epoch": 3.7468623286348715, + "grad_norm": 0.014486918225884438, + "learning_rate": 1.7953704854395647e-05, + "loss": 0.0032, + "step": 4849 + }, + { + "epoch": 3.747634678509365, + "grad_norm": 0.015768401324748993, + "learning_rate": 1.7933002880412125e-05, + "loss": 0.0035, + "step": 4850 + }, + { + "epoch": 3.7484070283838578, + "grad_norm": 0.015920784324407578, + "learning_rate": 1.791231024035663e-05, + "loss": 0.0033, + "step": 4851 + }, + { + "epoch": 3.749179378258351, + "grad_norm": 0.008338225074112415, + "learning_rate": 1.7891626940252255e-05, + "loss": 0.003, + "step": 4852 + }, + { + "epoch": 3.749951728132844, + "grad_norm": 0.007774571422487497, + "learning_rate": 1.787095298611947e-05, + "loss": 0.0028, + "step": 4853 + }, + { + "epoch": 3.7507240780073374, + "grad_norm": 0.008573639206588268, + "learning_rate": 1.7850288383975923e-05, + "loss": 0.0029, + "step": 4854 + }, + { + "epoch": 3.7514964278818304, + "grad_norm": 0.014735306613147259, + "learning_rate": 1.782963313983656e-05, + "loss": 0.0033, + "step": 4855 + }, + { + "epoch": 3.7522687777563237, + "grad_norm": 0.010564186610281467, + "learning_rate": 1.7808987259713688e-05, + "loss": 0.003, + "step": 4856 + }, + { + "epoch": 3.7530411276308167, + "grad_norm": 0.008973659947514534, + "learning_rate": 1.7788350749616773e-05, + "loss": 0.0028, + "step": 4857 + }, + { + "epoch": 3.75381347750531, + "grad_norm": 0.017529862001538277, + "learning_rate": 1.7767723615552594e-05, + "loss": 0.0028, + "step": 4858 + }, + { + "epoch": 3.754585827379803, + "grad_norm": 0.009524044580757618, + "learning_rate": 1.774710586352527e-05, + "loss": 0.0032, + "step": 4859 + }, + { + "epoch": 3.7553581772542963, + "grad_norm": 0.015119528397917747, + "learning_rate": 1.7726497499536084e-05, + "loss": 0.0027, + "step": 4860 + }, + { + "epoch": 3.7561305271287893, + "grad_norm": 0.015329336747527122, + "learning_rate": 1.770589852958362e-05, + "loss": 0.0035, + "step": 4861 + }, + { + "epoch": 3.7569028770032826, + "grad_norm": 0.013769777491688728, + "learning_rate": 1.768530895966379e-05, + "loss": 0.0034, + "step": 4862 + }, + { + "epoch": 3.7576752268777756, + "grad_norm": 0.009363507851958275, + "learning_rate": 1.7664728795769704e-05, + "loss": 0.0031, + "step": 4863 + }, + { + "epoch": 3.758447576752269, + "grad_norm": 0.008630231022834778, + "learning_rate": 1.7644158043891727e-05, + "loss": 0.0029, + "step": 4864 + }, + { + "epoch": 3.759219926626762, + "grad_norm": 0.010387644171714783, + "learning_rate": 1.7623596710017505e-05, + "loss": 0.0032, + "step": 4865 + }, + { + "epoch": 3.7599922765012552, + "grad_norm": 0.014387257397174835, + "learning_rate": 1.7603044800131995e-05, + "loss": 0.0037, + "step": 4866 + }, + { + "epoch": 3.760764626375748, + "grad_norm": 0.009594297036528587, + "learning_rate": 1.7582502320217315e-05, + "loss": 0.0032, + "step": 4867 + }, + { + "epoch": 3.761536976250241, + "grad_norm": 0.008206100203096867, + "learning_rate": 1.756196927625288e-05, + "loss": 0.0028, + "step": 4868 + }, + { + "epoch": 3.7623093261247345, + "grad_norm": 0.008791467174887657, + "learning_rate": 1.7541445674215418e-05, + "loss": 0.0026, + "step": 4869 + }, + { + "epoch": 3.763081675999228, + "grad_norm": 0.010609994642436504, + "learning_rate": 1.7520931520078797e-05, + "loss": 0.0034, + "step": 4870 + }, + { + "epoch": 3.7638540258737208, + "grad_norm": 0.009034925140440464, + "learning_rate": 1.7500426819814198e-05, + "loss": 0.0032, + "step": 4871 + }, + { + "epoch": 3.7646263757482137, + "grad_norm": 0.010067600756883621, + "learning_rate": 1.747993157939007e-05, + "loss": 0.0032, + "step": 4872 + }, + { + "epoch": 3.765398725622707, + "grad_norm": 0.009893916547298431, + "learning_rate": 1.745944580477209e-05, + "loss": 0.003, + "step": 4873 + }, + { + "epoch": 3.7661710754972004, + "grad_norm": 0.008948578499257565, + "learning_rate": 1.7438969501923114e-05, + "loss": 0.0029, + "step": 4874 + }, + { + "epoch": 3.7669434253716934, + "grad_norm": 0.009708845056593418, + "learning_rate": 1.7418502676803362e-05, + "loss": 0.003, + "step": 4875 + }, + { + "epoch": 3.7677157752461863, + "grad_norm": 0.009008213877677917, + "learning_rate": 1.7398045335370215e-05, + "loss": 0.003, + "step": 4876 + }, + { + "epoch": 3.7684881251206797, + "grad_norm": 0.0089599983766675, + "learning_rate": 1.737759748357827e-05, + "loss": 0.0029, + "step": 4877 + }, + { + "epoch": 3.769260474995173, + "grad_norm": 0.007111499086022377, + "learning_rate": 1.735715912737946e-05, + "loss": 0.0025, + "step": 4878 + }, + { + "epoch": 3.770032824869666, + "grad_norm": 0.012843016535043716, + "learning_rate": 1.733673027272288e-05, + "loss": 0.0031, + "step": 4879 + }, + { + "epoch": 3.770805174744159, + "grad_norm": 0.00939163751900196, + "learning_rate": 1.7316310925554836e-05, + "loss": 0.0026, + "step": 4880 + }, + { + "epoch": 3.7715775246186523, + "grad_norm": 0.010984611697494984, + "learning_rate": 1.7295901091818955e-05, + "loss": 0.0032, + "step": 4881 + }, + { + "epoch": 3.7723498744931456, + "grad_norm": 0.012250718660652637, + "learning_rate": 1.7275500777456032e-05, + "loss": 0.0029, + "step": 4882 + }, + { + "epoch": 3.7731222243676386, + "grad_norm": 0.008747139945626259, + "learning_rate": 1.72551099884041e-05, + "loss": 0.0028, + "step": 4883 + }, + { + "epoch": 3.7738945742421315, + "grad_norm": 0.008452476002275944, + "learning_rate": 1.7234728730598432e-05, + "loss": 0.0027, + "step": 4884 + }, + { + "epoch": 3.774666924116625, + "grad_norm": 0.010798325762152672, + "learning_rate": 1.7214357009971517e-05, + "loss": 0.0028, + "step": 4885 + }, + { + "epoch": 3.7754392739911182, + "grad_norm": 0.0076159583404660225, + "learning_rate": 1.719399483245307e-05, + "loss": 0.0028, + "step": 4886 + }, + { + "epoch": 3.776211623865611, + "grad_norm": 0.009635258466005325, + "learning_rate": 1.7173642203970026e-05, + "loss": 0.0028, + "step": 4887 + }, + { + "epoch": 3.776983973740104, + "grad_norm": 0.010172554291784763, + "learning_rate": 1.7153299130446545e-05, + "loss": 0.0032, + "step": 4888 + }, + { + "epoch": 3.7777563236145975, + "grad_norm": 0.009352128021419048, + "learning_rate": 1.7132965617804027e-05, + "loss": 0.0029, + "step": 4889 + }, + { + "epoch": 3.778528673489091, + "grad_norm": 0.08108187466859818, + "learning_rate": 1.7112641671961007e-05, + "loss": 0.0034, + "step": 4890 + }, + { + "epoch": 3.7793010233635838, + "grad_norm": 0.009638349525630474, + "learning_rate": 1.7092327298833345e-05, + "loss": 0.0031, + "step": 4891 + }, + { + "epoch": 3.7800733732380767, + "grad_norm": 0.011018278077244759, + "learning_rate": 1.7072022504334058e-05, + "loss": 0.0033, + "step": 4892 + }, + { + "epoch": 3.78084572311257, + "grad_norm": 0.007833022624254227, + "learning_rate": 1.7051727294373367e-05, + "loss": 0.003, + "step": 4893 + }, + { + "epoch": 3.781618072987063, + "grad_norm": 0.008461980149149895, + "learning_rate": 1.7031441674858728e-05, + "loss": 0.0028, + "step": 4894 + }, + { + "epoch": 3.7823904228615564, + "grad_norm": 0.010157615877687931, + "learning_rate": 1.7011165651694795e-05, + "loss": 0.003, + "step": 4895 + }, + { + "epoch": 3.7831627727360493, + "grad_norm": 0.007885764352977276, + "learning_rate": 1.6990899230783418e-05, + "loss": 0.0027, + "step": 4896 + }, + { + "epoch": 3.7839351226105427, + "grad_norm": 0.011371986009180546, + "learning_rate": 1.697064241802367e-05, + "loss": 0.0032, + "step": 4897 + }, + { + "epoch": 3.7847074724850356, + "grad_norm": 0.010999456979334354, + "learning_rate": 1.6950395219311822e-05, + "loss": 0.0029, + "step": 4898 + }, + { + "epoch": 3.785479822359529, + "grad_norm": 0.012473049573600292, + "learning_rate": 1.693015764054134e-05, + "loss": 0.0031, + "step": 4899 + }, + { + "epoch": 3.786252172234022, + "grad_norm": 0.01124456524848938, + "learning_rate": 1.690992968760289e-05, + "loss": 0.0036, + "step": 4900 + }, + { + "epoch": 3.7870245221085153, + "grad_norm": 0.012125209905207157, + "learning_rate": 1.6889711366384347e-05, + "loss": 0.003, + "step": 4901 + }, + { + "epoch": 3.787796871983008, + "grad_norm": 0.01850360631942749, + "learning_rate": 1.6869502682770776e-05, + "loss": 0.003, + "step": 4902 + }, + { + "epoch": 3.7885692218575016, + "grad_norm": 0.013725490309298038, + "learning_rate": 1.684930364264444e-05, + "loss": 0.0033, + "step": 4903 + }, + { + "epoch": 3.7893415717319945, + "grad_norm": 0.009691640734672546, + "learning_rate": 1.6829114251884776e-05, + "loss": 0.0027, + "step": 4904 + }, + { + "epoch": 3.790113921606488, + "grad_norm": 0.010694482363760471, + "learning_rate": 1.6808934516368446e-05, + "loss": 0.0031, + "step": 4905 + }, + { + "epoch": 3.790886271480981, + "grad_norm": 0.013220801018178463, + "learning_rate": 1.678876444196927e-05, + "loss": 0.0035, + "step": 4906 + }, + { + "epoch": 3.791658621355474, + "grad_norm": 0.008447596803307533, + "learning_rate": 1.676860403455828e-05, + "loss": 0.0029, + "step": 4907 + }, + { + "epoch": 3.792430971229967, + "grad_norm": 0.012366946786642075, + "learning_rate": 1.674845330000367e-05, + "loss": 0.003, + "step": 4908 + }, + { + "epoch": 3.7932033211044605, + "grad_norm": 0.012804819270968437, + "learning_rate": 1.6728312244170847e-05, + "loss": 0.003, + "step": 4909 + }, + { + "epoch": 3.7939756709789534, + "grad_norm": 0.010311468504369259, + "learning_rate": 1.6708180872922373e-05, + "loss": 0.0029, + "step": 4910 + }, + { + "epoch": 3.7947480208534468, + "grad_norm": 0.00922460202127695, + "learning_rate": 1.6688059192118018e-05, + "loss": 0.0032, + "step": 4911 + }, + { + "epoch": 3.7955203707279397, + "grad_norm": 0.011730996891856194, + "learning_rate": 1.6667947207614682e-05, + "loss": 0.0034, + "step": 4912 + }, + { + "epoch": 3.796292720602433, + "grad_norm": 0.008386842906475067, + "learning_rate": 1.6647844925266544e-05, + "loss": 0.0031, + "step": 4913 + }, + { + "epoch": 3.797065070476926, + "grad_norm": 0.009484478272497654, + "learning_rate": 1.662775235092483e-05, + "loss": 0.0029, + "step": 4914 + }, + { + "epoch": 3.7978374203514194, + "grad_norm": 0.009336920455098152, + "learning_rate": 1.6607669490438015e-05, + "loss": 0.0029, + "step": 4915 + }, + { + "epoch": 3.7986097702259123, + "grad_norm": 0.008269627578556538, + "learning_rate": 1.6587596349651774e-05, + "loss": 0.0027, + "step": 4916 + }, + { + "epoch": 3.7993821201004057, + "grad_norm": 0.008038083091378212, + "learning_rate": 1.6567532934408876e-05, + "loss": 0.003, + "step": 4917 + }, + { + "epoch": 3.8001544699748986, + "grad_norm": 0.013694526627659798, + "learning_rate": 1.6547479250549296e-05, + "loss": 0.0028, + "step": 4918 + }, + { + "epoch": 3.8009268198493915, + "grad_norm": 0.009768110699951649, + "learning_rate": 1.652743530391019e-05, + "loss": 0.003, + "step": 4919 + }, + { + "epoch": 3.801699169723885, + "grad_norm": 0.00901896320283413, + "learning_rate": 1.650740110032586e-05, + "loss": 0.0033, + "step": 4920 + }, + { + "epoch": 3.8024715195983783, + "grad_norm": 0.011361799202859402, + "learning_rate": 1.648737664562778e-05, + "loss": 0.0028, + "step": 4921 + }, + { + "epoch": 3.803243869472871, + "grad_norm": 0.009843561798334122, + "learning_rate": 1.646736194564457e-05, + "loss": 0.0026, + "step": 4922 + }, + { + "epoch": 3.804016219347364, + "grad_norm": 0.010585188865661621, + "learning_rate": 1.6447357006202074e-05, + "loss": 0.0037, + "step": 4923 + }, + { + "epoch": 3.8047885692218575, + "grad_norm": 0.008782930672168732, + "learning_rate": 1.64273618331232e-05, + "loss": 0.0033, + "step": 4924 + }, + { + "epoch": 3.805560919096351, + "grad_norm": 0.012191285379230976, + "learning_rate": 1.6407376432228054e-05, + "loss": 0.0034, + "step": 4925 + }, + { + "epoch": 3.806333268970844, + "grad_norm": 0.01644955947995186, + "learning_rate": 1.638740080933396e-05, + "loss": 0.0031, + "step": 4926 + }, + { + "epoch": 3.8071056188453367, + "grad_norm": 0.012493155896663666, + "learning_rate": 1.636743497025528e-05, + "loss": 0.0031, + "step": 4927 + }, + { + "epoch": 3.80787796871983, + "grad_norm": 0.012926139868795872, + "learning_rate": 1.6347478920803593e-05, + "loss": 0.0029, + "step": 4928 + }, + { + "epoch": 3.8086503185943235, + "grad_norm": 0.011629858054220676, + "learning_rate": 1.632753266678767e-05, + "loss": 0.0028, + "step": 4929 + }, + { + "epoch": 3.8094226684688164, + "grad_norm": 0.010302181355655193, + "learning_rate": 1.6307596214013342e-05, + "loss": 0.003, + "step": 4930 + }, + { + "epoch": 3.8101950183433093, + "grad_norm": 0.011941662058234215, + "learning_rate": 1.628766956828362e-05, + "loss": 0.003, + "step": 4931 + }, + { + "epoch": 3.8109673682178027, + "grad_norm": 0.013709473423659801, + "learning_rate": 1.6267752735398707e-05, + "loss": 0.0031, + "step": 4932 + }, + { + "epoch": 3.811739718092296, + "grad_norm": 0.009955592453479767, + "learning_rate": 1.6247845721155903e-05, + "loss": 0.003, + "step": 4933 + }, + { + "epoch": 3.812512067966789, + "grad_norm": 0.00988897867500782, + "learning_rate": 1.6227948531349625e-05, + "loss": 0.0029, + "step": 4934 + }, + { + "epoch": 3.813284417841282, + "grad_norm": 0.00757649214938283, + "learning_rate": 1.62080611717715e-05, + "loss": 0.0034, + "step": 4935 + }, + { + "epoch": 3.8140567677157753, + "grad_norm": 0.010603350587189198, + "learning_rate": 1.6188183648210258e-05, + "loss": 0.0032, + "step": 4936 + }, + { + "epoch": 3.8148291175902687, + "grad_norm": 0.010256056673824787, + "learning_rate": 1.6168315966451725e-05, + "loss": 0.0027, + "step": 4937 + }, + { + "epoch": 3.8156014674647616, + "grad_norm": 0.015140022151172161, + "learning_rate": 1.6148458132278948e-05, + "loss": 0.0034, + "step": 4938 + }, + { + "epoch": 3.8163738173392545, + "grad_norm": 0.011739297769963741, + "learning_rate": 1.6128610151472063e-05, + "loss": 0.003, + "step": 4939 + }, + { + "epoch": 3.817146167213748, + "grad_norm": 0.01055013108998537, + "learning_rate": 1.610877202980829e-05, + "loss": 0.0028, + "step": 4940 + }, + { + "epoch": 3.817918517088241, + "grad_norm": 0.008143901824951172, + "learning_rate": 1.608894377306207e-05, + "loss": 0.0028, + "step": 4941 + }, + { + "epoch": 3.818690866962734, + "grad_norm": 0.018344135954976082, + "learning_rate": 1.606912538700492e-05, + "loss": 0.0033, + "step": 4942 + }, + { + "epoch": 3.819463216837227, + "grad_norm": 0.012720978818833828, + "learning_rate": 1.604931687740551e-05, + "loss": 0.0032, + "step": 4943 + }, + { + "epoch": 3.8202355667117205, + "grad_norm": 0.012700404971837997, + "learning_rate": 1.6029518250029563e-05, + "loss": 0.003, + "step": 4944 + }, + { + "epoch": 3.8210079165862134, + "grad_norm": 0.01139888260513544, + "learning_rate": 1.6009729510640032e-05, + "loss": 0.0029, + "step": 4945 + }, + { + "epoch": 3.821780266460707, + "grad_norm": 0.010537398979067802, + "learning_rate": 1.5989950664996945e-05, + "loss": 0.0027, + "step": 4946 + }, + { + "epoch": 3.8225526163351997, + "grad_norm": 0.008864232338964939, + "learning_rate": 1.597018171885739e-05, + "loss": 0.0026, + "step": 4947 + }, + { + "epoch": 3.823324966209693, + "grad_norm": 0.010028931312263012, + "learning_rate": 1.595042267797569e-05, + "loss": 0.0033, + "step": 4948 + }, + { + "epoch": 3.824097316084186, + "grad_norm": 0.010445699095726013, + "learning_rate": 1.5930673548103215e-05, + "loss": 0.0033, + "step": 4949 + }, + { + "epoch": 3.8248696659586794, + "grad_norm": 0.008232830092310905, + "learning_rate": 1.5910934334988415e-05, + "loss": 0.0028, + "step": 4950 + }, + { + "epoch": 3.8256420158331723, + "grad_norm": 0.009088825434446335, + "learning_rate": 1.589120504437694e-05, + "loss": 0.0031, + "step": 4951 + }, + { + "epoch": 3.8264143657076657, + "grad_norm": 0.009279578924179077, + "learning_rate": 1.58714856820115e-05, + "loss": 0.0027, + "step": 4952 + }, + { + "epoch": 3.8271867155821586, + "grad_norm": 0.01180476974695921, + "learning_rate": 1.585177625363192e-05, + "loss": 0.0033, + "step": 4953 + }, + { + "epoch": 3.827959065456652, + "grad_norm": 0.009612584486603737, + "learning_rate": 1.583207676497515e-05, + "loss": 0.0033, + "step": 4954 + }, + { + "epoch": 3.828731415331145, + "grad_norm": 0.011616314761340618, + "learning_rate": 1.581238722177522e-05, + "loss": 0.0029, + "step": 4955 + }, + { + "epoch": 3.8295037652056383, + "grad_norm": 0.00945865735411644, + "learning_rate": 1.579270762976329e-05, + "loss": 0.0032, + "step": 4956 + }, + { + "epoch": 3.830276115080131, + "grad_norm": 0.01082943007349968, + "learning_rate": 1.577303799466761e-05, + "loss": 0.0033, + "step": 4957 + }, + { + "epoch": 3.8310484649546246, + "grad_norm": 0.009206010028719902, + "learning_rate": 1.5753378322213536e-05, + "loss": 0.0031, + "step": 4958 + }, + { + "epoch": 3.8318208148291175, + "grad_norm": 0.006946722976863384, + "learning_rate": 1.573372861812352e-05, + "loss": 0.0029, + "step": 4959 + }, + { + "epoch": 3.832593164703611, + "grad_norm": 0.010826830752193928, + "learning_rate": 1.5714088888117125e-05, + "loss": 0.003, + "step": 4960 + }, + { + "epoch": 3.833365514578104, + "grad_norm": 0.00945520494133234, + "learning_rate": 1.5694459137910996e-05, + "loss": 0.0026, + "step": 4961 + }, + { + "epoch": 3.834137864452597, + "grad_norm": 0.006998469587415457, + "learning_rate": 1.567483937321888e-05, + "loss": 0.0025, + "step": 4962 + }, + { + "epoch": 3.83491021432709, + "grad_norm": 0.008991682901978493, + "learning_rate": 1.5655229599751614e-05, + "loss": 0.0028, + "step": 4963 + }, + { + "epoch": 3.8356825642015835, + "grad_norm": 0.008646678179502487, + "learning_rate": 1.563562982321713e-05, + "loss": 0.0029, + "step": 4964 + }, + { + "epoch": 3.8364549140760764, + "grad_norm": 0.008845383301377296, + "learning_rate": 1.561604004932046e-05, + "loss": 0.003, + "step": 4965 + }, + { + "epoch": 3.8372272639505693, + "grad_norm": 0.01046202052384615, + "learning_rate": 1.559646028376369e-05, + "loss": 0.003, + "step": 4966 + }, + { + "epoch": 3.8379996138250627, + "grad_norm": 0.009699558839201927, + "learning_rate": 1.557689053224604e-05, + "loss": 0.0026, + "step": 4967 + }, + { + "epoch": 3.838771963699556, + "grad_norm": 0.009032976813614368, + "learning_rate": 1.555733080046378e-05, + "loss": 0.0029, + "step": 4968 + }, + { + "epoch": 3.839544313574049, + "grad_norm": 0.012526086531579494, + "learning_rate": 1.553778109411028e-05, + "loss": 0.003, + "step": 4969 + }, + { + "epoch": 3.840316663448542, + "grad_norm": 0.013793490827083588, + "learning_rate": 1.5518241418875985e-05, + "loss": 0.0033, + "step": 4970 + }, + { + "epoch": 3.8410890133230353, + "grad_norm": 0.008525853976607323, + "learning_rate": 1.549871178044842e-05, + "loss": 0.0033, + "step": 4971 + }, + { + "epoch": 3.8418613631975287, + "grad_norm": 0.009102314710617065, + "learning_rate": 1.54791921845122e-05, + "loss": 0.0024, + "step": 4972 + }, + { + "epoch": 3.8426337130720216, + "grad_norm": 0.012971078976988792, + "learning_rate": 1.5459682636748996e-05, + "loss": 0.0031, + "step": 4973 + }, + { + "epoch": 3.8434060629465145, + "grad_norm": 0.0119175398722291, + "learning_rate": 1.5440183142837573e-05, + "loss": 0.0033, + "step": 4974 + }, + { + "epoch": 3.844178412821008, + "grad_norm": 0.013856014236807823, + "learning_rate": 1.542069370845376e-05, + "loss": 0.0032, + "step": 4975 + }, + { + "epoch": 3.8449507626955013, + "grad_norm": 0.008841684088110924, + "learning_rate": 1.5401214339270464e-05, + "loss": 0.0033, + "step": 4976 + }, + { + "epoch": 3.845723112569994, + "grad_norm": 0.008778228424489498, + "learning_rate": 1.538174504095765e-05, + "loss": 0.0028, + "step": 4977 + }, + { + "epoch": 3.846495462444487, + "grad_norm": 0.012533146888017654, + "learning_rate": 1.536228581918237e-05, + "loss": 0.0028, + "step": 4978 + }, + { + "epoch": 3.8472678123189805, + "grad_norm": 0.009665487334132195, + "learning_rate": 1.534283667960873e-05, + "loss": 0.0028, + "step": 4979 + }, + { + "epoch": 3.848040162193474, + "grad_norm": 0.009867614135146141, + "learning_rate": 1.5323397627897905e-05, + "loss": 0.0031, + "step": 4980 + }, + { + "epoch": 3.848812512067967, + "grad_norm": 0.012394512072205544, + "learning_rate": 1.5303968669708128e-05, + "loss": 0.0032, + "step": 4981 + }, + { + "epoch": 3.8495848619424597, + "grad_norm": 0.007355043664574623, + "learning_rate": 1.5284549810694705e-05, + "loss": 0.0026, + "step": 4982 + }, + { + "epoch": 3.850357211816953, + "grad_norm": 0.008631830103695393, + "learning_rate": 1.5265141056509996e-05, + "loss": 0.0029, + "step": 4983 + }, + { + "epoch": 3.8511295616914465, + "grad_norm": 0.008126567117869854, + "learning_rate": 1.5245742412803421e-05, + "loss": 0.0028, + "step": 4984 + }, + { + "epoch": 3.8519019115659394, + "grad_norm": 0.01578478515148163, + "learning_rate": 1.5226353885221433e-05, + "loss": 0.0032, + "step": 4985 + }, + { + "epoch": 3.8526742614404323, + "grad_norm": 0.009282187558710575, + "learning_rate": 1.5206975479407626e-05, + "loss": 0.0029, + "step": 4986 + }, + { + "epoch": 3.8534466113149257, + "grad_norm": 0.008422215469181538, + "learning_rate": 1.5187607201002524e-05, + "loss": 0.0032, + "step": 4987 + }, + { + "epoch": 3.8542189611894186, + "grad_norm": 0.008351623080670834, + "learning_rate": 1.5168249055643768e-05, + "loss": 0.0032, + "step": 4988 + }, + { + "epoch": 3.854991311063912, + "grad_norm": 0.010155430994927883, + "learning_rate": 1.5148901048966102e-05, + "loss": 0.003, + "step": 4989 + }, + { + "epoch": 3.855763660938405, + "grad_norm": 0.007980355992913246, + "learning_rate": 1.512956318660121e-05, + "loss": 0.0031, + "step": 4990 + }, + { + "epoch": 3.8565360108128983, + "grad_norm": 0.011200337670743465, + "learning_rate": 1.5110235474177876e-05, + "loss": 0.0031, + "step": 4991 + }, + { + "epoch": 3.8573083606873912, + "grad_norm": 0.01221932377666235, + "learning_rate": 1.5090917917321974e-05, + "loss": 0.0034, + "step": 4992 + }, + { + "epoch": 3.8580807105618846, + "grad_norm": 0.009794541634619236, + "learning_rate": 1.5071610521656337e-05, + "loss": 0.0028, + "step": 4993 + }, + { + "epoch": 3.8588530604363775, + "grad_norm": 0.012350419536232948, + "learning_rate": 1.505231329280088e-05, + "loss": 0.0029, + "step": 4994 + }, + { + "epoch": 3.859625410310871, + "grad_norm": 0.007574809715151787, + "learning_rate": 1.5033026236372588e-05, + "loss": 0.0031, + "step": 4995 + }, + { + "epoch": 3.860397760185364, + "grad_norm": 0.008688594214618206, + "learning_rate": 1.5013749357985462e-05, + "loss": 0.003, + "step": 4996 + }, + { + "epoch": 3.861170110059857, + "grad_norm": 0.0109042227268219, + "learning_rate": 1.4994482663250504e-05, + "loss": 0.0027, + "step": 4997 + }, + { + "epoch": 3.86194245993435, + "grad_norm": 0.010012637823820114, + "learning_rate": 1.497522615777578e-05, + "loss": 0.0033, + "step": 4998 + }, + { + "epoch": 3.8627148098088435, + "grad_norm": 0.00960414670407772, + "learning_rate": 1.4955979847166434e-05, + "loss": 0.0027, + "step": 4999 + }, + { + "epoch": 3.8634871596833364, + "grad_norm": 0.007731981575489044, + "learning_rate": 1.4936743737024572e-05, + "loss": 0.0026, + "step": 5000 + }, + { + "epoch": 3.86425950955783, + "grad_norm": 0.010577595792710781, + "learning_rate": 1.4917517832949346e-05, + "loss": 0.003, + "step": 5001 + }, + { + "epoch": 3.8650318594323227, + "grad_norm": 0.009672565385699272, + "learning_rate": 1.4898302140537008e-05, + "loss": 0.0026, + "step": 5002 + }, + { + "epoch": 3.865804209306816, + "grad_norm": 0.009176766499876976, + "learning_rate": 1.4879096665380727e-05, + "loss": 0.003, + "step": 5003 + }, + { + "epoch": 3.866576559181309, + "grad_norm": 0.018057817593216896, + "learning_rate": 1.485990141307076e-05, + "loss": 0.0029, + "step": 5004 + }, + { + "epoch": 3.8673489090558024, + "grad_norm": 0.019623538479208946, + "learning_rate": 1.484071638919441e-05, + "loss": 0.0029, + "step": 5005 + }, + { + "epoch": 3.8681212589302953, + "grad_norm": 0.009910893626511097, + "learning_rate": 1.4821541599335976e-05, + "loss": 0.0024, + "step": 5006 + }, + { + "epoch": 3.8688936088047887, + "grad_norm": 0.00982673466205597, + "learning_rate": 1.4802377049076721e-05, + "loss": 0.003, + "step": 5007 + }, + { + "epoch": 3.8696659586792816, + "grad_norm": 0.015128890983760357, + "learning_rate": 1.4783222743995034e-05, + "loss": 0.0031, + "step": 5008 + }, + { + "epoch": 3.870438308553775, + "grad_norm": 0.008337433449923992, + "learning_rate": 1.4764078689666272e-05, + "loss": 0.0029, + "step": 5009 + }, + { + "epoch": 3.871210658428268, + "grad_norm": 0.014315987937152386, + "learning_rate": 1.474494489166276e-05, + "loss": 0.0034, + "step": 5010 + }, + { + "epoch": 3.8719830083027613, + "grad_norm": 0.014011272229254246, + "learning_rate": 1.4725821355553931e-05, + "loss": 0.0028, + "step": 5011 + }, + { + "epoch": 3.8727553581772542, + "grad_norm": 0.010705845430493355, + "learning_rate": 1.4706708086906185e-05, + "loss": 0.0026, + "step": 5012 + }, + { + "epoch": 3.873527708051747, + "grad_norm": 0.008886613883078098, + "learning_rate": 1.468760509128288e-05, + "loss": 0.0031, + "step": 5013 + }, + { + "epoch": 3.8743000579262405, + "grad_norm": 0.016048626974225044, + "learning_rate": 1.4668512374244492e-05, + "loss": 0.0031, + "step": 5014 + }, + { + "epoch": 3.875072407800734, + "grad_norm": 0.016248665750026703, + "learning_rate": 1.4649429941348424e-05, + "loss": 0.0031, + "step": 5015 + }, + { + "epoch": 3.875844757675227, + "grad_norm": 0.009499805979430676, + "learning_rate": 1.463035779814912e-05, + "loss": 0.0032, + "step": 5016 + }, + { + "epoch": 3.8766171075497198, + "grad_norm": 0.02029765024781227, + "learning_rate": 1.4611295950198018e-05, + "loss": 0.0035, + "step": 5017 + }, + { + "epoch": 3.877389457424213, + "grad_norm": 0.008975028991699219, + "learning_rate": 1.459224440304356e-05, + "loss": 0.0031, + "step": 5018 + }, + { + "epoch": 3.8781618072987065, + "grad_norm": 0.010319342836737633, + "learning_rate": 1.4573203162231187e-05, + "loss": 0.0034, + "step": 5019 + }, + { + "epoch": 3.8789341571731994, + "grad_norm": 0.008209176361560822, + "learning_rate": 1.4554172233303349e-05, + "loss": 0.003, + "step": 5020 + }, + { + "epoch": 3.8797065070476924, + "grad_norm": 0.0124753937125206, + "learning_rate": 1.4535151621799487e-05, + "loss": 0.0031, + "step": 5021 + }, + { + "epoch": 3.8804788569221857, + "grad_norm": 0.01752236858010292, + "learning_rate": 1.4516141333256062e-05, + "loss": 0.0028, + "step": 5022 + }, + { + "epoch": 3.881251206796679, + "grad_norm": 0.016599994152784348, + "learning_rate": 1.4497141373206458e-05, + "loss": 0.0029, + "step": 5023 + }, + { + "epoch": 3.882023556671172, + "grad_norm": 0.008728167973458767, + "learning_rate": 1.4478151747181152e-05, + "loss": 0.0031, + "step": 5024 + }, + { + "epoch": 3.882795906545665, + "grad_norm": 0.009795735590159893, + "learning_rate": 1.4459172460707555e-05, + "loss": 0.003, + "step": 5025 + }, + { + "epoch": 3.8835682564201583, + "grad_norm": 0.008861212991178036, + "learning_rate": 1.444020351931007e-05, + "loss": 0.0031, + "step": 5026 + }, + { + "epoch": 3.8843406062946517, + "grad_norm": 0.017795542255043983, + "learning_rate": 1.4421244928510097e-05, + "loss": 0.0032, + "step": 5027 + }, + { + "epoch": 3.8851129561691446, + "grad_norm": 0.01518005970865488, + "learning_rate": 1.4402296693826034e-05, + "loss": 0.0029, + "step": 5028 + }, + { + "epoch": 3.8858853060436376, + "grad_norm": 0.014253837987780571, + "learning_rate": 1.4383358820773246e-05, + "loss": 0.0027, + "step": 5029 + }, + { + "epoch": 3.886657655918131, + "grad_norm": 0.008239194750785828, + "learning_rate": 1.436443131486409e-05, + "loss": 0.0027, + "step": 5030 + }, + { + "epoch": 3.8874300057926243, + "grad_norm": 0.01097941491752863, + "learning_rate": 1.4345514181607916e-05, + "loss": 0.0032, + "step": 5031 + }, + { + "epoch": 3.8882023556671172, + "grad_norm": 0.010556716471910477, + "learning_rate": 1.432660742651103e-05, + "loss": 0.003, + "step": 5032 + }, + { + "epoch": 3.88897470554161, + "grad_norm": 0.011672616004943848, + "learning_rate": 1.4307711055076739e-05, + "loss": 0.003, + "step": 5033 + }, + { + "epoch": 3.8897470554161035, + "grad_norm": 0.016163257881999016, + "learning_rate": 1.4288825072805322e-05, + "loss": 0.0029, + "step": 5034 + }, + { + "epoch": 3.8905194052905965, + "grad_norm": 0.012452706694602966, + "learning_rate": 1.426994948519403e-05, + "loss": 0.0028, + "step": 5035 + }, + { + "epoch": 3.89129175516509, + "grad_norm": 0.009630842134356499, + "learning_rate": 1.4251084297737088e-05, + "loss": 0.003, + "step": 5036 + }, + { + "epoch": 3.8920641050395828, + "grad_norm": 0.010833237320184708, + "learning_rate": 1.4232229515925693e-05, + "loss": 0.0032, + "step": 5037 + }, + { + "epoch": 3.892836454914076, + "grad_norm": 0.009995395317673683, + "learning_rate": 1.4213385145248032e-05, + "loss": 0.003, + "step": 5038 + }, + { + "epoch": 3.893608804788569, + "grad_norm": 0.01041726116091013, + "learning_rate": 1.4194551191189226e-05, + "loss": 0.0027, + "step": 5039 + }, + { + "epoch": 3.8943811546630624, + "grad_norm": 0.010740031488239765, + "learning_rate": 1.4175727659231397e-05, + "loss": 0.0026, + "step": 5040 + }, + { + "epoch": 3.8951535045375554, + "grad_norm": 0.008188420906662941, + "learning_rate": 1.4156914554853618e-05, + "loss": 0.0027, + "step": 5041 + }, + { + "epoch": 3.8959258544120487, + "grad_norm": 0.01434328407049179, + "learning_rate": 1.4138111883531923e-05, + "loss": 0.003, + "step": 5042 + }, + { + "epoch": 3.8966982042865417, + "grad_norm": 0.007261700462549925, + "learning_rate": 1.4119319650739327e-05, + "loss": 0.0031, + "step": 5043 + }, + { + "epoch": 3.897470554161035, + "grad_norm": 0.013153079897165298, + "learning_rate": 1.4100537861945783e-05, + "loss": 0.0031, + "step": 5044 + }, + { + "epoch": 3.898242904035528, + "grad_norm": 0.008110249415040016, + "learning_rate": 1.4081766522618207e-05, + "loss": 0.0027, + "step": 5045 + }, + { + "epoch": 3.8990152539100214, + "grad_norm": 0.007599998731166124, + "learning_rate": 1.4063005638220528e-05, + "loss": 0.0031, + "step": 5046 + }, + { + "epoch": 3.8997876037845143, + "grad_norm": 0.009179320186376572, + "learning_rate": 1.4044255214213537e-05, + "loss": 0.0028, + "step": 5047 + }, + { + "epoch": 3.9005599536590077, + "grad_norm": 0.008315013721585274, + "learning_rate": 1.4025515256055034e-05, + "loss": 0.003, + "step": 5048 + }, + { + "epoch": 3.9013323035335006, + "grad_norm": 0.011417336761951447, + "learning_rate": 1.400678576919981e-05, + "loss": 0.0034, + "step": 5049 + }, + { + "epoch": 3.902104653407994, + "grad_norm": 0.011364213190972805, + "learning_rate": 1.3988066759099516e-05, + "loss": 0.0031, + "step": 5050 + }, + { + "epoch": 3.902877003282487, + "grad_norm": 0.00905610527843237, + "learning_rate": 1.3969358231202827e-05, + "loss": 0.0032, + "step": 5051 + }, + { + "epoch": 3.9036493531569803, + "grad_norm": 0.009831871837377548, + "learning_rate": 1.3950660190955345e-05, + "loss": 0.0028, + "step": 5052 + }, + { + "epoch": 3.904421703031473, + "grad_norm": 0.008532781153917313, + "learning_rate": 1.3931972643799612e-05, + "loss": 0.0031, + "step": 5053 + }, + { + "epoch": 3.9051940529059666, + "grad_norm": 0.009511567652225494, + "learning_rate": 1.391329559517512e-05, + "loss": 0.0028, + "step": 5054 + }, + { + "epoch": 3.9059664027804595, + "grad_norm": 0.01154781598597765, + "learning_rate": 1.3894629050518294e-05, + "loss": 0.003, + "step": 5055 + }, + { + "epoch": 3.906738752654953, + "grad_norm": 0.011973300948739052, + "learning_rate": 1.3875973015262561e-05, + "loss": 0.0031, + "step": 5056 + }, + { + "epoch": 3.907511102529446, + "grad_norm": 0.009173572063446045, + "learning_rate": 1.3857327494838195e-05, + "loss": 0.0032, + "step": 5057 + }, + { + "epoch": 3.908283452403939, + "grad_norm": 0.010813619941473007, + "learning_rate": 1.3838692494672462e-05, + "loss": 0.0032, + "step": 5058 + }, + { + "epoch": 3.909055802278432, + "grad_norm": 0.011844763532280922, + "learning_rate": 1.3820068020189592e-05, + "loss": 0.003, + "step": 5059 + }, + { + "epoch": 3.909828152152925, + "grad_norm": 0.010590963065624237, + "learning_rate": 1.3801454076810688e-05, + "loss": 0.0029, + "step": 5060 + }, + { + "epoch": 3.9106005020274184, + "grad_norm": 0.015272004529833794, + "learning_rate": 1.3782850669953811e-05, + "loss": 0.0031, + "step": 5061 + }, + { + "epoch": 3.9113728519019118, + "grad_norm": 0.008821163326501846, + "learning_rate": 1.3764257805034015e-05, + "loss": 0.003, + "step": 5062 + }, + { + "epoch": 3.9121452017764047, + "grad_norm": 0.008121635764837265, + "learning_rate": 1.3745675487463183e-05, + "loss": 0.003, + "step": 5063 + }, + { + "epoch": 3.9129175516508976, + "grad_norm": 0.00942297838628292, + "learning_rate": 1.3727103722650175e-05, + "loss": 0.003, + "step": 5064 + }, + { + "epoch": 3.913689901525391, + "grad_norm": 0.008933554403483868, + "learning_rate": 1.3708542516000827e-05, + "loss": 0.0034, + "step": 5065 + }, + { + "epoch": 3.9144622513998844, + "grad_norm": 0.011289010755717754, + "learning_rate": 1.3689991872917845e-05, + "loss": 0.0033, + "step": 5066 + }, + { + "epoch": 3.9152346012743773, + "grad_norm": 0.013694976456463337, + "learning_rate": 1.3671451798800833e-05, + "loss": 0.0034, + "step": 5067 + }, + { + "epoch": 3.91600695114887, + "grad_norm": 0.010910279117524624, + "learning_rate": 1.36529222990464e-05, + "loss": 0.0031, + "step": 5068 + }, + { + "epoch": 3.9167793010233636, + "grad_norm": 0.011041252873837948, + "learning_rate": 1.3634403379048038e-05, + "loss": 0.0031, + "step": 5069 + }, + { + "epoch": 3.917551650897857, + "grad_norm": 0.009513885714113712, + "learning_rate": 1.361589504419612e-05, + "loss": 0.0035, + "step": 5070 + }, + { + "epoch": 3.91832400077235, + "grad_norm": 0.010692187584936619, + "learning_rate": 1.3597397299878006e-05, + "loss": 0.0032, + "step": 5071 + }, + { + "epoch": 3.919096350646843, + "grad_norm": 0.009557679295539856, + "learning_rate": 1.3578910151477947e-05, + "loss": 0.0027, + "step": 5072 + }, + { + "epoch": 3.919868700521336, + "grad_norm": 0.009597002528607845, + "learning_rate": 1.3560433604377064e-05, + "loss": 0.0034, + "step": 5073 + }, + { + "epoch": 3.9206410503958296, + "grad_norm": 0.009098115377128124, + "learning_rate": 1.354196766395348e-05, + "loss": 0.0032, + "step": 5074 + }, + { + "epoch": 3.9214134002703225, + "grad_norm": 0.012747055850923061, + "learning_rate": 1.3523512335582166e-05, + "loss": 0.0028, + "step": 5075 + }, + { + "epoch": 3.9221857501448154, + "grad_norm": 0.008872752077877522, + "learning_rate": 1.3505067624635032e-05, + "loss": 0.0035, + "step": 5076 + }, + { + "epoch": 3.922958100019309, + "grad_norm": 0.007852994836866856, + "learning_rate": 1.3486633536480852e-05, + "loss": 0.003, + "step": 5077 + }, + { + "epoch": 3.923730449893802, + "grad_norm": 0.00892792921513319, + "learning_rate": 1.3468210076485383e-05, + "loss": 0.0032, + "step": 5078 + }, + { + "epoch": 3.924502799768295, + "grad_norm": 0.011076129972934723, + "learning_rate": 1.3449797250011247e-05, + "loss": 0.0036, + "step": 5079 + }, + { + "epoch": 3.925275149642788, + "grad_norm": 0.008503065444529057, + "learning_rate": 1.3431395062417934e-05, + "loss": 0.0029, + "step": 5080 + }, + { + "epoch": 3.9260474995172814, + "grad_norm": 0.013432437554001808, + "learning_rate": 1.3413003519061917e-05, + "loss": 0.0031, + "step": 5081 + }, + { + "epoch": 3.9268198493917743, + "grad_norm": 0.008086584508419037, + "learning_rate": 1.3394622625296533e-05, + "loss": 0.0028, + "step": 5082 + }, + { + "epoch": 3.9275921992662677, + "grad_norm": 0.009846203029155731, + "learning_rate": 1.3376252386471965e-05, + "loss": 0.003, + "step": 5083 + }, + { + "epoch": 3.9283645491407606, + "grad_norm": 0.0073504154570400715, + "learning_rate": 1.3357892807935397e-05, + "loss": 0.003, + "step": 5084 + }, + { + "epoch": 3.929136899015254, + "grad_norm": 0.009216235019266605, + "learning_rate": 1.3339543895030843e-05, + "loss": 0.0028, + "step": 5085 + }, + { + "epoch": 3.929909248889747, + "grad_norm": 0.009160725399851799, + "learning_rate": 1.3321205653099222e-05, + "loss": 0.0032, + "step": 5086 + }, + { + "epoch": 3.9306815987642403, + "grad_norm": 0.009034167975187302, + "learning_rate": 1.330287808747836e-05, + "loss": 0.003, + "step": 5087 + }, + { + "epoch": 3.931453948638733, + "grad_norm": 0.01226490642875433, + "learning_rate": 1.3284561203502965e-05, + "loss": 0.0028, + "step": 5088 + }, + { + "epoch": 3.9322262985132266, + "grad_norm": 0.008918531239032745, + "learning_rate": 1.3266255006504646e-05, + "loss": 0.0032, + "step": 5089 + }, + { + "epoch": 3.9329986483877195, + "grad_norm": 0.010417668148875237, + "learning_rate": 1.3247959501811885e-05, + "loss": 0.0031, + "step": 5090 + }, + { + "epoch": 3.933770998262213, + "grad_norm": 0.010061853565275669, + "learning_rate": 1.3229674694750066e-05, + "loss": 0.0034, + "step": 5091 + }, + { + "epoch": 3.934543348136706, + "grad_norm": 0.008060588501393795, + "learning_rate": 1.3211400590641448e-05, + "loss": 0.0029, + "step": 5092 + }, + { + "epoch": 3.935315698011199, + "grad_norm": 0.007736505940556526, + "learning_rate": 1.3193137194805195e-05, + "loss": 0.0029, + "step": 5093 + }, + { + "epoch": 3.936088047885692, + "grad_norm": 0.012635236606001854, + "learning_rate": 1.3174884512557329e-05, + "loss": 0.0029, + "step": 5094 + }, + { + "epoch": 3.9368603977601855, + "grad_norm": 0.00976151879876852, + "learning_rate": 1.3156642549210768e-05, + "loss": 0.0028, + "step": 5095 + }, + { + "epoch": 3.9376327476346784, + "grad_norm": 0.013500361703336239, + "learning_rate": 1.313841131007531e-05, + "loss": 0.0029, + "step": 5096 + }, + { + "epoch": 3.938405097509172, + "grad_norm": 0.012697826139628887, + "learning_rate": 1.3120190800457622e-05, + "loss": 0.0031, + "step": 5097 + }, + { + "epoch": 3.9391774473836647, + "grad_norm": 0.01049091387540102, + "learning_rate": 1.3101981025661258e-05, + "loss": 0.0032, + "step": 5098 + }, + { + "epoch": 3.939949797258158, + "grad_norm": 0.009147342294454575, + "learning_rate": 1.3083781990986644e-05, + "loss": 0.0024, + "step": 5099 + }, + { + "epoch": 3.940722147132651, + "grad_norm": 0.012354012578725815, + "learning_rate": 1.3065593701731077e-05, + "loss": 0.0028, + "step": 5100 + }, + { + "epoch": 3.9414944970071444, + "grad_norm": 0.012569538317620754, + "learning_rate": 1.3047416163188724e-05, + "loss": 0.0025, + "step": 5101 + }, + { + "epoch": 3.9422668468816373, + "grad_norm": 0.008328719064593315, + "learning_rate": 1.3029249380650638e-05, + "loss": 0.0028, + "step": 5102 + }, + { + "epoch": 3.9430391967561307, + "grad_norm": 0.009669464081525803, + "learning_rate": 1.3011093359404725e-05, + "loss": 0.0029, + "step": 5103 + }, + { + "epoch": 3.9438115466306236, + "grad_norm": 0.013122282922267914, + "learning_rate": 1.2992948104735763e-05, + "loss": 0.003, + "step": 5104 + }, + { + "epoch": 3.944583896505117, + "grad_norm": 0.00925926398485899, + "learning_rate": 1.2974813621925397e-05, + "loss": 0.0026, + "step": 5105 + }, + { + "epoch": 3.94535624637961, + "grad_norm": 0.010746442712843418, + "learning_rate": 1.295668991625214e-05, + "loss": 0.0029, + "step": 5106 + }, + { + "epoch": 3.946128596254103, + "grad_norm": 0.010360931977629662, + "learning_rate": 1.2938576992991364e-05, + "loss": 0.0028, + "step": 5107 + }, + { + "epoch": 3.946900946128596, + "grad_norm": 0.0118419723585248, + "learning_rate": 1.29204748574153e-05, + "loss": 0.0032, + "step": 5108 + }, + { + "epoch": 3.9476732960030896, + "grad_norm": 0.007602566387504339, + "learning_rate": 1.2902383514793043e-05, + "loss": 0.0029, + "step": 5109 + }, + { + "epoch": 3.9484456458775825, + "grad_norm": 0.01111143920570612, + "learning_rate": 1.288430297039055e-05, + "loss": 0.0035, + "step": 5110 + }, + { + "epoch": 3.9492179957520754, + "grad_norm": 0.010026851668953896, + "learning_rate": 1.286623322947062e-05, + "loss": 0.0035, + "step": 5111 + }, + { + "epoch": 3.949990345626569, + "grad_norm": 0.008931624703109264, + "learning_rate": 1.2848174297292936e-05, + "loss": 0.003, + "step": 5112 + }, + { + "epoch": 3.950762695501062, + "grad_norm": 0.013498594984412193, + "learning_rate": 1.2830126179114e-05, + "loss": 0.0031, + "step": 5113 + }, + { + "epoch": 3.951535045375555, + "grad_norm": 0.008173106238245964, + "learning_rate": 1.281208888018719e-05, + "loss": 0.0028, + "step": 5114 + }, + { + "epoch": 3.952307395250048, + "grad_norm": 0.008225626312196255, + "learning_rate": 1.2794062405762713e-05, + "loss": 0.0026, + "step": 5115 + }, + { + "epoch": 3.9530797451245414, + "grad_norm": 0.00897209718823433, + "learning_rate": 1.2776046761087684e-05, + "loss": 0.0027, + "step": 5116 + }, + { + "epoch": 3.953852094999035, + "grad_norm": 0.009104445576667786, + "learning_rate": 1.275804195140598e-05, + "loss": 0.0028, + "step": 5117 + }, + { + "epoch": 3.9546244448735277, + "grad_norm": 0.008700024336576462, + "learning_rate": 1.2740047981958364e-05, + "loss": 0.0029, + "step": 5118 + }, + { + "epoch": 3.9553967947480206, + "grad_norm": 0.007199831306934357, + "learning_rate": 1.2722064857982486e-05, + "loss": 0.0025, + "step": 5119 + }, + { + "epoch": 3.956169144622514, + "grad_norm": 0.013268791139125824, + "learning_rate": 1.2704092584712762e-05, + "loss": 0.0029, + "step": 5120 + }, + { + "epoch": 3.9569414944970074, + "grad_norm": 0.009150993078947067, + "learning_rate": 1.2686131167380477e-05, + "loss": 0.003, + "step": 5121 + }, + { + "epoch": 3.9577138443715003, + "grad_norm": 0.013867981731891632, + "learning_rate": 1.266818061121382e-05, + "loss": 0.0027, + "step": 5122 + }, + { + "epoch": 3.9584861942459932, + "grad_norm": 0.008802138268947601, + "learning_rate": 1.2650240921437716e-05, + "loss": 0.0031, + "step": 5123 + }, + { + "epoch": 3.9592585441204866, + "grad_norm": 0.008668835274875164, + "learning_rate": 1.2632312103273974e-05, + "loss": 0.0026, + "step": 5124 + }, + { + "epoch": 3.96003089399498, + "grad_norm": 0.014852354303002357, + "learning_rate": 1.2614394161941267e-05, + "loss": 0.0029, + "step": 5125 + }, + { + "epoch": 3.960803243869473, + "grad_norm": 0.018547803163528442, + "learning_rate": 1.259648710265508e-05, + "loss": 0.0036, + "step": 5126 + }, + { + "epoch": 3.961575593743966, + "grad_norm": 0.008425499312579632, + "learning_rate": 1.2578590930627677e-05, + "loss": 0.003, + "step": 5127 + }, + { + "epoch": 3.962347943618459, + "grad_norm": 0.007943299598991871, + "learning_rate": 1.256070565106825e-05, + "loss": 0.003, + "step": 5128 + }, + { + "epoch": 3.9631202934929526, + "grad_norm": 0.00888790562748909, + "learning_rate": 1.2542831269182764e-05, + "loss": 0.003, + "step": 5129 + }, + { + "epoch": 3.9638926433674455, + "grad_norm": 0.009093926288187504, + "learning_rate": 1.2524967790174003e-05, + "loss": 0.0028, + "step": 5130 + }, + { + "epoch": 3.9646649932419384, + "grad_norm": 0.016038067638874054, + "learning_rate": 1.2507115219241577e-05, + "loss": 0.003, + "step": 5131 + }, + { + "epoch": 3.965437343116432, + "grad_norm": 0.012677005492150784, + "learning_rate": 1.2489273561581999e-05, + "loss": 0.0031, + "step": 5132 + }, + { + "epoch": 3.9662096929909247, + "grad_norm": 0.018096787855029106, + "learning_rate": 1.2471442822388485e-05, + "loss": 0.0034, + "step": 5133 + }, + { + "epoch": 3.966982042865418, + "grad_norm": 0.01353723555803299, + "learning_rate": 1.2453623006851145e-05, + "loss": 0.003, + "step": 5134 + }, + { + "epoch": 3.967754392739911, + "grad_norm": 0.009085814468562603, + "learning_rate": 1.243581412015692e-05, + "loss": 0.0034, + "step": 5135 + }, + { + "epoch": 3.9685267426144044, + "grad_norm": 0.009171192534267902, + "learning_rate": 1.241801616748955e-05, + "loss": 0.0034, + "step": 5136 + }, + { + "epoch": 3.9692990924888973, + "grad_norm": 0.008186507038772106, + "learning_rate": 1.2400229154029541e-05, + "loss": 0.0026, + "step": 5137 + }, + { + "epoch": 3.9700714423633907, + "grad_norm": 0.010795575566589832, + "learning_rate": 1.238245308495431e-05, + "loss": 0.0028, + "step": 5138 + }, + { + "epoch": 3.9708437922378836, + "grad_norm": 0.010350009426474571, + "learning_rate": 1.2364687965438033e-05, + "loss": 0.0026, + "step": 5139 + }, + { + "epoch": 3.971616142112377, + "grad_norm": 0.008702244609594345, + "learning_rate": 1.2346933800651678e-05, + "loss": 0.0028, + "step": 5140 + }, + { + "epoch": 3.97238849198687, + "grad_norm": 0.010398447513580322, + "learning_rate": 1.2329190595763085e-05, + "loss": 0.0029, + "step": 5141 + }, + { + "epoch": 3.9731608418613633, + "grad_norm": 0.01263085100799799, + "learning_rate": 1.2311458355936872e-05, + "loss": 0.0032, + "step": 5142 + }, + { + "epoch": 3.9739331917358562, + "grad_norm": 0.015385749749839306, + "learning_rate": 1.2293737086334433e-05, + "loss": 0.0032, + "step": 5143 + }, + { + "epoch": 3.9747055416103496, + "grad_norm": 0.012905635870993137, + "learning_rate": 1.2276026792114037e-05, + "loss": 0.0032, + "step": 5144 + }, + { + "epoch": 3.9754778914848425, + "grad_norm": 0.009418494068086147, + "learning_rate": 1.2258327478430704e-05, + "loss": 0.003, + "step": 5145 + }, + { + "epoch": 3.976250241359336, + "grad_norm": 0.01325889304280281, + "learning_rate": 1.224063915043629e-05, + "loss": 0.0032, + "step": 5146 + }, + { + "epoch": 3.977022591233829, + "grad_norm": 0.00991750042885542, + "learning_rate": 1.2222961813279426e-05, + "loss": 0.0028, + "step": 5147 + }, + { + "epoch": 3.977794941108322, + "grad_norm": 0.009794146753847599, + "learning_rate": 1.220529547210556e-05, + "loss": 0.0029, + "step": 5148 + }, + { + "epoch": 3.978567290982815, + "grad_norm": 0.011819394305348396, + "learning_rate": 1.2187640132056949e-05, + "loss": 0.0027, + "step": 5149 + }, + { + "epoch": 3.9793396408573085, + "grad_norm": 0.010317685082554817, + "learning_rate": 1.2169995798272622e-05, + "loss": 0.0034, + "step": 5150 + }, + { + "epoch": 3.9801119907318014, + "grad_norm": 0.00700108427554369, + "learning_rate": 1.2152362475888424e-05, + "loss": 0.0028, + "step": 5151 + }, + { + "epoch": 3.980884340606295, + "grad_norm": 0.00885722879320383, + "learning_rate": 1.213474017003699e-05, + "loss": 0.0031, + "step": 5152 + }, + { + "epoch": 3.9816566904807877, + "grad_norm": 0.007584693841636181, + "learning_rate": 1.2117128885847745e-05, + "loss": 0.0029, + "step": 5153 + }, + { + "epoch": 3.9824290403552807, + "grad_norm": 0.010611428879201412, + "learning_rate": 1.2099528628446905e-05, + "loss": 0.0033, + "step": 5154 + }, + { + "epoch": 3.983201390229774, + "grad_norm": 0.00826264638453722, + "learning_rate": 1.2081939402957487e-05, + "loss": 0.0029, + "step": 5155 + }, + { + "epoch": 3.9839737401042674, + "grad_norm": 0.008506223559379578, + "learning_rate": 1.2064361214499292e-05, + "loss": 0.0027, + "step": 5156 + }, + { + "epoch": 3.9847460899787603, + "grad_norm": 0.011393384076654911, + "learning_rate": 1.2046794068188893e-05, + "loss": 0.003, + "step": 5157 + }, + { + "epoch": 3.9855184398532533, + "grad_norm": 0.006410202011466026, + "learning_rate": 1.2029237969139673e-05, + "loss": 0.0028, + "step": 5158 + }, + { + "epoch": 3.9862907897277466, + "grad_norm": 0.008192823268473148, + "learning_rate": 1.2011692922461782e-05, + "loss": 0.0028, + "step": 5159 + }, + { + "epoch": 3.98706313960224, + "grad_norm": 0.009260269813239574, + "learning_rate": 1.1994158933262161e-05, + "loss": 0.0031, + "step": 5160 + }, + { + "epoch": 3.987835489476733, + "grad_norm": 0.00856191385537386, + "learning_rate": 1.1976636006644531e-05, + "loss": 0.0031, + "step": 5161 + }, + { + "epoch": 3.988607839351226, + "grad_norm": 0.008363443426787853, + "learning_rate": 1.19591241477094e-05, + "loss": 0.0029, + "step": 5162 + }, + { + "epoch": 3.9893801892257192, + "grad_norm": 0.01415453851222992, + "learning_rate": 1.1941623361554034e-05, + "loss": 0.003, + "step": 5163 + }, + { + "epoch": 3.9901525391002126, + "grad_norm": 0.011211784556508064, + "learning_rate": 1.192413365327249e-05, + "loss": 0.0035, + "step": 5164 + }, + { + "epoch": 3.9909248889747055, + "grad_norm": 0.008803755044937134, + "learning_rate": 1.190665502795561e-05, + "loss": 0.0027, + "step": 5165 + }, + { + "epoch": 3.9916972388491985, + "grad_norm": 0.010068275965750217, + "learning_rate": 1.188918749069099e-05, + "loss": 0.0032, + "step": 5166 + }, + { + "epoch": 3.992469588723692, + "grad_norm": 0.011156435124576092, + "learning_rate": 1.1871731046563017e-05, + "loss": 0.003, + "step": 5167 + }, + { + "epoch": 3.993241938598185, + "grad_norm": 0.015244616195559502, + "learning_rate": 1.1854285700652828e-05, + "loss": 0.0033, + "step": 5168 + }, + { + "epoch": 3.994014288472678, + "grad_norm": 0.007877008058130741, + "learning_rate": 1.1836851458038351e-05, + "loss": 0.0028, + "step": 5169 + }, + { + "epoch": 3.994786638347171, + "grad_norm": 0.012196575291454792, + "learning_rate": 1.1819428323794274e-05, + "loss": 0.0035, + "step": 5170 + }, + { + "epoch": 3.9955589882216644, + "grad_norm": 0.00742301857098937, + "learning_rate": 1.1802016302992042e-05, + "loss": 0.0033, + "step": 5171 + }, + { + "epoch": 3.996331338096158, + "grad_norm": 0.008889246731996536, + "learning_rate": 1.1784615400699878e-05, + "loss": 0.0029, + "step": 5172 + }, + { + "epoch": 3.9971036879706507, + "grad_norm": 0.009742755442857742, + "learning_rate": 1.1767225621982764e-05, + "loss": 0.0033, + "step": 5173 + }, + { + "epoch": 3.9978760378451437, + "grad_norm": 0.00847399327903986, + "learning_rate": 1.1749846971902446e-05, + "loss": 0.0033, + "step": 5174 + }, + { + "epoch": 3.998648387719637, + "grad_norm": 0.010470501147210598, + "learning_rate": 1.1732479455517425e-05, + "loss": 0.0029, + "step": 5175 + }, + { + "epoch": 3.9994207375941304, + "grad_norm": 0.008755206130445004, + "learning_rate": 1.1715123077882972e-05, + "loss": 0.0029, + "step": 5176 + } + ], + "logging_steps": 1, + "max_steps": 6470, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.604065328444211e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}