{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0159010600706715, "eval_steps": 500, "global_step": 3405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0176678445229682, "grad_norm": 8.093123435974121, "learning_rate": 1e-05, "loss": 3.4829, "step": 10 }, { "epoch": 0.0353356890459364, "grad_norm": 0.5401085615158081, "learning_rate": 2e-05, "loss": 3.4785, "step": 20 }, { "epoch": 0.053003533568904596, "grad_norm": 0.6342999339103699, "learning_rate": 1.9999569325372924e-05, "loss": 2.9312, "step": 30 }, { "epoch": 0.0706713780918728, "grad_norm": 0.7516262531280518, "learning_rate": 1.9998277338587826e-05, "loss": 2.9621, "step": 40 }, { "epoch": 0.08833922261484099, "grad_norm": 0.4993094205856323, "learning_rate": 1.9996124150929886e-05, "loss": 2.5796, "step": 50 }, { "epoch": 0.10600706713780919, "grad_norm": 1.0893628597259521, "learning_rate": 1.9993109947863768e-05, "loss": 2.5345, "step": 60 }, { "epoch": 0.12367491166077739, "grad_norm": 0.7348760962486267, "learning_rate": 1.9989234989017622e-05, "loss": 2.1804, "step": 70 }, { "epoch": 0.1413427561837456, "grad_norm": 0.7738515734672546, "learning_rate": 1.9984499608160744e-05, "loss": 1.9985, "step": 80 }, { "epoch": 0.15901060070671377, "grad_norm": 5.662350654602051, "learning_rate": 1.9978904213174812e-05, "loss": 2.0175, "step": 90 }, { "epoch": 0.17667844522968199, "grad_norm": 0.9150586724281311, "learning_rate": 1.997244928601875e-05, "loss": 1.8988, "step": 100 }, { "epoch": 0.19434628975265017, "grad_norm": 8.785883903503418, "learning_rate": 1.9965135382687235e-05, "loss": 1.7734, "step": 110 }, { "epoch": 0.21201413427561838, "grad_norm": 0.5746702551841736, "learning_rate": 1.9956963133162776e-05, "loss": 1.683, "step": 120 }, { "epoch": 0.22968197879858657, "grad_norm": 0.7510441541671753, "learning_rate": 1.9947933241361482e-05, "loss": 1.6329, "step": 130 }, { "epoch": 0.24734982332155478, "grad_norm": 0.7327967882156372, "learning_rate": 1.993804648507241e-05, "loss": 1.5906, "step": 140 }, { "epoch": 0.26501766784452296, "grad_norm": 0.7026442885398865, "learning_rate": 1.9927303715890573e-05, "loss": 1.5329, "step": 150 }, { "epoch": 0.2826855123674912, "grad_norm": 0.689364492893219, "learning_rate": 1.9915705859143597e-05, "loss": 1.3812, "step": 160 }, { "epoch": 0.3003533568904594, "grad_norm": 0.577924370765686, "learning_rate": 1.9903253913812003e-05, "loss": 1.3581, "step": 170 }, { "epoch": 0.31802120141342755, "grad_norm": 0.5152636766433716, "learning_rate": 1.9889948952443174e-05, "loss": 1.3432, "step": 180 }, { "epoch": 0.33568904593639576, "grad_norm": 0.5361068844795227, "learning_rate": 1.987579212105897e-05, "loss": 1.2808, "step": 190 }, { "epoch": 0.35335689045936397, "grad_norm": 0.562074601650238, "learning_rate": 1.9860784639057e-05, "loss": 1.2738, "step": 200 }, { "epoch": 0.3710247349823322, "grad_norm": 0.6112928986549377, "learning_rate": 1.9844927799105615e-05, "loss": 1.258, "step": 210 }, { "epoch": 0.38869257950530034, "grad_norm": 0.6768578290939331, "learning_rate": 1.9828222967032533e-05, "loss": 1.2198, "step": 220 }, { "epoch": 0.40636042402826855, "grad_norm": 0.5859869122505188, "learning_rate": 1.9810671581707223e-05, "loss": 1.2073, "step": 230 }, { "epoch": 0.42402826855123676, "grad_norm": 0.5085058808326721, "learning_rate": 1.979227515491695e-05, "loss": 1.2316, "step": 240 }, { "epoch": 0.4416961130742049, "grad_norm": 0.5284629464149475, "learning_rate": 1.9773035271236566e-05, "loss": 1.1577, "step": 250 }, { "epoch": 0.45936395759717313, "grad_norm": 0.5618858933448792, "learning_rate": 1.9752953587892013e-05, "loss": 1.1296, "step": 260 }, { "epoch": 0.47703180212014135, "grad_norm": 0.5541247725486755, "learning_rate": 1.973203183461759e-05, "loss": 1.1492, "step": 270 }, { "epoch": 0.49469964664310956, "grad_norm": 0.5731683969497681, "learning_rate": 1.9710271813506954e-05, "loss": 1.2129, "step": 280 }, { "epoch": 0.5123674911660777, "grad_norm": 1.798783302307129, "learning_rate": 1.96876753988579e-05, "loss": 1.1975, "step": 290 }, { "epoch": 0.5300353356890459, "grad_norm": 0.6141424179077148, "learning_rate": 1.9664244537010924e-05, "loss": 1.1168, "step": 300 }, { "epoch": 0.5477031802120141, "grad_norm": 0.49130862951278687, "learning_rate": 1.9639981246181557e-05, "loss": 1.1087, "step": 310 }, { "epoch": 0.5653710247349824, "grad_norm": 0.518574059009552, "learning_rate": 1.9614887616286544e-05, "loss": 1.0381, "step": 320 }, { "epoch": 0.5830388692579506, "grad_norm": 0.5865027904510498, "learning_rate": 1.958896580876383e-05, "loss": 1.0836, "step": 330 }, { "epoch": 0.6007067137809188, "grad_norm": 0.7070803046226501, "learning_rate": 1.9562218056386366e-05, "loss": 1.1234, "step": 340 }, { "epoch": 0.6183745583038869, "grad_norm": 0.5792056322097778, "learning_rate": 1.9534646663069816e-05, "loss": 1.0202, "step": 350 }, { "epoch": 0.6360424028268551, "grad_norm": 0.6844910383224487, "learning_rate": 1.9506254003674084e-05, "loss": 1.1047, "step": 360 }, { "epoch": 0.6537102473498233, "grad_norm": 0.5521841049194336, "learning_rate": 1.9477042523798762e-05, "loss": 1.0834, "step": 370 }, { "epoch": 0.6713780918727915, "grad_norm": 0.578198254108429, "learning_rate": 1.9447014739572503e-05, "loss": 0.9927, "step": 380 }, { "epoch": 0.6890459363957597, "grad_norm": 0.47503796219825745, "learning_rate": 1.9416173237436252e-05, "loss": 1.051, "step": 390 }, { "epoch": 0.7067137809187279, "grad_norm": 0.5625858902931213, "learning_rate": 1.9384520673920502e-05, "loss": 1.0239, "step": 400 }, { "epoch": 0.7243816254416962, "grad_norm": 0.535770833492279, "learning_rate": 1.9352059775416442e-05, "loss": 1.0245, "step": 410 }, { "epoch": 0.7420494699646644, "grad_norm": 0.5579429864883423, "learning_rate": 1.931879333794115e-05, "loss": 0.9748, "step": 420 }, { "epoch": 0.7597173144876325, "grad_norm": 0.5888713598251343, "learning_rate": 1.928472422689674e-05, "loss": 1.0526, "step": 430 }, { "epoch": 0.7773851590106007, "grad_norm": 0.562355637550354, "learning_rate": 1.9249855376823542e-05, "loss": 0.9808, "step": 440 }, { "epoch": 0.7950530035335689, "grad_norm": 0.45564863085746765, "learning_rate": 1.9214189791147363e-05, "loss": 0.9943, "step": 450 }, { "epoch": 0.8127208480565371, "grad_norm": 0.5389313101768494, "learning_rate": 1.917773054192076e-05, "loss": 0.9906, "step": 460 }, { "epoch": 0.8303886925795053, "grad_norm": 0.5588345527648926, "learning_rate": 1.9140480769558448e-05, "loss": 1.0077, "step": 470 }, { "epoch": 0.8480565371024735, "grad_norm": 0.6023375988006592, "learning_rate": 1.9102443682566792e-05, "loss": 0.9995, "step": 480 }, { "epoch": 0.8657243816254417, "grad_norm": 0.5838338732719421, "learning_rate": 1.9063622557267443e-05, "loss": 0.9542, "step": 490 }, { "epoch": 0.8833922261484098, "grad_norm": 0.6777709722518921, "learning_rate": 1.9024020737515135e-05, "loss": 1.0053, "step": 500 }, { "epoch": 0.901060070671378, "grad_norm": 0.6219585537910461, "learning_rate": 1.8983641634409657e-05, "loss": 0.9962, "step": 510 }, { "epoch": 0.9187279151943463, "grad_norm": 0.6651351451873779, "learning_rate": 1.894248872600204e-05, "loss": 0.9596, "step": 520 }, { "epoch": 0.9363957597173145, "grad_norm": 0.5591141581535339, "learning_rate": 1.8900565556994986e-05, "loss": 0.9638, "step": 530 }, { "epoch": 0.9540636042402827, "grad_norm": 0.6066205501556396, "learning_rate": 1.8857875738437526e-05, "loss": 0.9468, "step": 540 }, { "epoch": 0.9717314487632509, "grad_norm": 0.584261417388916, "learning_rate": 1.8814422947414e-05, "loss": 0.9694, "step": 550 }, { "epoch": 0.9893992932862191, "grad_norm": 0.553644061088562, "learning_rate": 1.877021092672732e-05, "loss": 0.9886, "step": 560 }, { "epoch": 1.0070671378091873, "grad_norm": 0.877346396446228, "learning_rate": 1.872524348457659e-05, "loss": 1.0424, "step": 570 }, { "epoch": 1.0247349823321554, "grad_norm": 0.5153814554214478, "learning_rate": 1.867952449422909e-05, "loss": 0.9541, "step": 580 }, { "epoch": 1.0424028268551238, "grad_norm": 0.6072525382041931, "learning_rate": 1.863305789368664e-05, "loss": 0.9835, "step": 590 }, { "epoch": 1.0600706713780919, "grad_norm": 2.0284619331359863, "learning_rate": 1.8585847685346415e-05, "loss": 0.9128, "step": 600 }, { "epoch": 1.0777385159010602, "grad_norm": 0.5690125226974487, "learning_rate": 1.853789793565618e-05, "loss": 0.9317, "step": 610 }, { "epoch": 1.0954063604240283, "grad_norm": 0.5882884860038757, "learning_rate": 1.8489212774764064e-05, "loss": 0.9644, "step": 620 }, { "epoch": 1.1130742049469964, "grad_norm": 0.7221271991729736, "learning_rate": 1.8439796396162756e-05, "loss": 1.024, "step": 630 }, { "epoch": 1.1307420494699647, "grad_norm": 0.6504462361335754, "learning_rate": 1.8389653056328344e-05, "loss": 0.9529, "step": 640 }, { "epoch": 1.1484098939929328, "grad_norm": 0.5796203017234802, "learning_rate": 1.833878707435367e-05, "loss": 0.9081, "step": 650 }, { "epoch": 1.1660777385159011, "grad_norm": 0.5094664096832275, "learning_rate": 1.8287202831576292e-05, "loss": 0.9413, "step": 660 }, { "epoch": 1.1837455830388692, "grad_norm": 0.5529056191444397, "learning_rate": 1.8234904771201115e-05, "loss": 0.9494, "step": 670 }, { "epoch": 1.2014134275618376, "grad_norm": 0.5121185779571533, "learning_rate": 1.8181897397917672e-05, "loss": 0.8958, "step": 680 }, { "epoch": 1.2190812720848057, "grad_norm": 0.6205973029136658, "learning_rate": 1.8128185277512106e-05, "loss": 0.9541, "step": 690 }, { "epoch": 1.2367491166077738, "grad_norm": 0.4824579954147339, "learning_rate": 1.80737730364739e-05, "loss": 0.9269, "step": 700 }, { "epoch": 1.254416961130742, "grad_norm": 0.6322911381721497, "learning_rate": 1.8018665361597378e-05, "loss": 0.9151, "step": 710 }, { "epoch": 1.2720848056537102, "grad_norm": 0.5373772382736206, "learning_rate": 1.7962866999578005e-05, "loss": 0.8693, "step": 720 }, { "epoch": 1.2897526501766785, "grad_norm": 0.640831470489502, "learning_rate": 1.7906382756603536e-05, "loss": 0.9646, "step": 730 }, { "epoch": 1.3074204946996466, "grad_norm": 0.6338053941726685, "learning_rate": 1.784921749794002e-05, "loss": 0.9903, "step": 740 }, { "epoch": 1.325088339222615, "grad_norm": 0.692629337310791, "learning_rate": 1.7791376147512754e-05, "loss": 0.9846, "step": 750 }, { "epoch": 1.342756183745583, "grad_norm": 0.7451064586639404, "learning_rate": 1.773286368748214e-05, "loss": 0.9787, "step": 760 }, { "epoch": 1.3604240282685511, "grad_norm": 0.7188582420349121, "learning_rate": 1.7673685157814556e-05, "loss": 0.9304, "step": 770 }, { "epoch": 1.3780918727915195, "grad_norm": 0.5633394718170166, "learning_rate": 1.761384565584825e-05, "loss": 0.9507, "step": 780 }, { "epoch": 1.3957597173144876, "grad_norm": 0.5163728594779968, "learning_rate": 1.7553350335854253e-05, "loss": 0.9334, "step": 790 }, { "epoch": 1.4134275618374559, "grad_norm": 0.5537968277931213, "learning_rate": 1.7492204408592447e-05, "loss": 0.8753, "step": 800 }, { "epoch": 1.431095406360424, "grad_norm": 0.5419259071350098, "learning_rate": 1.7430413140862705e-05, "loss": 0.8902, "step": 810 }, { "epoch": 1.4487632508833923, "grad_norm": 0.5655599236488342, "learning_rate": 1.7367981855051275e-05, "loss": 0.8664, "step": 820 }, { "epoch": 1.4664310954063604, "grad_norm": 0.6520758867263794, "learning_rate": 1.73049159286723e-05, "loss": 0.8684, "step": 830 }, { "epoch": 1.4840989399293285, "grad_norm": 0.5785622596740723, "learning_rate": 1.7241220793904644e-05, "loss": 0.9331, "step": 840 }, { "epoch": 1.5017667844522968, "grad_norm": 0.5621282458305359, "learning_rate": 1.717690193712399e-05, "loss": 0.9055, "step": 850 }, { "epoch": 1.5194346289752652, "grad_norm": 0.5962981581687927, "learning_rate": 1.711196489843027e-05, "loss": 0.8962, "step": 860 }, { "epoch": 1.5371024734982333, "grad_norm": 1.2902908325195312, "learning_rate": 1.704641527117047e-05, "loss": 0.9052, "step": 870 }, { "epoch": 1.5547703180212014, "grad_norm": 0.5082823038101196, "learning_rate": 1.6980258701456843e-05, "loss": 0.8873, "step": 880 }, { "epoch": 1.5724381625441697, "grad_norm": 0.5958746075630188, "learning_rate": 1.6913500887680588e-05, "loss": 0.9228, "step": 890 }, { "epoch": 1.5901060070671378, "grad_norm": 0.7719606161117554, "learning_rate": 1.6846147580021016e-05, "loss": 0.921, "step": 900 }, { "epoch": 1.6077738515901059, "grad_norm": 2.6116905212402344, "learning_rate": 1.6778204579950258e-05, "loss": 0.8997, "step": 910 }, { "epoch": 1.6254416961130742, "grad_norm": 0.5398052930831909, "learning_rate": 1.6709677739733555e-05, "loss": 0.8912, "step": 920 }, { "epoch": 1.6431095406360425, "grad_norm": 0.6152315735816956, "learning_rate": 1.6640572961925182e-05, "loss": 0.893, "step": 930 }, { "epoch": 1.6607773851590106, "grad_norm": 0.5041419863700867, "learning_rate": 1.657089619886002e-05, "loss": 0.8951, "step": 940 }, { "epoch": 1.6784452296819787, "grad_norm": 0.5210781693458557, "learning_rate": 1.650065345214086e-05, "loss": 0.9144, "step": 950 }, { "epoch": 1.696113074204947, "grad_norm": 0.6072030067443848, "learning_rate": 1.6429850772121448e-05, "loss": 0.9113, "step": 960 }, { "epoch": 1.7137809187279152, "grad_norm": 0.6967266201972961, "learning_rate": 1.635849425738535e-05, "loss": 0.9556, "step": 970 }, { "epoch": 1.7314487632508833, "grad_norm": 0.5619155168533325, "learning_rate": 1.6286590054220643e-05, "loss": 0.8637, "step": 980 }, { "epoch": 1.7491166077738516, "grad_norm": 0.6261915564537048, "learning_rate": 1.6214144356090494e-05, "loss": 0.877, "step": 990 }, { "epoch": 1.76678445229682, "grad_norm": 0.5935543775558472, "learning_rate": 1.6141163403099716e-05, "loss": 0.9611, "step": 1000 }, { "epoch": 1.784452296819788, "grad_norm": 0.6743911504745483, "learning_rate": 1.6067653481457254e-05, "loss": 0.9007, "step": 1010 }, { "epoch": 1.802120141342756, "grad_norm": 0.6876866221427917, "learning_rate": 1.5993620922934716e-05, "loss": 0.8605, "step": 1020 }, { "epoch": 1.8197879858657244, "grad_norm": 2.510390043258667, "learning_rate": 1.591907210432102e-05, "loss": 0.864, "step": 1030 }, { "epoch": 1.8374558303886925, "grad_norm": 1.431142807006836, "learning_rate": 1.5844013446873087e-05, "loss": 0.9, "step": 1040 }, { "epoch": 1.8551236749116606, "grad_norm": 0.6549361348152161, "learning_rate": 1.5768451415762784e-05, "loss": 0.8726, "step": 1050 }, { "epoch": 1.872791519434629, "grad_norm": 0.5458822846412659, "learning_rate": 1.5692392519520022e-05, "loss": 0.9468, "step": 1060 }, { "epoch": 1.8904593639575973, "grad_norm": 1.6610078811645508, "learning_rate": 1.5615843309472162e-05, "loss": 0.8806, "step": 1070 }, { "epoch": 1.9081272084805654, "grad_norm": 0.5835666656494141, "learning_rate": 1.5538810379179694e-05, "loss": 0.8934, "step": 1080 }, { "epoch": 1.9257950530035335, "grad_norm": 0.5027960538864136, "learning_rate": 1.5461300363868326e-05, "loss": 0.8641, "step": 1090 }, { "epoch": 1.9434628975265018, "grad_norm": 0.5919517278671265, "learning_rate": 1.538331993985745e-05, "loss": 0.8959, "step": 1100 }, { "epoch": 1.96113074204947, "grad_norm": 0.7145887017250061, "learning_rate": 1.5304875823985067e-05, "loss": 0.9284, "step": 1110 }, { "epoch": 1.978798586572438, "grad_norm": 0.6099452376365662, "learning_rate": 1.5225974773029246e-05, "loss": 0.8896, "step": 1120 }, { "epoch": 1.9964664310954063, "grad_norm": 0.5641235709190369, "learning_rate": 1.5146623583126134e-05, "loss": 0.8888, "step": 1130 }, { "epoch": 2.0141342756183747, "grad_norm": 3.2261722087860107, "learning_rate": 1.5066829089184545e-05, "loss": 0.8495, "step": 1140 }, { "epoch": 2.0318021201413425, "grad_norm": 0.5612754821777344, "learning_rate": 1.498659816429727e-05, "loss": 0.8446, "step": 1150 }, { "epoch": 2.049469964664311, "grad_norm": 0.5648794174194336, "learning_rate": 1.4905937719149038e-05, "loss": 0.9185, "step": 1160 }, { "epoch": 2.067137809187279, "grad_norm": 0.5706191658973694, "learning_rate": 1.4824854701421277e-05, "loss": 0.8588, "step": 1170 }, { "epoch": 2.0848056537102475, "grad_norm": 0.5422844290733337, "learning_rate": 1.4743356095193665e-05, "loss": 0.8906, "step": 1180 }, { "epoch": 2.1024734982332154, "grad_norm": 0.6885594725608826, "learning_rate": 1.4661448920342585e-05, "loss": 0.871, "step": 1190 }, { "epoch": 2.1201413427561837, "grad_norm": 3.2255871295928955, "learning_rate": 1.4579140231936415e-05, "loss": 0.8621, "step": 1200 }, { "epoch": 2.137809187279152, "grad_norm": 0.5476865172386169, "learning_rate": 1.4496437119627907e-05, "loss": 0.8193, "step": 1210 }, { "epoch": 2.1554770318021204, "grad_norm": 0.6046047210693359, "learning_rate": 1.4413346707043467e-05, "loss": 0.8533, "step": 1220 }, { "epoch": 2.1731448763250882, "grad_norm": 0.5473910570144653, "learning_rate": 1.4329876151169581e-05, "loss": 0.8563, "step": 1230 }, { "epoch": 2.1908127208480566, "grad_norm": 0.5437634587287903, "learning_rate": 1.4246032641736362e-05, "loss": 0.8801, "step": 1240 }, { "epoch": 2.208480565371025, "grad_norm": 0.5767475366592407, "learning_rate": 1.4161823400598234e-05, "loss": 0.8802, "step": 1250 }, { "epoch": 2.2261484098939928, "grad_norm": 0.5995182991027832, "learning_rate": 1.4077255681111905e-05, "loss": 0.921, "step": 1260 }, { "epoch": 2.243816254416961, "grad_norm": 1.4571260213851929, "learning_rate": 1.3992336767511585e-05, "loss": 0.8508, "step": 1270 }, { "epoch": 2.2614840989399294, "grad_norm": 0.6092621088027954, "learning_rate": 1.3907073974281562e-05, "loss": 0.8886, "step": 1280 }, { "epoch": 2.2791519434628977, "grad_norm": 0.6601437926292419, "learning_rate": 1.3821474645526174e-05, "loss": 0.9181, "step": 1290 }, { "epoch": 2.2968197879858656, "grad_norm": 0.5523033738136292, "learning_rate": 1.3735546154337218e-05, "loss": 0.87, "step": 1300 }, { "epoch": 2.314487632508834, "grad_norm": 0.5680422782897949, "learning_rate": 1.3649295902158874e-05, "loss": 0.9032, "step": 1310 }, { "epoch": 2.3321554770318023, "grad_norm": 0.5610331892967224, "learning_rate": 1.3562731318150177e-05, "loss": 0.8743, "step": 1320 }, { "epoch": 2.34982332155477, "grad_norm": 0.6986725926399231, "learning_rate": 1.3475859858545121e-05, "loss": 0.8976, "step": 1330 }, { "epoch": 2.3674911660777385, "grad_norm": 0.6468251943588257, "learning_rate": 1.3388689006010394e-05, "loss": 0.8701, "step": 1340 }, { "epoch": 2.385159010600707, "grad_norm": 0.6808676719665527, "learning_rate": 1.330122626900088e-05, "loss": 0.8447, "step": 1350 }, { "epoch": 2.402826855123675, "grad_norm": 0.9036700129508972, "learning_rate": 1.3213479181112906e-05, "loss": 0.8917, "step": 1360 }, { "epoch": 2.420494699646643, "grad_norm": 0.5925520062446594, "learning_rate": 1.3125455300435343e-05, "loss": 0.8911, "step": 1370 }, { "epoch": 2.4381625441696113, "grad_norm": 0.610488772392273, "learning_rate": 1.303716220889859e-05, "loss": 0.8397, "step": 1380 }, { "epoch": 2.4558303886925796, "grad_norm": 0.7890755534172058, "learning_rate": 1.2948607511621498e-05, "loss": 0.9072, "step": 1390 }, { "epoch": 2.4734982332155475, "grad_norm": 0.5136253237724304, "learning_rate": 1.2859798836256316e-05, "loss": 0.847, "step": 1400 }, { "epoch": 2.491166077738516, "grad_norm": 0.5742533206939697, "learning_rate": 1.277074383233167e-05, "loss": 0.8632, "step": 1410 }, { "epoch": 2.508833922261484, "grad_norm": 0.6180789470672607, "learning_rate": 1.2681450170593683e-05, "loss": 0.8637, "step": 1420 }, { "epoch": 2.5265017667844525, "grad_norm": 0.604252815246582, "learning_rate": 1.2591925542345244e-05, "loss": 0.865, "step": 1430 }, { "epoch": 2.5441696113074204, "grad_norm": 0.565585732460022, "learning_rate": 1.2502177658783538e-05, "loss": 0.8873, "step": 1440 }, { "epoch": 2.5618374558303887, "grad_norm": 0.5197896957397461, "learning_rate": 1.2412214250335815e-05, "loss": 0.8654, "step": 1450 }, { "epoch": 2.579505300353357, "grad_norm": 0.5932325720787048, "learning_rate": 1.2322043065993556e-05, "loss": 0.8402, "step": 1460 }, { "epoch": 2.597173144876325, "grad_norm": 0.4967307150363922, "learning_rate": 1.2231671872644995e-05, "loss": 0.8662, "step": 1470 }, { "epoch": 2.614840989399293, "grad_norm": 0.5910283327102661, "learning_rate": 1.214110845440613e-05, "loss": 0.8501, "step": 1480 }, { "epoch": 2.6325088339222615, "grad_norm": 0.5208580493927002, "learning_rate": 1.2050360611950245e-05, "loss": 0.8566, "step": 1490 }, { "epoch": 2.65017667844523, "grad_norm": 0.6784213781356812, "learning_rate": 1.1959436161835971e-05, "loss": 0.8649, "step": 1500 }, { "epoch": 2.6678445229681977, "grad_norm": 0.6994915008544922, "learning_rate": 1.1868342935834043e-05, "loss": 0.8613, "step": 1510 }, { "epoch": 2.685512367491166, "grad_norm": 0.5287041068077087, "learning_rate": 1.1777088780252688e-05, "loss": 0.8911, "step": 1520 }, { "epoch": 2.7031802120141344, "grad_norm": 0.5094373822212219, "learning_rate": 1.1685681555261788e-05, "loss": 0.8625, "step": 1530 }, { "epoch": 2.7208480565371023, "grad_norm": 0.6024549603462219, "learning_rate": 1.1594129134215852e-05, "loss": 0.8821, "step": 1540 }, { "epoch": 2.7385159010600706, "grad_norm": 0.5689032673835754, "learning_rate": 1.1502439402975842e-05, "loss": 0.8474, "step": 1550 }, { "epoch": 2.756183745583039, "grad_norm": 0.5587700009346008, "learning_rate": 1.141062025922991e-05, "loss": 0.8609, "step": 1560 }, { "epoch": 2.7738515901060072, "grad_norm": 0.6656576991081238, "learning_rate": 1.1318679611813166e-05, "loss": 0.8577, "step": 1570 }, { "epoch": 2.791519434628975, "grad_norm": 0.6215320825576782, "learning_rate": 1.1226625380026407e-05, "loss": 0.8226, "step": 1580 }, { "epoch": 2.8091872791519434, "grad_norm": 0.5720105171203613, "learning_rate": 1.1134465492954028e-05, "loss": 0.8981, "step": 1590 }, { "epoch": 2.8268551236749118, "grad_norm": 0.5954132080078125, "learning_rate": 1.1042207888781031e-05, "loss": 0.8478, "step": 1600 }, { "epoch": 2.8445229681978796, "grad_norm": 0.6065002083778381, "learning_rate": 1.0949860514109265e-05, "loss": 0.8569, "step": 1610 }, { "epoch": 2.862190812720848, "grad_norm": 0.5527032017707825, "learning_rate": 1.085743132327296e-05, "loss": 0.868, "step": 1620 }, { "epoch": 2.8798586572438163, "grad_norm": 0.5579487681388855, "learning_rate": 1.0764928277653577e-05, "loss": 0.8946, "step": 1630 }, { "epoch": 2.8975265017667846, "grad_norm": 0.5760249495506287, "learning_rate": 1.067235934499405e-05, "loss": 0.8359, "step": 1640 }, { "epoch": 2.9151943462897525, "grad_norm": 0.6175970435142517, "learning_rate": 1.057973249871249e-05, "loss": 0.8355, "step": 1650 }, { "epoch": 2.932862190812721, "grad_norm": 0.5847891569137573, "learning_rate": 1.0487055717215394e-05, "loss": 0.8035, "step": 1660 }, { "epoch": 2.950530035335689, "grad_norm": 1.2150473594665527, "learning_rate": 1.039433698321042e-05, "loss": 0.8211, "step": 1670 }, { "epoch": 2.968197879858657, "grad_norm": 0.663726270198822, "learning_rate": 1.0301584283018813e-05, "loss": 0.8213, "step": 1680 }, { "epoch": 2.9858657243816253, "grad_norm": 0.5659819841384888, "learning_rate": 1.020880560588748e-05, "loss": 0.8943, "step": 1690 }, { "epoch": 3.0035335689045937, "grad_norm": 0.5880382657051086, "learning_rate": 1.0116008943300852e-05, "loss": 0.8894, "step": 1700 }, { "epoch": 3.021201413427562, "grad_norm": 0.5816975831985474, "learning_rate": 1.0023202288292552e-05, "loss": 0.8256, "step": 1710 }, { "epoch": 3.03886925795053, "grad_norm": 0.6161662936210632, "learning_rate": 9.930393634756877e-06, "loss": 0.8375, "step": 1720 }, { "epoch": 3.056537102473498, "grad_norm": 0.5282244682312012, "learning_rate": 9.837590976760283e-06, "loss": 0.8738, "step": 1730 }, { "epoch": 3.0742049469964665, "grad_norm": 0.5946238040924072, "learning_rate": 9.744802307852794e-06, "loss": 0.8679, "step": 1740 }, { "epoch": 3.091872791519435, "grad_norm": 0.5638343095779419, "learning_rate": 9.652035620379481e-06, "loss": 0.8447, "step": 1750 }, { "epoch": 3.1095406360424027, "grad_norm": 1.077945351600647, "learning_rate": 9.559298904792054e-06, "loss": 0.8454, "step": 1760 }, { "epoch": 3.127208480565371, "grad_norm": 0.6322164535522461, "learning_rate": 9.466600148960597e-06, "loss": 0.8682, "step": 1770 }, { "epoch": 3.1448763250883394, "grad_norm": 0.5520142912864685, "learning_rate": 9.373947337485521e-06, "loss": 0.7946, "step": 1780 }, { "epoch": 3.1625441696113072, "grad_norm": 0.5950481295585632, "learning_rate": 9.281348451009837e-06, "loss": 0.8241, "step": 1790 }, { "epoch": 3.1802120141342756, "grad_norm": 0.614825427532196, "learning_rate": 9.188811465531725e-06, "loss": 0.8697, "step": 1800 }, { "epoch": 3.197879858657244, "grad_norm": 0.5201759934425354, "learning_rate": 9.096344351717528e-06, "loss": 0.9154, "step": 1810 }, { "epoch": 3.215547703180212, "grad_norm": 0.6227285861968994, "learning_rate": 9.003955074215198e-06, "loss": 0.8995, "step": 1820 }, { "epoch": 3.23321554770318, "grad_norm": 0.5124209523200989, "learning_rate": 8.911651590968259e-06, "loss": 0.8432, "step": 1830 }, { "epoch": 3.2508833922261484, "grad_norm": 0.5678322911262512, "learning_rate": 8.819441852530358e-06, "loss": 0.837, "step": 1840 }, { "epoch": 3.2685512367491167, "grad_norm": 0.7585355639457703, "learning_rate": 8.72733380138044e-06, "loss": 0.8434, "step": 1850 }, { "epoch": 3.2862190812720846, "grad_norm": 0.6334419250488281, "learning_rate": 8.63533537123861e-06, "loss": 0.8203, "step": 1860 }, { "epoch": 3.303886925795053, "grad_norm": 0.7814059853553772, "learning_rate": 8.543454486382803e-06, "loss": 0.8214, "step": 1870 }, { "epoch": 3.3215547703180213, "grad_norm": 0.5241668820381165, "learning_rate": 8.451699060966174e-06, "loss": 0.8379, "step": 1880 }, { "epoch": 3.3392226148409896, "grad_norm": 0.5536594390869141, "learning_rate": 8.360076998335447e-06, "loss": 0.8338, "step": 1890 }, { "epoch": 3.3568904593639575, "grad_norm": 0.5642170310020447, "learning_rate": 8.268596190350158e-06, "loss": 0.8628, "step": 1900 }, { "epoch": 3.374558303886926, "grad_norm": 0.5817282199859619, "learning_rate": 8.177264516702875e-06, "loss": 0.8496, "step": 1910 }, { "epoch": 3.392226148409894, "grad_norm": 0.6194543838500977, "learning_rate": 8.086089844240495e-06, "loss": 0.8059, "step": 1920 }, { "epoch": 3.4098939929328624, "grad_norm": 0.737491250038147, "learning_rate": 7.995080026286632e-06, "loss": 0.8449, "step": 1930 }, { "epoch": 3.4275618374558303, "grad_norm": 0.5627506971359253, "learning_rate": 7.904242901965171e-06, "loss": 0.8297, "step": 1940 }, { "epoch": 3.4452296819787986, "grad_norm": 0.8182918429374695, "learning_rate": 7.81358629552504e-06, "loss": 0.8627, "step": 1950 }, { "epoch": 3.462897526501767, "grad_norm": 0.5409490466117859, "learning_rate": 7.723118015666266e-06, "loss": 0.8837, "step": 1960 }, { "epoch": 3.480565371024735, "grad_norm": 0.5023618340492249, "learning_rate": 7.632845854867393e-06, "loss": 0.8592, "step": 1970 }, { "epoch": 3.498233215547703, "grad_norm": 0.5892724394798279, "learning_rate": 7.542777588714256e-06, "loss": 0.8318, "step": 1980 }, { "epoch": 3.5159010600706715, "grad_norm": 0.5022056698799133, "learning_rate": 7.452920975230247e-06, "loss": 0.8231, "step": 1990 }, { "epoch": 3.53356890459364, "grad_norm": 0.6234381198883057, "learning_rate": 7.363283754208061e-06, "loss": 0.7936, "step": 2000 }, { "epoch": 3.5512367491166077, "grad_norm": 0.5600544214248657, "learning_rate": 7.273873646543044e-06, "loss": 0.9023, "step": 2010 }, { "epoch": 3.568904593639576, "grad_norm": 0.5818344354629517, "learning_rate": 7.184698353568157e-06, "loss": 0.8629, "step": 2020 }, { "epoch": 3.586572438162544, "grad_norm": 0.5921304225921631, "learning_rate": 7.095765556390606e-06, "loss": 0.8372, "step": 2030 }, { "epoch": 3.604240282685512, "grad_norm": 0.6511384844779968, "learning_rate": 7.007082915230247e-06, "loss": 0.8143, "step": 2040 }, { "epoch": 3.6219081272084805, "grad_norm": 0.5591942071914673, "learning_rate": 6.918658068759754e-06, "loss": 0.8483, "step": 2050 }, { "epoch": 3.639575971731449, "grad_norm": 0.5008967518806458, "learning_rate": 6.8304986334466884e-06, "loss": 0.8219, "step": 2060 }, { "epoch": 3.657243816254417, "grad_norm": 0.5557330250740051, "learning_rate": 6.742612202897436e-06, "loss": 0.8225, "step": 2070 }, { "epoch": 3.674911660777385, "grad_norm": 0.524795413017273, "learning_rate": 6.655006347203128e-06, "loss": 0.8677, "step": 2080 }, { "epoch": 3.6925795053003534, "grad_norm": 0.5734356641769409, "learning_rate": 6.567688612287625e-06, "loss": 0.835, "step": 2090 }, { "epoch": 3.7102473498233217, "grad_norm": 0.4921402037143707, "learning_rate": 6.480666519257501e-06, "loss": 0.8554, "step": 2100 }, { "epoch": 3.7279151943462896, "grad_norm": 0.5890780687332153, "learning_rate": 6.393947563754253e-06, "loss": 0.7895, "step": 2110 }, { "epoch": 3.745583038869258, "grad_norm": 0.7719208598136902, "learning_rate": 6.307539215308644e-06, "loss": 0.8313, "step": 2120 }, { "epoch": 3.7632508833922262, "grad_norm": 0.5820373296737671, "learning_rate": 6.2214489166973235e-06, "loss": 0.8796, "step": 2130 }, { "epoch": 3.7809187279151946, "grad_norm": 0.5791803598403931, "learning_rate": 6.135684083301738e-06, "loss": 0.8228, "step": 2140 }, { "epoch": 3.7985865724381624, "grad_norm": 0.4887305498123169, "learning_rate": 6.050252102469417e-06, "loss": 0.8304, "step": 2150 }, { "epoch": 3.8162544169611308, "grad_norm": 0.7655726671218872, "learning_rate": 5.9651603328776606e-06, "loss": 0.8734, "step": 2160 }, { "epoch": 3.833922261484099, "grad_norm": 0.5560609698295593, "learning_rate": 5.880416103899696e-06, "loss": 0.8274, "step": 2170 }, { "epoch": 3.851590106007067, "grad_norm": 0.5723090171813965, "learning_rate": 5.796026714973359e-06, "loss": 0.8734, "step": 2180 }, { "epoch": 3.8692579505300353, "grad_norm": 0.5722220540046692, "learning_rate": 5.711999434972378e-06, "loss": 0.8711, "step": 2190 }, { "epoch": 3.8869257950530036, "grad_norm": 0.48828595876693726, "learning_rate": 5.628341501580246e-06, "loss": 0.7963, "step": 2200 }, { "epoch": 3.904593639575972, "grad_norm": 0.6057942509651184, "learning_rate": 5.545060120666812e-06, "loss": 0.8188, "step": 2210 }, { "epoch": 3.92226148409894, "grad_norm": 0.5612859129905701, "learning_rate": 5.462162465667614e-06, "loss": 0.8261, "step": 2220 }, { "epoch": 3.939929328621908, "grad_norm": 0.5768514275550842, "learning_rate": 5.379655676965984e-06, "loss": 0.8937, "step": 2230 }, { "epoch": 3.9575971731448765, "grad_norm": 0.8001552820205688, "learning_rate": 5.297546861278013e-06, "loss": 0.8398, "step": 2240 }, { "epoch": 3.9752650176678443, "grad_norm": 0.5258967876434326, "learning_rate": 5.215843091040409e-06, "loss": 0.84, "step": 2250 }, { "epoch": 3.9929328621908127, "grad_norm": 0.5343855023384094, "learning_rate": 5.134551403801336e-06, "loss": 0.8507, "step": 2260 }, { "epoch": 4.010600706713781, "grad_norm": 0.5460066199302673, "learning_rate": 5.053678801614205e-06, "loss": 0.8737, "step": 2270 }, { "epoch": 4.028268551236749, "grad_norm": 0.6175852417945862, "learning_rate": 4.973232250434579e-06, "loss": 0.8511, "step": 2280 }, { "epoch": 4.045936395759718, "grad_norm": 0.6113592982292175, "learning_rate": 4.893218679520137e-06, "loss": 0.8204, "step": 2290 }, { "epoch": 4.063604240282685, "grad_norm": 0.9578297138214111, "learning_rate": 4.813644980833851e-06, "loss": 0.8181, "step": 2300 }, { "epoch": 4.081272084805653, "grad_norm": 0.5546936988830566, "learning_rate": 4.734518008450312e-06, "loss": 0.789, "step": 2310 }, { "epoch": 4.098939929328622, "grad_norm": 0.680998682975769, "learning_rate": 4.6558445779653946e-06, "loss": 0.8605, "step": 2320 }, { "epoch": 4.11660777385159, "grad_norm": 0.6179676055908203, "learning_rate": 4.57763146590916e-06, "loss": 0.8559, "step": 2330 }, { "epoch": 4.134275618374558, "grad_norm": 0.6207692623138428, "learning_rate": 4.49988540916218e-06, "loss": 0.8733, "step": 2340 }, { "epoch": 4.151943462897527, "grad_norm": 0.7949970960617065, "learning_rate": 4.422613104375259e-06, "loss": 0.8521, "step": 2350 }, { "epoch": 4.169611307420495, "grad_norm": 0.5593691468238831, "learning_rate": 4.345821207392605e-06, "loss": 0.8163, "step": 2360 }, { "epoch": 4.1872791519434625, "grad_norm": 0.7125366926193237, "learning_rate": 4.269516332678529e-06, "loss": 0.8531, "step": 2370 }, { "epoch": 4.204946996466431, "grad_norm": 0.5829963088035583, "learning_rate": 4.193705052747737e-06, "loss": 0.8855, "step": 2380 }, { "epoch": 4.222614840989399, "grad_norm": 0.6220327615737915, "learning_rate": 4.1183938975991644e-06, "loss": 0.8152, "step": 2390 }, { "epoch": 4.240282685512367, "grad_norm": 0.5507341623306274, "learning_rate": 4.043589354153541e-06, "loss": 0.805, "step": 2400 }, { "epoch": 4.257950530035336, "grad_norm": 0.5985105633735657, "learning_rate": 3.969297865694641e-06, "loss": 0.8132, "step": 2410 }, { "epoch": 4.275618374558304, "grad_norm": 0.8096944093704224, "learning_rate": 3.895525831314282e-06, "loss": 0.8731, "step": 2420 }, { "epoch": 4.293286219081272, "grad_norm": 0.6014131307601929, "learning_rate": 3.822279605361138e-06, "loss": 0.8497, "step": 2430 }, { "epoch": 4.310954063604241, "grad_norm": 0.5453154444694519, "learning_rate": 3.74956549689342e-06, "loss": 0.8213, "step": 2440 }, { "epoch": 4.328621908127208, "grad_norm": 0.4849812090396881, "learning_rate": 3.677389769135444e-06, "loss": 0.8196, "step": 2450 }, { "epoch": 4.3462897526501765, "grad_norm": 0.5745995044708252, "learning_rate": 3.6057586389381326e-06, "loss": 0.8202, "step": 2460 }, { "epoch": 4.363957597173145, "grad_norm": 0.5921375155448914, "learning_rate": 3.5346782762435383e-06, "loss": 0.8623, "step": 2470 }, { "epoch": 4.381625441696113, "grad_norm": 1.4667394161224365, "learning_rate": 3.464154803553408e-06, "loss": 0.8438, "step": 2480 }, { "epoch": 4.3992932862190814, "grad_norm": 0.6135275959968567, "learning_rate": 3.394194295401796e-06, "loss": 0.886, "step": 2490 }, { "epoch": 4.41696113074205, "grad_norm": 0.5192072987556458, "learning_rate": 3.3248027778318593e-06, "loss": 0.8115, "step": 2500 }, { "epoch": 4.434628975265017, "grad_norm": 0.4936811327934265, "learning_rate": 3.255986227876782e-06, "loss": 0.8178, "step": 2510 }, { "epoch": 4.4522968197879855, "grad_norm": 0.5864696502685547, "learning_rate": 3.1877505730449677e-06, "loss": 0.8582, "step": 2520 }, { "epoch": 4.469964664310954, "grad_norm": 0.5196579694747925, "learning_rate": 3.1201016908094518e-06, "loss": 0.8544, "step": 2530 }, { "epoch": 4.487632508833922, "grad_norm": 0.7537738084793091, "learning_rate": 3.0530454081016637e-06, "loss": 0.897, "step": 2540 }, { "epoch": 4.5053003533568905, "grad_norm": 0.5827525854110718, "learning_rate": 2.9865875008095114e-06, "loss": 0.8611, "step": 2550 }, { "epoch": 4.522968197879859, "grad_norm": 0.6028100848197937, "learning_rate": 2.920733693279879e-06, "loss": 0.7945, "step": 2560 }, { "epoch": 4.540636042402827, "grad_norm": 0.6480982899665833, "learning_rate": 2.855489657825573e-06, "loss": 0.8389, "step": 2570 }, { "epoch": 4.5583038869257955, "grad_norm": 0.6590670347213745, "learning_rate": 2.7908610142367144e-06, "loss": 0.8061, "step": 2580 }, { "epoch": 4.575971731448763, "grad_norm": 0.6050382852554321, "learning_rate": 2.7268533292967026e-06, "loss": 0.8249, "step": 2590 }, { "epoch": 4.593639575971731, "grad_norm": 0.6227623224258423, "learning_rate": 2.6634721163027076e-06, "loss": 0.8171, "step": 2600 }, { "epoch": 4.6113074204946995, "grad_norm": 0.4837830662727356, "learning_rate": 2.600722834590781e-06, "loss": 0.7877, "step": 2610 }, { "epoch": 4.628975265017668, "grad_norm": 0.5412755608558655, "learning_rate": 2.538610889065619e-06, "loss": 0.8309, "step": 2620 }, { "epoch": 4.646643109540636, "grad_norm": 0.5128341317176819, "learning_rate": 2.477141629735025e-06, "loss": 0.8903, "step": 2630 }, { "epoch": 4.6643109540636045, "grad_norm": 0.5534719228744507, "learning_rate": 2.416320351249062e-06, "loss": 0.8418, "step": 2640 }, { "epoch": 4.681978798586572, "grad_norm": 0.6710190773010254, "learning_rate": 2.3561522924440127e-06, "loss": 0.829, "step": 2650 }, { "epoch": 4.69964664310954, "grad_norm": 0.5560805797576904, "learning_rate": 2.2966426358911387e-06, "loss": 0.8572, "step": 2660 }, { "epoch": 4.717314487632509, "grad_norm": 0.5557115077972412, "learning_rate": 2.237796507450272e-06, "loss": 0.8317, "step": 2670 }, { "epoch": 4.734982332155477, "grad_norm": 0.7195934653282166, "learning_rate": 2.1796189758282917e-06, "loss": 0.832, "step": 2680 }, { "epoch": 4.752650176678445, "grad_norm": 0.5557926893234253, "learning_rate": 2.122115052142545e-06, "loss": 0.7884, "step": 2690 }, { "epoch": 4.770318021201414, "grad_norm": 0.5075626373291016, "learning_rate": 2.065289689489213e-06, "loss": 0.7874, "step": 2700 }, { "epoch": 4.787985865724382, "grad_norm": 0.7519310712814331, "learning_rate": 2.0091477825166637e-06, "loss": 0.8451, "step": 2710 }, { "epoch": 4.80565371024735, "grad_norm": 0.5555654764175415, "learning_rate": 1.9536941670038745e-06, "loss": 0.8339, "step": 2720 }, { "epoch": 4.823321554770318, "grad_norm": 0.5968272686004639, "learning_rate": 1.8989336194438756e-06, "loss": 0.8268, "step": 2730 }, { "epoch": 4.840989399293286, "grad_norm": 0.7066864967346191, "learning_rate": 1.8448708566323504e-06, "loss": 0.8263, "step": 2740 }, { "epoch": 4.858657243816254, "grad_norm": 0.46801695227622986, "learning_rate": 1.7915105352613382e-06, "loss": 0.8091, "step": 2750 }, { "epoch": 4.876325088339223, "grad_norm": 0.7215123772621155, "learning_rate": 1.7388572515181445e-06, "loss": 0.8339, "step": 2760 }, { "epoch": 4.893992932862191, "grad_norm": 0.6271361708641052, "learning_rate": 1.6869155406894344e-06, "loss": 0.8223, "step": 2770 }, { "epoch": 4.911660777385159, "grad_norm": 0.9216979742050171, "learning_rate": 1.6356898767705954e-06, "loss": 0.8052, "step": 2780 }, { "epoch": 4.929328621908128, "grad_norm": 0.5909148454666138, "learning_rate": 1.585184672080371e-06, "loss": 0.8026, "step": 2790 }, { "epoch": 4.946996466431095, "grad_norm": 0.5720986723899841, "learning_rate": 1.5354042768807976e-06, "loss": 0.831, "step": 2800 }, { "epoch": 4.964664310954063, "grad_norm": 0.5404096245765686, "learning_rate": 1.4863529790025033e-06, "loss": 0.8301, "step": 2810 }, { "epoch": 4.982332155477032, "grad_norm": 0.5945255756378174, "learning_rate": 1.4380350034753766e-06, "loss": 0.8786, "step": 2820 }, { "epoch": 5.0, "grad_norm": 1.2441853284835815, "learning_rate": 1.3904545121646319e-06, "loss": 0.8814, "step": 2830 }, { "epoch": 5.017667844522968, "grad_norm": 0.655418872833252, "learning_rate": 1.3436156034123383e-06, "loss": 0.8598, "step": 2840 }, { "epoch": 5.035335689045937, "grad_norm": 0.5294393301010132, "learning_rate": 1.2975223116844115e-06, "loss": 0.831, "step": 2850 }, { "epoch": 5.053003533568905, "grad_norm": 0.5677767395973206, "learning_rate": 1.2521786072230935e-06, "loss": 0.838, "step": 2860 }, { "epoch": 5.070671378091872, "grad_norm": 0.6083729267120361, "learning_rate": 1.2075883957049862e-06, "loss": 0.8866, "step": 2870 }, { "epoch": 5.088339222614841, "grad_norm": 0.6290938854217529, "learning_rate": 1.1637555179046344e-06, "loss": 0.8751, "step": 2880 }, { "epoch": 5.106007067137809, "grad_norm": 0.5518789291381836, "learning_rate": 1.1206837493636992e-06, "loss": 0.8597, "step": 2890 }, { "epoch": 5.123674911660777, "grad_norm": 0.5620294809341431, "learning_rate": 1.078376800065749e-06, "loss": 0.8105, "step": 2900 }, { "epoch": 5.141342756183746, "grad_norm": 0.6572288870811462, "learning_rate": 1.036838314116706e-06, "loss": 0.8057, "step": 2910 }, { "epoch": 5.159010600706714, "grad_norm": 0.5638129115104675, "learning_rate": 9.960718694309623e-07, "loss": 0.8214, "step": 2920 }, { "epoch": 5.176678445229682, "grad_norm": 0.7003311514854431, "learning_rate": 9.560809774231872e-07, "loss": 0.8544, "step": 2930 }, { "epoch": 5.19434628975265, "grad_norm": 0.5848519802093506, "learning_rate": 9.168690827058813e-07, "loss": 0.8273, "step": 2940 }, { "epoch": 5.212014134275618, "grad_norm": 0.5222687721252441, "learning_rate": 8.784395627926734e-07, "loss": 0.8567, "step": 2950 }, { "epoch": 5.229681978798586, "grad_norm": 0.5781293511390686, "learning_rate": 8.407957278073952e-07, "loss": 0.8236, "step": 2960 }, { "epoch": 5.247349823321555, "grad_norm": 0.6143786311149597, "learning_rate": 8.039408201989618e-07, "loss": 0.8415, "step": 2970 }, { "epoch": 5.265017667844523, "grad_norm": 0.5558165311813354, "learning_rate": 7.678780144620956e-07, "loss": 0.7737, "step": 2980 }, { "epoch": 5.282685512367491, "grad_norm": 0.6024080514907837, "learning_rate": 7.32610416863877e-07, "loss": 0.8291, "step": 2990 }, { "epoch": 5.30035335689046, "grad_norm": 0.5599005818367004, "learning_rate": 6.981410651761933e-07, "loss": 0.869, "step": 3000 }, { "epoch": 5.318021201413427, "grad_norm": 0.640021562576294, "learning_rate": 6.644729284140828e-07, "loss": 0.8184, "step": 3010 }, { "epoch": 5.3356890459363955, "grad_norm": 0.5697632431983948, "learning_rate": 6.316089065799958e-07, "loss": 0.841, "step": 3020 }, { "epoch": 5.353356890459364, "grad_norm": 0.5342921018600464, "learning_rate": 5.995518304139991e-07, "loss": 0.8244, "step": 3030 }, { "epoch": 5.371024734982332, "grad_norm": 0.5991451144218445, "learning_rate": 5.6830446114996e-07, "loss": 0.8567, "step": 3040 }, { "epoch": 5.3886925795053005, "grad_norm": 0.5282171964645386, "learning_rate": 5.37869490277697e-07, "loss": 0.8427, "step": 3050 }, { "epoch": 5.406360424028269, "grad_norm": 0.6702190637588501, "learning_rate": 5.082495393111564e-07, "loss": 0.817, "step": 3060 }, { "epoch": 5.424028268551237, "grad_norm": 0.5918974280357361, "learning_rate": 4.794471595626071e-07, "loss": 0.8249, "step": 3070 }, { "epoch": 5.4416961130742045, "grad_norm": 0.6387214064598083, "learning_rate": 4.514648319228798e-07, "loss": 0.8695, "step": 3080 }, { "epoch": 5.459363957597173, "grad_norm": 0.5860491394996643, "learning_rate": 4.243049666476784e-07, "loss": 0.7571, "step": 3090 }, { "epoch": 5.477031802120141, "grad_norm": 0.64732426404953, "learning_rate": 3.9796990314997176e-07, "loss": 0.8065, "step": 3100 }, { "epoch": 5.4946996466431095, "grad_norm": 0.4854739010334015, "learning_rate": 3.7246190979849164e-07, "loss": 0.8417, "step": 3110 }, { "epoch": 5.512367491166078, "grad_norm": 0.5119847059249878, "learning_rate": 3.477831837223433e-07, "loss": 0.8222, "step": 3120 }, { "epoch": 5.530035335689046, "grad_norm": 1.499524474143982, "learning_rate": 3.239358506217549e-07, "loss": 0.8278, "step": 3130 }, { "epoch": 5.5477031802120145, "grad_norm": 0.6199241280555725, "learning_rate": 3.009219645849859e-07, "loss": 0.8005, "step": 3140 }, { "epoch": 5.565371024734983, "grad_norm": 0.6573869585990906, "learning_rate": 2.7874350791139203e-07, "loss": 0.8338, "step": 3150 }, { "epoch": 5.58303886925795, "grad_norm": 0.5395724773406982, "learning_rate": 2.574023909406853e-07, "loss": 0.8363, "step": 3160 }, { "epoch": 5.6007067137809186, "grad_norm": 0.834705114364624, "learning_rate": 2.369004518883855e-07, "loss": 0.8207, "step": 3170 }, { "epoch": 5.618374558303887, "grad_norm": 0.5336258411407471, "learning_rate": 2.1723945668748248e-07, "loss": 0.8153, "step": 3180 }, { "epoch": 5.636042402826855, "grad_norm": 0.7850385904312134, "learning_rate": 1.984210988363311e-07, "loss": 0.8114, "step": 3190 }, { "epoch": 5.6537102473498235, "grad_norm": 0.597079336643219, "learning_rate": 1.8044699925278242e-07, "loss": 0.834, "step": 3200 }, { "epoch": 5.671378091872792, "grad_norm": 0.6741916537284851, "learning_rate": 1.6331870613456423e-07, "loss": 0.822, "step": 3210 }, { "epoch": 5.689045936395759, "grad_norm": 0.7204360961914062, "learning_rate": 1.4703769482592335e-07, "loss": 0.8481, "step": 3220 }, { "epoch": 5.706713780918728, "grad_norm": 0.6026734113693237, "learning_rate": 1.3160536769055708e-07, "loss": 0.845, "step": 3230 }, { "epoch": 5.724381625441696, "grad_norm": 0.568855345249176, "learning_rate": 1.1702305399081349e-07, "loss": 0.8179, "step": 3240 }, { "epoch": 5.742049469964664, "grad_norm": 0.5504450798034668, "learning_rate": 1.0329200977319265e-07, "loss": 0.8236, "step": 3250 }, { "epoch": 5.759717314487633, "grad_norm": 0.60431307554245, "learning_rate": 9.041341776016565e-08, "loss": 0.8486, "step": 3260 }, { "epoch": 5.777385159010601, "grad_norm": 0.5151515007019043, "learning_rate": 7.83883872482949e-08, "loss": 0.8344, "step": 3270 }, { "epoch": 5.795053003533569, "grad_norm": 0.6697881817817688, "learning_rate": 6.721795401268493e-08, "loss": 0.8232, "step": 3280 }, { "epoch": 5.8127208480565375, "grad_norm": 0.5603944659233093, "learning_rate": 5.69030802177728e-08, "loss": 0.8249, "step": 3290 }, { "epoch": 5.830388692579505, "grad_norm": 0.5996559262275696, "learning_rate": 4.744465433443979e-08, "loss": 0.8186, "step": 3300 }, { "epoch": 5.848056537102473, "grad_norm": 0.5992182493209839, "learning_rate": 3.884349106349716e-08, "loss": 0.8801, "step": 3310 }, { "epoch": 5.865724381625442, "grad_norm": 0.5472813248634338, "learning_rate": 3.110033126549894e-08, "loss": 0.8299, "step": 3320 }, { "epoch": 5.88339222614841, "grad_norm": 0.6012226343154907, "learning_rate": 2.4215841896938486e-08, "loss": 0.8276, "step": 3330 }, { "epoch": 5.901060070671378, "grad_norm": 0.5417101979255676, "learning_rate": 1.8190615952794477e-08, "loss": 0.8673, "step": 3340 }, { "epoch": 5.918727915194347, "grad_norm": 0.5450870990753174, "learning_rate": 1.3025172415451758e-08, "loss": 0.8526, "step": 3350 }, { "epoch": 5.936395759717314, "grad_norm": 0.5599738359451294, "learning_rate": 8.719956210007096e-09, "loss": 0.7772, "step": 3360 }, { "epoch": 5.954063604240282, "grad_norm": 0.5253913998603821, "learning_rate": 5.275338165935395e-09, "loss": 0.8169, "step": 3370 }, { "epoch": 5.971731448763251, "grad_norm": 0.6051644682884216, "learning_rate": 2.6916149851563542e-09, "loss": 0.8312, "step": 3380 }, { "epoch": 5.989399293286219, "grad_norm": 0.6155297160148621, "learning_rate": 9.690092164715835e-10, "loss": 0.8493, "step": 3390 }, { "epoch": 6.007067137809187, "grad_norm": 0.7502905130386353, "learning_rate": 1.0766923640215254e-10, "loss": 0.7903, "step": 3400 } ], "logging_steps": 10, "max_steps": 3405, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.855684396713247e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }