{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.941747572815534, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009708737864077669, "grad_norm": 3.141545295715332, "learning_rate": 1.5e-06, "loss": 0.3406, "step": 10 }, { "epoch": 0.019417475728155338, "grad_norm": 3.8397164344787598, "learning_rate": 3.166666666666667e-06, "loss": 0.3078, "step": 20 }, { "epoch": 0.02912621359223301, "grad_norm": 1.7836706638336182, "learning_rate": 4.833333333333333e-06, "loss": 0.1926, "step": 30 }, { "epoch": 0.038834951456310676, "grad_norm": 1.257520079612732, "learning_rate": 6.5000000000000004e-06, "loss": 0.1476, "step": 40 }, { "epoch": 0.04854368932038835, "grad_norm": 1.0694407224655151, "learning_rate": 8.166666666666668e-06, "loss": 0.135, "step": 50 }, { "epoch": 0.05825242718446602, "grad_norm": 0.9249787926673889, "learning_rate": 9.833333333333333e-06, "loss": 0.1516, "step": 60 }, { "epoch": 0.06796116504854369, "grad_norm": 1.0610548257827759, "learning_rate": 1.1500000000000002e-05, "loss": 0.1073, "step": 70 }, { "epoch": 0.07766990291262135, "grad_norm": 0.9249606132507324, "learning_rate": 1.3166666666666665e-05, "loss": 0.0894, "step": 80 }, { "epoch": 0.08737864077669903, "grad_norm": 0.6027079224586487, "learning_rate": 1.4833333333333336e-05, "loss": 0.0814, "step": 90 }, { "epoch": 0.0970873786407767, "grad_norm": 0.8849421739578247, "learning_rate": 1.65e-05, "loss": 0.0963, "step": 100 }, { "epoch": 0.10679611650485436, "grad_norm": 1.2120862007141113, "learning_rate": 1.8166666666666667e-05, "loss": 0.0817, "step": 110 }, { "epoch": 0.11650485436893204, "grad_norm": 0.781768262386322, "learning_rate": 1.9833333333333335e-05, "loss": 0.087, "step": 120 }, { "epoch": 0.1262135922330097, "grad_norm": 0.5335855484008789, "learning_rate": 2.15e-05, "loss": 0.0502, "step": 130 }, { "epoch": 0.13592233009708737, "grad_norm": 0.6909231543540955, "learning_rate": 2.3166666666666666e-05, "loss": 0.0813, "step": 140 }, { "epoch": 0.14563106796116504, "grad_norm": 0.8511887192726135, "learning_rate": 2.4833333333333335e-05, "loss": 0.0823, "step": 150 }, { "epoch": 0.1553398058252427, "grad_norm": 1.1109554767608643, "learning_rate": 2.6500000000000004e-05, "loss": 0.0638, "step": 160 }, { "epoch": 0.1650485436893204, "grad_norm": 0.7318443655967712, "learning_rate": 2.816666666666667e-05, "loss": 0.0649, "step": 170 }, { "epoch": 0.17475728155339806, "grad_norm": 0.7571411728858948, "learning_rate": 2.9833333333333335e-05, "loss": 0.064, "step": 180 }, { "epoch": 0.18446601941747573, "grad_norm": 0.8281847834587097, "learning_rate": 3.15e-05, "loss": 0.0731, "step": 190 }, { "epoch": 0.1941747572815534, "grad_norm": 0.7228130102157593, "learning_rate": 3.316666666666667e-05, "loss": 0.0674, "step": 200 }, { "epoch": 0.20388349514563106, "grad_norm": 0.9129646420478821, "learning_rate": 3.483333333333334e-05, "loss": 0.0674, "step": 210 }, { "epoch": 0.21359223300970873, "grad_norm": 0.7561999559402466, "learning_rate": 3.65e-05, "loss": 0.0632, "step": 220 }, { "epoch": 0.22330097087378642, "grad_norm": 0.6391080617904663, "learning_rate": 3.816666666666667e-05, "loss": 0.0748, "step": 230 }, { "epoch": 0.23300970873786409, "grad_norm": 0.8094485402107239, "learning_rate": 3.983333333333333e-05, "loss": 0.0822, "step": 240 }, { "epoch": 0.24271844660194175, "grad_norm": 0.5360986590385437, "learning_rate": 4.15e-05, "loss": 0.0617, "step": 250 }, { "epoch": 0.2524271844660194, "grad_norm": 0.7553185224533081, "learning_rate": 4.316666666666667e-05, "loss": 0.0666, "step": 260 }, { "epoch": 0.2621359223300971, "grad_norm": 0.6815693378448486, "learning_rate": 4.483333333333333e-05, "loss": 0.061, "step": 270 }, { "epoch": 0.27184466019417475, "grad_norm": 0.5073156356811523, "learning_rate": 4.6500000000000005e-05, "loss": 0.0473, "step": 280 }, { "epoch": 0.2815533980582524, "grad_norm": 0.8138278722763062, "learning_rate": 4.8166666666666674e-05, "loss": 0.0652, "step": 290 }, { "epoch": 0.2912621359223301, "grad_norm": 0.8348583579063416, "learning_rate": 4.9833333333333336e-05, "loss": 0.0746, "step": 300 }, { "epoch": 0.30097087378640774, "grad_norm": 1.0741016864776611, "learning_rate": 4.999969242985639e-05, "loss": 0.0718, "step": 310 }, { "epoch": 0.3106796116504854, "grad_norm": 1.182652473449707, "learning_rate": 4.999862923413781e-05, "loss": 0.06, "step": 320 }, { "epoch": 0.32038834951456313, "grad_norm": 0.835841953754425, "learning_rate": 4.999680664797127e-05, "loss": 0.0607, "step": 330 }, { "epoch": 0.3300970873786408, "grad_norm": 0.6821876764297485, "learning_rate": 4.999422472672202e-05, "loss": 0.0554, "step": 340 }, { "epoch": 0.33980582524271846, "grad_norm": 0.9348113536834717, "learning_rate": 4.99908835488218e-05, "loss": 0.0629, "step": 350 }, { "epoch": 0.34951456310679613, "grad_norm": 0.5926679372787476, "learning_rate": 4.998678321576651e-05, "loss": 0.0506, "step": 360 }, { "epoch": 0.3592233009708738, "grad_norm": 1.0646535158157349, "learning_rate": 4.9981923852113145e-05, "loss": 0.0782, "step": 370 }, { "epoch": 0.36893203883495146, "grad_norm": 1.0149277448654175, "learning_rate": 4.997630560547597e-05, "loss": 0.0665, "step": 380 }, { "epoch": 0.3786407766990291, "grad_norm": 0.9700633883476257, "learning_rate": 4.996992864652204e-05, "loss": 0.067, "step": 390 }, { "epoch": 0.3883495145631068, "grad_norm": 1.1201013326644897, "learning_rate": 4.996279316896606e-05, "loss": 0.0656, "step": 400 }, { "epoch": 0.39805825242718446, "grad_norm": 0.7025376558303833, "learning_rate": 4.9954899389564455e-05, "loss": 0.0711, "step": 410 }, { "epoch": 0.4077669902912621, "grad_norm": 0.6064212918281555, "learning_rate": 4.9946247548108794e-05, "loss": 0.0572, "step": 420 }, { "epoch": 0.4174757281553398, "grad_norm": 0.5238557457923889, "learning_rate": 4.993683790741852e-05, "loss": 0.0633, "step": 430 }, { "epoch": 0.42718446601941745, "grad_norm": 0.599403977394104, "learning_rate": 4.992667075333296e-05, "loss": 0.0508, "step": 440 }, { "epoch": 0.4368932038834951, "grad_norm": 0.55897057056427, "learning_rate": 4.991574639470263e-05, "loss": 0.0539, "step": 450 }, { "epoch": 0.44660194174757284, "grad_norm": 0.628349244594574, "learning_rate": 4.990406516337987e-05, "loss": 0.0496, "step": 460 }, { "epoch": 0.4563106796116505, "grad_norm": 0.7520539164543152, "learning_rate": 4.989162741420876e-05, "loss": 0.0504, "step": 470 }, { "epoch": 0.46601941747572817, "grad_norm": 1.0347453355789185, "learning_rate": 4.9878433525014335e-05, "loss": 0.0646, "step": 480 }, { "epoch": 0.47572815533980584, "grad_norm": 1.04426109790802, "learning_rate": 4.9864483896591094e-05, "loss": 0.0679, "step": 490 }, { "epoch": 0.4854368932038835, "grad_norm": 0.9814009070396423, "learning_rate": 4.984977895269087e-05, "loss": 0.0607, "step": 500 }, { "epoch": 0.49514563106796117, "grad_norm": 0.9229623079299927, "learning_rate": 4.983431914000991e-05, "loss": 0.0667, "step": 510 }, { "epoch": 0.5048543689320388, "grad_norm": 1.0306048393249512, "learning_rate": 4.981810492817532e-05, "loss": 0.0658, "step": 520 }, { "epoch": 0.5145631067961165, "grad_norm": 0.9046841859817505, "learning_rate": 4.980113680973082e-05, "loss": 0.0746, "step": 530 }, { "epoch": 0.5242718446601942, "grad_norm": 0.6024751663208008, "learning_rate": 4.978341530012175e-05, "loss": 0.0525, "step": 540 }, { "epoch": 0.5339805825242718, "grad_norm": 1.0224015712738037, "learning_rate": 4.976494093767943e-05, "loss": 0.0725, "step": 550 }, { "epoch": 0.5436893203883495, "grad_norm": 1.0598809719085693, "learning_rate": 4.9745714283604803e-05, "loss": 0.0853, "step": 560 }, { "epoch": 0.5533980582524272, "grad_norm": 0.6167446970939636, "learning_rate": 4.972573592195139e-05, "loss": 0.0612, "step": 570 }, { "epoch": 0.5631067961165048, "grad_norm": 0.6139333844184875, "learning_rate": 4.970500645960756e-05, "loss": 0.0525, "step": 580 }, { "epoch": 0.5728155339805825, "grad_norm": 0.7935957908630371, "learning_rate": 4.968352652627806e-05, "loss": 0.0526, "step": 590 }, { "epoch": 0.5825242718446602, "grad_norm": 0.5704646706581116, "learning_rate": 4.966129677446492e-05, "loss": 0.0614, "step": 600 }, { "epoch": 0.5922330097087378, "grad_norm": 0.5081838369369507, "learning_rate": 4.9638317879447606e-05, "loss": 0.0549, "step": 610 }, { "epoch": 0.6019417475728155, "grad_norm": 0.5273293256759644, "learning_rate": 4.961459053926252e-05, "loss": 0.0523, "step": 620 }, { "epoch": 0.6116504854368932, "grad_norm": 0.423949271440506, "learning_rate": 4.9590115474681816e-05, "loss": 0.0571, "step": 630 }, { "epoch": 0.6213592233009708, "grad_norm": 0.8299949169158936, "learning_rate": 4.956489342919147e-05, "loss": 0.0548, "step": 640 }, { "epoch": 0.6310679611650486, "grad_norm": 0.8283652067184448, "learning_rate": 4.95389251689687e-05, "loss": 0.0495, "step": 650 }, { "epoch": 0.6407766990291263, "grad_norm": 0.828658938407898, "learning_rate": 4.9512211482858714e-05, "loss": 0.0642, "step": 660 }, { "epoch": 0.6504854368932039, "grad_norm": 0.7801154851913452, "learning_rate": 4.948475318235073e-05, "loss": 0.0467, "step": 670 }, { "epoch": 0.6601941747572816, "grad_norm": 0.7108743786811829, "learning_rate": 4.945655110155333e-05, "loss": 0.0624, "step": 680 }, { "epoch": 0.6699029126213593, "grad_norm": 0.5172573328018188, "learning_rate": 4.9427606097169117e-05, "loss": 0.0601, "step": 690 }, { "epoch": 0.6796116504854369, "grad_norm": 0.7694447636604309, "learning_rate": 4.939791904846869e-05, "loss": 0.0585, "step": 700 }, { "epoch": 0.6893203883495146, "grad_norm": 0.651713490486145, "learning_rate": 4.9367490857263944e-05, "loss": 0.0521, "step": 710 }, { "epoch": 0.6990291262135923, "grad_norm": 1.2775520086288452, "learning_rate": 4.9336322447880676e-05, "loss": 0.0687, "step": 720 }, { "epoch": 0.7087378640776699, "grad_norm": 0.6819589138031006, "learning_rate": 4.930441476713049e-05, "loss": 0.047, "step": 730 }, { "epoch": 0.7184466019417476, "grad_norm": 0.8026332855224609, "learning_rate": 4.927176878428206e-05, "loss": 0.0419, "step": 740 }, { "epoch": 0.7281553398058253, "grad_norm": 0.7379328608512878, "learning_rate": 4.923838549103166e-05, "loss": 0.0547, "step": 750 }, { "epoch": 0.7378640776699029, "grad_norm": 0.5019276738166809, "learning_rate": 4.920426590147304e-05, "loss": 0.0492, "step": 760 }, { "epoch": 0.7475728155339806, "grad_norm": 0.8244302272796631, "learning_rate": 4.916941105206666e-05, "loss": 0.0542, "step": 770 }, { "epoch": 0.7572815533980582, "grad_norm": 0.4618038535118103, "learning_rate": 4.9133822001608164e-05, "loss": 0.0676, "step": 780 }, { "epoch": 0.7669902912621359, "grad_norm": 0.5262795686721802, "learning_rate": 4.9097499831196216e-05, "loss": 0.0415, "step": 790 }, { "epoch": 0.7766990291262136, "grad_norm": 0.537287175655365, "learning_rate": 4.906044564419969e-05, "loss": 0.0511, "step": 800 }, { "epoch": 0.7864077669902912, "grad_norm": 0.9852665066719055, "learning_rate": 4.902266056622414e-05, "loss": 0.055, "step": 810 }, { "epoch": 0.7961165048543689, "grad_norm": 0.6320510506629944, "learning_rate": 4.8984145745077584e-05, "loss": 0.0554, "step": 820 }, { "epoch": 0.8058252427184466, "grad_norm": 0.6188237071037292, "learning_rate": 4.894490235073566e-05, "loss": 0.0783, "step": 830 }, { "epoch": 0.8155339805825242, "grad_norm": 0.6136628985404968, "learning_rate": 4.890493157530609e-05, "loss": 0.0532, "step": 840 }, { "epoch": 0.8252427184466019, "grad_norm": 0.6563002467155457, "learning_rate": 4.8864234632992457e-05, "loss": 0.0583, "step": 850 }, { "epoch": 0.8349514563106796, "grad_norm": 0.779821515083313, "learning_rate": 4.88228127600573e-05, "loss": 0.0608, "step": 860 }, { "epoch": 0.8446601941747572, "grad_norm": 0.3783950209617615, "learning_rate": 4.878066721478461e-05, "loss": 0.0599, "step": 870 }, { "epoch": 0.8543689320388349, "grad_norm": 0.7626412510871887, "learning_rate": 4.8737799277441566e-05, "loss": 0.0526, "step": 880 }, { "epoch": 0.8640776699029126, "grad_norm": 0.5785266160964966, "learning_rate": 4.869421025023965e-05, "loss": 0.0623, "step": 890 }, { "epoch": 0.8737864077669902, "grad_norm": 0.959281861782074, "learning_rate": 4.8649901457295096e-05, "loss": 0.0509, "step": 900 }, { "epoch": 0.883495145631068, "grad_norm": 0.6331967115402222, "learning_rate": 4.860487424458867e-05, "loss": 0.0498, "step": 910 }, { "epoch": 0.8932038834951457, "grad_norm": 0.7830513119697571, "learning_rate": 4.8559129979924787e-05, "loss": 0.0457, "step": 920 }, { "epoch": 0.9029126213592233, "grad_norm": 0.8294515013694763, "learning_rate": 4.8512670052889955e-05, "loss": 0.0528, "step": 930 }, { "epoch": 0.912621359223301, "grad_norm": 0.7075780034065247, "learning_rate": 4.846549587481052e-05, "loss": 0.0474, "step": 940 }, { "epoch": 0.9223300970873787, "grad_norm": 0.4232042729854584, "learning_rate": 4.841760887870988e-05, "loss": 0.042, "step": 950 }, { "epoch": 0.9320388349514563, "grad_norm": 0.6930996179580688, "learning_rate": 4.836901051926489e-05, "loss": 0.0616, "step": 960 }, { "epoch": 0.941747572815534, "grad_norm": 1.1047744750976562, "learning_rate": 4.831970227276171e-05, "loss": 0.0606, "step": 970 }, { "epoch": 0.9514563106796117, "grad_norm": 0.6250813603401184, "learning_rate": 4.82696856370509e-05, "loss": 0.0419, "step": 980 }, { "epoch": 0.9611650485436893, "grad_norm": 0.5649757385253906, "learning_rate": 4.8218962131502e-05, "loss": 0.0434, "step": 990 }, { "epoch": 0.970873786407767, "grad_norm": 0.5142524242401123, "learning_rate": 4.81675332969573e-05, "loss": 0.0455, "step": 1000 }, { "epoch": 0.9805825242718447, "grad_norm": 0.437963604927063, "learning_rate": 4.811540069568512e-05, "loss": 0.0477, "step": 1010 }, { "epoch": 0.9902912621359223, "grad_norm": 0.9865050315856934, "learning_rate": 4.8062565911332235e-05, "loss": 0.0636, "step": 1020 }, { "epoch": 1.0, "grad_norm": 0.4108821749687195, "learning_rate": 4.8009030548875896e-05, "loss": 0.0386, "step": 1030 }, { "epoch": 1.0097087378640777, "grad_norm": 0.40395405888557434, "learning_rate": 4.795479623457497e-05, "loss": 0.0406, "step": 1040 }, { "epoch": 1.0194174757281553, "grad_norm": 0.6007186770439148, "learning_rate": 4.789986461592061e-05, "loss": 0.05, "step": 1050 }, { "epoch": 1.029126213592233, "grad_norm": 0.7217828631401062, "learning_rate": 4.784423736158616e-05, "loss": 0.051, "step": 1060 }, { "epoch": 1.0388349514563107, "grad_norm": 0.6370289325714111, "learning_rate": 4.7787916161376515e-05, "loss": 0.0641, "step": 1070 }, { "epoch": 1.0485436893203883, "grad_norm": 0.6220316886901855, "learning_rate": 4.773090272617672e-05, "loss": 0.0624, "step": 1080 }, { "epoch": 1.058252427184466, "grad_norm": 0.7248903512954712, "learning_rate": 4.7673198787900063e-05, "loss": 0.0567, "step": 1090 }, { "epoch": 1.0679611650485437, "grad_norm": 0.7164710164070129, "learning_rate": 4.761480609943546e-05, "loss": 0.0424, "step": 1100 }, { "epoch": 1.0776699029126213, "grad_norm": 0.6365995407104492, "learning_rate": 4.755572643459414e-05, "loss": 0.062, "step": 1110 }, { "epoch": 1.087378640776699, "grad_norm": 0.5840019583702087, "learning_rate": 4.7495961588055836e-05, "loss": 0.0344, "step": 1120 }, { "epoch": 1.0970873786407767, "grad_norm": 0.6124367713928223, "learning_rate": 4.7435513375314253e-05, "loss": 0.0524, "step": 1130 }, { "epoch": 1.1067961165048543, "grad_norm": 0.6590617299079895, "learning_rate": 4.737438363262187e-05, "loss": 0.0615, "step": 1140 }, { "epoch": 1.116504854368932, "grad_norm": 0.6731277108192444, "learning_rate": 4.7312574216934225e-05, "loss": 0.0536, "step": 1150 }, { "epoch": 1.1262135922330097, "grad_norm": 0.4859127700328827, "learning_rate": 4.7250087005853446e-05, "loss": 0.0456, "step": 1160 }, { "epoch": 1.1359223300970873, "grad_norm": 0.48708751797676086, "learning_rate": 4.718692389757128e-05, "loss": 0.0499, "step": 1170 }, { "epoch": 1.145631067961165, "grad_norm": 0.7826804518699646, "learning_rate": 4.7123086810811356e-05, "loss": 0.0437, "step": 1180 }, { "epoch": 1.1553398058252426, "grad_norm": 0.550841748714447, "learning_rate": 4.705857768477098e-05, "loss": 0.0507, "step": 1190 }, { "epoch": 1.1650485436893203, "grad_norm": 0.7365569472312927, "learning_rate": 4.699339847906215e-05, "loss": 0.0624, "step": 1200 }, { "epoch": 1.174757281553398, "grad_norm": 1.0098744630813599, "learning_rate": 4.6927551173652075e-05, "loss": 0.0619, "step": 1210 }, { "epoch": 1.1844660194174756, "grad_norm": 0.738837718963623, "learning_rate": 4.6861037768803016e-05, "loss": 0.0503, "step": 1220 }, { "epoch": 1.1941747572815533, "grad_norm": 0.5268725752830505, "learning_rate": 4.679386028501156e-05, "loss": 0.0489, "step": 1230 }, { "epoch": 1.203883495145631, "grad_norm": 0.8031307458877563, "learning_rate": 4.672602076294714e-05, "loss": 0.0595, "step": 1240 }, { "epoch": 1.2135922330097086, "grad_norm": 0.6441630721092224, "learning_rate": 4.665752126339018e-05, "loss": 0.0425, "step": 1250 }, { "epoch": 1.2233009708737863, "grad_norm": 0.5169068574905396, "learning_rate": 4.658836386716938e-05, "loss": 0.0536, "step": 1260 }, { "epoch": 1.233009708737864, "grad_norm": 0.6084913015365601, "learning_rate": 4.65185506750986e-05, "loss": 0.0349, "step": 1270 }, { "epoch": 1.2427184466019416, "grad_norm": 0.7514814734458923, "learning_rate": 4.6448083807912934e-05, "loss": 0.053, "step": 1280 }, { "epoch": 1.2524271844660193, "grad_norm": 0.49506863951683044, "learning_rate": 4.637696540620441e-05, "loss": 0.0497, "step": 1290 }, { "epoch": 1.262135922330097, "grad_norm": 0.6199485659599304, "learning_rate": 4.630519763035687e-05, "loss": 0.0517, "step": 1300 }, { "epoch": 1.2718446601941746, "grad_norm": 0.6022672653198242, "learning_rate": 4.623278266048039e-05, "loss": 0.0439, "step": 1310 }, { "epoch": 1.2815533980582523, "grad_norm": 0.7462307810783386, "learning_rate": 4.6159722696345045e-05, "loss": 0.0454, "step": 1320 }, { "epoch": 1.29126213592233, "grad_norm": 0.40218812227249146, "learning_rate": 4.608601995731407e-05, "loss": 0.0326, "step": 1330 }, { "epoch": 1.3009708737864076, "grad_norm": 0.7571138739585876, "learning_rate": 4.601167668227648e-05, "loss": 0.0364, "step": 1340 }, { "epoch": 1.3106796116504853, "grad_norm": 0.77698814868927, "learning_rate": 4.593669512957901e-05, "loss": 0.058, "step": 1350 }, { "epoch": 1.3203883495145632, "grad_norm": 0.7244229316711426, "learning_rate": 4.586107757695755e-05, "loss": 0.056, "step": 1360 }, { "epoch": 1.3300970873786409, "grad_norm": 0.5994431972503662, "learning_rate": 4.578482632146793e-05, "loss": 0.038, "step": 1370 }, { "epoch": 1.3398058252427185, "grad_norm": 0.7894623279571533, "learning_rate": 4.570794367941616e-05, "loss": 0.0425, "step": 1380 }, { "epoch": 1.3495145631067962, "grad_norm": 0.7884004712104797, "learning_rate": 4.563043198628806e-05, "loss": 0.0495, "step": 1390 }, { "epoch": 1.3592233009708738, "grad_norm": 0.6530587673187256, "learning_rate": 4.5552293596678294e-05, "loss": 0.0414, "step": 1400 }, { "epoch": 1.3689320388349515, "grad_norm": 0.8399489521980286, "learning_rate": 4.5473530884218886e-05, "loss": 0.0493, "step": 1410 }, { "epoch": 1.3786407766990292, "grad_norm": 0.6782814860343933, "learning_rate": 4.539414624150708e-05, "loss": 0.0472, "step": 1420 }, { "epoch": 1.3883495145631068, "grad_norm": 0.6846131682395935, "learning_rate": 4.5314142080032696e-05, "loss": 0.0335, "step": 1430 }, { "epoch": 1.3980582524271845, "grad_norm": 0.4258232116699219, "learning_rate": 4.5233520830104805e-05, "loss": 0.0576, "step": 1440 }, { "epoch": 1.4077669902912622, "grad_norm": 0.6838470101356506, "learning_rate": 4.515228494077798e-05, "loss": 0.0606, "step": 1450 }, { "epoch": 1.4174757281553398, "grad_norm": 0.5958576798439026, "learning_rate": 4.5070436879777865e-05, "loss": 0.0444, "step": 1460 }, { "epoch": 1.4271844660194175, "grad_norm": 0.48623862862586975, "learning_rate": 4.4987979133426215e-05, "loss": 0.0449, "step": 1470 }, { "epoch": 1.4368932038834952, "grad_norm": 0.5873615741729736, "learning_rate": 4.490491420656537e-05, "loss": 0.0442, "step": 1480 }, { "epoch": 1.4466019417475728, "grad_norm": 0.4631931781768799, "learning_rate": 4.482124462248217e-05, "loss": 0.0423, "step": 1490 }, { "epoch": 1.4563106796116505, "grad_norm": 0.5900189280509949, "learning_rate": 4.473697292283129e-05, "loss": 0.0485, "step": 1500 }, { "epoch": 1.4660194174757282, "grad_norm": 0.36097192764282227, "learning_rate": 4.465210166755803e-05, "loss": 0.0364, "step": 1510 }, { "epoch": 1.4757281553398058, "grad_norm": 0.6405830383300781, "learning_rate": 4.456663343482059e-05, "loss": 0.0416, "step": 1520 }, { "epoch": 1.4854368932038835, "grad_norm": 0.5862817168235779, "learning_rate": 4.44805708209117e-05, "loss": 0.0475, "step": 1530 }, { "epoch": 1.4951456310679612, "grad_norm": 0.7803816199302673, "learning_rate": 4.4393916440179786e-05, "loss": 0.0472, "step": 1540 }, { "epoch": 1.5048543689320388, "grad_norm": 0.5544291138648987, "learning_rate": 4.430667292494955e-05, "loss": 0.0361, "step": 1550 }, { "epoch": 1.5145631067961165, "grad_norm": 0.5777389407157898, "learning_rate": 4.4218842925441966e-05, "loss": 0.0439, "step": 1560 }, { "epoch": 1.5242718446601942, "grad_norm": 0.6668662428855896, "learning_rate": 4.413042910969385e-05, "loss": 0.0312, "step": 1570 }, { "epoch": 1.5339805825242718, "grad_norm": 0.8618906736373901, "learning_rate": 4.404143416347675e-05, "loss": 0.0481, "step": 1580 }, { "epoch": 1.5436893203883495, "grad_norm": 0.5840690732002258, "learning_rate": 4.395186079021537e-05, "loss": 0.0485, "step": 1590 }, { "epoch": 1.5533980582524272, "grad_norm": 0.3441118001937866, "learning_rate": 4.386171171090547e-05, "loss": 0.0401, "step": 1600 }, { "epoch": 1.5631067961165048, "grad_norm": 0.27396291494369507, "learning_rate": 4.37709896640312e-05, "loss": 0.0379, "step": 1610 }, { "epoch": 1.5728155339805825, "grad_norm": 0.4788952171802521, "learning_rate": 4.367969740548189e-05, "loss": 0.0398, "step": 1620 }, { "epoch": 1.5825242718446602, "grad_norm": 0.6418083906173706, "learning_rate": 4.358783770846836e-05, "loss": 0.0657, "step": 1630 }, { "epoch": 1.5922330097087378, "grad_norm": 0.6678286790847778, "learning_rate": 4.349541336343867e-05, "loss": 0.0466, "step": 1640 }, { "epoch": 1.6019417475728155, "grad_norm": 1.0702563524246216, "learning_rate": 4.3402427177993366e-05, "loss": 0.0452, "step": 1650 }, { "epoch": 1.6116504854368932, "grad_norm": 0.6941045522689819, "learning_rate": 4.3308881976800146e-05, "loss": 0.0521, "step": 1660 }, { "epoch": 1.6213592233009708, "grad_norm": 0.5572572350502014, "learning_rate": 4.321478060150813e-05, "loss": 0.0336, "step": 1670 }, { "epoch": 1.6310679611650487, "grad_norm": 0.5476228594779968, "learning_rate": 4.312012591066146e-05, "loss": 0.0375, "step": 1680 }, { "epoch": 1.6407766990291264, "grad_norm": 0.6187303066253662, "learning_rate": 4.302492077961253e-05, "loss": 0.041, "step": 1690 }, { "epoch": 1.650485436893204, "grad_norm": 0.8198221921920776, "learning_rate": 4.292916810043459e-05, "loss": 0.0463, "step": 1700 }, { "epoch": 1.6601941747572817, "grad_norm": 0.6056732535362244, "learning_rate": 4.283287078183392e-05, "loss": 0.0494, "step": 1710 }, { "epoch": 1.6699029126213594, "grad_norm": 0.6936320662498474, "learning_rate": 4.273603174906149e-05, "loss": 0.0413, "step": 1720 }, { "epoch": 1.679611650485437, "grad_norm": 0.6517147421836853, "learning_rate": 4.2638653943824026e-05, "loss": 0.0462, "step": 1730 }, { "epoch": 1.6893203883495147, "grad_norm": 0.5960354804992676, "learning_rate": 4.254074032419474e-05, "loss": 0.0403, "step": 1740 }, { "epoch": 1.6990291262135924, "grad_norm": 0.5907045602798462, "learning_rate": 4.244229386452342e-05, "loss": 0.0378, "step": 1750 }, { "epoch": 1.70873786407767, "grad_norm": 0.5899525284767151, "learning_rate": 4.2343317555346084e-05, "loss": 0.0457, "step": 1760 }, { "epoch": 1.7184466019417477, "grad_norm": 0.4403168261051178, "learning_rate": 4.2243814403294126e-05, "loss": 0.0317, "step": 1770 }, { "epoch": 1.7281553398058254, "grad_norm": 0.4933370351791382, "learning_rate": 4.214378743100302e-05, "loss": 0.0497, "step": 1780 }, { "epoch": 1.737864077669903, "grad_norm": 0.6774694919586182, "learning_rate": 4.204323967702045e-05, "loss": 0.0383, "step": 1790 }, { "epoch": 1.7475728155339807, "grad_norm": 0.5610571503639221, "learning_rate": 4.1942174195714066e-05, "loss": 0.0469, "step": 1800 }, { "epoch": 1.7572815533980584, "grad_norm": 0.8438130617141724, "learning_rate": 4.184059405717863e-05, "loss": 0.0478, "step": 1810 }, { "epoch": 1.766990291262136, "grad_norm": 0.8638678789138794, "learning_rate": 4.173850234714282e-05, "loss": 0.0489, "step": 1820 }, { "epoch": 1.7766990291262137, "grad_norm": 0.8042917251586914, "learning_rate": 4.1635902166875456e-05, "loss": 0.0527, "step": 1830 }, { "epoch": 1.7864077669902914, "grad_norm": 1.2712494134902954, "learning_rate": 4.1532796633091296e-05, "loss": 0.0512, "step": 1840 }, { "epoch": 1.796116504854369, "grad_norm": 0.5284692645072937, "learning_rate": 4.142918887785638e-05, "loss": 0.061, "step": 1850 }, { "epoch": 1.8058252427184467, "grad_norm": 0.6059673428535461, "learning_rate": 4.1325082048492866e-05, "loss": 0.0513, "step": 1860 }, { "epoch": 1.8155339805825244, "grad_norm": 0.6848878860473633, "learning_rate": 4.122047930748343e-05, "loss": 0.0395, "step": 1870 }, { "epoch": 1.825242718446602, "grad_norm": 0.8983065485954285, "learning_rate": 4.1115383832375174e-05, "loss": 0.0334, "step": 1880 }, { "epoch": 1.8349514563106797, "grad_norm": 0.5672741532325745, "learning_rate": 4.100979881568316e-05, "loss": 0.0384, "step": 1890 }, { "epoch": 1.8446601941747574, "grad_norm": 0.3233070969581604, "learning_rate": 4.090372746479337e-05, "loss": 0.0416, "step": 1900 }, { "epoch": 1.854368932038835, "grad_norm": 0.6278699040412903, "learning_rate": 4.0797173001865305e-05, "loss": 0.0528, "step": 1910 }, { "epoch": 1.8640776699029127, "grad_norm": 0.7250891923904419, "learning_rate": 4.069013866373409e-05, "loss": 0.0361, "step": 1920 }, { "epoch": 1.8737864077669903, "grad_norm": 1.1977667808532715, "learning_rate": 4.058262770181217e-05, "loss": 0.0455, "step": 1930 }, { "epoch": 1.883495145631068, "grad_norm": 0.7614680528640747, "learning_rate": 4.0474643381990505e-05, "loss": 0.0453, "step": 1940 }, { "epoch": 1.8932038834951457, "grad_norm": 0.7502397298812866, "learning_rate": 4.036618898453941e-05, "loss": 0.0366, "step": 1950 }, { "epoch": 1.9029126213592233, "grad_norm": 1.1674546003341675, "learning_rate": 4.025726780400886e-05, "loss": 0.0536, "step": 1960 }, { "epoch": 1.912621359223301, "grad_norm": 0.6499914526939392, "learning_rate": 4.0147883149128433e-05, "loss": 0.0355, "step": 1970 }, { "epoch": 1.9223300970873787, "grad_norm": 0.5907668471336365, "learning_rate": 4.003803834270681e-05, "loss": 0.0436, "step": 1980 }, { "epoch": 1.9320388349514563, "grad_norm": 0.6319796442985535, "learning_rate": 3.9927736721530805e-05, "loss": 0.0356, "step": 1990 }, { "epoch": 1.941747572815534, "grad_norm": 0.49555259943008423, "learning_rate": 3.981698163626406e-05, "loss": 0.0394, "step": 2000 } ], "logging_steps": 10, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }